Whoosh-2.5.7/0000755000076500000240000000000012277504634013076 5ustar mattstaff00000000000000Whoosh-2.5.7/benchmark/0000755000076500000240000000000012277504634015030 5ustar mattstaff00000000000000Whoosh-2.5.7/benchmark/dcvgr10.txt.gz0000644000076500000240000061213312254366350017460 0ustar mattstaff00000000000000Mdcvgr10.txt[sJ% T?#Ӻ{fg2J$VR* $A[ EL_Z*gg̦kk /܌ARtt=_QJOA7m5MRm$YՕot66^i7Euj]IUcZp&{\$Mړ\9Yv1n6"Z8ؕy?de%Mʿi_%Mu5ɱHI&48o+oyJYUIm6\'mCZy] Sl9Mvr^|0L1TE9iy+W|cGO_=|MnGv'SySUo]ZLwe=p:V9eT)=rp]wڬ22X<`9Kr=Jz|^7܎O9K^^dq8}6$KJ1x:du۷Y`Z*"}XΒuUn2G]i%uir;L!%mSrImͱ[u8MUV_ocA'Y2ﺣ \&,]ly*ykgW\(:dVi!#AQ[W,kɳʺȭ idYq,n'<ř~+.nMveq>㇇]4)x|:ɼ\yi}Ndd|í\^:̙Ax;uΓrS`AR_.IBRMUlr1Zfx٭w2.%}Afn3sreכ6Nn'yݢ[w/d4{&sMxL@T\iXgL^ٴU!_m'O)>^ݕ^97Gwwӱ|%Q)Fr#ӧrVU!V0H_OU^ 0dz&_|8Nnr<:M-ҝRgrn⇄㉭LNY:Y=3|,2bӲRRj~ϣ|,#Y2sld8߳7"ldZ]4EzNm61J;s[g'/V|r/;&&հmη2Y2|ɩ|;zO26kNjd,βaQ'wl1$MSݏ0:Q] GWS+UYn\+uɒLdSɄ(+P4Pu}P,k =^I'd|;G9qL/;-Ro%Lp><+6/Wݗ'x߲=pK8bWyˤj g\MY.Fn׶v:zz`/mۼA*/2Qp?ycen6p/fe-6<\$݉mgosJ./>0A.+י!V*KN;.ᒏ$wחyzM^Kr~C'V8ݥ]cyb,Y(a-xxX#vK./Ylq{كt8d'a凣\C*\}ҦDd7nFXsٸdurGU^bdTek3sqs`6y2zKo'eq"`cd?ăa48R>[kjl$mrSO IEqGMGWdG3qD<`Ի^:-mz7?UJ/_l{vg.md6 ,`0uî3CvdLxٍ9~gs$ػƟA77)稇&7E]<[3'4槚(/Bu*\ro[G/e)==ˎC;GLԧ|)UcNmj@;WlL#~ w廵mx/l:MgS1߹ceh(|@dYbȚ1J/\UbIU%:lG 1W:݆'skIy+>:e|ir?7Ies>ZnAW./KZ1|ϣdL~]#bAH+wA'sskq `Z˜N1C1G|jqXAt}l`1ZqxwDZo-vpCg^h% 67EȡN+:3QͲ0G;#T &jҷ42n#ɿ$?x jS]i!}LV+ 8J FO.|eNouV3tDz[S?@b_x!>tC+QB'S+ EũkfDoCPӖQ{bT1&ZCN8 <&qQO1W*Wıtk3 :icW찿%kλc2~=O&WK%d×mzo*֓nFn@ŵ cdO"SE Eޚ1 $Knt_ZS5fD#"=ȭc}jsŸVۭ6ǧ՞*U`qՀ%Tdf֑O f(;2"KcډcC; ^c|-eS(7-JC,wKѨeMOE} boCk1~N3b画2~Fi2:$pM7nɔG˘4' L*bY&c%nqsQe``廉rE" MO^eIU}F n0e94UEy@G^QΖFrO/yɇ R6 syÒ&Ę%Ine#3̧/voKM ÷) [=>Ixo|ԘqWSk4jvL؇j$=׹%$oɺ;wiOm!FECha^)cĦÔ%/tbn9VoI3 @#['_Qm& \ 9'4d0:>a~"BWC)zWۍWY cM{ ݎ7}L'գW_U?S~KE{ĘT=2uCclQbC)-={}'DMiFXa?y^>ΞIϗɃ==:;3Zl:iܣi1RMօbKp%\gF CՉ}+S52eKN ~BR~A'<&| EikFG^d;|߫A޶&޶dL^2t t{r% _2㶆9X fP.r ԧnjISyOu0ͫ2X ʯ|ߎ⨼=({u~ uXi;)e%čwv4eF?LmƤv pB 9yZwa]̣Vp2::FŽ}/f2r?IHt(/I% L B *-U;Ua" u{L! x=)$ȆH^bc_&ΐ"wO/I& Jflk%F#7|>Nnw,+j f$rD9tWYB&G+4TOltC!Ai WYa2X"~PrqBa9*R{,4^ 0"qx:M,m+/'S86Ǩx F8eWih-z,{ɑ-Lr–U2u5jD2V78Ͷ;"'k-e!'ӬNܠ\7WX5] Hä&8cDA-բHa֩=6I叇Na@ˠI`\S*^ YX8kqcepNӗCuz>=q6#ʦb=AkjF<)5)B}%/F |n HH6ocm:MocK'#Iy0<^ǃnt3'w#ϫԇe²R8\[K"o= vcgO߾bg`dn_Nma@//#Xv4{~bi@P+:%sz ȫpWB^%yL}n8|}2lU}L% W! XH$|k|KENs/ \8@Ή,.[dv-K9ΊYkr]1=ԕ;~u#&BmBKQhB}΁eN 1[2X_|<#K+Y8!"S w^ Ϯlbr;W26#%wPdzaNOM5*txi ^9ο~jQ/C-2&FpJp. 6';[ B5zD q%q5ߤWB/'1,!{A9Z9;rض5SA QP+_ILKBZ"Guezj3Ԁ0~ςz5#eD16.+G_d,4KwX_65HBF$]~z]5;[^7з&f&?Gr[}%l~jf>apܺ>{rŠ?6¬MJb4?AX)ڢw Hd/xr&FQTS)OAMteҺGhz*^Wo4Tv) BQ/ql&uvYA>' nj ZN"ư~ mU2ۇ;ȎZ}O\JQc3ȝU'_e.3&]Y#͍fҮÍZmsGr'0҂U[JnFTe=(:زi3c! t'UbTɚPň H|döíP e$?/F*lwb'eHQ<[Y{m.d1z1Eef%Z\sik!>ut5#Hj3گxiiШmܦLVBBE^#箪a}UnV̞^I3=ߺYlK&Ym 6O a|2:@Ox^p L-/phc>t;j p}7͓~]oy1󛭁?|WSblUܗa?4NGUVmOnӲ3Dv!k&&sRЋ Y@FgI<6<%ѭR!C $P!nrR zg2.#3b`Emc J~M_J"fc"`7U]\Z#Mێx1sx=]OAl~(u`l% Ľgk0z ;8t'v;|:t;SLueO PXJanlbb[=,+aT]1ƆY (mP#+ߞ/wa_?Kޞd'ՔSHV (wţ ˦CanܰeY<#18m46KT z8#\60p4fN!&#~GϷ^ESFbV-26H؞t -/pQZbPBh:ٔ z>!eSX{fs ]2:;3atՄ?Kc5sdﱓU+{YUof;y<}6gp0Ѝ,Ǒn7xnSjZ.=ۮ5YOB mYc̗XHlj{ć<-<*7ǵS ڌT\"w|.FܗsJ|O:rvoS/K!GcmkZ)NoY9!m'>W-d.)5`h%ň2> _e8]"y1>hOP&"F†8tc4nH,7bp<-Ai#ۜL{iU5kǯɿ)/[Vf!w+x~YUx唳`,OKo.:[F0PJs(琥T9 ݢr ej#~;VxI tͻNLvݡ EKhEVbd?#BeEؤh;}~Ao󆸈`f*:xBmjhKE;e¨,G[w Ol!fI e_0նuи\B yUЩ1i8R6J[ _.gbIrKv|1zGdj|4 }Ar% ̇qn϶ήj1^KeF :GyrT5r^{B=m`EͼHqO z܂XҀU%u0 vg ߐI`J@1z ~df;sB0fukʧSpOmnw nBge@.uSU'$7"2Pq^ZpxOJ&*Ѻ |9g 2/2 @-'sdՈi6F j {rHZϩ9Z˻qGT's6K䠮h.6Z*fP>+`RU=&F+%ZCP _Y{G:V%VAϊM@b|MgO7 3bdadkyjmĹ}X|/c]h| I [F}x̭qqim񛲣xKzv r虬}m~6;o]י8;aUFm+h3̐=\phun"'avW3tuSp2Xj-޲&)ek!+wQN56tYc_] E;J`%[uzl4xcfaCJi@yP)Din9UO nI"o8l?x=yxN'*%Ҧ+ҋ8, U7Ҕcc󊿭,X-9ע׆?ϞGwokNԷ "Ad^-(|KE3Q߉⹆ռkފoJI(  񡗷( nZ$[H w_U)Mޡns =˲]8z4^Z"7|W\$?B\>{ʏ+w[O5"r&gv?#a-a鵴;|^6݃htc5,w)X\Lbp>>R?b(R3+;y{LȴHԕiwҪ-lynXs7祛 [>1uuN @I]^fv[+q}}omtIj ~;( E6,T <$GԴ ƪIM9c6`:[jRmQ+jTz&Z'O5|.!iH*r\RX5MgyEb*je5j$O[o`bt42|/*At-M P6]ݤ#*,Z./M`d P [M3w;pMm8*`,9|w y2nEC" IQIeG28T⵷ޛmwbX?'˙r(+ >ŶCI"kRcW[[p32o|gN%Xg: .1VaT'^e6Lu5=eH`Nb)ĭDQ8^ApvʼzZo 6)0 6—0Xm۵UK/҈A=\ A'A5^=H'NPEv-I~Wuʊ²~MP-,+ 5g 4$iDP<&7;ӦNmPtݑ@G\T~uv [e QC:^kߡ,^{BrD/'GIbjȴS!%-}f#a.3>6I6R5Q8z pL3NYI{CAWwI}& 4C)#`ζdO?dAhXxZ@\F@GѬx8$~if ٿYD) TECמJB: o嬦mg_5 $qfh6z:=]%q:FKl9̯J4]+^LiP:U?~ JW"#eC* V:?Hi'T,uV#GK)HMgyRPrQ`!f5 SG;tMVMSMU7gK'l ,%\aIAA:-&@{Eՠץ4ibbMf-m.jO{ !`.SmVΨGt2v ޖL,W-UAd,**e)#ԍ.jC!|sx03calm@ݭ""&,IѱF  (HjH\ C^ҕwĴJ=RFhGईmeHi#S}NPn?ŮYc`*MKI޷9k:*鵬jk@5 fpI$$^2mYTݣSa#UtJa++-״MgW|BWv=i 7A`וP)7]]bu %>3ч ^KlM7V](ώhAQg_/R6Nսv_|}uk&zJ *HY^Oˆ7L2dP_Ձֺ~AfGKH>@q4XV&){e:tlW;z>xlVbKKEb^ǼQTρ\+a ?_"uO9wVm(pb^:C rbh2Wq6d;6[#LbuZQ:>w)k$G :ss $Ãne7gn GY'Mf~ٶ }ӖHNjNm+ϐʊ;M<˦ o2#5y1Hy<##Mf' 2yYڪCFoO:i>r&ztj6˽iZ/g>y~3剄L}>S"7H !D>_"͐ozPHN_bDGˆ2"!OwEP gk\[bѶ޵89 v7~!j"Z3'YwίS 6o|SY'nUɝ- ~B# >m%Wf+݃x[jAjz`;u}Le͞2-'7ZsRs5(OG9!h3!ZI@;Vy T1).ZlrnLǍ~C(7.7+4)1 WǿSſhȿ(i1G݇M8( ]i\ npIS4vm9c ͽ5yfb_Sty@:B Ԧ}+׎\ LK] :ȊmStUGu%sbc2W[ųQ/P1ncOcQЃD-F9N@!1C?AΜ[NofɲM^>qd䰓D>%ؕ0`"X^Vnn(QD+)*Odhլ"4[ǦՍkm07q2j"S?kQZX#zoy=&7%N?gk85q9Y!˭!A$ , z(UkԀR ;x!1/9(A(+}؊7*@ɭOk ; рUmyr$90Zdc B@)`!#D{zTjnNܶX`i*~I|$`yǮ?bTلxdvPVQcr'0htCԗ? ^z7:,p"w:n|3Xx%D;ykĥD0!+ە7fb9vܨQ9/KF;f klzar ;ԻzlyaDWjU 4,3E/Az/wu :SFwRBqҐڕZL)i&Ǖ<tJRi8G5Eib/9-C!c"+8Wu 0BFZE+h ઊDnvl8U&X 2ZvP e 3@5SxqaJ*j:=)EF &^!L1T1v*k>] kl'T?# 6 ^ib z9؅Ptά XwvoE1aL?$b[TGtw&&p9!f[),C1ѽv8u ' "MȐmVE@L3T;m]nI솸@}EʵP15 `z>yϏU8 6rP|\׶/, 04ApO^ie`F>*_Z!G·pȾd d+(=( |9 :YCG GP,:ls#c%VҫZVh7u_TG[H B(hCY0+#^55s# h*x$+v YИМ]I7Onj dQL]ʕ7T>, %lLC@\FgpQC <W% &mU_%#G#DuPZR ՝] NLUgI` ;hD$ 0R8Z0CRCT%T&e{飴SxBr\<=bUI>#dɉ~ߢOo9>fsY3yvd׆ ުwGlJ6ز #Ӯe1-`->Ԣ f6@jx<&n?vú{6'A[(V+Zlgv_*_ʌ4"1hﰤرnƪ:>ŮE#=#a~I4  P>_\$uaIp^$eSq<n\FE6b >njԹ]Tr7HG17O)JRWgAF*:2V[#=n~+ h$ ["R"+$E"E12+zcoVd;+~)yȘ&H#B6c[.!֥1Q![W kb9{4PcbA=?٢H5FZ|hkMpJUc7mu4 -^&dW@CJ; 9J8[cX0Ȳ8U{q9;?(fmji]TL >FNPuJV(FU``|y>~m]Ocu\Ϛ6.͢:fAn=>^e26C]]0"oUshքyd"֠hBn أ/ؙΑ5lũ*h}|YZ~#@25@)NdmŬkU Hn%:%z|o&E[M|RSZ"3xlEX *:cg> oEj2**Hht&̦ j~4ʺA')@~ Ѩ"й yIHE׾I"j%DUH?<{UU+YVIݩVz5 }S1 'p<}{ sCf?Y,.MXE5U֐/vzb}áJE% /?)BӃ9@TQ:)re۶>(\9gLX^Yv@?y:0hdkq"8ŞoӦ}ɝ,^OyAmDh~¨Vk>tBwUՙܣUa` o0y+ݏ8 4WchҲ߀f¥>]t( qXCiY[LW7\iL=UDU8-b\gF@v^ x8fy܀2.nA8!(h⺧Ic̠k'S-X{Seb,}|tB/YP+ ROѐkɇV!ag`Gf|Lwr}yrLex"P#ݴ[AMU]C-@8"ᄘG\~Vbǚ]O6tƲX^ dcT)A:ͅ3*a3]֌\&î^>Ox+5 |/lsJՒwU] М4Q8j/B \$E Ghv@;z3J=E" =- _|O^hL%\Qsؕ(lڲLA]q_NQ57Kb\^D&Tjc 记쒥cl"#)O@L AaЏMw8dr)v}dk[xjTUmlrY=s|VnZ{%]n%*imѱ"p\k뼼2`h|FTlbW&?zSr&'| >"<)=.mQU?4rK۴ W-<-S6A̋Ck~-QH%>/xQo iLgh12q5 =~q8)!ŵL|zhP_VUu[g.XVOi{&UO1^\DJ܉9C6Ѫnq,yy km H^ ;V6m 2q:]CK]adEgu"Bi!H飹2nխ7)7x?5 r6U6 ^fHvk1')&T3di2?Ƥ'0_&coZ>T_Y*(ݺ?dձPd#`;ByUA,8VuH|Ӎjya؂auّ*8ǾQp0b-(vmܣ"\eJYP Q: ye>LæJŸ/Ȧ(!{ge&CC5 y#~9zQK]gC+^S+e G!ʸ')?f}̸RUIELwM_i, 9yB>K~Xh]d[ )q`DŽiB (>;M҇V?\nTKzUhHˈ`} 'Rn=gkiʿkEa׽7 xӤAc6z}fzYle}MHMpV7|2z({g thZ;IKe}Wce5++QB!5ln')9?90- M?[UD"0;+M&~&CNb q:5·{l4|]4PkW'ikqjc* ɑAܩBR_aZJ[ P [Ab`.C! \iIa#(aWth{+N'K*(!)<|gƀNYVٲ=Ƀ\u꟬*%V䱨:v{f|o~lw^1']ʅ43/3d"2åpQCEG+A/H2EtLP JrA,WQ)/vͷ3C BK9nV|>ѷ;rHz$X9+8 k ֺ6{_a^- 4Z6ZUՊ VO* 4nVnҌ) BVV*L`*i r9*G/Yp]Y)#DQ .6@.ˎS+Gd+y3N-o 1LkG96V족T@"MY%Rk=Q\;9.(@X4Z^kASvې^UCձ(t![m tU!9KcX`Sm,92YYGfH},mER&8e _dDC}OZ_\ثnc)j_QYV5⼍+`dn:3q2c$&I+FK߽`'ld1ߍ/bZ{0zR >r9*ҧaʽ!<{R|A}{؈ vV=}b!s4`-b2*d{VuSV |gbU#WcS` q4zãJ8rw ơ韮Ps>_BcIKNUʐҊ&ml2gk=P$tu }4//*p!ݵyBnsŨcv&AϹ9}*S 6bܷ5<ҾH[~!yxJp荅֪8)o)BVz\ӷFfiw y'Ʋj9MۣcTɲ:|}e<~ >%TzR*+]D{evɛ G,xNJ^eXU ʎbF:o(ewA?_hVŇRe˱0R'>VU8^F5Xu%4#TrkQFTb7cktPQDt勧ZNC P2|⍼P(`Ƚa@?9C7QuoKGwkJ'LP\hBDN גo$SL4 )[edmqa2R o E n*|u_gL"QY9PBHJ!1z'|/>+O!;-=/d<7oZ`K6liO z :r]:+=u3şdiGJP5k=C.?yuz6T#B xM54@t|Sq YtoIbx2(tUՃ'.Êz͐Z1"k& 1!\)/;Q3$&PWd:hl;ra{ @vZ7d7%j+~*)jP鈿w*$OqWܠ- .쎰= Uʔ..Ui^\gWdHdj[Oeև 0K$L1z Ɇ5: ae6?eb5qD}.cA`fBQl*Wٶ$er?F` iXdsY)22vt/X3ѭ:Q>Ml**-Ry1M N^ g6(rs#h\෈́.C{ z[BPj}GG@t{$N8X&)@X oA .:Mz53Xjw&jxͲ?DuF6k#dfbz/GCAn\HM#2~ije2qĴݣfє@H㿷67}2w >؄o xvsˡ"<<}MyZ9ov#q[E-,MGؓq&<ԌWKզzr\"=l[DŽiJn*G4NvC| ӋGrlTQi/?Ě!;mޡ]HRPϑe~ FN^h±uv<R$E.O[,f-J=N^4O )O %F=2E,2H.g^{8n]䓫+~2l;̔MEY$B0F80vnm}S=U4>4h6n0 EnN0\rڧVrpu6it7z=M JGnX"[TUC X37,;,@q5jLE!wa薙s;g{ظ!Ҿ%iAfbJk^0k\ YlS(HQq\ pbv39m7;Vm7@:f:UQ<m+ UMN}WA}&d|Կ{򗙶݉kN]-1RjVs^ UDW]W TgB8r9;2|dOa_?4!1lrf}1}C +;y 2?63HV|8 ؐƹg@艌L,'9sp?K`\Uz,WB2uq2U-~5xis*?fQ,37yzٰڴנis6`i9/- >!>MN3={|25`KJ~V29'(c# L[AkSh>E^N)[&3Uܫ1ef_Ue4[9O*'pUwe(?9ac"k,#[cm>U4hzo*R(hqU5C95k P>nudVZu'uQw&@@3gW !(@ߡn!wnl'OdlF 0sENT´mvS!nH^taz-{TO*c.V}KId!LmѨd2N8B9hr/YQ&AuM^Z3c3d06pyj3Tk+=rZmk2s<HGn5hu}Uy0ir pU]PU¡\[6 ž|<'yGqV 0c 3dx)5"vJHhu+lŮ{S%0:UMВQ+wtiCTCg!CHKV %+u6n4^kl&5.ˢ'SՑd}X^] ;GWݹQa5=gs zxTµiP7kQwRȶ# %I}g)sMH֭ 5e>#w9p膐{KlL[nmEL X{pfO)ٙEf7qrx4$V6h"L<݋) YA>gӏ3SxNti^ 'c"(苧KSQz2`;Sx22I-MG8#=e )`CLP =\ XRjG }D5]7RnsLisp)Ć)MJ~fŢ4ʻk5S1.Cs;<|UVYC._dj&4]LTz2 BuQ&9j*]Eat #L~uP^nsɫ+h>h왮VЃ縎앉׆܇1P5>c/w[?RRjb hd@vWt c^eЮJNkrsec %M(ۇx ސr>滭H!7%Tq+ל3?c1k["Ю)FMJ;ɕ-a2Ɇ&!g?}kgi GOc*Ra%y9M;G=+JZzdZS-\iק0(*> =pdDRd[ʰLw?`bCkLS\ kw8eyX0@8P(7O=Ճ4W+L\T橸F>]G]ԚMIOuʇX:1` mE4΃R]w1$1~21̠ʌ_pG U1X,)MqOkm[4_$Oϻ rhs>ŮBCE`VOVZ P}2hOv#[G|=ɿT^!Gޞ ;:^i}hT!%dS]Ύ1G2eyqx^C+d:]HYꎿ!r/r>o( [{l"<<ڿߓ´"<C^Kr XO{=ᢃgj pW %u_҃?ܝ{ڐJ4c8FT77hZp ֙|c"6Գܸ~ 1i{cMziM!:@fA`JB#G N›nڅ׼%_K޳齩mD@onVUW8u:Z4+2J%Ъ@vVZ܎MЋd \ed~6)94"맞AE3"+>NneN{xRaC<uS߳P*e%HzaXvLB~57Jz|K*?=GO d#9~ =[އjENY)ٞKpkkc,Kf6ڥ|~[f\__X./^]3E{SVF{WW{+,F]%2Sf,1֋K@RŇ& DMe:R1PQqʴWd#Tm_q >*Wyfu-c27 Vxn)^bfVXE]+WFriPB+g>K09QX|DP^Y%Ce`afqsZ]L͸E5mV/n2Fwtm 9y@֣Zl Lk)dQ.H?L\nF(yj>mSVVll/E q̽6uiǜ!+PːoY{6R:O<}O|KMѭdx1`-;gÛDa5O1.5P#x|h]&{0!iރL!}FRc0]3MYEjyؿ%/P՗-ԡ)ʟ/={քf؄Y8u6gi$}Yg&C&UhI("CBh?k]_x9 s+]=ԿICRljшvts3鶠xF*λ\s9$i>%0k0n4-~N՘:\AhC繅"F*PJQ ?!<9%6YG;d|gJK#nxk'Fϲdu-SdnuWqφ Rp{oWAev` 3G]Fz%qEt A?ť=7K"NE"LJTOAΟy CsqqF? oGكR|m64zFKH']yJS*CLۗQ($8?:Ep!Dzx{8KcR~/E_<(21jy~kƪaR>v1ix{ؙ[nl~Pq-_Z-QIKS&#IUdC|!c񬍭qb5~'q_nm-}eSCήO1Ծ"+no#(3_l& c5O5Vrȴ@"rgG3] L+& /uujdt|n e跣Q5]8PD%uS>2I |=$OY7kYg]tC0yXi^6^Auw wJ'Ӧ^{BKY G;b<_-[]>Vn\2`xy:AA F&+Mt1)S:;!L=H |!d7|kjJHf"z6\xx y3/40` 0mP?HaѮR%]͊H?ö07"c9´m4) rà&S*U'>s6'-5|%xn식K!.ˮIqL] 9(z|Qϡ Wb|m}a/iF9HD^Fc_g*3$P<ݮҳ6'qU?R9j9Ex#s5@>nF(|P2SD1ܸണxa "[͝9"3~:S%SJ}{r3`wW`ll9.cҝu FZ4ƕbF> n?-q`e^t%[xd0,ɯE9wU'e$A~$AF4SQ͵BBjwb2[KQSi6gm {WE nUǴP[/Rq"Zڟ%|EH;n1-](`e4fz%vvJ!x>ɃԽU&ppC2` m Nfc]d{[1fA=C&tvY'z b4"]?B?ɓ$E^ѹ&$dF2%ϴr=͕^adžEw m|1)}zSLbsm1'Ecr;խgC(ܹ_%LLe\wֻͫXr6T9&@]hm9z1w2SYJ^_ ܕ֬s?_pTm͒b]T<<{:v3[ʸukF`;JQ \ JQΖ diT9Tqt68ObGԗ x<&nu L)Pzzc(y_(crkF?$[Z2quZG)1JMd؎$ŏՕP5(^^AƕnH:߽VFPZO,Ҟ6P) 6ܖȆy9" c-eIdo@F]ZpR #C8DY3 xΆ (YwtwhJ\X4 vQO;BSJm:1*lP@S!?5qȝ-5,BFyDA zϚ:l)xԍ%OY Jq*>Y)5侑n|*|i$H3-7-6u)a}!6o afA~X*~Q EK#:?7I?_թs(,YZwz1lD׶EGii({m]#Qȼ Mx-toZÖN y߇/ТjR3(M30a{1R|zN[ȭf{>='g0{r:Q[ml&nIf;?3Cǚv~K7ͱG`]~p>Y`o8ue]wʓQՕ-A$55p}6N8y$k (5PdK(ۢBE#2\zJ>C]mI%ݖ nuE^Ӷ1J* Ie4Y13dVM⫞um Wb60k,=hwO>3%5D[+*%:T R4r!=mHu)pKРfVhHB4L`:Vax# Yz߬n%wD:]2q m,3#E~(q`h'疧i15^d^NF5W pM1-l(cIW-j 6ûގcjye-wQw%N7ڐuOIeJzY/LZ˶dc.7X"ZRDaM`6q])v6Ýxr Ol3e8ثZ@ugE㓌xЕR~SkWt4y4>o,c࠘%aP$v/٭]S3veqU@#tHzAx֪[7[`} N5(RIP+$!#eV=̑eF @c/}z͖cNZd B(AUL$ywX{NP&<֤* X&Sϒ>ysmH-)sY:4BSytZh0 O#8Hm&nmkƹ9w;%5NeT8 2O'D >ipmT1Ց@cz+ ho]@HH xJwg7r 0/LBŁJk狱0k{n5WK P:0M17o^v`Z o1*59⌕#ʐu:Q3}+O'\0:]T ժwo*H8֫ߏ0iOS28IxQrIJdm`+fq(6whoTφ;nPưFnwqRhdMP:{>GO96>;'e1"+CG,~LL"m=fhZC=O5wV)An"LLPrl98E{QT]^g"j.7Rz!|߼ R [Ib ]KPBVnAzg嗕f*,O k{ˁ)]Rs`o/S.cdxxeW>ҕmH355Aw.wӍD@ jQ=(E^IV읬k@CȚOYu RNI(?m%ZskSu ~2r/]?E.AAzE&04H8duqYSwD&LK,s %h=h}sh{^ionuYC6Yflj.o/+gxBuQ~M|S@\d4N(ccMe2PW <'#-#hI ^cMBŐ(k[CuOєĤ?1 J/Aa$G}z'5y L2AЈ>~d+ksǂ`kDy+t M,).oW yPNd ̀fW4腸Bݑrws\f}~C9C0=NBDd E}>Op)J(#5t#@i?h eY@Cp*XV9dKpUfHC%-W#\@LjvjCa{o{Q&r2{v1mt\1?qdfŧ#Ww\L9 1Gtش"] oGwj}1wހձقKRم6I$+ ţ`JZ1[.7~ؠk_@H~T34kê\_8K2/ͷwU>4`l pG㙵#ݿZ\Һmb]0+]8\ҝjrق(\uFWơ!%oԈ'g5݀afz]x:nfUkZ*#-MVLS`2#'bK$-ϊ@Gd൧cGx]Z8~fñ*6]FRAdӳl쎟KPBd/_0rG' P;ݭY]<oADЋ:L&r[(Y"'HuAfEBa'bNWi{+{!B$Oz >}N_}Nm'O2䇳uȥvs5Ō#"}"Pw+Q yS]L> ÏlII aG?DCdY˕Wz8cN|@gBGɭ 1(G4 +nBRSM@l(滙{΍MEQ0nyR-Z3"*CVP &o)ا&I KnCO X,~J`o.rWKb#FJy'{a+b2MxsYrArM|ӰS?m_D7-=R!L(0t ȩs_+N-֢zR)]Wjh"L^<9?LLAv}ڴ9'2F_bbVmҒL(E_:D%_|rJg|%%|40aBB /oN66MkvY%zL=, ORTn~0Ř=Kwp|( 9 dJݺ6o}<^; N͟<.Rún N! PL򽯀oΚ5A}jEhk!5!4XwJƇ0$st r$+#<hSK&h3| Rm< 8<=ByT!5qٱCVh? ͳOAeqt ĵ)hHSlsKGq="+FFR+H1ůkb12mJC |NK*$%&llgXG yw KJhG4e1hiCyEn*|DӉ`V2Ƶ4^wcgUAEGxU,&scH3Sc/}Ub-fu!n- "/ȣ3@[X;jm+7i"^ݢԥH2M^ tW2kʐHj7B@K{ +o L  ]L0؞H Eγ^*+4:EՏ$U%Dp`m dZ#Ub@AvAG|(|I`{3Lܒg'j_ƺt7H]e*I>:Pʓ͉a#Yu=>TvLlJ>TǶrJ a|*L} $PeRqA&A2᭫0.> q&*<1X(eU>h#us0J\5)֮k#vom9)n訄-WUgq)a7u'0'E{1/ jMjЖ4GB hA 1!32>|oyw T(P%xs&eJ W٣:RK4{$8K"8$U}<ǩ#&:H^T}zb~oϕpNz'0Nh'[sTųxԡ5^;Z'30gE(z;7_->ڣ)W4E Jz~L+d Z}Mne2(扭C"-uƗc$M/9K燠RX_-!H'wU‚ hc:6ěhvgƝxWroGaxnU oP$N" M||qP_Ʊ+X:v<y?T-c3 06vǁ<A fKA.Fb)Ff^Fcr>⦯lu5 t^pe;% 'F*|Eg==|n\_̑f۹Xo$^2Y:6nkn&-ZȗΠ^dʛ|U"X)Kyl\P9})x!!iK3@ۚ~عuύub/fL6jg9Csz|Xo"aFq: A;qTKQU騋S7 |x*x||?Λ0D{"'>񞫮V܀>~fk\5USXraO]WJnZA|jȬy.,azЉ-h:S0bȊdDש, {WnSX'5RKlʭY>H9Fk8DY@mYTIpQJQ9z#fƇ< Ix@ˑfKDn|2S|i[r%iȒ Bp-c42B@2倬&혉^4f*kIz& 7uZLB($cY]*#1tAobLxy+$\FtDl#5)=-39T˯-OaHMn_vӧ 7M;csޥU1軡J++,f. Wf"]A22F.pjɦmۍ^J5΅{*J 2z|4-)PCeEIhb b&hƱe?=xQ]as?}oMUX ]ۣ2֦0^@>.7mD1qĦ@@`"C2/㻄%=`شnn0;0w9/q0 +Sb$V|~`ΈKZyM'&ZX&Íc`#~:e =to v(t\|^䦶lI2xMp+V$J7W_xERcۨW49Zi~Ҳ(a M"IקTݔmO =;%ineLFRfgN}3}wjx h+~#SB7:mT Kj|UdA >|a̍*2g zA~2(=x_bkX^7K)B3ka}Z(lsg94?X"pݧDU1~ z bxh600zK;W%}(*j8뒌*BuyBJ⫽,!֦^( J,U,6W46zg84Sf,%ܑ f{,dpW_B C>Y]^:Go''d"F8*n|mpVՏVݩ!C#fí)WkV"?b:VS5U7xT^(ի3E78m S3eD @v*6f^4yLqt @G0Ro<8.2 t5lyMrV^݌ިWq;AVFgRt B/ycc.-$cX1J46u3;Ϯ?[XȊ~ϔZ㰔P; K(PE_W-4ne 3Ef5Rx!&:KN;kp8tm;Ц>f~:Lbk 4•Q7]6R 5G\Z]q dv)^huN 3{>]pr F-(,pu/eRΘP, z0,F]|h"ÿoR(r5rMu)o.p?s (Yٝ4b,[CYe mBXg} J?d@X<$1Ab~+8b8|`~j,9vVk|"0=nz_TaPzHF=v N!,S3-'ݵQmLPEMz4}#sUXV7TuSWGIU qM ~)Ez&JU>Q>g`@Ak[(&,p~hrShAS؇^F> Y-&rz;&22IgϾ|XMyND; FtjN!C <(C`,蘱h Չ:%Uڒ톳IvcwB5JԞjP1uQ͢5{N"Df⻸7&27Z8->/\E6&q ؞:)G8VsD[{C!Sʉd~@r@޶CC f)'YeηeybsE}@<ϡG Z 4n{f+ti ݆n Mw}i}^CTb AL|/AIK'W {/: rSAw.@oBL+W?+j6v*4Hߵ'"LJ ɔGjW&y LcDJAHD;[lq™I%@5RWNO~ᒓ`cuѸQ \W .NMeLjvURP4iIZ%T<־M퓖'is̵vUܸ TLOz4{ԍ4z-A60aBzwgT:izmvxo@nkf ׽tѱo{9Vzy[EdhxBm1}!a)3q[{Ȼ _JG2ke IT~R,2\S嫩TSng7i^h5\WlzH'R տ/؂P oB-{A`jzOBxw^d#ycXi+͏R82GnQM/q ǒ(&^ֵO3tS9=ﴦkϾ%{v:$t)7Gnu`9ͦ|(MA~%B4bXaēAF@ ߆tC_#˪IJm|Hs:=F!~][~-oz7=r`(&Tdrt4ƢR'+nPKďlP(ܥTV}`"z]e72hъ!iXN2;&IyLX\l 2kޑդ0O/٧aOHI$,A'CkQh+J*`dL^, ay,Nq)ԠZҒ\DǴeD+%>Cr̵6f;#)3R i2ruۯ-(Ӈ#Aw%CJvdXY0,+Zʻ = )PPVУ<ޅ^+0ըt!HΆо{񒸅J轺aK&\#`YDJ [-4/2g% m (2UiI'/asN0 Jn<}$.,,@g´ xq -cGNd$*~6_PyϞ *ˎU) 0k3 =I {thP f}':+Gc zKyOؐ9{Z2t @7JNVv>妫{r[x0B3x(!A,-ƫR+s- Biٮ=y)bf3Y4#Ie%c) Ѷhe 9\bzU,l6>In=h\H2 TMlo%cFΡO{ͼ[d&HGJbMVC,P=tǑ {9I|j6kCC2wtCW6|rg]VM^\}0ͻw").U$+hOc_n qހ 4h] pf(nmd"ľ mH"XnUKyQ 3__|%KQ[|]Zi^XI&Ǔ%2=̹M֓H\s͏Pu덵D xMpXp _Z ЬXSA䘗W%*|,Lav^^MXɈ9 >1/ft?y,AlͫáOW $p RMܓeWY앦UC$NR-8ޖ r{ŋV]Q~X>"}g{""ꈉuԳbF²TlTS]yي{2ԍ{*H&ҏBAiN{[oz$54HA`*mA@ QlJM#tZb!G7umM[&|nZ a*mr<+3i(.tdJڼ:tMW) yLkQ V~-2R#J-H"-'Pccs !J a*C()l'D[iCyVhT]"Jwlx)dg90pe~t~P5g yx*.vår3k[:+^#ݙ5|!_7QYH`W0Еd㐯n;֠z|/']W*.7AEu~z nxZʵ V(|cTFM1o[6!(9Z]:.)kXZD{<(<4j"1C}K3 i]{AsҳvumE힆feR45j7b^#OG*tRXc]J\+$%3P".#9H +#: 'VYV6ޙSm.@_RGX]bm 4&5v6-_2 7n/ƪ-@6=>SyQnPhtGXl j&؟z=J |6 _ KASàxvECޘG`+'*F<8ʑ)#cTW43JUi Vy*ݲ+ׁ|:i>uf #(1Cu'NcVcOIaq][`M"gj?]R? (()շ^2gvv7e8w6H&ZcCAx\6;gUW[=B!5b-1dخ<ͫuOt6mSN1/}qk!*AC܇qg'm4D|O'5PdUdnVlCwCbUy6  SëYҸF24IF'ǫ( `bwb!R/+4%1!\ZnL1>emsE gؠ-T3 jSMciSƐ 2n1iXCVx''h b݃@XnYЈV;o146bU׻r%t!¼a4䩝R-M]N~tSkcZۿb :jKiְ3 õĘY9*dj` Ul <=ǛTE͔8=a@Nv(CcsϨfy+Cr614®N], 2dOmr#8e*s@msbNZ9j/r2;*L@jEb^,}lDY oYF82Bk}P  ?d冋Z!MB`[AZ]Y `EBi((6\6T-wڠ:Bd(FOg𺯸]A$1ba`o&p7L }{Q#ncѝb>CVR@׸_zlೳV[, *̩umbgFAj2Hx?Hj 1kԳG4:\δXiA?nYh5S-Li2S~dg@o)dRx/ X"=~^c}(F:09pz MZ:PV+IoR> wnYX#wM>ﲏp|b!i/HDjnӮ`ŁA߁Gq=N64a̸T@z$#>5J]%oUWf|[V$.Me.XwGguxi=dJNX-EXzGRr?ٔkQv-38Nn ܾN2+>C?mX&"0Ge@p բ?k P ztֺ&I3u!Äf~}Tr%5e,Tm:Y0TQ/^j.(2K0iQkNAʩAkg蒹9kA /_M[|Hq$־+nC:;gp,M,6īF;.gB@.8kT1"Ҏ $qEBEVt7G&^aߜFlY#{ΗqbXxE^A">qCޢ^*):Us1 $%q6”<7GӺ'+C,;Z/w#H0/~lĕZ; g)n[6l3M13"0mt;@G~SO4J+_`BˈQU+/\ڀz !\lLA!lMS(1{ =1bs۸z Y>#=asTX8Yh?8e~9|fӜj0=QPfߗHh=I HJfv]/~|isG"5 )*.P4džY32#rUZ$RLNZ_>> mFaUbuמZ%kY אx]5`_k?3 яAt=p:8*/*Ԓ;4l(,ܭ(ڠV:7!y͕+? vQ/Ճj Hs SyգDW;&0~#2:Akm[W'$CL2[NZ304[o V,yVVo:pjw"t?blB7rr拒?ê \2)B("R#i=$ȗ pU?*rtH)%&97L>XKi_ .܎|Svй0/=R (3 A1~Irܶ#i>i?EV&}5@m*4YP.t `K>46h I.<9X5b9}A{;HV6,!v(R;;ބ0jZ 4DiSZW2[ /\6N;) +Ķ"iv 5W 3Fi*iЋH }^GHO56UGzBr.5ϝ4meƝId`(g K-Q3`]s@,b?g_W:gۭ3,G *Y?0vTX:{*8܄%.R7\!yv ` EQ'$@??={ҡBZƛd%U^ wu iY'QVjy.1~}HDL^L )kR*La| q_T;3N t CW6 }2kGhdkuDVVr&?EZz@z6.]9;Hٞ،qmQ ޭ(4UP5Ybn\a603p5Kb$Z>RO ؛iTc BzCvnSuI Y~zdA8l`e0 q.e uGjIhƉ&k*2䉕 +ٟ$Oy|e/Ae>-!c[Zp ꊜ}9 g^K:ReED%a^ MOgXFQ#nL^p(| NTVu;pgcnN Ii|қi WмU,YwX,Xe}ĿV:}HjGq@C ZK)Wg'w `)Ƕ!Ǿr +ELꕛTnOi6E}ex+s}Fjὁq vkrP׸_fi/23Wa/SNFCڜ++R{hڶ<0΁WʙZНQhVfd&x׻}riWLeª)\$&}u79d`-on>:V&Fi,Dp5Kă"db)%Q@NvZ?҄0Q^Xޒf%5np/gDE+UEK}źj 忶xj7OʓGV I̞dӶtH7SdLQc]8"Y.Ξ8Ez:g|X,H`LueW&Nx}AKaֻ/j r9rV%l[XwOk7f/Wr4Fe]!Eb ,3iRlC|5ڥc`6?25Ug_eȻNհ*܌49}3n^ k ǪND#AE;ˉn(@p'Y=f%h|].d}S;V?j%ߧHN \ C4g%Mݍ"H5M"(܍aY@5r]6jxEǫdPZ=MݻQʠ2?&tn{7w=ktVmR,>]B>+JlFaL 1'(aTN C,gA"iFґx'73:CGHfm5F9t9ܓp@e&oDv9VM:m*`lE%J1D Y%SЮb~/ yp7¢*{:,V3w (m 6K씞/n`/rgGÜAXs jgbTY4I ;t([g|S CjuJ XwULr<{ya>_TL}4/>d MZ/򾡖m t׻|di1?鋙Rl2rڃIo-(a*6F\O?Y@٫~!f qL:">p*rm)|߇iqK1"4B)g݅:KLC%_dQ"Ep  (.V;83zT_ 44 it%= 9zz3 ݃Zwvrkc$jjn堭%FHbb Rҕb.4󆚜{#`<ʠ*`?) ԓJ *cC90sG_mUį3[vG9BA] /;I,SNn]'7q7Ȭ$ @*0fkΜqp!1m-ٶ.zS:^[hpMˬ3{S8Mͷty Ze&ƈNbK?EX{k^৆dZGU?@NIFMzZ][&O=Tc+R1)4Y8{2o FOaX8LjU:<6 bge|;6vD%jp]QGZtYx|aBs`=k3 4,iA>(WՆ ~_v8)/>X˗EY%ؑ"K#Uz q>d'&whXEARTiϦTۄg~OǿaTX:F5rjzCtrt0!R{4~a\=tA(!ga_oʱV߇#,êrQsoa.Vn&!(y ˉUJttd( ;з %fá5Xj$$@gO6~j;_VGvP0f@]ѓ;H(Bɡ@LVFI7_BQa8~M󃭽t؎ =N:\ YEVRd~#/oL+.QH~o TϽhsB QBF߀"2+e渶$vQ[ ;dr.]"$g+ 9M@pP埴fAX,*V&ʛ}8hR>%:ecZ,qUj>s$c:ob48npTF^S}50%vXC$R~>QAO=B1mcbT|swY fNZWmNNrhԺhav>}PFh)+a5fޣ7<@IͩոMh9[^ A< N=T,ePjGJܦSy oDZجQF WUzQeD)Y^j+\Sp] 4^*Lᇇ?IBBrokDir_PV:=l:Wƙ^Cݱ!E$3!(Q N): e"n^#"kVV`1ܙ_WqZSC`U`TDAb.\믽m-VL*fqdeN7U8Ӷ?.3l` 3 @^k/vrG ͞Ne*%2i~BaFi@]`G&wvf/P"g؍7bB[o}ƄnzZ@;FEk,t%'Zfhp9 `0\H#%lj[A X:qLƿ{l7pk 636$ խ8-_`N(a㟾(h rֿA|wqE]q j i 6{VKdh9ti`@W#zkK5<(5L텭tIi1ơަNWECY5ڮ\|M[l`:fcXMZք+@k60uтNL+Zm_Q>>nQ?}t r7GR"sqѫ8'7 2Ki°5yg ;׭&۪xo~XM;(|W}Ǵ>?>t )Atu+ 7`eGeOJ"]Jq؝"&{&&RpX}}4F=E/*TFuFGoCPgr_QIL7ů*ZtGv_ɾR#Xy}Г"a!i^usSgؔM" RdGqikd\i4^{] &IvBjc? ^v?DdX71#P 8#F:X{$v Pȵݘ!Woed^ E p|A'mƱ4K|e[ױ>]iDI DIrl@$.3~mHVpe]mfw1O=Vʀ^p)F]Dv6׻\V"S@BӽYt=LL?'"p 襬CB ɍn"/|A+ʲjz/ |L: 7Xd>^e# ʷ`% k1.5XrgM)0P,ua‘h Bb'Uk6W"ncL0S7ӕQ))De%7$.CfD#+{yL{WНΧKK@\6HJd u+q, -0ֶ h(hO~z9|ȪoP6hZľb\ME:Bw)"Ϲ`Hϖ0F؍j x|4I5ZW-PqwPrʬ/ǐJ_qLK@!I3P3=)griЏ=iv,5=by 2Nj_vP CQ[@(>'sGo/ k+>=nV"vD4% Duc? q[I?ԟX :;9\OB!%A>!YqcB`8jAͶuWQdp]t9pr XJ=&u1S|{oZ`Dϣ8SAN:,TmoNk?0!ݟ(а4";2qSP!|RAJW} di.y#sy۾n~xyZyvVNQ} d\e!AOM?TY )*5,u=&M&\lҡL볉[8EPLIڿP?Ecyk OQGQ^!":oèViW3cј6o𪵄ӏl",qӥ&@']*JQlOZ.Ed3X W)oX Ҭf$5!Ч/򜪐A9˯ևX lU̅@)m}A4$KvIοնEV~D am(OuW}hlg CÝ22{AɐPD{lKWzfcaN;#wXs3hQ B:®VhY&Zyq);n[v]1PdJεWOJa CO׋#~|`Ηy9ȥ4kbG|]h~ |(ocnkApHtkmavleX<̢'z4XLs\tIȱ%G엲߁Ũ@Ӥ~ =ȹ腇afA:>A ÚU[z<L^hTEPcq~>·>bђ9p!qttPo;\Q {B4bT<}\5ÎPM{'a9=Y.9 꾕3e[2CIZ;T%!&ZVOXSB{x ZΑ/Kbc]j|[ܣ^r8qL&u8x~y~~ }?>7$ƅ5xg0Iנ|~'-&؂|Q'y :afBx?cx$pw~YXd4p p=z-76gZ|s ~%Iq.h#MQzDbLP㘖 )Bc c9.S؍aXEGjlE4%KW _o6b40bnraMf; m˰W!bO^⿃[Ph2:NEaUOq w`?}]ZNvFR@qʅ q oUv)'7c83|_,%KǕ,loz{Orʲ}|Y;5ErNu!pi+30ivFNvWC\O5svp~Hԥ=mОw`S솝lHkEB؝!)L,'[tyGf#&H'. 5-,ed̖uAJBǺnzro>( _R?'#:rpvRnUxheM1X_5lN^쏈ɯAX4rx205"$[70_g%g3|{^/XLC&#qV&qKbmHT2Lk{DDSq8ae@NvW8vGkA>X_'cIh0De/YtV`I >zPxjgDF>wr31)S@Kx폦 S-\dH!5e5$D,b)UFX8u .SyˠF &H@l5dha_;b:]곾$u' F nH$u"ҷ~Us|0g B^;q'C:L9 6yA\GM gh)feֹi'"OBr*UPu^HdDۑ̗ԗjuRUs*+wzrvlg`|bԭ~MÌ:_fhroa:PAZgD_%("˶-^(=93]UPO2=f M>Niմy?( dۧ_k*絼r 1`T%'ES]0F2KNFG/a/ .Aow᪔lc*tPtlHzbkqOfPY/bg'%<1( (Y/A-eTf(+b'5؁td#,U}|hZN"4* c grn-VO dsWvcriO.I¬͞P9V9s$qXL.͍jO#5e~ˎ eXPH`,dJh42Z;.gAmub\wb_]$` @nt}d2<)&'X7K=F*TZdpe(cBކa :C:|$rʋyN@4F}l{ 3"t'XPEyՊ$rP";*ߑO '4RZk+GQW=i^rSHC!"W1&N.QO"i w9En;/l ~cޓ?}c0ZCq'/|\.jRFZ,S9r O$i߮0l%l:·HXK}>QE[|D-MSOn[{*e=IQ!qEmGVE_+w5 C4Pdݶ (b1Nz)i^O! N}Q:Ew9 TF< |Nڜ xu _Qj$يU"&<"h [@A psݮKUC>J Rk>Rqb6:9ZN,MSw wWdx"r|. q})v=غo4;p]uuȘNFq[% :bw4 ng&$ G< uh⧲X Z~F~۔oNFSwVۉ!bu,Θ}ߟgb ?65hL#]Hmvbj .d>ueI=;W#)KƸX娯YZؙS4;|ٮ5۶(qt%C*|dG7ʫr0Wv;@mCKPq56[v)3g2VQ"BVliO%k7{_[.$i#8OsՇuYe|VÑ]rDN<uqY9: {p7´YMMڸ/LD"P Pzq8.>!?:rkDaNŖkix Qx;Hdp*:sAaBtaaּ` JJbî6p$ԑqi:9ׂTce3MuOg&蛠6䳩!6p\bLe8IC}Gc/%QET6Y|jy\T;@Re4.Ҋ P@OR|w1'gմN6v^= +&F(1!/~)4H@_mr^73U[A&> 58j - Ea"OXg+wN yf ?ZQ@nڑ.Bӂ,+ J'44~m24I!U[*VeWb=p ''"t^l4Y`7|-~. tXNd3\J" t~/>+D^n) To}^(})T1 UN +aOr{܄qjl.4Y/$ e-P<*looK.^W>:˭_+]~n _}Htݵ1p{> (*h17-ǯ/0? O*:QP;H߮K f?uB͵/'a-z//\P~& ˊ ZC.{z,r q+G'] i/=BaP:9l횣,NBWMIWwRH aT }C?.OP4 zβRyj>iN/aY|UWc,EmFܮճ DzmEqŝ O=0M[<"~[RNl G8 ̌`.㗻duAR&y ,%@'eT@-([I W\^4n]S~G}2`t5Y+L+ϑpMGmQDby4I㍖} 5՘0$v;.@EV?G.Fr, *q!cD0Os+y9w6ߩwS: R*\TF2ټ02 %ɶ?>3\;Qgqi!櫊[z/\=('Cγr^KQR-g6zD2$9DNs!p()q=]b=W]EU+' xeJ<8" h&b(Zp#S',aC'{cpt=626loݎ&DN%

a w'J*!nDc{^VWKG8R $f@B0(>$ 4|l\tԒz+Mڵ-لÿ} 1*q$`+|̮/KU{>o~`懪FϽ)B_!q?R,QY%^$*|[l"U^=Wi޽ZQ쑆88gNn'/;P@&UoR-Ⱕ"f5"Ԫ@;e]%{ mn(ؚbNOuEg;OdH1wrGJBrD^9u-&͠,g ̫GD묎]E%/opU;&N6~ &:qoˀMEζmܛu$W m/˄vqtɷ2c*ڧuÔ:|A?&QF}e݅SNYt&u8GmWJWݠs"ua mr  }VTTm <SzW\P9hf(.nf8/@miB3vi~[m'C&ʺ|JaI')|BOsNfW{^frp:%ُ_ ’#܅_{+ bp0HR]53+;GDe݆Ebv1OBخ4& ìmtBrv vIoâR ?.@wWȇ٢!%;B!R2\* ɫ4p^f%9e(胷o+R xgDq1 lxkbdQFyR~u2i4 1D::α&vv  |C9(5T{$z喠MLK祌|ϩT$dtL0ٲh }#}ɹ&[/+>&AFEV)%Dr&ē7,jDRsdK8xt=%C`Slxڶ!P^Uݙar9tKPGYFFdL*::xݹ]K-ԋ1y/`TH_[# 0lU^yS\VgڧMkamcL$^*yiR6Xr?"W*Bn<DI@ !qOJ`dF_}bّ9@-Mlk2U4vBዤ}RhKyTy_;Z_^Hyu ˢWS阜qG9#+ԕ K?6mފk dJ^[,9a$zrO8v^XWzUZvvpx1Ƈ&$!fOʫGtr4qܬ2O ۫s/oY<:\F= >O &ϟssG@q?>n7-ц ݻ wm(NgBjbIX-%,-ҋ-4B S>K-*TE̮j#3oVɣP%uCb^^Mܺmo;俾1L( >ϰ6$|%aavzY<ݶW[轫}OC+sE: *^w90Ll _jyCkM|CD"mBlWs{h Ӈ:<\H'zJ^ˉKWZ9แr JbfeC$`Ȭ&C&RrʓzLpCw:j~=X6Ou[&]׹+9&Y0DnH p/yG m\=e!J[1Fe0v×bsx;4EC:yE)Pm}N3ȢH|Skx`*;Mr Q:b֞4+@9bw.ji%A (KjkS^j!݇@ީٍlh9??2U {0LIt!bH;sOGewTo.i $qCl$ xWrlC0p ؗ$zRhDlt'?oÞv&3,g&jՇI$Xַ\LP ~̊Qf>̪ad)A^ *(и p7X;Ӧ FKkP9[̌?H1#c_H8^);rϔ"ei5} @ {v r:. &ɴ 0M>⡜ 54 &@ ۚ[DJӘ񆁒C AuGt4OX ol*eCŤN`IQ޹z0g&'[@TPߩe[[7ࣄ4h$}{zn7/Ö?: j,Y Ӱ&Jݎ5h * u|!JV!P"9:bՄ "'GܢPކMtKK*v7(8v͏G`_ҡ:-ɷ#鰈@}%Ru>b6TYFљuյ:ets6ϐ/RM6t4~Jt6dgNݸ):֞[^c$TbM2 Ѳ$]/՘f=mA{p^֎yPؠ`pka'A5풠}59K,.-MGR ĸQ-#HEnaP~7~w3+Ql Q5[uB$<%F[-S <, :v/F|Ɣ=v KY?jw?pICeh]!1<[y8xOQ ߊb?F#JzIQMkDFCt8D c50zf1Q(/ON& D"XŒ͓$KkZQQc' "\d|X&R]m] ﬑+G#sEcw>, >չfh{3ƮEvYT.ıRn6ߏ+4nb ߋ_oא1Jf]Hf6Bqk=?և,n~پ*hi+OeIaiܟ7j!/\R :^zvy~u&԰Q_7/:Sc?^㑵^ep wms/1Vy@1\!DWlƋ$CqhRt^h̜9ξHDAE x}O-K7Jmo~3=Ʉ2_b;"<{(~K'QY"D uA9Ox %n \Ɗdݾ4)n &/ ݞ?4wքR{){8Ң\tp o0蔣u#L>y=,VսK3!a`WnAhWҴgԓaTߟpv۷ KeAF5=Dpp-6ㅄ6&Dن$՚} *͍@bQ&og$W/H1 JȔ9[و[n-q j5.G;Rdn6 #z*=Y4^Tm6pYK(>Ά4n/D*v7m2N3rH^ByE 2`ڊXL6o'Oo;I _ ,nC}بs I,Jÿc'r#[D.PI 7svh@? XuݴNg4m(ij+V H&MI󊠫]QG1ryr$dd""2ĿB_`۟$ T``AOrI2n>-7(ru k|VPX`pG/j{YvJv!_2\CtT}R1sgᤚ64B+ƒN0?-[fv[1(5 }iϩgy$|Lf|;y|Up3~|f_?LpTxh0;3Q|/? @̽/NNe5B ^ٝxg;ut oߊiYFɺJ[@KcPlqW?EeAZ\/-DҨ2C+? p0j5]^Zl+}hgj{[F*)ϓHď[ efP1gT\0RBo'f/aJ5sXѾs3{TLvMti58Kfg< ZV*HY0%n (f)1-1+&x5 }tC>&8WxX9u :fdqQ=`Vu#I6QI0X5\.цh@LGp͚()UX?V̸C[sTgЄj.~}؟Cbq%~֔21GuFu,PxuT_06Ar7 NS}_Nc]}h`$hsw̶k0{pF‰w71Ϥ>QcW6s, XQ&%Zur"$r0ʉ{5 + ? 6C+g!r y=]"=^VNQ3 VJHH)xFp0(ފ t$gjԷ C~]e}Kۖ+PETK7bfGT{tO>cReh>ձ?HQ&yEYw]!skMĝjWN`luXOڕ}TZN*/ʌUq=;`cvǛ.a+)E؊pOƨ=V[p~ZXO7likHHu3F2,O&MrO'0/dc~BI[GvP p6o+%Gs]?WigEB1_^&}(r^ FՎw?nd(lkR;[a"b Ԁ=VNx-lL17sn?Va Zi~v ۮe5c 砅uigN30ЙC^?~b !C#DD8*<1&!Mscu@zIaH1t9݈ȍTkT%Qt8r kd<n3FM=Q*|RM]Lp`g]132\Iw xD;q_b_c 3g%* ~Qz½}w4Afa /wնKoO'f28B~to5lӸ21hFd;_3CJ#ksAw>qmb㜅[$O>cTwvlI-TҢEP1K_lx 󤯘=)5ݷ\-SX@NL`%'$;\>*pI*aCOrrX-(q,PF4?lHDF5t3[Cq}hx,3c0|p6o=w^XI*X㢙Gc鱻CsB@W9a"\RފK|.S"*!olzM1Rӫm|gRT{C^`PkGa&®OY"wU"^\faMrn *&f#IM_~҇XMۦqX 5&W%fPoi[i zԐӥ+WMOo@VLz(*9^$ IYC L[=g8RRsDFOZl2g6nؖePj uzDJ4T[JP.uz7K7Mo 3(K@ơeSAG}:adX#̶Gvc8g+qAqpL[w?޸!9]0̕úՠ)Φyo˔l60Ζ#WWu;ax_u6S;EZh_3G tmkl.Xe 9 x+(VɈ2#cJ86ȕL>XvN"ǃe/uspzq˽v u Z^F%v$-g~e6ݸ }'4\@TBtp\<1U Ds>>'JiHD>+5 ݮ0 Lyi` <+EdFRUWm[ CXAyQwpgz~DNp%h w/T~|Px=0T?>TFݐfỴ]hfK:̤!ml!1Ū;>\=煨ۄʢ4Ñe$imf$ 3\7.W6LW'Rq1rSB gq׳B}# $$R(+ Lӟbp{hu]ȎB԰cJK%B_p\-ƀZ"cϮޏoG˹5 8_S%O?J;]4CAe]MX~(=I}iUAeztUGg תi}(3 UhC=i3E`ޕcJ\*Nt̍tp~ukXԪivB 6j\Vu'Ue陓ȝK80@Y?vR\~ɠH1:dki7TM򲤔]mTdQXYH"ec޿RbdxW>D sk!{g]!jSO G3ߵbz3 KΗ(D*_|Kqnm16dlX~CV-DxM\&< +_,j`mNi]}PAqо u-WB)z:i||hSj]̝ =z9 fXՓxDw/bxRV>t_ːӿ~&a,|9cmYCdȈǷss&6ĝk#‹8_/|0`*0TZ8:muL:[yֱM}6 fՙaTWf ر1!#E9O.##ΝCg=5 u[f5;,ߜ_92W<_ ܈Fm`T7:i&I"7#RpIYT=ۧXA]_Y?H@PNQ~@D^=ԣ ;mg$K'Li56 ،8*,3{ S &DNEte/4\P1dͥuJW@!xy9Ec~Lj+`lh b&F)EDz$=zoRڈ0TY`ݍqj`HI8}5 WP.vڍK߳kޙ9}Qn&Zc۶C[z!}=^^E€PQ)$s$hQ0A:/hb 2n &=!>('ô J6 Vv-0um>D桭3# i"MԶₒg>{g[M!0#L\|hh5<k3ß' CQ))Bא%]p:BON!<))bLՅvpBKI0 [vea,[. |齭nG$m!P~}9}DilV#-4`y0 2TfN۶T=o1 aI=l)B8 XDa]>_VKS0yy){Z&r굾 R!f+䛛-_(Y&7F"%?8_{d@UA} G8K`r7[ardg‡iIK.mܤ,PVdrW Ix}{qj5ܣXQ̂4@z}YFz 0=(2{3*>EY}$HA$,i@ufќq?b_"fbjjGT^ 1ʚZ`[STDmRPh? 9@͐DJmՙ <ې_vaTWi t6YI` SqfT9mqLߡrfh8-B\>5-MtN7ZT6d1\- agFtR3Q[/K)S6-,HvB=EW\|3Vm0?uC\ KP6a%M"iʨ 3JC(WwH}2UHuHǖIvi7k%) {f扱E,Oxܝ] xBuT+6AaD T=;ܮjd= [ݳf6"$uFnVQxr׬a?KS\C&Jѥ\Ӌnkx&gI˂i)!J6Ulq=5cab͌ D\‚ʑG\sRTgrUM} zD qy:uX:dSQ,?4z[,n۵}aTW5_ZX1RB?= ̄4x3 V@9KEפtϓaLiW!IfSXm l֏HQ. fe%"Ftf~y5ݤ[RC zyE!S-+o,'Eж>1=Y%}yO, >:!dLW}'vp2))ROa4]I+}'J*s(kjoht[#Wf>d!Nl:2kix!axyZW9'ɒ @ /+:_2֞K$mTmt]CI}WC( ixըs'~-p[IZ"/G5DdbP4b/9d7jlSK! Jg=tKƷ3 `MXN$YU_(\mPAOVA/sT{SϬFW+$ a "1}S+ovv}v q>h<){ BuR|-{,OZSdƏk!%e)Ci=W[uIp3<(ºU!b⒩)̘Y L*zD'z؝;@݀6f!JZC):ݬ9FL+9;A{嘻%lv(Ҟyt"L*E N% \a5B7!ȰwȄ%ivW(hb+;77 %RD=1zBkkYd /P%)K{jTN!MOMHZss=~Fց&@$=Iy&:E sa ,?~}aoc><9C唓i/;mv£OZ".+aWhAUwtb{U >A`[bGB"ڳM66=ֶI5xGT%S3Hx\ kRd<#cGc5&'pFǘжWjt L-\*O,"۰֓ΐ/&/?[{?C"۲A`I}0ʐuuy+򬡴-n V[2^ ݖG_6oB _a+}5= &&A[W,=Y=B&AN=Xo/.bs?Рr HHk5}F61Hk{z-creS=MO!ď˽)ߦK s+T i}1{ؠĐWn) i,|,Cgeu MD?#<_@3b /+ b R" ͖ν;zƅi]8>įxn:|.bQBOGe:^4ȵAa+wFR\7^F{R 6-Ra>Ъ>eک~~U˕'zt>~LT?eǓtM~U%es9 "hßY7 xD(1p۾)h\P$zK;vusZɓ3Sv=YUȞcd$*T]v_ꁃ`WTz^ S176 s*uN_UODJ=%0ԛLza (q,@.BV6x JLq%7oj\_-fģ<D\+5;Q-9\ȿ̣\ȕޘ2zj~{Gku۠^-כf+:|~d khfCћZ2lOH෶ G׷'P)+/ٿ@U'@u;6˝\ L@4b#m` }(:|eLaHU;1"3"@uawB<4]5DV$>%%z`ڮtXD\` *gfeDXTwT!{iޯ NB/IkZ\/Qd{e©63`QP&ȍρ|xyKGtbPcQYG6kK|gR_0'剴T+| '1L`nl iXVoo8e# /K9tmZ\ ju?OY ;5)jBԳj0N0Uvewr~ȵ*i<}9,ze02VLOY}3,a< S.h-7TV&l_QNj]/8"Mc6(eїf4?Lhvc)U4ќ;T4CSvJ$gFzVH9B2D6`콋'C,{~HQ/ez \Eıȓ\0g=U-1#fLda\wxY8҆kg Fw+(뒳nR# ~HJQ.pxtS0s엽Q2D>&;K ƛy%V6*?3,:Ew!9Rۿ 78H$&_":=[!hyݿh2L.zPhP!@ \x:=x> #!Z gR {CwGjV:b5 }ӑ7U6al 0ŅQE1$IcINs3zmksWg5ڃŇ6['œ <8Xff˯]Rw=ugZfH8}>.ۊ^ PgҔAjb˛po^, 77E=qE}G"6x}be.}Ǝu"S|v0T+ :8e+F'T7Bdrn',ktw؝~u0PՄ4n>y!Нh-\bGKqلb<-ШH=iʮ,s$6,? 7nO{H 5lHŒEFtUH|7zcs4Ws i| *ZVkU.<-*_&jL(Աcڰ:W[rr@5|xoi#8oۺ~fhX7p*AC:Y~m& ʷ~<L&;cj0}ZM%X[u4$&oaI~v[5 vk`m{iJZ0K  .3pdҊx'Q ;V C&t80!˙Pdĭ o2cV Y2AbyH}ɂ<Ӵ >b'M?ӿ m ]Us[ugknz"! R!jrV, trB|bztc9Y&Yl3FvsA%a|$}*=^Ui}H\4;ċC-7%VT%ĘA< ٹArlvޖީiŚ6~-) v0$f $Qvs )D .zcz⚥E 4c#ђKyH(@%ŏ @0{aTz5#4o|>E`Rj(|bi>xx{.ib$a Gy G7%%q ]'Ff {9U,}RA0.*3BKo*HKv’;x$}P.7~(Iv' )VȠ$$3~[":%:g¾b26(+o!lbn<]8u ědk50Z=21$Dρm&6Fg&{qqSыN}(\Ibe:&wBk])"\ 4 R"b,ԦSiNfҦPu };8WmQȎϾOʚ4@7'GgFɻ4<HF#"i#("k["|kG]Z^%mg=ʎ!p>/P7.0sJ԰0lܚr7Ɯz1ŌVI属CbvE ]\'^@ aeD>*!+r*H Y-ʳlwjsݱ{ Quc۞8ol}k8J~8;6 p5GӵtgCx& R܎cQ=\'_sGyQZ/Py{ q[f0LQm=^C2Nd/E Z)x]eC Dإi.Fdak /ohY%t5"_' ldɠF!7BTTj??|%bWzL Joȵ@p Emkz5BîxG<{ C lJ]-%"?%ypp&;%nJ9o .o߆U[g?*̲jADy֙и4_9r[^K7aa,"=ƈ"r512D\"&K#R`*, eߺ꾼ɲA!2YvFE{=_XSl. "qGnrFf/bR|OXwuS ?ƌFewc,!˄L۟C}u=(: A’ ,(i& &cO^Q4PQP I`G;CzjmmNi6#²3) $JZ-0 59]EǔBx6g$r f(KK84=R M'PL5L\'VXyZ@feFۮQ@N H7EF^\p6/84-Ⱥ48!OwW aAz0]u3}d4+1AA[:+<515Pڼ 7!hxK%Z7@Bo[}h]{Uq/:%#k@ @ Cos_!h@AMA2!P cT,\G1¥_Mna؝UEޅ;k/.0eX=e?.F0 m5EhA̤6ٍiT,.vhz݆l'6 `SCSg`gJ U(Ubi})a(s c,0DlSk$78g0x;RZSDsv~vHc~ ѰteGfCGp]y ؅ g+ çQ- oBMzD{Kɑ"kv滃:2%LD~O"H^F-X, .L #ie&1 KSvtR F4`ovɃ~%.spx4طڎm (_8Y;ڪ*]i`j AJņaz,ԙ %yԑ(g=+!pw2tvBӊ7nUDK@r1W_Άԛ^aJVJp `K& pd&(/R}i-,O9U&2(nЗϰ"҈+ 6kZvMs/kvRw jS>\[!15=+Yb ^ 3{ 7"!ub;fᝄcALf0^Nny8Ͽ{@U }O0ky{;6 iza͐RiiH:hvOm{WfAvba$ɻ㓁Ug adm. ;tB&H /a>XnJ@o"E|T[vBPqZ]֛Eh6WG]E8"ݬt/{rщj_5|K/_'S& |Ȕx1ή`KlHxnFҝ zו#_*`6 dYjF/ݢ Qw#I01{Cx"[ dE(4Eo^ _ Ϸ =&D= z9S*Ť<Y${j&J̯MݵOYJ5!d*HR"~bt.DWŌ#es+uY ̾OfHHw -0IQ$G[vTrƲӁpcCbksv {R&6š/F ~y|j0]#,729D $al JIW#f%MV = C;dDLȏA mmDtcACDߍjq}g`Չ\(;e||Ȧ$qզver="t?lxR*%$7%{ˤC~r Vh25Q,k7tJF6CF!u񷷘jls!:J/C5oZ}ps9YW`7IV7k\mBzQ2ٌ?M,W]A-|S& rhZnCpGʎ5 :="P蓬/$X9%0c1 JORQ-0 RWu"^VEsS` !zs4"6'Y4PݛOZg,DLnt2cG;kuB7ዄ^vK0( npC$*[;Rۇv@?|%n4N ^"p@oZ!yh%GO(un|3I:N4_Orh a;n-ٳ "m~lUHV9$po"æDxLoZ芮/lP@S[N:Lg[fr-}69i窛0EQNIoQSCUqt8y?WY.JmXBl54WwH̄]5 L1ްtv+ .dꭕN4jz̴xk@K $Eܼq,@Is:60A^Wg@KxOrOe1,z~X@Ķ* ||;NzV3A7cU6|rrH*b9b@}2*#,M~NgR%Tа"OaV86J7.&.Oꙻ#I[(㥞ffvf4G !RIo^YL' 8,*\!m@&Qu}f{mF=6vsu:-Rl@Yk?SV96S"pj b +ךp|c 4UPU&䬗)J -a> B[N& Y 9! c5=Lֻpzqu&j ef P$FjW1RckV+0Xu !ps|Y#W 6)_ȓzYYל Z](Ma0X/MA^K_Iky TΏN?I=YSCÎ- Xl9@TwPUpçC,Ec奓6 !|BT̎nP&ԸP< ljnr0͔r2fلhyf1L|M~ x oE'P\n ɍZ$-J j躶 5u g6"_ɣbn13ݲbDu~Lo?qwG;͟F 4]hӷpvI Z'qx%sW8bz?^ME `B/stYءt0b$%H^UN@yv?ꨛp`"Sv߳ttkbZuq՛ Li9١767H&Tkelwf)؍,dB]1TØJWU_ AC9'xTN \5r.DI̋~zghت3x= -sjm١s|q|4}t+[6YG//ȏ:mY 5( i[=&OtJ?@0@%Dy6и8E"W!ȓq*guCSo^jxWXd φI;%lr=3eplțo^Vn)yu(65@ή%Ġ" 5g,IOYk=GIJT.FTq@ $aQL&Q JxVR7B.n")(/#$DAUV@BǕ>twH1}E-[Ј\< [ar}*Oo0\RO6*,+ K6hnߦO߉Nˉ3U.5ʭqM&i@Ri""6!.ro-n6/?/۴KKk2l,* BjKGY*H>u\}sS_”Mqc(SxJ2}ay$!DA0a8>b4JMGc{¹ٖ{XEn^[ TQAFu؟%Mnq,im罔nh D@ E֮Ė.K!D!mzT#wͫڜ|W)[io5ӵAg[l.s  +nHUQ>:i>uK,`rۂ<P&s$Kފkef1-QhS)p);?(Pc>W.AxOjt=m-oej[GalΰvϪsY7,a$OWAL$Gk=:'$qMrWyGѤa5=ըEB=<ɮ[SԴ<"O#p+#P;Nrx(a4v2H@Lj]ZG7U)}4 +VN66~h9"ut\Mܜ80x7 jۆuSMy#pxJR\3\@ v@=LvL)g˓T*偯⿸NDL6dګʹU7@!BHLԢ)MsWL %@6EEpbkcg쏤=D f70[CmY,S8nŜ:D=cvw<(>ɚt 9+@oE`w 㔒HADa.yJOPC^gԺx7P5PhB4K馊茏1?BPnZ:fMIԅhgt~WqB8Jzbis߼K-׺¿pQrj/1in_ 4\o_B_ƞ}[)_~_'-=.S~E;PaDJ\NٯƤ ~Z!ap#s/Z{I]ؓ2ԡܫvS,xgE bؖi,6̙u& Iq<<" $~R:vΫ:ҹU2 5֧\R&Y w.U0Iݭ'%[2ͤ^\J= Q%'r/4)Ui~6{k.(O{Vر++!_JF l6>5b$fFkz@(y#ajb5b{8O](hi,f?9zC>|P/Q sFWZ,eA 9 Vi"Ġ 5Pjx"lOT_d`Gm%Nsa_.Eݲv//wOxҡI #4N"$:.1!b: ڃ@aAۦnӖBC8wJڌe'>|6Zt9:Э(5_Dds"DJ[ 39MTVCR-JEذ&VzS$3f%B3eX܏/Z:12aI8Ioû\ &4[[uuC26G'++](L-Fԗ\;=_xi[+ͪme l3(Eb0p G\|<l`&peHJ#ږ*!A6X6x9bUGTο&,0Bb N8DB/ޤY!IJtgk6:^rGcPSjԎQdzZ D3@1&F=Pkt=8v2LQlgyB|GU]nB LzD:=,Bq-MWГO?47Ta.!;aƆÚԶ;֬u`m&, x#8 eYY.uVr*8C q؛qUF7R`&*P]lWѽhZPPgτcO GvlH?rd*p+Ėf@1aJOʦk|[zN/IAJ<5>1h@㌮, <} |d(͡k纫@۾=&$}YKERMtP87y6koa|}Qٻk/N\aaZnע&1ہI!1fk&&cU7 s]#u(%KzBe 7rЌ {s&IvPusHN[_w!~0x'/ww"@D"GItekf_(v $Y[8ct:WrO AqԺ`Тw[LN@AwoC|";ZS,PE,FԒ{z[iI[>c,\z {)`0d7R@>)hdaU)H`Q S 2wr:((B0Á~?JWd3f3ʓ37f we{!M,HTH2{%>+bM 29!z ËMRچ!U/ײ2#}:l$erI8t"Tc:uN*ӛ*uEdKs0ϔJ mui&{8es*tOALDqWahω4T'nmE-l$B{/k y(V asU9ؤ{:YtA&ј ls6_Qߠv R!chLכxl~hΦqZ B;SOҡ7$|ܵ9*E\X/| 蒬NNUJ|LʛNF0-Mr: qAv`,(Ђ7Q &R[PX!! )RwC}N4Mx(9J[дLNY}6wlEцͽoh־3_ W)41h(KgYX])9ѷU:3HBN^Jc,Ei)nSQS=5 IwD={atUIBZ; Q^M1M¬2Q&u1>ֳVg'_ "y;P9y1o2nb1xg6ypWu"UޤAmөa^ q~baG:a_n)rx1t*ө0U"Ԋ3{hyȝ]쭴Ha+>smy8͹q%luy# Vl!S.3\ݦՐ F>_y@ro6/A>b.Q6Ƈw\u2 EX&ZU$!7úkî5_ ܌Yjc&n2r~߿ſNdhi\HȒ,$\in<z{̗n-8mlV7g۩)4et ʼViȺ؎U_S{u?}{YSĿwD'sdphm͓wVBG'W@5\1+!'OzpcI7/[t w}Q0;^P:`"u#!՗e@ v\Z?̗ =d.Hh$e ݧFyG!+FYxNɧvR W+Tk5}=eOHa:tA nč桓e,)公< #lWWTY@k]k~J>f`o)5qa:yu?maӥQxPh8ioSacT-6T, p܍h{l2/ٰ 5X~cۨ> kYP}|,)c]yJgքP7毬Sg-b7eSmdtq]p|tG*x6OMU˧ŖJKJ>y邵c=d|x=F'萟Ѐ[$Qyu> p԰p.v ,(:I5-4&?K1twYen#hYU_VL<9!K/ `5瞷ۜ`FyII{quId1V}OѺlo6]6wW;:yp=Om~_#j_؟oe,;B˜e$H39._@qq`*EU%wӴ4maEcmM1hN>Lcԧʚx\F9A'-\{Rݳ#,*jԱnnDc2U Gf 2ϼxnL:wT:͵SG# jp1*ŰNڄ};'Sz 掝j1%ž-?̮M~-(SY'< Qٜe%&i݂t#}ev-# Rkz4B<䎕 Yd Ȳjտض@ #(cSՔb큺[z agjUSHf+;tu/{5_Qט6ҘQC:|W_4coAQ͑ %.hwBZU&gPvEB}Ѷkb$ؽa]@Ėz萄(=9U͐V0W]P|-xR)=-)} 2>Lenm?gtQwˣ^lm}xY:ɩ@`%DډZO`.bn7އ0 @$'o/^15?k^5Sv(1 dX|[%Vjv,B %]^eAٝ_|Ԉs!l'hS&[y5yG#;|d#6s8PŅT.w?-cGMa:LH`b3s836xϧҘ2$ Qy9%wf,.Ee-n~]-aܴ2g <-6ZK׭eW]in;upUZt $ <0Gm] vg`W+d[*rTTًpe\=Q56#zۓ /v>Qۥ*K/bp2Y tﯳKȵ!Ix蟐 ,4R`{َjus"ԕuXFIȽ\q ,tYKK_K.XIn26 $B@~l|WU3­ #UU/dTLEH= tM!p0BDLEqp[%geSq)!9,̧5P _<`F!=/dC:ohXCDt- *}Tne&d""lq\̲.9B&~Id+?ZJ2N`qTgȝy!I;{u9QSJIoOH%Mȶ -M2(OTs7(!j{}B,)s#jetV6⨟1+7xUZm/; \9XKxBlkn4svz26< A+7>- cRZ~2 w6uoY4`Zңܠ\ \ D4!@56)՞m}E~![&p$$6QϢFY3A˦>GS4`[r.Dt^UF6Go^Ѡaہӷ±` Hzf?D$¹mߡoZ͂9ux\bJ}мnV$9 G\$6 n6\O8aY2t`cxǢs]W no|], гF]>bEҭz;i(z0^Snȅ40 &KlA`å:|6~Y"h^Bs 71O#zY X(MVpCCU 5KuU@L>nvZ+{4DQ?Z͏HaJf1>-QԼV5#rY0_DWAP>wf%Iya/:ɍOI7Nr1O`K&e݆ip'P[_>/$ÿ _ٮ3{sxЎ1?(ceOf' ;l?)X,CzPZPLOVL7zo{vAZY&Rla@o_RAe0Iy6z'RBOb~bo\6{56zFp37¦He16w+4!0ecp ) M{J>^gtv+q_J[;6yin}58&V5#<5V Y1A ,wN"~Q"6x sl:\Oyy~~\_RW:=HQ=K tuOXcS]Y<-1ӷzk*ܠMw'moF?\˚-vYz j~B%#q| ɂ|`HG -ɴ}b-p/)PK %Ў{K,Yp fEK:ߧz'5&:9?2`|VAf'qX l)x l)D ymGؠwsc:ڲ0_cWrG265Q pWyĪk\柒^SVX-Lf1Da_j41̅ݽXmja`3+߸r2<%ET&/=Ŭ!*69B:{`cLHhs; P&v)d$KƮu:~)'$<>Li"9T\*VavoHE pD%ZSMQH>t?(6 YX/SOT7C9郉-YI ^aQOw3##OBk۰|&9D<-ZӱH)3 vWFmxSyat] " (!쎠 z/T[<bE F P[/E{Lj>fgD3|h!$<ZTصZ_mTH0l/a5k΋jBҟ|$MmN{<_B2/u&B9w_m.rYlCJr;y`28scyn,$\FNC8rwDh^vg0TIhO <_O̪~٩MR~LU%{_ķ/ T[ҧb WmsQ/y@jI^+]l< -aHz%}6Y_Pg LdRw_)G[oek nP# A*O쿁nXa"  Eٟ@{nT*a|*Ɂzb jBe8)-:5l!?2pDx 4UƋs߇ i>To|@V#f%z1(sv @,""%, ^_!tbQ0r]6W~$7|2Ao!WeFhMThR|!ZY Xsƻr'EMg$E1Š"ᤸ7T PB2ģc{G\: gFT-8+{tm`|y? ʰˆc'ǎj;5#+{~a7-WČxKVoL>fSb́mHyhy UcJi"|J{|ًX. REz]lj%KG3LOV* #L'Wڈ|dCYtx&a4ׁEC}? ~>a0l%JDB"KbGi<7vw} QG\{4ȱNV6q|b=JJQ_Pj;!M눣=uHJMM%UkPG^dmԌ8OI}ص\4vo9[騩\'sHJ 6h^ٿ4zo#B Ԇ_i߮xYrGjXbeѵ3ǂH%/z4p~̌$uhYQ~(ࠎb!xZ]jUj":.jlPMt2ZQs0~^Lo戱3L7q .y FBHr$hyl=aU-&6KMv\KZ';1)WSoe~욜JSÔ L٩MV˧ޘUJ/!LFq.IwMO:ъQ82+";d9;d~GY'p~>hGFBv콷{=DLMT\֓CT.ɕDSm#HΊдѰBvA}d7{#ɽ4&lrT9ȗū|o1n3iT3h5m l)U/ Ҍ:Ha1^N |/YmxN= T^ْUԩ8h,S[Q"b ص:UK ce_ 6 hǡ 41=Ȕ'B|LOt5A~ Q^oyQ>f!v<٬ K)hǣ<_fnBH~oRа*yFKsǩ/ U$"Mbbk|5JX[\,Fx|٪lO7]D[t%4۝5GDTJXv(:qJY|ga;lC{VAvj>U%5uBG*RfrAfF := %fXFJ ɄSwkW\ϓmF^J`B\K~|S)U8 (-ɩE~ b\P}O٥|r[,H_i?P:I:a^씕%׎?l0"~S6keqü}8xi5\qyT%BDJreWeא=iYgb?C!d6yi;oխ&Na^pn՝pjI iъ& {X"uMS2j?EcRN/ԩì>ęrV6nfu|Jx~.7{?A)_.wZdL~ QȾ3{߬[-ßL\Ev(#oPnM U>1{0.+H.ܼ,ht]p"8oh/XF;=ar{X6f Sno|{U# 0bυ:[x%NRТO`=&G\oxU<)0؍fQ/" :/x;B&kl745뺤*[#.݆w8ѺsҀ:]fC,yi\D }RfY!>T\J$⛇X߽v6c2H40[r\8~%f5(/Жma7r%siKp[[23MDBz,jLFWHJÒC1j0JL3G[QCyi-eԔ$TG}͟Jdƅٙmg3@1aҼJϓC2\7u* ,&UL&/'gTamu{{cᢂ|XӻXZ5* I-;uc@kz xw5KA7@ĵY{5@j )2g5bUGX]H]O)ryj ٶzE' 󎞯?$kmJ 1ae31cZ߽8C†01z'A}dnJ|o-ԁ3X LejDF4^hswN4}N3&1ǠV.*520Hd wkhCF3R)}&xn(|^k -rV)וD\o:<9ye& ڨ۰ P-Q3sܓ tl(aI/\B,e] ͯa!'cu+%Bc\I׶ۺWTڤ9#9+89>&|q&9oq8y ?MyHEo.|Cejߧ=U-uzl4MM?75+B5m 2֞ ,+`_t;=cF#D.Sidy5Bl^mFhkͫlf3'#<¬oUkBEZU%3XN.0sLFvnm(f턞< Dyv]"g*qZ ͔>B`^UZN[n$Uڵي #h@β%Ki kj[DsʟIӪb;zeiq" k5Ls>8oIݝ _+'Y%8ettb׹ky Z+~Uֵ \HP0%*1Ye%\NŕaC&Q5fnŐB-j\({Us!?\(oW+,UA#] B1-cS54cPA6~#( l5$Q(2&ѡ=m0uK^$s gi I/v, H"vHXL7k$QG_Mނz/KuXT,纄N,X4, 7RdQtP) @_*72o''3mngt W@T x >$ *1҈GZԣ΢`GOn*篹ͥģSG:>iagg_M7&9X-8G5+Pi j'D1Nz%ޥ\l^i| u﹬NPoU6X}m+ʶ}xD:#ܐ̉;p-2bn\7jR˯bng&>emX&E8IYC:16OuVarUDˋ++T{ӰP?qoW@˳ݭ]O 8졛T]k*_NG*١L&PϜB,< 6:+%%ΔT@No $%A#u6PySbRje xYPTebMnNQm%G x=wDeZ^sWx#90LTYXÆE-^;gG bFWE/$*>dY;K̻&rQXBvofl^ )b@=&A*H1Q Wyp#݌W =eCKin wSqQ(rOJQ.y_͡ Ȗi&=cYoSBk8>m:΃n߀yJ .m?0? ^O#u40JJ/6;] NsmFX#%+ǂRHm V7QӛT{堬̫Z2:tk %4 |Ȭ{C7=bJ>sXPܛTeq).I-*ḬIF5+ 4OA)Ӑ.Z@nJ "`pH֤Jր.1i bBpAHs0E(\mEI^ڧSFk)rj4hbzZ(qWDXqM$ ~ⰇJ?ހ!PFcc34`"++Qv5ڣ.P3@rkܡ]AwhRt=/AjY=)<4^͵dMveل'.uOu$Iq{Jc3לsyz .6Z>LX5_lzoi2M͍s_[{!(CKL҇GIP5N- }6V&.K!dЙS\Ro¯oVYH+GE;MեxZ(aEc*H,WUmg+ūHz=d^f~<]X-z]'HcA@С^ÃO^u04^ãu [ۗ[! Nʟ6|U/g!p|h}N> gkz\ML,9So-hCSn]OҘ`gB9T3fܭVl ʎ- fs$ 'p='ohqz.l9 9tG|.Tb}3ͧ &6mjS034(9۵A ȆSO q9'cyBz79#yR{)ae$\U۶VJZˌ0 DuEB9`4xSz6ڼh ""66\:g8pD6 Z `Xq쎹6h]@Vőcֿ5DtG&^sկfP}W|aO_T]J͊F\0:0QN7%.NtJ2At"D- S. ˽cމkцeS,&;J]QC+Mv)c{sjp8`Sn#gQD(,Y|hrSS w8!NSN)[/J$ =Ia3aIgjCJG "qi.uC!M!XW:G(I S]&TAMD=8yto{hAe՘:ծ(f U=g`#`'R.W'R1V bt Z_F9: #zWjA9c8 t:μ‡ed`P-/%բfh(iHG$"$$HXY̮jQcI٦FL28U$նӬ~DQ\U)׋rG\0K@D"rgK\B%Vt3u\0a4Y);Pt,D[Vr*] \8iagPy+IXEm'hUo)3r2~h[`u>b(89N.(PVibTI4—ζ'mr2`W< X"EÂNF10-"C{cͰ7t6q/AOMMB& =]{,SɆY>![.E `uq褆3LiMq 6aN逮!)=)!(!k O3Lb婾l|Za.)rZت&|BŭA}k&k8pB`^_޼9jl=̟׏@w~y?쇯~\75s v¢ErYNDJm&>p{$9#s8XuqAe5|;Ae#uBZ UbFW2e9N*"XeQ gnib>fkI0!HbG~T v-+݇6HeS%d  `nAOlءIZ1 3He\?1]Q+wAُ~ҳ'"'HуRl#Z!"eYQ 4Φ/M{u}oMQW0Ā@I pǷ!ۈ >e p% ea{ [W?V&<|)a,۟$2rߵ,b1CuWM׵@(㍲=QZlV͘+iλr7+qϹ-U',nэS\Y5~6,eu/o>_Pc4DsXvh)W E$)-ju[m {?7H= ds_^ufʆ G~4.}= Ƌˢ]?ELfF$pZtj3,]~ĪZ80[SkDt$bi:н|eI G*W29XSC//,e*Z)"]}w~aQvLJtzV+} zE\jdwλ#FQol/|bxkCG 4__|mCcen93yЦZӨ[uW`q@?X ]橇2M40 G|=^{EUoA̒B?ǘGlxlc&Gb)Lʭ'c-oژw}[ț IMt"{K.ƚ:WLߖiU 9n{/2a'Pp( =zMFN? ޒ }J&( R4j6?~W{-򖘔OGo.PFHIxuE8A:6p " J z sM:azy\<81 }j0-̌/ۊ,:(˝_<[X@92 ;iV8 h[^6(kf 2[uR-:FRd7b=~"0C\m`I|hYXU{M$җ@[ۀ4ON굲*_]%JUFN4(UUi:]9'hf;-G.XRӼ͎U}% R`{@w~xo\Loh-n:ne@ĬZu$Q.uq bë9Ѕrc("Vi@!&6GZ-?_gä?Tyǵf~=fאz_R,x')IwKC[vWMsj(&K=2O!,7Ӊ#=;*<ݧjHpwI'Sh ]Ͼ -n&P-ӹ()ޘx0Ա9.O0BJJ-2l+U4aYnOx@2 |ߕIgPa\bf80uXQxԐs "$LCt{gFЪ"R umYn_!^+`HyM0{%R%QJN(^slnkz`ҐGtZ,.A)'l$u{1oˍ$Y >"z8)tMWu!R@N"$XLϑy0FЂ kU:gd2˺h S! ՙnXmػ6Ր˹[**@X3UD9{3&9%J*^Xy5I ŕYж|Ysd0NEϪh3Dvɦ!ݾsm2u%'9ulY3=lxa]86Ch~?^ 'T,1P|5w*P}Q&ܠw-“*&|4Rے24][^k}~K@C& 8ށb.c`ͮs X.㄁:[S_[LnF@CGH @|qks6tra{L.*-a73F?+wV@yMiהkJ=Y %.iTtq|#{yβ=1۴5k-\wsMзC(5/6m4[j~)u#,B^w%uw9Ǭy[:5r'07>|}l~{fEpsјQ[H3}sVc^XC!x@#Ј۰֧ _aJC! 0@")l! -x02}95klI|8p e}sh}ӬӰn.c Kb1[‹1 ^IȊ}*29{o,z!hv1yYQA <4&_ ):pZ6R#o脃ۉXwfb uz;/ORQLXHvo&Џ:n=LX'@.Ph UǧxGˡ*1{fJ@QsQh`;8_HbfO{G7ȹKnlebo~ '>[uzhܶpu}qK8ʙF E fDȳ s7tj-DKQՌڭ`.hPmAB8W g|\OyU S`T0CҎB"rv֒&'GG}f6K3b_? X(Dv%_87tw%_nrPKf#?<2^6; puf<`/2uiPƻj9)Hs۰j[Ig/@:T*L8UZEs iӦըUz(g7Iedltf+$rG'yed a}PBBw?ɧ*dMx<+b2lWqlCpp_&kw)(4fO`z>+|#`FGkU|_2NJS6QtUWsU6c &KreԻN$]ߍ'Kzh@鈵=X0zo{]*:ɲ}[G^$!ޛlK'i.I貉O(y& ?9H_u('"7=ej6[O[a1>I(ԇW%_${O]BVn];&RK`]%C5@1,?D )IߛA-5Ift3)TU>k 3 P< 1 tLW?G>Rm0gfHȽȮ0lƆ6^ .|R]o}sWDfYC.A FfwPv(^,)Eg8#BE.φ=|l+uν#2ӒB2ipqP\FN@z~KP P"E(a($CyE?F,}B^z8>9aI Cl\~zfsfc/rߋmSP&}Zk\ͳ.k0#N|/%DNE ֹB400> i5*\xrm9pp8'Z/ƣAu>d{'$AQA,f;z`2Ҙ4{8>/U7>\mY|doGw0+. E‡Kk;Mo1xdxUp;Zt;yϗ?Ś "0W=|sֿ}.m؛ fBFQ*vۑ[/38{r;z࡞۬gUy4SÂ6fδZ(቏5(pyxz߫Yb_hصxBtq} _~B{ c兛&qQYavo>>*-|Zg.JX[5!?H|.W|^>әK,絧:![苈P+c{e[r)͠JU~R3>V~41TOdዏSa⍫ζ3,@!@h֏&vIM 3( d& VZE$ؗ }Q Ua _ gƣ<]dB0* (r ]`b]T5d{X[&Pý-}Ga=!$ !is1my=,TU! Q-ذY?m4++ Pج0=R2!f XLXޜq1>%fa/ɳbwRr%~TRp􁄻j6D b-R`QZ)A?*ǰ B~3>}Pe.ʉnToGJa C_=P:z`ʳt1"M$DsUB@skohp.IM!@, 8Z;3|R{ihfeVuNJu."/xW^xljCTm%-?/X!'^4uou$ea 9 *\3c̴'/,e sOJ#S.B,ÔetXDyYn0g̪<$V`Qa!̹Fz!?P\&W;,zsKs"gS:"ZRNBeIƫ);"9:~Dk^,&^7i1Ɠv(h g fSltL Vؙs5F۷@? D%Q;uT OE%xigPˑ-zUx?n&QMSv(o24|J7b;l&:F?E)2Wtt.//^h="OD?f̦ lz5J:` ReX:d_np:b6Rq{[T&€z+%_5*ؙ8mȃ *A7 䕘nލl,ߒGHGe'"qE~"k%Pwq /b(ghxv4OwMߡ/֬왡5%{N!mYKVs h9;5TN7!0P|2ֶ5^X"i"($^AMe,Q~$2pk%/3SR:X!N~1b{g];8P&-Q'-YiIɈ{9/bH_-hv"9silwQg;ѵ9S7bf~e=A3t -@l"z"Af&vynQ;wځߣE_Y$o '!E-ה91%ʮ84jj)h՗28&8XSg-12VPFQt. ԣpTrmX$12|K$u ZHĐ"_]DYkNfEvU ,̰ KiUEߒp#"j80kg>FϪ7YdF5VwGʁk)ywNrl!ZBcwd]ƪ|-qkq/b0fSU[ W6:M#ą aS35 vO\G aed :d/LoY˜@LH,@G#ә%;-FStEʓ!wwվ߻zLýT^hƏY{l$T+ڃ{?=1OAS*<$3Lļ34i3(2@f / ȼ[,R%'ҥ4Õ|祻&Kؽ/b|;2^2ʁ c4UKD4LhdUe)53K}.3)\^mc+3?xA 22Ag(lv&4eApb_$h[pEV\ZG|뷲v6][v3İu C^"515TNH2b"dG->.GUݑa|4u*'/y h"UiT&\HtQH~x¥͌l /P;@?gP+L%vgY;5>\u-ܵN{!=DÂxHnk#Etsܓq4%2 g~}7}s\mx9@`^؜~ K#i.P횕n< 8.MVMCܢJX_ϻ_ҏ6i[|T"3.3hvI&񙷆v PTפ8+WsVQLvbREщ~(q:S^Jxѻd sRҐl}vӫd$(nLR7DQw Wyk-`P4 Oa9lNgCZސ>YH}(njq/י^배y1U}o0ot/=I>dZYlsKW.=M:)o#( C 5dZ&̝p1AD|+%nz/àw .>tc>BU=F]k;rwPj`XJr2 YodzjMM$py;}P ݖ#Ķuߦ;Ax 45@n^}^*.Q vew3UueAU.GBI`܉ >8ne r\$厘\%}!8YMgudٱ9ZT@É?h^J \\zS<; jμ%ՐQ“D_ؼ8vE[*tcL&]Dg]X->¿&%"рҎ6yRx008V.<ץpҶ,ЭhdO1xEB, /" CUZTx7QAa.E· eK&ICk.ɯ(Dg*Ð"10`O5MORK/O}λ"|ñ.i"P~̬ÄS.s_F?B->4ʟߋYd7v7ۑHåZ5#/3pEٺ< q4Ld2H]|@>%qpSǽ,)!/?Eɏ{ ;aL6::oW"'{ڹAvMR,&פ4)3`Yi՞}tVg]\ӑg5buZ_O =FϊT3e͏3kuV'^F?=OgVtX.XC}_ΝU|U}rbpu4- 0P&&I=nVígFtG2Ɔ IWۜK0vǵ:c۷(}RBCerĂܭv!Z\s`,^ ƐQԼyv2ZPHVM[GF axF2>pyv\e)M{\v)1" ǠWT @KBG캝H(/Yy%iŐ3?P2Dc сBV 7z^9VD bS& Y X2BEԖ2ɥӅ~Vonp=:!݌sH&HdEҽN=mjA L**[tA.FTC$zx:Qf!2ړh_yچ9W/ d =_Zud9ۺ5&!וMT*ld:[kƤi<2cdC!IkT8IBIYeKeWbe5n>AБQ<;J<7!bTψۊon7o*<;p0Y;%uv99o"*P-aw'9 ؼnx.ܯ'6[֒U]R86X6 u5w ^K2ZC3ͷSShU7np O/VON>1ӆ80&$GMe.hqtC&[ ,.MFoz$; ,ղH0BJ[Рf6W#u~jئ0I .sT {'hQajaN@'$`[WD$NHڎ%[lRAa:ǃyN܏f-G`g&0M{>2bz,ia!º[03TW1!_esa@,*A·'t5'/S]Yxf·n×vIY~Pr;Q*^a}=?c knPo'&8)<9TϢw;l': C{&v"qpz(PjHwثB4YHh+t$>mԚ~ӽ3p%Ik2%Vm>ٶhKYG,#|,[$W[ǽ= UVg80%^r<%_hqX-Eِ$;+t/)R3KXd8_-˅h|VN 9L̈́Crtm|릃iWr Q:y|;^|= B + g/?ߩm -䭣L -.Bg?zZ>r +Ub->EVt6ZxCIer*rq\,R먒E)2b^?Z%U(I.`@_i6??Hc qE Y ‚ylq/;SZ&Ю`֖g/\Y͜X-%x.Uky9|1ӏ;u"}CyڥAYzP+J1af jvȅWc\.2൛)򬚊aD >A!ʲm=QA0"AJ&k.!& 됥$8}Lȕ:{x@A?'yGPhW}4GX6zS)v;ߎ0!n7VĆ /yg`[lGVfQT]/m^&eI9^@8v\j2Gk1vmVcΕ{4-ZqGT($!z(U0^1dF`mQێP0p5=9F^ u}Hk mUf׻zmEHm&;.Fr<+k0EDxNTwӦWfHtqvcXlQ3vFDn`qH }]rz2cMp|CѢUgW/ BǺ'5YG@$Qg_4DS>2[PH&K3g!xωQX4%pA%775UGܻ-쑞nD2 z@i&20!< W3o.AUF# Wu4ͽ8wEwB^Np/`4Zu򨍷x$ωHO0TZ+j~ay?J$1ۏa?oΔ6$Mc +c 7s IM6>N|_䙦c]W6>ݕ DfT{Av Ʋz+mN7TݖeȊ31oN- %J|q%T;"MEɒ~ndjVa}RmӂC$L𙺼t{rURj~!Zk7pP !=Rq.ّKg҃fxqnq9(4p.RaUrw $X !GS"eΓC447jA̸SG6R cuGVͦd.C`6ś]X58:jU Fg&fG犣TД=kR?V[il;tя'wCSzxZE/x_#-eko!Ng+38aWtHՈMxv%&eB(_8rFT9$ N{A[~4+Vvk)Dz>eG-,'ZT-|[MPM{ AC0w7ݰyaʇs; 뱕21C/ L ?/nP_Ƿr*8ŷlr FW7nGD#A- ^s'=(̽ jӍS߳˾9_Y dJp1)Px!5(u0" T:"Opt,(+$),Qr"`~2J6Fc79Hpm(+*/l&O>U|<"."gQr,b'LA/^z}3.+ T*Q{͉ [?v=0[7 wf70ndcre|XB $lێ"Zw6<4o&HhvCxJ)ĨJFqz|.B@IЏ&UH/8twRP9v|(ݹo!7B me]71'ɴ;Q&"P=k^o7//Zrco~M iH?*k|ːun'gĥYi+ͅNZ]Aũ;;gZ1Ԁ(t_*8eic j( `7V-L_^.UDn'2&[/Wo8XƪNnKa\F0XD%M9$OmQ3#-6&%ʹ AWJ.6&Iy;R>L廰peE-)K4-M1`yw.<+VLr\ٻrw~rZFm4P+S+"NEn&[f";6Ud#߯ܩt򑣔.^䭱C.ZRoE=%MI#F0HDڵ{awP7YK=y+.߈\ݠy ɞ 89)gچ5BFUv iy2e%@j 0CR\Ć#{0 XC`,j\*S½k~MiƑAuнQvgD?1#:%]huUυSlpHzEvO*|?:0'^S?9TZCA!ԪF*u̦9Egů [ ^5S98VDlǬ%|ZhϤ=c5k<1w϶z t?5Q50 w ]_  epFn~iLf8d<㸇&ɠ~pr.wR,FGw^8"C%tӒP%-:w7&cjIW޶um∑}恆8lb֧}JJ +cYY!P`zMHC;őW-]:3ԴAeq+R;ͅ>΄BHM+RP)R8^ i j$ }RE[^dwεR"#`?6a~E!2?VB&c >3߅'w(` 5Hܡ¨)C2C݆F[D;CIҺx(<+N*Pt[~FT2-QUcҖX5 :ـh^@̈́|8$/y=o΅3b#*4, 3 B؂^vGtMj2"^7B,D5JORɫrC{O|C*b6ei3㭯=7 >ƶDĢ>PGy6A=K<Po=9]z4BM*XMmNT s"~@uߓj5%jsA89qj󆕎B>桁/eƨE@Y&C[o?㛳Xm"F0'N/qi60L ˴_=;(mt D/)NhmJE}hDں%S#> 3qpWWv J0ŜrKSo5]a)E]AAn{$' IGv pl¥W!)>>aam MIֻ6!J+,~Yob. lF #-om/\Fyz}(-f:O m,[|ezs#! ۛhΥɇ qA'CQnXoS4) CX򒨷r z(I_ՠNj"* &jC-$ٷ1 x~m&&D;i0^Ơ\ħج\[v5;zz?|ۚn,X54 |"վ?sE;ZE`¹Vht/ 1$Ra5֚cxӒb:/bؽѡl> 9Nǿf_X 2Lƽ0K??l`s/M3"g& <1(\BU~8~jW U`E$a?AAOTpZaFp)zOsNcYrK""|>'{I$,*rzXLiL^/szLD]i Ve w<8iK1;TQm#YX78z g-+X>Af1QH=#٫Tċ!:bٱR~1ֶ汩pY SHU,ʃK,9D5?U[l6:.ZV&e*U:)W9l^7קJm:UsB/MHqY2a\%mr:x;QM6$%RL Ipbx˂3?5׆7HRa\&aRz9`!jWǝj>{z!,&%`k+Z2Q\3k0 Ͻ=:&Odʽ}$<&d4wyV/MU(i$uϤLLͣYVxkϣQHp?4wL6+d8bM:ْVFX\ԕh\0c }yw7%+O`&Lg28]|+GsHLY`t1%?Bikk#=m1@/S|U1~6qh8!:SgQ3V9Ԃbՙ]L UՔ~?NfFivd/F6p3Zti0z7__aّwlPUX na]Fc7G5k-v D 3z` ގvc -I-rHڰ:E]%oMsfa[K%|G}6v!3 $ Wx“_Sj 7~wԿ :)b#Lw7MWpĿڱiBp<љB\+@sx_РlWn_dzM mG~ [Pj_!Gn?ZYX0L CAT)֥~/K%c9JI/aFB8pے&(On[ZqG ]PeAx#)ђXn%)me^+/n;{EA/Ӫ/ՖD![Qѐ)BZ= UNkTFt_Jnlig(yCw=Ӎ+MqZ/c؟-]YҴwQbE PRHr &*̣2*%=n> F֗ކXծ)A_uK-b@f2jT0g<š =zFkE4~ǰTӗxreQh LV_;*YpFN14N<}y-k3:Ԛ/7컲haD Lvv&-OZ-I&Zc}+}GS%9B6M@B;$^b=McqJKĚӅ'#9 &P(7we^Tr6}#]Nh8;1Q C>Xb~X6R1 )w}Wd`0XNb_֟Ҿ9Bs8pg0 k9]0&P5zFr(ȶPNe h̸*kER-;Wx{Åk4]GFTFN,^,Ww !*5Ši畒UCu~s=jg6T L%.OCf75{EwhݹAз-M ޻,fAkg9Ю*9 ej2[>h,r"|̛#P0{^?j~CQr[J!ryc.Swfz@@FEbڌGA4_2n7P/BKBHl.KN^R Wex!_e-?F?l_A(u7 Q}S4]\TK%| Ez]FVg}cx8`h4cƍk/ˮ_\8,@a/07>sZ -9K:eM3|p\B!LX8.iMj1f,V EBnml-<_7Pz iIdoHaJ̬TvTQ| 7N 3'](Ҷ9{D/A?Ȇ{2g& >E\ÔTKm#=\bQ1o5ވ7M/ʆd. $J6Eԋh8J,.:}bJٴޫ̭vmYq  smI`mݴd͑b ,*vly weݦq}Jy{M!=i `/eYw]| w. 9QY|(&p4ۙ>۟,)dhq+p'VB3lk{+ {. ӽ[VPHvi+KlwQ;ޕQhAXqCn bjӃ=@#kyJgi%Mlvs)-И!.<%\@B)d &t9ݾoU\*LXaT!O/x 6`!OjWɞyE8Wsg_( SXeXq ~\z1!'x$وWysTm @NZ5 {nnuZIFk7T0(]PX8bFC3KWm>ÃT܋)+ ZDs^`[ O9F^~Uh(\Mw:]F0W:`&.sA[# 0r0ɡvYvޛ0=lOJ)vz(CZl"f!?`Y)m6Ų0IzlYo; LpÅx5l z+R3c~#oabF.-K2 Q2Carh򴘚^:CpNe!ϽtlWXHExEJؕ=*hgZV>FL?ȂUD!ob[gggѡжK:b}5i4~߸ݔO*FOd>*/a)t_Xh4@TZnEx#[<:xoi&&fBRâyFWLCy#yPF>n\ .<`M!a#֔쾖=r`3 KtqqOgHV5+w@PmRGHBf!U~XsB\.K\*1(|t RLp- ὠH2_(ڞ.cn*qr&)0>ܸ 2 ̠, y2k1hSSw{+՜2LF7~~P &it+hռhwg% FӮ.0O=I,bi@gͶ[*3P25ESV.zr$.ᙄ/LO_(jxBq4:|FSlu6#tz_ԂMBZ_t F;U1*zpۤWZ[U+UUqpvsƺ~ ӧM̄`:`.k:3p굇潠&ͺGs!R[.=޻XbAtbjTNtyEvliCE>Ic5 lZfw{w?[;3:NRX17ٝ|׿;q]ZEv 9ΦS4 j$l}0hD(V7SףG:Rb&_A"!^&۲dy,qb)u\ׅ]_Bp!#L"Dus?$X! aΧuSΦ>/UΦ0aqlˏ=_BqǧuQsqaJ/P8,a!fLX0fpM:BI~1ʺ+ޱi8dNRǴ{]+O>mӬ}`inQ[@ nPǰ,a ZJV]Kd?[goタ'ID?[{~mp-6cy|V ǧ<J;GHѱmr Ѝ•4w%^0Ic"-_T[4*ё H`@cNp"<C錙ɱz؋Sem-7vLU[ *p,]C}EK+/loƕӪc![:DC'˘5MW/=PpZO/kBBKJM<1"DjOdumTײEm:]kCaS#8j7W6\kM2^ߎ~,,A N;ƒᎲ2 iS `*CtՒ@랚{ D?L:Ő^(oÒ+>PRL9^T'ɺ-&A=,=P 9"Uxfꎻ r{us!$#b!ǓcT ٴ!g[ah`vIwqmzr;CEb8E~KUKWzbw57MFp=P dN~h{RӚ*ȠB">%m@aNCi>d%8włr8%D`@-[ Jߛl+5Zz i $ 'SkR)&{(>/Gre0#z%[A Y^ZWwU9lyd?LWfk) kͪ \<I,\YO<|эg+["N +Fm'*}3m3?ϧw5xs&Ӈɘ5ImzTD8>uFKe1)2<>ah)J xU5$`;h+'#*Yk"eַyF ̠˔:xms 𓕦6 bWL}yV_)jZVZ=R:,uY3J&*V ˮ&)2_(k&KI/E˼>-[FFn.R|F| _%]BFi찏9GcoU/t]5_^?A))"2TR~Ě) ՞NWu#}u[de*6@Q \:WOxnLIAiOT[cb&UeO.(n(!) wHjdX'Yx< x܌nd}w rJm$,t_\H )lMKsԜfev${ï!H5*+kTKG2ֱ"GWO;N4_M:L T/m/4Td5nӽJK`V%tW`)ݠ)9Y6 {&)±'5}qd܇9p%ob) b SIᝬ=J$DCø9g+B)E l 4T-j:@: [=KjٵϦwRHxGԳ5 Lj_*9oOO PO魅s>q$Q<%Ƣҋy^"đ?͚Km.8Gw30qXd.\) P/q!yؤ10PtѸskmm>vrM-{H*ͬ*j?a&O;+bnG=YvoYfCκ1ya$C{#{ٴ89Wl"t}]$e -[p+$! _|6I{6E1<"ۈ@Eƈ:QL-J1GbRٖGZaY3O)8ə$Db*V2j?ǒɡ-P.b-Xk\ ;`8 uo!bо:3IW2n?%XaPDžq;|T`1=?¦= -]3aa)BģlG'V^iX^/҈f/k:ḇѓh|²ѻ<] @8r^{ߋ^UE\m_J_PWFw}£&d8]$`-u*ڝxM_ԱK[9gVӝaSYvd/] 0+];[g-8} pQ(>qewIrNaX=]Lirn KԽi)^3& t@kWc{Zf/ٓ) >$9'YB6+`!~ԐgVF507*C&$U ^c:&,vWC"7na@6鼋2l|nS3 xC1 XP}en"e :( rfbU!;&%-_81</ P[]E)>5oK猺ؖ0FϹvkتw< ӂ焰6#vy72_ThN}&SI=\S{Tot-ø&5Eye3>OiIr'S#fDq l5$EuVd9SaZVXsvk}eaB/i K)>X8eO(RːQ9 C:&!8*ŰBjk>%qqv鶷'!J2G1+V^kw&RVuW-~;9d]3| 6 CpZ_u/NvHy6O$0GnoC+ZmGf6/)#[eL|eʸk(; 3CKDA[.(ȃAowe r*dRa=N(8 mVOHv*Nd}\'V-G%{Ew6k>O{Oqة-+{rMw%).hd&1|SAN]XV^Q O,j(M_O9[\r+$cA,i P:D5b0B΁ *y0sPgI8e HM?8}W,)=]վtjȯޚ9Yyx2ϧ ]U&nI}`m``Й̹7`Si\M2}S> }[Gد?D7nOCL_ٞփLŪ+ 4o$+cAPSA5*t2 ~%{w0 *[")hn{[|\KlJd\$(Pnex;~0DɼTnD𿞆qGA6xyW!UrVjUH蠟qP80 mQu|.1';ZQd/Iyh=6ߗ,F mbjJ PY% p!ȇ19p?r $`gMCG*:8uXIԢ H?87K)X%WzsQ;"Y`yyi i\REqs!3-af U ~+酆jͻ6'+\-š)%sV3]6'ñ lei!ij<;D>Q^[w1{>򝊿CV&< (52oC-5mDo9df>h8/w{ӹ!Z|j8oql>QerXAQCPJB 6J3TafC-Sؔ9Lq=VRbD4U,eSGBuz-M֏ɖfj$ҴoSFܐj il#_gZnۀ+"f)94 `1.kojoú"&Sx똣Zw4wͅG-=$5F|Hql= l"i_ښK7x|B5A~JR4Lc(fygx.Gԡu 4>Քp݁[ZJY؀L8,s ת >NGs Ayql t3gz <&ql!{>x%'&;^HR<ٕ'_TW\X>:2eKau֛ Mw~*O)Z~gص纈Lj'ZF^çw?OƋbl ;TNh6wVt喬a Ij 2(4 EqTfduLd Tg~Gxבv,ob@EcXAhaqRn'<zB7N珣LWO騖˖eTn;wCdo&/.*(NB]i֠*kOՕ-_H B=݅QJ?:{8RU{ =>"vb =Mʟ&C% d[D&!p歌VQW_jvhgELlp-2>+Vuuxhާ HqF6=$9=I1n֓H^CIۣtn׵3Y\&N8ts'+&ӣ287QdU#kϥLdЯr6C\+m/,Z|C]$(bk,Ψ1 &7f\k͙hgpፖ-XIRmH _T^A@i_],^ʏCިڕ|LkܑTվ䱉΀c[nۏ,p͗6 ito&eꇷ7S9&\ïXnVoIPݦ_߱{p.]5L|W=K)QgDjȈ\!VP.4 (3TzvI~= WLtkڑ'V΂?+P5Bf`nx]`00"x`4|Y J|W.: Vd|\́4|B@UX'ںk#߅by:0д|1 5 iM foZ,qQs;Y>yUNS mdi BwQ[D4ׅ>skF" |_%ë&uUV:#Cf_Hzl2X=wQ;pU@9[^ĜM-ƳjMޕ[FE W({\A7إ<sFlϖQ1ȳtuc&$23K;:LU`nlu'VjʉV0?=g#?<|d,33WU@\/m| +~ IS}[x(qȷk"H?ЄU?NjtUXmΧy'5c :@;j: =%^K [v>aDb<;,=iF%>yξROjU!yZUb%B\B}O>[|NnEx9]+}9)^$=dj d;C@xp&ˢs a$Ow$x(ͯpQAP ,<({MX9 *-aP7n p)RM2~|@N@#^0s3Jǥ; $?q32znL&'`T]@CTnJ>s٘QЋzL.r{;5ˬL)g3=6aN^6`oF?% T 8j{@!hz5cOҵ3IB3a]q./pCKh[`Vbk6{@$[,Y>wW=m„. =Wt! RE4b&!e,r(,#[kvg!Uٰi@:QD.{*i廘M[S:NFáM(ӲVr'*`꠽d4e4lg.7;S7V{h>5t,9ZS*[rf94G9y2`7ue%!q= =Շ=x MV$jҚ!p5_32JC1#ަ[0I?84Ųywa=.k#-kWHÏ:(s ˋQ4D ̒rRFNh }WN */Xmsq[ѢhFY;R,Yaː,4O%Xً[l?݆jmnjT %֩d5z4ڸn8͌,mNMH-76oV0mq"KwZud{L~ϓgEV#D!#Q.+j; 㾞3Xkh1/k!W+ ]OK#~}-| F=0*w$851!FVE78i5 #4%&*5(1kq)Y xK+E,C~z0.o5Sדdy0;qVay9/CmY.lb:k `W!RC,'fz3%U/4,3eFeM] ~YIi"3X:VdQ4CߋҲRV<# v]W hX &ըpsh`r5aⴝ`cV^Z+HBHqFvCrE.\.;(n)sd; uM| DHhL,]==>ЋqI9?/DiR@dz=i* >’e~UTغ-Gr]z8Jxl05LzL_XG@eݚ֏1Gu7MI^9.S?4cA`rVhj09AY, ea2|7Mcn̜o #5a+i; 5T1` ~i6R&[}qS0.zmmu=g .(>ᙏ:^2̊-.MyhUm̈*$]> ֶGEduJK.F[(zXnSkF?׊YkD$WyJm0 ]<젬ۢ0ZTsvhNl'Dߟq,6;w^Q'j(W{'H9]CihV.Z-ZJqD-}Xi8_\>&嬓xJ#s_ҚׯE x0l5[]_-čI5QώG;{+8(X{fhee vX}e vi@WeW&[A${43ZZj ^0X1TÊב## 8~ S )l/" )9z=‹FE:sfVufr6mfvxRPIJP~ˎ$uu=o!}wDќz[X/ :TU{abgѾ)Jg>*e.8,aH*%#ul_HɻQwuk -WX0Ml֑hmrbtQ^'çR2/NfT#ez J96FBz@0UCUij=N/.A{z7t]1ad:'31Ė^dHƥiZR`3VtHR; Mw7~FGPy;a6:gDfy%u|\6!+P͎=cf55Җ= \9kM5[(Ln{Ҡ$wE8ywE4v,LJ 36T(r3ģL:+2z:ATUl9e"VN t* YGѧ˫KpFg .^U^fM$mk?}Yi$L FWObb1Q8 :6:y5$JenTmƎ.Eس.\7UMO'ΰycϐ3 BjӺ\l{ 33ҫ OQA9O@EX鉅SmvN>%Eck41^U~(t2o˙NFo#G[FE/`!̈́l?Tz|?k%MJq9l-`bC}vT%2{G9|cP?&°4lLifuZW?,5_-jmoFq Զ'\E:c=%{W-N'j !_a ~q .ptcnTa_E5 _IZe.K9_"(|\`o?yhmK;ĶK型-U-D"? gza|@0S-w( b۱Y^ЇQF]:0MfF4V]ꭗY[O[G54^o:0`avt:ϚDl=Y\*y=L/❽j~򼓂Ñ`r( 5z'$ɢQ|K?ÇYVٙJWtW< (ֽڅ{2ZgxLzP'`$9!>G C1( jӅu+*vH/#;ӺZdCRaGo,fmaX>Ķ\Z^h 7v$Mb{RⱉmSm( {2c̴8J7Kqc−+vڸ˯};EV2B/?7DI#oW|8V/. DJrz%$wHD< R#\-9{;.t_zGL;Ŏg@MZ~>]`]OnuEm I:բz]mZ=!MfdVay#:Zd';G$UxlUrYo9Hם(n Ew~.|N0p꘵(t߾Mj7`}8)] "i _\,cLgGk*y1 %kՈE3}:$wKe:SO9n,KnaHE,aj<_$<,q`6[/hrj#Z,NBwx|Mch0.X[ X6{Ie`pώ27*ʩʣm{1Hṯ2 ?/dP۪43j P`خRbʒlcM=wO<)d;z:q6O@NY _`jGL8SdHljcٱ[n8rBBgc/KeZ9^8t ؕ5I!4E<=4HחM$Lmfh~hGZ Y[G^rhLTa.> Q̉yꑍndaV)=Qg!f1E0Beq'Vl8y/.T0c s3zPϊ 8.%Y׮l+]?4"4i _K{#H8m#O:f\\9ɹtX-k?Ot jk⩚ է^ɍ[pi3DR'nVHP,f, (Cy kcZ-TC垛'$LNb!TF8m"E<@؆y#&լ:P:ݘ)"u17$`daOm%L e_<ĤsI'bW+3: X]SYG]|?dΠNxXPP& vx[0/[%KwsP+>V _\dy@(S:udQ1 @P0Bbs'H'KotWms8ذº-]?ST!3LN":ևIT2]c+ p%qנ@b)3,n{- ߎQ{r`=nEgYQu+&h-up 3YZݚ/+libʺ{skQ$_g Nj }W3*C&$JVe@(d]0G;Z-.ִ?>g#4t# |<Ѻ^Ve gnX#2$J KAVKC 97N q-R˔*67ћߍ{(?6 ߛ)36g#!V=z&e&*N$!|{BhĪ-:C:~%޹['DW6‚;J}:TXx f\ii?]lhR|pB".Nxd |UXNJ,0e a}C~&J[\BQ I}[2GXd=[LѴn:dunԡWwj@/G|~j((27Z8e.B;2ROUҶ*DQan~p\B՟|iK+Z!ڂiКRZfaF0Z}5D^4Gы?K[0dγ7zkCSW' lZxy hV"D[|2:ݑQ# ~܄6VۦkaŀNiCDLL=8֘ b^ˉ{;gPn5[6!Un6Q`19'JWG*|B[Cyn5{`B$k,(a~Y  %fwkqj{gwotBFDH]Gܝ&q rzrU_F^2A8ў=meZgJJ62\= :Yׂ쏵_4ne¹0En%'`Bˏ0`i!*vy݇akzZ~6ٲz;W/Ơ7"Ҿ{\==b, )ux!n"ȏ2FgOf 3a۴-Aܣ/'˥]HAm~4_P1ፋdaX`ۻŜœSmΞK}g+D;e\,xUCCd0 >^C4ꃡhNɼ>{0b@S/Lp(?0l@.rlo8/ n'?;MoV6? 'i.Mĸ'ɹ Y$`^鲣`?h=pvӻZIxk2\ dvnR1׍{w ӟP%. ,4Nd8p#Z B*i,Lkr\ (U4$Ra toZϺ 漻QM'puXօck*#.ڐ Vlt? KPr+/PeoWй:ؓ;`#nX,BiC&jG[zGaV ~GJ-)QO=drѮv <=X\ʹ[Jc[ t$8-]>|f3&=dYͿEx,<=C-Q/Q/IM7M 3V!݇3ěkKW>Y}n[[d lkJl3.c 4| At8&$ k$FFcܣRXy:/3}02\Uy`Vnʼn>[Tg?ĩhKPL`g_O)kRKƁM6c1j"{0(Yg@D#`؄жHYYb,i̲EZɏBeCįiŘ_C&}kD;a̝k5Ě5yϪ!T Z䬎 zYB㜨\\g.a y(ɢ2^TxZCOkU@cS R葃4[zsЫ=XKc %_ZĚC壈53oӗG@ZKUHNEwƗ*xf]P([ΦfIh EC& 2'JV0a7>XKTeT3"FR<PT5l?^D{Պ~dSx /m Ygœ{e9Nln7x7~F獯RJs=vf,~z;wEM\}O[CR<5nB=W _D;&dž-$PV Y=pPz4..pfvU mzϵ0vI)UQ[d_ \188A Tˀo5II:\ő*֑  c"v= Xm#M KQYPsmc#a|erjI`-`Bb693]e}2d"El;| Pˢ4pJ-;Gs^U!dkb1$/$7Q1K[+Ʃ}p B=y.=&g8{KYp/ez#VO!7؈"itTjVnQdgb'FU)EǫͽoH}˞!#]FOcM7<{=Y,*jk(EDgw!XQ8O°|WA?$Zof#H"bz `OȌ ͊βwO~e?naQoUUVS(h||Oh|@ƹvEņ'i9v4b/d#~ËЇxO4oCE`$@`6#j$nUyomv o"ڃ'"3[!#?|},Xp]=S2ʸn$ӧۻ|| ևCLVٵc}s{BַH[kD:#1~xVEBKƹ|#\u܅(۬#> |Ksfoz#~ އw"ydCv\Y0Blۯo06 :V6{Tm=z4̑9tT5!|p𑚼VY587Ё +dFl_4bbXm ץ } ʥRiYEz_NtWVOܖmXg J^SR DR<Qv?"q]@cl5o<"ցǷwK!dӥh3 rSڬf*Lˣ-f"5}kIcjٳYe)i.Fu$}gw DqA܈Qڀҧ?! abARMXIz.5-#ny=St6HP `iD R!YA3ٔNFG5e'9Gt1}yr' Pp`Ir{ں^ZE7=Үc262?N?4 C׾}m$&A4q"%PhAIaVX?X*# 9 cjQ7 nj #ݘ'wƁtENhQH5By ӀFub˹Pm`q?>NP"Lb{̺\)xm E}}8`2 >>aɥu: x*z#.~HrO|ECNfVuÚW]}h wvlxGdщVM.'3\ rcaN{xjYpVlaδ4-C8֌Q9&554H+L۸' YNtnխ+QXoʘs/:'*x;CMm&[TB;w\J=_c:o20Cp ;X<)le Ik]:*B`d%A}2a?z7! az5;^ c=q:[hk^o3=>L mVhĩq)o/o&($~$UEΐ h"LJUN^%}D[I_ĪEhnW M D,:^*dPXzB1}﯍iՒH6DM Ŗ.}ٷ&ぐa' ]fJRF P'"04p1 IА2azrڙk){DL - /vnN>w5(Yc 1ȉFB6 }5](zm9T]ՕD6^aɻ=xe^x,okP՟K++B℥jm#uY@s>l*bգBPfTj, 30Pv;ƤVpIϭ+y}#lTm_8P?zDQT#vi ESGwt!A.Bns<@=  b6qWU.! /B X^ ;i~6keo؄ J?"YUaBh8gfE.ϓ;|1HN C ՛0gV[.يل}:b;~[UC-RXB Y\6`4ղXV +Rkfu XL'cPWʯk>&5 Lm)֠uDk,6m ~[{WƦr.OE+G MHYyx$5j.ml+;}D Z@Ժnc/$ЏԷ1l Ӕ20[rýmK%8ebnJwYXK u3cɥ(:7 (Sҹ2p|Y)ezP )y]A&8-lic$r:Y* -E>(a؍Mpph/RGo!^޴ƒcZ`$Z`hv.p[/H\)ػjpt6M-H[[#.|p .ƛv~--\-u4JҜ3gagU eNKIӖJ =a9dnK K8.ǘ-jg>|'G,䘉eD:ta&Cf L곾E&=(ټ2.l`{7.X^ klU^XgA AC$G("J³M EEg.7o +TQRm5`=(EGDT3)pj#$I|z=Ϡ[,^>QG108~Ywfv#]lsׄSH/TDG\2mS+fԯV5{au+o#q2[_#>^`6]O**߇"pba.jՇ?5kui KeB>$`x2τa ],~gpCL++W-HP'}I(aŎV6>Bv`A94񑸗2D&/Eef@V 9- %ؒ7/?1npBCkV1\.ln׋TXOr$]{~f|ːZnFm c52_goYˆJ4?nВ<($3? GDjJx mCcZOĄbfes#4$^ƱD]mن"@tQa#!|! |7zz8@G}?c Q[J!Ϸ0l!"VO4qHq:2A.,V(mTۆdaq&Ǯ?|2}%plf&MzϷ0'W|I zӿ{:xaH.9/wGmȈBi?]]og 2X{, oE42: |ǽ5&ti. LIuINWl SxaA*|ԊeX9ܾ8t"PޤN,dS1 Dn,aXZ/粿meXX2}uMDL @@ -6T$=+~s:Cv"iS#ߌpB,~\. >r?f6|;ۈe CCήC}y +e)p-lGqI4\! O>4.M[T4Cpі%DJ2,W&Q0'U?< t>UA.H{EHlwvr x}zqp`ߞM;o.CL;ϐVjwG ԴGx^?t"h-E%:s~d[_^IMUV[Im)O{P}b`w6BQ:bb/?~;e,; - /I2odt|!DkLT8TU7yn!eyv~G(3f וs1@6&z:gųMӅ"yz >m7pNZG[iy7ZFrSh#J Yзn%>~xl3~?qUfn6`}Xknj:y_cjʘ/x$&! vXs^t4r>a 'v!ER_[凬iґyG:/ j:P`KxKӚ*һ iqyYb q0~|>[ZN°۟zZ7sR{xJXF-ź9O|fZbR&d(6fy7ZiL5Bx·j2,(f%+FѹK4;:e5NXsP͖#$zлN5dr]*C !`xȄH.B5lcʶO%=Yg,tڿG4X\(oNx(8+% w1zF|aV زA{g^swv KQ>`t8S@jEl<[hHӋY1@!oj_X` ltvჯ){Y^~u!4T[f"+1IJ:*2^^sm›Qu7G[|#1 rUOp'?eK2 su nitUva#!JfBw6/aq ى|:~aߋƞɬ0𺎚;r 'zY7 ݷ3M]>PB`^Fv(=aQyh]:AJB.q`kqMP!גz$/L0qmHjIǀa֯ kӅq(*vk5._,٘١(P0 ؖ\l6ʪWᯎ~+־0; :nE$F0 43$1~=j{J\2j;p$5'A' 9b/|:1y}uO:uX .wdBOٲ=Aq)0h6m{s:4uV59@YV!HSk* V ΐNDDPCkEZݾ|KuLk@w v}_ 7R_9im6e.)}jXՍxl&I ^!tZ1VaH&8Y6R-Hº:%'e6vVmK4raj wRQKhȾO )|z 2yA&[,q=.MܶzO2`cK{/XѮV [6}0,cP{/8auxC] K檶]O.ORV6M 1EVd֔R?8r^o]X2CŖ"C1yM&罪Ce Z1}ߠZtF~Q I RS'JHl^lY"c{ 0ãX άݕubW!>z vVNm^m(j'W|JsQ*vٕiu'H SX,̺K4W t?/-OC> ף)CiZzqAYZA1҉+!.>|(ߤ2Ȫ̇7c eraimRO{y9, E\>*D:*-Zd"m⊏4QC@/Gu+ir}лJdG~y-$#Յ`؄q~W#9C0\qB~*)H:µ $6]5piaYǮh}^{6̽!a zc R!,AgqnН1J](S xׄ2cR} T0{$ϊݶ@|\ #jK0Tv=~hjzp@r^"yByX$nPr=i;$`Q: ub8UBFm 9f9^pxpfе:#7z8ݏj"Y3Y 2йCQ|GL'wb.eD,IDܴ6?[Q۽/;+]q0Xe %J}ڰ}!^!:7'(@N_X\ɿs)͡}f_YX90 ǪmXL4Z𔢣dolJ,ҟg鷤`6;s(j E1g%4|zEL8 3y GhSSښ_ߍ#xteM9>Q`JT̃,/`n~7}㥓0\trҷ맙ilth4zu>98/l/Y QaALP:=<;4Har%UFhJ=vDg|TI8ar8VԡSZseC&vUՊkʈ=%)Yq5j+TtCG `m dB)^oFkL$w|ipf?_ng\O7R_my2_l d~I;ˣU bL4r|E*ҩ&՚xꟹaHF];XIaD3Ӹj&omlUJl0F1d!\ ]pQ*\( R_3H9d"b:(yr4ߛ_}[k>f.qlAZ}G"he`C8{2$*Z~d݁™lSqG`; <݆x[6 wbGv} ē ,)CXXxa)U^9ib!e ShMU}Zbs %gd) ;0]E̱:/8=(r6|@~5$ >+2L9d'ُ]R6&S {2uC&F0j/OE/{ctru"Q?!/ÊY&"N8w9'bg}X~ó( <0er}o&`d2Jblv lDMr:2 ϼwhq V!;sX"Nᵇ`d+(o]OnzMc㪒Po 2^qbܞԂjykRnYռBpYAaȃsbq 6B:s &{ll6H4[ sD >R1+}ҝ^ʿ{"ma=ٺkwWLg!H c_ t]?dV7^.hu\e{,3]+̕( z/P0\ (ocCq~{;h凟N${+%«R9?}<5}xVZ"cR-(C5 -XRȫz%?[9.S[=F>*0ҍ?E6W,!**;,$=_ό?Ӏ?,^f?TNa%n5Ĵ{#UCOAP;\MmŠF_eQ#Bmdl|\l٧NϥCܷ.C3RjKMف{א=ܝBL&/W?LWOɞ\0d5LR1Gyy%-F e*^TۮLmJD2jx3) wMgQ؋Q)U'~>WC 3gvWsc0HNչٜ{TC7w嗗A[@,g ݙM=DvWL&q煣&B$CSOQُJ1_}fCcb)0$}Ta/,6WfKkϓ$ 7zdM%$8ߺaҋ;I|&1_I=L0/ $Y!٨Šz խ)rZۛk7kZH`&%0rPr\ -߄%B~AFH.&HMS$ZH3DpHfJvƙE:)qc\hk3˗lxivS @noDaK]Vb'dґQ;Z8Odž|88"A'~YG!Y^DFբ,EW92z7#ey<}0YRJ/]|cζ|ѿ*K\*wUr-VEWy7 KlJqlhP][@y*{woՓΡ](ּD0ʉb]=o?Ew{&fsiA.d-7㗮WpVSZ:@+WzQ?wlb?]nDù~GBkۃE]BEDgJQ3s/fQj?; =?.{+R|@ j}'n9|GE^a}^Kwf$p/f$*|dlқ ?э%ՋB\,V%"9Jѻ=L[0Afc1> C ش2K0nơiU4 WPkLJYu(&MM][ju->Ԑe3n-N_m t玩Qh,Cd3ۋ\dйOUdSQ#aܗUAɃ8r+ueރ؛96roÒ+:2 g]J ƕf]r8*.)y PP'"6&*'Ң]n Z֩)9R  GY߾'aexdm[] Vg,@ N2APeE򶍛zqV6'etLB$#느/"zPu+Wi}f,&řl&w<l6Ԑi%aO|W\Qf>Ng nLyDTs,!R/EY$7kt=h3~f]L h#ucX ,Bn_]%].z' D`@:I4xCL0V+׬Ucw?.sz s}j Z.G!ms=K(kiam/#F7X"6n$}@x"a1̡ɯu! |1CaM[2HEq8K' ΖWZ`Ykf q"8i__3)N(p6 Y §m%+sQ,}E=fUXlz=Gqb`J-$p.REV 8[ B<GR'PdMu`z5QG!X/3O8Doet衇P)$$Ja*+_F:PHyc54hӮ!yަCإ'~8ɬJk^RYr 0WxiGe^ĬPZRmK5èf~6[鬌2gn ^9XCZ/<2Y@ 4L$w=EHz߽{?T!>Zպ\wgۆ)"|ae?$u߰6Ҏ&c[a?`Z Uz;rjm 3y(I[,T_՚L*s$o=nvfEL"~5+:.^Cl©O,@4Od3w<%oÔEi)(C8=IYKrv `ˉ0M /o l 5c6+߿Egg8B<:b‚ڸ`<ޜXV||"=(d^`p[hhhsxG 3B\=q+JPS+ zQ q+FنR5xE'M# yQg7\Q@*3[C1)(npmޮa5(@ a Nx*x^٨dmނ 34Е^2[RX]a>m޴Plv`eסub8}0DL6qOmRR'ng%כ߳rNM'*ʷh\oâk-?VqK7`Cv22P}UudkgLҫL@>:#D qF ^f+Y}Udm~o KvE 363EY.VgNǂP[`du :p6`y9 d-PS)(_wvLF?3L_etlUF ьf #-Up,z`7]}q#&kKeU6 IY =S6lJ(YV%Sz,/ݘFMyJ$=ibp̦鄕-.nJ^Iq0 vJ'&ćp;՗4~A~G>ݸ |1~sս 0E6o# Fˁ wӪ3uV~vqdxZf˱ aCH{K*\RxvXlϮ#Hl-  QKx7%%y[I(]*M5'I"L?byXR/NdJav*3🮇W Έ7YK^Ç0:Fz>x^_ ,|)|Uհ`\כ>L~yL0-uLh%?;a9>TFa1nugC<>.9hp Z nBCA֦vj[cΉTRR]Q¼!:j djOK$4 B\f\u8˗Ql{߻J 9D8f`Yas\b+Fz&0{Gbt=)nQTr*ME 0n.D(ZY .Kʓ&hH*L l;?`k[@mvهTÅU'DlH9Q)|{˖FQo΄Rb T7.li)p/,<(?ycsWO~ms\w_Cx 9dG3 I&Cx0@_e[Ɋ߅6)BR~tRBݱ}a%0FMM\3eSy4`̶` /+aؼaڒ[NPi ˅`Z҆`MHZNMĆפ/ f}{ ^Z.|JMk&x?T.gH:CWefQփR1`is$g jLSQ/f0QPބ.sݰϙJQ/-'ȥ ۖ:ި騮b;\XĿYns]Q*34هܽΈ~4sRjsޟCu`܆k>vӍ7ߡr t ՆU>wFs޿L,sJBlP)rJg7fúXǏ$'uA$ν%QLh J/ڝ}2xd#\,0xwƣ zñ I g":fX 6m(oƻ#3LOgXrA{\OS-'ۚ\aYWs =%yPi&997Q=7,[2 *mb#V<4KxU~HE{TH wn0`QsX1ڕqZ-9cҭ\ܥ#D]<c۲E:B{u{y=SwM``)҄_m09Ym*ޱxQ75I /@;\wݕ ï2$].k:f.L4{U;9ږ$2Ern1faQY@[SSHswW<u|[E%{J(z)&$/C^[wqRt+PrD8ᶀDxˇOkb| >-ɛx,DI]qdUyRjZ/["eh.0}Y~^pHgSG )Y#d?Az@*nZn@]qAid ^m[eׯ!8-C#F _2(l>F!}qowVe.Y.I~8IDNueOE s絚n*AӏJ,k@0B0L nzi4a}aPV 5C(o-i4=Tx( :O7Eb+Mln~Gͩ7 pԜV.0ג16H`꿷F(b?^fKTm6%[ x [a]L5$3aK=]xd]]m)?dI| >__t+ϲMfU-(ov LyɼFXJr R@d PgO(;VJZC IWD۹=;a4,F6;#%$쭋Il nh=EMzÛ嬲pe&f%+!{zp]nF'~xGcK WfJfln=D;@DZfMR>*npt18K2˷Fr s.5.n;L!ntc60^G%31@h\K-Wj?xzF!ZBP@E9/$ZYln8ݏ"(opKv9]6T')dD HHe8+l?Up?mnB\^^hZpIcoW¤/@ _jM %d]xve,c(R&MȻ~Ҍ8%PBT-J͝YG  4 T_4n²Ӳ(Gt6tځVL=Ǭ tڞle{dmV`ڊ&df ܣQ8rJ\8=gHsbOf(ô_)t$1{Wx&[O)I texZE,ޮLbmKĽℊtQ;E}ES-D" u;]yMшǬ_&۝V'1d!R6+cLSE7/~0|: tb믽o3Fhg. ɴ8Bc~g5G4cM vTe|e 9M6Mݑb^aw}7dp3S#n O; *-dOJ|ZK6aT_e($Bih(>amZ)A!,j`2EIX!'?8eQr8r. u2U'VecMr Vwt d5,og<"Z~E4 _Cű[j43 X;)"%^]5?&a5 ~|1{nJWmKjw<1@"]kb BQD>Vmٌ1+>|+z@V9[8=Zq:Loh{=9[(aRmi`0 tqJoD$oSw@d1yۑ\ !|2w*{5ȌH@6;q,Jl.\ =R ]Xcu5ϙg.)Vyf8:|ҽs(q| 7M{&uKMΘx>2p~%k貯sZXo0VkùNnpԮf|UGz 0ѩآG)KMsOrn%I;_Hc1ym5[ώk07&_)o09ٕWd_A\))|B` a9^Qp"{V@uBP -r5>_͞[%6Pp] !n{)/E @˚E0Ԁ72$6ڤw*>lꊜp(|aPDGlߟuna^uC rbN=˄I%"J)# 8~'nZf `QI4ˊ#v+VMX0թ fk3SOE,{T%?VW2(SDf$K,\PǏVk{)0_bXxLٯR讛_C_ Jxri)зCeqvImXޢK #Xp!HXhOomm2Z5u.eE*%U9zGƅt_z W:ީOT3G`W _wx}GIֳz\zwRcL,<tѻf8ȸ$tlK'DҐ Ϊ"5KRBY0|+j.'PW~˷ZW &l˩ rZk/oHЫs`#Aex**z1'>J #;H'g)hLYS`[pÍIƔ39M2@grA\}~nC8We @Hd sWk/D],{i _kz9@—Žv 챕Ij+&H)S𳵏^Yx4;136n O)}ˑG ]*B0^{oE|$J}JҖ}zAE2PuME=ǿ}z]6kwVs;!28_)l v9.H6(wd XPӄ+t>N B!X Aej1sy1+Ϝ{6u|!5T&:/uzsh[qs2 iwGlPi'}Cmҹܒi'6[}[w"I%Wz̜#=BȐ@(5^#<܈Ȟ^Lewٗ'y%"yp@f䨖Z~Ԯ6_I">[֕*LD[D8ޠ]7^luKe2 ɟzU0,1wuThaL@4Й_Bws9gW;ī}ex"|_ mT GOh' Ff1O.dOTIPI~x! I*e7 }~ U œ-Y:O-18ki3{D[Rė~f&6d]cV>o-SfH]x(W oVtE&ҬCQ(O%h,S 7NeV jT7+v(g^!֣C0亟1JXEa!><8 ظVX-cJ=WziJ5OV>p9Ƽ,hyKr^aR::p\ƭ!+s55Nk# Cq0!ň^cjr'yݵ!oM2;x&ahIv5a؀ↄPZg]#~]Z]m3nr'Ɇ=1D'iZԺ@ ɭ`W|lStF =k$bwV߹_GUН*Pֿ2]2Q8bIVDNjoS >ڠM Ey\* 嶝,ZƠ=?_mקL`'):nXzTl/ ZyƓ`8xų I7y_{֯XRN&=Pg6GM|-#d>J/ˀ7'6ہe I$|gxGgS2䪹.QKme(&p1&`b ,aT+DѝX$,}0y Ԁ8T@י;UDI4܇ot*7Ѓf!{9a!N9"1e ,,EH{q=׉-fѻ\ 7ƒ:7 o a]pquHiv> 6]ׇ\CDH7}?> `UnFhtA۩c=6pji{腌mUm 'c8'@GQDDqdq!fQL!MToCH }'b-C'Ta?5!y<ӐURgؙlVeaG- JaՌ"ǀn-*i&F%=xR NSrX %u(΃aX5+{7rl# u `&Ē9#,WY銾gceSG/ wGX32ϪRm^瑙/f1dkCv6 o))WdzT ąӲNlUAǧuU>j؋Ӑ6cSU=1~!!*PT׎b5AU7z}rbkaz̭ODA&Oh5d!R9yұ-._-yKۋݩm -F(Tߩ7X㺦OwELMAhh7y||"2+] q T뮫YV:VD?Sk meͦ7_ȥ%)cgj(C /0@=Ҕ>UMnP6"W; #ٷQ)p0}m, sG kY  Q,fgֶޙ}}ݫFEw֗9cbWm\ߥ] 9"jp!>i˖w\2p˪Q\FS%z<^mr6SgJFgk6]͞ XFIsz ĪH%ph%0tzb}sZ?--͗w:|5V؄#Yd$O~lRM=$O. Wu4V6.5;)_!n,.)e'H4h= |ߗG,֍d*n}2x:D?oCtk6Lʄ#"7e4JR^I,xs0sNV}$6 ÒZHNYdn`SdCQȪW𤪓H. mǕ@!c(dɫͷvGMLۜN|ѯ^'W gחkv`ekm:;%}/ L}};rY#/TO2~Fk C%FZ Ǻ!/n)(Pi]hgfV_x w km wI^_$Tg1ԝBe>$ sZ@tf1i#7vi@}SDM|L~u 1%C0猾(BL~+4[Mk==Njmnd$vMdgS1UlbH/2B F,* nCsnfOiDfJxH\YY5BIsqNxٙxN`4P}  _LFf{)e`pm8-KqW_$X윌t2#!d1բ }fϥfmrJ6t<#;*J"C֭ p k,s[ o^>`~ɕE` 5=ԂG./AwM~sv=gb`[/#LM .HFAdp<_N-g'2.UHnLUo ;X[_6o,+|\}vu0>Ƴdkcs O' Th Sr6t+G -U[Ll03J>]e\*"&xU+ l6j5U/*o(gJWf'rěŒ8Eĵ}{8P&|_pmѪAldޙE?QhF]`8LۊeCOVz؋ %Gow35v"\'X&Nt WW:_cFh:X{+3KC~p)8fiW(#>T~=GFek6~=UuHlTs_\45l[TZ:IC4,ҞM}6Gq#m @ (@#4#yB?7s9Mƙaj3M'= @|kl%Pg7Ql`b5t USYE,u'_Dӽ޴8(HJU>I3G3RjSl Aaֳϯ7l¾DPԡ~e0F:s}xe0|һlz?NȲLJޯb#)׮a+SқH}CKBʚfS&ha!.(%lvd{5OZ|{f0k* [=qh^W,Aӏߪm_ӿT(|O+zMt]wmw_Lʉ*v )PqSLjp4hɬm?@A Z5_Chc .Ƃ 1D lTѱ,s&u EQBZOE;Gy:4kiOina8dEXm 3Eӱ)3>Rhbm$)?ȫj}.jjbZ)4,G!dlPd.UuU |u55WF&R42bqT\5bՑs!$<]a63k=}e 4<΁6KdVq-]QQ\zJvt]bqֻͧ7OQ5NGDz#lD! ޼PXUrVǽ㻝 ߵ${&˺T(SP@O0METFhFO`n2i4qF§S:UZMF' bK-$@nd`m-<#MS}TY cJF{F0FbUhTU֩`{:;rI4JYPݻ\핎Bz#Ujn%LK, AUGYi}sY DJpuU>ۧGD&^$ a-g uW AC7{ca+ Mp; "-TQq6"c60g߰ߏ;4Ἳ݀Nr;xR5p]qgeLQJ^'MexT-C${\E)E!vSQÅOP? &l:tޟPũ%&܆5YL+76R:'A*u JӖm3 [h]-tC *fڤ4~0E_|)JUvӠ>L] (tNz7؅ɺx◛vmE s:lkR7d_{&r']V6z,uTed֓T <>3*Si ԩ:{Ԙ94ѭ!"Qy.So!a -.Щ *6 "wN 8öάf=EskeSgNuIIژj/ VHԋ5~Hn47wKMݶaoGi d[UL\M'|V*fZMX !,)㥹nDE>6yob s4*]O:^xEϘ 2_xԸ#"-(z"@6DGĬ~FO;N[OdlwDDvZ"bGM`ԅD=S^-jѦd5)bJ 2`#xo{?6$Lp=N#7w7re2ii-) &~y6=r^ډ^ nTR-:'Bo5ըʗuzqm?@im.Kaי0sRfΉ؂+6I}yZ Qf6'a5<"iaPeQG+e% qo*[fKY`Pe+_q[_iqI6!0 '"⬵en\nw" q">rhZ >uC;n2Ŷe'{Lz*a"Ts HԛN["L tE s60{}Wm7ksv|7KMx8j@g&/)ǬlNlҾdk|_[i_,,a1ϰm8؊!p_->`0 }>\09 z?Ȥ y+k4ۃUX`Tx@>im֊-u+|PYYn0ֶ+5q̓׍d, CY5})zVZ $PSb1N=cެ>'i,Iq9u5vgR|;br3Jr dYbYIEo= `o \a;`A&[B+*0h }Mr΁LEq4ߺY!! lAIr}_4׵svDl (_!G!ջ3Nsxzt3b?`v yLъ4fUSesIzMet+ +05Lَ:@Ŝha@uG h33!LϻaNQ N! 2̟ld2I4lʻ™];;:L1"W@?I 4QNf=/c#]s;[M*SڻnaH3AH!TrknMJ(9l NXsQHҲ㮵pL7GNI$裧OFl/EY+a)o3c#:z{yqe fe計!:Q3S#=ʎtw.ݜ%΂&B+ 9&U 98X'2;XSCutWi(5,#3%ޚ$!q`:N~Yf^XDKzW u,MLs=E%V hQtVnqm\ RLƏ,QE!F%V]6P nӒh<|t0%vg r \*Ni|#4s5T/WkF"+dea+bԺ5*7؈NfC?UGeftB'DJ/F{ZזT6v h5WF䖏9cqϰb SڸŘS~?^xEa$ϟ8Y%(l;?Kݞ wKb͍0v@ _WG`&52tM.S-lYUABNFɖKy0 1uLNH!Fs%{yKW#P:4.Dk3?F]$X2(` KoBԆUxY.SgH2 ED=F|I|uAt. qXZӽL6 <4bmR;UdϜ*u+}$rFd2$~bqzQARDzТ 럱sd6NKJ C?Gc-AwuMEMAy*VNFUU:zՊaG+O:FeB>RDt 5IhXU~v_o`1_VKDYVZA#)mju̱ֆڜf+Y _ />=/qM,HO}]*: th˳vB۠sV;Nul[ !/UN"Nx"7'SM&ۅqBn!`F P3x|òt;^StE^y!Bfşl9ؠ\Bj,Zsm3(Oyz5RgW$0 7S]󐄓l#׏~ޫqͬ&QݦC,+k c~ mUww܉V=x7f_yg͵c d>,VED&CcIGhp_gwF>KkwX*%Йw:}$k|b~a} HY/ʘ:PX s!L|Yl!q(<5@[fioF54賝L'">~-_L!e OX}cbh|T7WUZƐ_pwfGSdgJ@1>C5bӺ( ^{pУPmу[GCF/PٍپۂF?Mӝq2!e6Ca'D^YbM ! 9{#ϫJ!pߝip62$DץQƒ=Y豠M$rw,b ~2P:HP[ja7|Eida ʹ,fl'pCH<ͦ/TgMX-b%zF"^>oE(65ksN(HzONVpKRדJKlv?7L{f0d秣-G[!!Gb3I#]\i^ˆ(ɣNd9&ǞFޔk\b]&IA A ADMzz2Y Qo~čxs!UQo 4VQ)]hv櫇XUXؙjEکtX C|V4T OSDv-ӋEwr86l A I4P^.ch7ZUa\]&rl$PƉ# #E5T=CFQvWN8:>e?8i6e`"YZ[(%ǽBg /tQ얘+GAeb}!`)<[t:.^ȓ`;psnx%P߸߈,7*ƶ#kFı>vR/a&W\F"oܒ qy{vV7|UsIRyy ު+s6dU*bE+}|}cã:%l+ h4{yW3 3O 5u1~I_O PASG÷$h|K>bذNEصt琯\W508 ( VKAmvhumh^ѽV}'y_L, ")akoJ6[Y!|{oo#;:[W>]<%q #RUS0Dl#,EΆH#OĭL7A)eNr &Ic3j) w#|m * ye72~{PFFYaEYD i 0ֵڂBa¯[qɱe+ OY[/F=hMN@= 2 `|crDt;a|5-l ዄF]J١U.sTP QDNyNTk^ھa0^ H*Sq5v'Zz1*.2E8(f>4pr;hHViM$<*ê L?3x[Dm΅*r:wD:F~ u>8T Iz7aRojAoeEKӛ^ɋgt8_T®vyQ[LYTR.,cedTN$lYc^\ \]fL7\ "ArLBtMxh+SLsQ)Z1[׭ C(IDጀ4G$A:2,(P~n:.pckHhπ@8Q`*7ァs1dΆ_. a;Pχ,sx{K88 j;s42e {Z ; @j5z×A2y ^:]q)~~,VH!n,I*JK1Š8%*R1>bK Xs< ZS0:fYۨ&=/p> ]e̊O{_uHy""CYSnJ=lCRvNmʨDG>vqX(;Wp8扄7""1oS,f~А4/,Ww;C2| ϡBWY;+c zX_I)2|릡B [FᓧpқI&n, N8\{ڤc0c<>Eկf侫4茠)G0ZoxZ,="[7Ob,FXVg[Mu3mК{}ft4|f ow!ؼH¯OS* :B6tKW>74pi _=UrM\A'dKځY#Ĺ:+UaA /QqX>; #}mJ=hlFkt! AϚ㪗GVSn \ׯ i{}aE@6Y |dޜ G|Shq{Ԓ餮]0%'_@1KAŽ%{;Θp'5_P4d`A @*'qRRh4́lU Rm"=bS?KuuJ]m~>"6XNcJh1.}f| _j9I7Kp]ȩwS抸V;Q[L{=S;=ر(Q=c rtrJ'˄ jݩ =<${/.Ky0?.@cX=*V[xZudА}o>Xd jU1I.+4hȫ дQÂ0]:{BV]5.x2&!K1lWYզ90TES^WPWmZ۵$yޏ:LZ:]gH?f}%{-{D: +h?e0&;3THHyoјAst*G8_5I*"yiיD!:[Ls@^y\nCz_%5{Q^GdŽ[T}~gȫ1U޺UG 7nά۞R {lY>~|"D(i5{ƛt ]bj1s a˛A3LBꚨNOwf_$U/]=qEO3lWEQ8xc,w rga_%NYۮH)fH%o*amhO)KS;X*xe IfG+#dP_=:5pj<$S&bB6ǒgX3z]LTw* Bѝ6&I:D3M؁Q#fY3WH,gjwR-<:>Ѐ.!X@c}TSndg*+HGhqbC< vw Ss פ$U}B3;=M; G-C5O¹bRs^zť!Ec(!IY# RVl^PA g&%><Ʒp f\6? L:ng_/QdBy/mѕi.SAeNa;%b2>̆/n/Z@CRLHƷl0ؖ2 .]Hy6sdQ'.b_[Î=7_p ll- UGUXT"t8`A7Q~flau!)33 6M7Hu@Wc}Ïhv DlESiKr) ]|\E)Z!*3wo@͗F!;,k7ŴZ+xFL\BK4 P'V_]䦱g !6Oq: Sp_.ۻg''N_(tJ*kaڑ pO0Nj0@!"4Ef{_;g]#1&h]-¬Mjz| ڊ}hu;{?PMx?Q0.]Մ"1Շ{F E ^ jMU%7UwtfRf/ׁ)v hhA* 1 -g};ƥAo<%nO|yq&yъZ!tXj\9M?KO ?τ1+xo>Fc;XdX&lʐ V ĕ)\ rP:,&0ħLV4xJ:3E@󘡯L Nn&?0z`ְ9UK㤎:\qcۦ#Cޟ X//\%;KFuQgA#jsB㒮i#!³binU%hXX@zbIMsa"< zf #B0ރNl:. ׶5pUvkod+X6$|8R9lv$:bMB1d0vh,3w9n{ߪsA #A)"},u]E%X'DZ@|lߥ/H%IPN|E8ao -CV2{높뮹ry0O@&쐵IGZ} Ufiݠ.| ܧ),o ).|Pv_FؘrL`JLU$!4Ŭ5PSdL\XSe4;T[~[DX6ĽTt6]tU2gaPa4>| N¢8NhAnT+2BMMl羛bb"~i"%ƀ8#T  q](#\cyۚ_n땆wddwtz8I@oIվnµ-P; v\Vr  a? k3".=Bv@x7'W!JUx= *39ې=:=jw2uJL5oY.e!eAB"v{a$#Qi9@Ͷ1SՊ-+u!Fb Qܝ:L\K>k<-IzD%CӇSc0%59tャreլƾ/A٤?փ0ݏJ}d<nrph)Ut Xp dB Ċt=efӜG͠2kI2+2R#LgQY7 (hxuW &=H07Xp)} w='5#{3>xVֿлOa_@̰)B kX*DwBnMfv'@b]թ1q&zZq1<T?rTLO9tXyHⓁ*.aq, 0Gm.QJ|%8Kdpv,QXS~epX61Kcm`C=Gf\tTep %Cy4؄D63.lNcLHFЦ`O\ӫ:=~t7 gf&)p 2@FHnnbK|0(#7D\0\03?WZ)ul4)ޗz]> &đ=+A mt9k9@RV,\ش^Yg'Uo/|aD FeiVr@KNzt %\\ǎz@C?Bsx,7]Ah"7fڠh♥ 5EPV¼ţsz飏9}Rç'90WXy^\9s+)7I}f9xXR_jxk­4y盄ɂ=o6?>SXcx7]>Љ6qs cQٻ>,<#^2MA*cQԅĨ qzw( Wز[Q5LcCpT6 d%<_]ugޒZvߦGv^^ ]o'~ f4$k؛MjW^U-("YfpvfI4oSw5 kXoKYo!;Ħ_Xv*D&|UDF\]\Ml.:w!_4eM] 4B|fT}R'؄:Mz7O|ܩH7W>E>Y0FTD].2κ2=+$l5hb&(x6}RǿY,uHmu 8"Ev 'vs iG5˩(J}#{U2s|'X?߆A}]w ogdsod4ymmi16Z5Ʊ(t 1^ASy")Φ]Uhi=(sݘ rBsߨP0P_M|Y܏%k Řn .wLz^^v;6|~1ѩvs)*P  an}}3΀贈b e Tj59.Q63!V( gI؝{x8"lN;fS\ߦ`k'mbX!>)ֈt pҰ |û:мy.U Rr0M8m Aa;.g;=f ȇ.VVa<ĖUq%yLek7JzCbz$kE51M܋"_xiQ6 $jUvÈ]k=ڕ>q>75D{}h[m`?⺫fQథ4Nmq 8!^AbH !)FP҉R(ppS2%FGBUE..02V.kk_'FK7&XcFдt[zJ`0u','$6?=I"kg40_uen|δ?|>z UL2s3SpCvHlh~wɥ&C2&<)ȵ %tZ2]SR؟Of9?u(~|kwf6܆o2<'x1zt,=q^4U'xoEzuEկ0};zDD}qӁV8ɗwkIsЊ6-jp<#2Â;= Ya pJﭔ) .cYl<0fAwHHL1 ֬ҷs"bS9D9٧Mdb%IIr57 cRbן6Eb7c˂vtGf"/O+jTt.:o`8DƹJ^P^I[|-16*u`'[W×w7̗z[,N(#^Jq/i;kkHfل>bƸurKh0p|㕘JPҤ`pLG7954$%C>BC:?Wbɽ%/_@b9.\T\W ́V цb"܁ݾכUAᆋQ!LyOgQOdE|Nfch9 Rf^^(~y%; sK@bVlS'R]N9sX0AEbr4ђ @<$ #Ms%&F Y_I,v+hQ[,!]G B)pª@KT[[rz} JsHL(B*·V,ÄUrnކ0o̕uƸ}}Q|4.e }`ИiER2(wuVd.ރi*: rQ mgW Fޅ ";g[yw3:|Y8 6cmBTo%Pn,6q΢\nGǛ y 4)zX 8KxH;F4 "iߪ aDŽ/ c!\k1U~޼Ez \.s ]vR.% >|9y!ZM;5,V[Wg oU $c7D졽shwq }1 ե8 Ry$ך/' " # صGpב/ޅ Oܰ$#w苇rp21!+lЇÎz\1%JN vw2- FR9\7Ұ*jHҠ%buP71(50wI{br}k~}FwH\£y *A?~~ʝI8~f8Cp`|gW?+Ss՝r8ngSRomgӲk@pV+BfRZ giR|-,V(m`|wmVf x~ځ; |2m MD[MUk͵X=!TzáZAS)ΎkIkCfEU%@%qGYƐJaxκ+p: uNea⑊mt(lI<0?(ԙ&j ORmwquaẅԥ4fI'kQ}<+7\ԖcUXF¬e=zTN!K_iyj'[qq7 ЃTu':J;kae:̾=!ȤOrGM꭪E`T=pV& v$9NF_`13 {{}^fSx.p8}.<6Dm"P7f3,߅Q) ñsBQ4in)dEO%^i.~K!6GiPϱpKsSNg ~ 師)k-vW΋O9eD86kk/~l(VS"yщ|o#-EiGfp'Iʢ\=K1Wqn uF&0d]}B.>O3"2:RYBEYc:ku?Z{Xu»=/nt]-s+ΓGG;lyv!\H_멘0"9bW@e86ȱPF%#b)֐׷]YfEcs(q(}Ci MB"j OU&/Lz&K!&H[\q@tXDZ' DeIx(r+D(PzI oPϓfW}P?ca_ ņ\46cW'`]F4L`X_ޱ8fID_ʇc%pګ֛\7 *i# OM,|12ݮH|unH76?QI|d5'9.fE IrߜroCYj5]kLS튮Xa&GeU#xz'gq"N(fK3tg:Ю`].,בS\ 3sώ'Zx%"LjD|;-ڣv#i鳪BWt7Ww&t0,~N&_.H ZNU"pV!F?zA=.$~MvLT !fd9R8a6C?<"#G.WZWWđxl|sЭaXtghTSTlN ߵI4}=c=Ao`Frz"İ~<{ `-dŪ+0f#5=U$;`qge6-Yۂ^ QeՒ_ ~<9|4ud"P#B-A]<.Sr.TyגZa׿I8s4Y:0e;ZB#HoY7amHTaQ)6!m܅20[|*՟#+R0sȹnfwɳuZcañxG#v4 $òκ[4vLKE05E-Ah7Il|]XGW/,Î$@xϬzh&x~'( {a 9̢aE)Ӂ20vz*xmjhrQi,)rrZxIW:5Ⱥӧ6]bJ=@4l(jBcժ8!m+ YLX!%XfxDL̎7NQC Yc:fޅ(8ߪbuyFUN!*l*0eտ=OeTQz82iZS*GYj]N l'ȬYi1>Lq_\-S`isx!ΟC=m^MB!B\mhS[4ruecAde:!%{Фd뭱 {6E!m8 ۄfOL͜godJ5!9:%>Kї[fAXw=`7:bGrǼT􋥝iA_z7Vc*b)wzpjބD 06}*oCx4f+D(qJ %l Mǡۇ%-S['j XLGɣ譂,9|t( 5]`|FX?'xw7G|uY\_Jse%* #W y)ҡo.a HPZ.if>oQ*!5i:똖ﵶzx#n*a}/ـ6 ӵ`E-WbC4_ y(( 'ȆgbG/a8L&-:3X[oBV|4󵁨.6GDVF$p%2bsUg똅 g ǰO#n.YZA#F Qb"\"$ptel>m9ҧ7 Ka5 gVQ5T{\YT iL !V!kDkze^]>CGpG8{=T8 QƟ4X ?IQg R#7GW'Wc많> "qLۜnt4)A" pYղά)r.fmGΞ0- P^}]1%Ȣq+vPBmgS t @=-eg?k-ۂAuば0Wn*BUA@q)O&a#ة/Ş'IⴌToC^-d1_cNӵ(,uP4i[`$Cَ-`HU 4 aI892_LȿɀS+敌_%b:3]SX46(#<9;δZT T"uk\@ CLaJ#GffVqw("&<հ\s2qHYvEą ⩕"'dD!vk5I "lInϴ5#h-<@ӆa|3=)Vﯝ.ū' a5S`4j̑$ʪxdMQ'|y5tg]<ĚE7G: !eCU0H 8_SVC,m:C'#\Km`/d)sjx8v{h 9_]?C"=5jw}uXuùA[giqNW޿j+$gI%I~o2j[u/p~;|x-x"!ev[@ si8qjŽިdhS5Zuixƛۛ<]d#F5ffsGZ\0=*`Q]p|G0 e\E|a>anx^_4췙~)u+n̘TB;;:"ok 2/mR@Po{t8Þ<~Dh*s?'GvfܮÌ)}C?0!޲cXkAڈ{{8vL>B wb e86HԳ;!^z~AXOm?qWy{jߗ /]R(V5g%ECQ쒘>gҝM{l.AWBak?,Ҳ`ď6 V>klz)M@(ddokd?9ٗUiW6{ w"0 +g5WV\E"DG~8J`kۣ<ԪFڡ9z&j@rLz8/JT+SDype`CH&ڑ»Q*$JbG XzIkEp-5~.vx 9'!0}"hݛ~*"H!Q륬jzl !MyؤXuzeUH oUXa ķ)sWu#oY)CP6 j&뱸+WwdppwaB_;Tg)jm:$'NU#K91Vgg$>$q=+>KF8sL_A^g_SgD)M4^wАa_a1Qc#C-]ڌ-a2j K/._(.7 M>.oOq\R2u$?@ Qtvzw8̾[Ai~p4ȕL+O̜\h#R{`=*LFe |8''k]PQ NU &ǥU?ϯȵDq u?:ͷ]sWhoosh-2.5.7/benchmark/dictionary.py0000644000076500000240000000235212254366350017545 0ustar mattstaff00000000000000import os.path, gzip from whoosh import analysis, fields from whoosh.support.bench import Bench, Spec class VulgarTongue(Spec): name = "dictionary" filename = "dcvgr10.txt.gz" headline_field = "head" def documents(self): path = os.path.join(self.options.dir, self.filename) f = gzip.GzipFile(path) head = body = None for line in f: line = line.decode("latin1") if line[0].isalpha(): if head: yield {"head": head, "body": head + body} head, body = line.split(".", 1) else: body += line if head: yield {"head": head, "body": head + body} def whoosh_schema(self): ana = analysis.StemmingAnalyzer() #ana = analysis.StandardAnalyzer() schema = fields.Schema(head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True)) return schema def zcatalog_setup(self, cat): from zcatalog import indexes #@UnresolvedImport cat["head"] = indexes.FieldIndex(field_name="head") cat["body"] = indexes.TextIndex(field_name="body") if __name__ == "__main__": Bench().run(VulgarTongue) Whoosh-2.5.7/benchmark/enron.py0000644000076500000240000001446412254366350016530 0ustar mattstaff00000000000000from __future__ import division import os.path, tarfile from email import message_from_string from marshal import dump, load from zlib import compress, decompress try: import xappy except ImportError: pass from whoosh import analysis, fields from whoosh.compat import urlretrieve, next from whoosh.support.bench import Bench, Spec from whoosh.util import now # Benchmark class class Enron(Spec): name = "enron" enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz" enron_archive_filename = "enron_mail_082109.tar.gz" cache_filename = "enron_cache.pickle" header_to_field = {"Date": "date", "From": "frm", "To": "to", "Subject": "subject", "Cc": "cc", "Bcc": "bcc"} main_field = "body" headline_field = "subject" field_order = ("subject", "date", "from", "to", "cc", "bcc", "body") cachefile = None # Functions for downloading and then reading the email archive and caching # the messages in an easier-to-digest format def download_archive(self, archive): print("Downloading Enron email archive to %r..." % archive) t = now() urlretrieve(self.enron_archive_url, archive) print("Downloaded in ", now() - t, "seconds") @staticmethod def get_texts(archive): archive = tarfile.open(archive, "r:gz") while True: entry = next(archive) archive.members = [] if entry is None: break f = archive.extractfile(entry) if f is not None: text = f.read() yield text @staticmethod def get_messages(archive, headers=True): header_to_field = Enron.header_to_field for text in Enron.get_texts(archive): message = message_from_string(text) body = message.as_string().decode("latin_1") blank = body.find("\n\n") if blank > -1: body = body[blank+2:] d = {"body": body} if headers: for k in message.keys(): fn = header_to_field.get(k) if not fn: continue v = message.get(k).strip() if v: d[fn] = v.decode("latin_1") yield d def cache_messages(self, archive, cache): print("Caching messages in %s..." % cache) if not os.path.exists(archive): raise Exception("Archive file %r does not exist" % archive) t = now() f = open(cache, "wb") c = 0 for d in self.get_messages(archive): c += 1 dump(d, f) if not c % 1000: print(c) f.close() print("Cached messages in ", now() - t, "seconds") def setup(self): archive = os.path.abspath(os.path.join(self.options.dir, self.enron_archive_filename)) cache = os.path.abspath(os.path.join(self.options.dir, self.cache_filename)) if not os.path.exists(archive): self.download_archive(archive) else: print("Archive is OK") if not os.path.exists(cache): self.cache_messages(archive, cache) else: print("Cache is OK") def documents(self): if not os.path.exists(self.cache_filename): raise Exception("Message cache does not exist, use --setup") f = open(self.cache_filename, "rb") try: while True: self.filepos = f.tell() d = load(f) yield d except EOFError: pass f.close() def whoosh_schema(self): ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None) storebody = self.options.storebody schema = fields.Schema(body=fields.TEXT(analyzer=ana, stored=storebody), filepos=fields.STORED, date=fields.ID(stored=True), frm=fields.ID(stored=True), to=fields.IDLIST(stored=True), subject=fields.TEXT(stored=True), cc=fields.IDLIST, bcc=fields.IDLIST) return schema def xappy_indexer_connection(self, path): conn = xappy.IndexerConnection(path) conn.add_field_action('body', xappy.FieldActions.INDEX_FREETEXT, language='en') if self.options.storebody: conn.add_field_action('body', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('date', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('date', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('frm', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('frm', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('to', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('to', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('subject', xappy.FieldActions.INDEX_FREETEXT, language='en') conn.add_field_action('subject', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('cc', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('bcc', xappy.FieldActions.INDEX_EXACT) return conn def zcatalog_setup(self, cat): from zcatalog import indexes for name in ("date", "frm"): cat[name] = indexes.FieldIndex(field_name=name) for name in ("to", "subject", "cc", "bcc", "body"): cat[name] = indexes.TextIndex(field_name=name) def process_document_whoosh(self, d): d["filepos"] = self.filepos if self.options.storebody: mf = self.main_field d["_stored_%s" % mf] = compress(d[mf], 9) def process_result_whoosh(self, d): mf = self.main_field if mf in d: d.fields()[mf] = decompress(d[mf]) else: if not self.cachefile: self.cachefile = open(self.cache_filename, "rb") filepos = d["filepos"] self.cachefile.seek(filepos) dd = load(self.cachefile) d.fields()[mf] = dd[mf] return d def process_document_xapian(self, d): d[self.main_field] = " ".join([d.get(name, "") for name in self.field_order]) if __name__=="__main__": Bench().run(Enron) Whoosh-2.5.7/benchmark/marc21.py0000644000076500000240000002170512254366350016470 0ustar mattstaff00000000000000from __future__ import with_statement, print_function import fnmatch, logging, os.path, re from whoosh import analysis, fields, index, qparser, query, scoring from whoosh.util import now log = logging.getLogger(__name__) # Functions for reading MARC format LEADER = (' ' * 10) + '22' + (' ' * 8) + '4500' LEADER_LEN = len(LEADER) DIRECTORY_ENTRY_LEN = 12 SUBFIELD_INDICATOR = "\x1F" END_OF_FIELD = "\x1E" END_OF_RECORD = "\x1D" isbn_regex = re.compile(r'[-0-9xX]+') def read_file(dbfile, tags=None): while True: pos = dbfile.tell() first5 = dbfile.read(5) if not first5: return if len(first5) < 5: raise Exception length = int(first5) chunk = dbfile.read(length - 5) yield parse_record(first5 + chunk, tags), pos def read_record(filename, pos, tags=None): f = open(filename, "rb") f.seek(pos) first5 = f.read(5) length = int(first5) chunk = f.read(length - 5) return parse_record(first5 + chunk, tags) def parse_record(data, tags=None): leader = data[:LEADER_LEN] assert len(leader) == LEADER_LEN dataoffset = int(data[12:17]) assert dataoffset > 0 assert dataoffset < len(data) # dataoffset - 1 to avoid END-OF-FIELD byte dirstart = LEADER_LEN dirend = dataoffset - 1 # Number of fields in record assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0 field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN result = {} for i in xrange(field_count): start = dirstart + i * DIRECTORY_ENTRY_LEN end = start + DIRECTORY_ENTRY_LEN tag = data[start:start + 3] if tags and not tag in tags: continue entry = data[start:end] elen = int(entry[3:7]) offset = dataoffset + int(entry[7:12]) edata = data[offset:offset + elen - 1] if not (tag < "010" and tag.isdigit()): edata = edata.split(SUBFIELD_INDICATOR)[1:] if tag in result: result[tag].extend(edata) else: result[tag] = edata else: result[tag] = edata return result def subfield(vs, code): for v in vs: if v.startswith(code): return v[1:] return None def joinsubfields(vs): return " ".join(v[1:] for v in vs if v and v[0] != "6") def getfields(d, *tags): return (d[tag] for tag in tags if tag in d) def title(d): title = None if "245" in d: svs = d["245"] title = subfield(svs, "a") if title: t2 = subfield(svs, "b") if t2: title += t2 return title def isbn(d): if "020" in d: num = subfield(d["020"], "a") if num: match = isbn_regex.search(num) if match: return match.group(0).replace('-', '') def author(d): if "100" in d: return joinsubfields(d["100"]) elif "110" in d: return joinsubfields(d["110"]) elif "111" in d: return joinsubfields(d["111"]) def uniform_title(d): if "130" in d: return joinsubfields(d["130"]) elif "240" in d: return joinsubfields(d["240"]) subjectfields = ("600 610 611 630 648 650 651 653 654 655 656 657 658 662 " "690 691 696 697 698 699").split() def subjects(d): return " ".join(joinsubfields(vs) for vs in getfields(d, *subjectfields)) def physical(d): return joinsubfields(d["300"]) def location(d): return joinsubfields(d["852"]) def publisher(d): if "260" in d: return subfield(d["260"], "b") def pubyear(d): if "260" in d: return subfield(d["260"], "c") def uni(v): return u"" if v is None else v.decode("utf-8", "replace") # Indexing and searching def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.mrc"): if not os.path.exists(ixdir): os.mkdir(ixdir) # Multi-lingual stop words stoplist = (analysis.STOP_WORDS | set("de la der und le die et en al no von di du da " "del zur ein".split())) # Schema ana = analysis.StemmingAnalyzer(stoplist=stoplist) schema = fields.Schema(title=fields.TEXT(analyzer=ana), author=fields.TEXT(phrase=False), subject=fields.TEXT(analyzer=ana, phrase=False), file=fields.STORED, pos=fields.STORED, ) # MARC fields to extract mfields = set(subjectfields) # Subjects mfields.update("100 110 111".split()) # Author mfields.add("245") # Title print("Indexing with %d processor(s) and %d MB per processor" % (procs, limitmb)) c = 0 t = now() ix = index.create_in(ixdir, schema) with ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) as w: filenames = [filename for filename in os.listdir(basedir) if fnmatch.fnmatch(filename, glob)] for filename in filenames: path = os.path.join(basedir, filename) print("Indexing", path) f = open(path, 'rb') for x, pos in read_file(f, mfields): w.add_document(title=uni(title(x)), author=uni(author(x)), subject=uni(subjects(x)), file=filename, pos=pos) c += 1 f.close() print("Committing...") print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0)) def print_record(no, basedir, filename, pos): path = os.path.join(basedir, filename) record = read_record(path, pos) print("% 5d. %s" % (no + 1, title(record))) print(" ", author(record)) print(" ", subjects(record)) isbn_num = isbn(record) if isbn_num: print(" ISBN:", isbn_num) print() def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True): ix = index.open_dir(ixdir) qp = qparser.QueryParser("title", ix.schema) q = qp.parse(qstring) with ix.searcher(weighting=scoring.PL2()) as s: if scores: r = s.search(q, limit=limit, optimize=optimize) for hit in r: print_record(hit.rank, basedir, hit["file"], hit["pos"]) print("Found %d records in %0.06f seconds" % (len(r), r.runtime)) else: t = now() for i, docnum in enumerate(s.docs_for_query(q)): if not limit or i < limit: fields = s.stored_fields(docnum) print_record(i, basedir, fields["file"], fields["pos"]) print("Found %d records in %0.06f seconds" % (i, now() - t)) if __name__ == "__main__": from optparse import OptionParser p = OptionParser(usage="usage: %prog [options] query") # Common options p.add_option("-f", "--filedir", metavar="DIR", dest="basedir", help="Directory containing the .mrc files to index", default="data/HLOM") p.add_option("-d", "--dir", metavar="DIR", dest="ixdir", help="Directory containing the index", default="marc_index") # Indexing options p.add_option("-i", "--index", dest="index", help="Index the records", action="store_true", default=False) p.add_option("-p", "--procs", metavar="NPROCS", dest="procs", help="Number of processors to use", default="1") p.add_option("-m", "--mb", metavar="MB", dest="limitmb", help="Limit the indexer to this many MB of memory per writer", default="128") p.add_option("-M", "--merge-segments", dest="multisegment", help="If indexing with multiproc, merge the segments after" " indexing", action="store_false", default=True) p.add_option("-g", "--match", metavar="GLOB", dest="glob", help="Only index file names matching the given pattern", default="*.mrc") # Search options p.add_option("-l", "--limit", metavar="NHITS", dest="limit", help="Maximum number of search results to print (0=no limit)", default="10") p.add_option("-O", "--no-optimize", dest="optimize", help="Turn off searcher optimization (for debugging)", action="store_false", default=True) p.add_option("-s", "--scoring", dest="scores", help="Score the results", action="store_true", default=False) options, args = p.parse_args() if options.index: make_index(options.basedir, options.ixdir, procs=int(options.procs), limitmb=int(options.limitmb), multisegment=options.multisegment, glob=options.glob) if args: qstring = " ".join(args).decode("utf-8") limit = int(options.limit) if limit < 1: limit = None search(qstring, options.ixdir, options.basedir, limit=limit, optimize=options.optimize, scores=options.scores) Whoosh-2.5.7/benchmark/reuters.py0000644000076500000240000000227312254366350017073 0ustar mattstaff00000000000000import gzip, os.path from whoosh import analysis, fields, index, qparser, query from whoosh.support.bench import Bench, Spec from whoosh.util import now class Reuters(Spec): name = "reuters" filename = "reuters21578.txt.gz" main_field = "text" headline_text = "headline" def whoosh_schema(self): #ana = analysis.StemmingAnalyzer() ana = analysis.StandardAnalyzer() schema = fields.Schema(id=fields.ID(stored=True), headline=fields.STORED, text=fields.TEXT(analyzer=ana, stored=True)) return schema def zcatalog_setup(self, cat): from zcatalog import indexes #@UnresolvedImport cat["id"] = indexes.FieldIndex(field_name="id") cat["headline"] = indexes.TextIndex(field_name="headline") cat["body"] = indexes.TextIndex(field_name="text") def documents(self): path = os.path.join(self.options.dir, self.filename) f = gzip.GzipFile(path) for line in f: id, text = line.decode("latin1").split("\t") yield {"id": id, "text": text, "headline": text[:70]} if __name__ == "__main__": Bench().run(Reuters) Whoosh-2.5.7/benchmark/reuters21578.txt.gz0000644000076500000240000054326212254366350020320 0ustar mattstaff00000000000000@bLreuters21578.txt}oyn@Vt4 ˲,y$E$)R")˓daɴ-rbk]h8J?pbVV~RXrW(VA*"՛^+]:IRuÝg0V Ej^(QlD[߄]{ 0<Myed+/~@hi5^ma6Q|j|@j8W4d6!*QUNBK&!So`v(-š|˰wBA@ؤO(zW8XD^;Sf8#X_tj1w:%%)/qek]gNiɡ ;"#qBˁp[{#ZTnpCGy3~a\Ą:/xb_j%œ$aGwg4!JWtVD/{% P{}U4˙jM@iΏi d٭6M NGc7/^q3)Hk1v<]ab.kGp% %/h1r9C:k`w=D{%yEڀ9'd?͋wS Zf :)`yM$ ~*e@daJ[4fUx& ) Jr(I5ZTV ɾuYaW+o HT`H#1A#7XSTb.ycnpD #(R#dY&9M6th0:^n>Nҋ.gB=yJ{%ѵ&Ҋ4%so5Pi ?$x @:݇1 CvFt.'˙[e?9ewv 9+q@*Id5ݫa+ӄDb</~A=$?j hA:ͽ͔EZ2w`$"p*`T/ Ou m4띆\&[YAP&Xx{y❭jwݐɈ3t5+DPq YhrD히AGK6Z I}(c>}3jN2{6*rg/gšQb>iŬ 5c%72pGNII(R>geq#ƞq0D{ S1H@;KgۑȤ I~`^%#&:GQE뒑UsHDU.2XoH:Y: $Nq#*lu|<,J^ CBӮI{ i̱TC~jM{/%N%/&8=7 l`A.2H`6!3 R>)H0h+B'̭bAHzc[Dk]qtQ">',puW3IWj@TcAX `Kf0XE'1MG `cIc! uAS^Hx}<[ d>2=0 fCL`N^o!4mlxavE&Q$zSpO-Jv!}=b^3 ,#&Q PSsTpaCBg1{K@!)Pk6O2FgDyLA5 ]pϲ\&s` _*_%d*2ѣ03`D:֤h0KdALLRA"ZBUn+ ICd BE1h$_hWn`ې(fJKbbVX%mY5Y|WeC`fİp<@}Mf4`RAKäd ATqdD#yKC1JBX^Ϊ&2Hn'EWcK!; D|G*. !,5quOJJɐ0 ?2֬5Ajadg\{[2>Ӑ;DHYqD,-:+B SvL҂اAh.9jM_C6oD3% 89@,itg2C XQ֪Ee9cF A/~@(Λ?񶟘2QVXZ1x;'ʥyS$'$Z[X%:Nc|"0$U@"@NHBT BrS=xED>3 c0cDl%) NH]*;Ś֐x#DȧI4aw2Ԗxk>lʭ?UoxP$7G|08Lw^|tD@`\"$@O[@G2U~sHsQ `K 眩:+7sr'H!:LA6VlG+]\HaX^`ZGfށh0\[5#8z'Я 6BZ"l|n0rƒ',˓]9?**U/ Kܠjnl=nj6-Azn4+ o]9KuCƈ߭+~0[ږDGg{Vo59gnw9G3D=}j7|6J]o&JpBC|0քHzmĄb"ؒ{'[{1˷,!; FJGx١(.=QKX*PLȢuB&$<@R*~ow0ΰ )#R& hϳ5se&cqj_6z7{vbЯO靗3vm3i7Y#kp6- 2SF6uBvZVr0hH 'Ė vMF⋭(f$5JY$ A#' ^]W+jM-DNl)iz>j^91"p&_0 ]iDRp68&kVpa@}Kr@jsO2||Liƫ} Usf.ubD 1s{X'Ov^tLb;lďS54ԓ"J7LjiQw>Poʪ =,IDqd"PmtP,= e>;MI7 kl ҼʁƂ |d߉ynOD;1߾>Ay7^ߥi`-A=ɞI:M7h^Zջ7;NbR 5NuLCBǓ@PQW ׅعݭ @Y0Mv+îܯi~O40/P#,ǤdfB^9LǘF}'og[5'>hڒ&٦.b#=2vmF*fxKڔv&<ث19HaOR\E?!s!޼ڱ 3MhQF2On%8FÖ,˜i"!x$$Rb$ubsdI}B) bmԣ\2 bNH*,gol#|K-\vm (W83i`b.)Y4h>QOߔq{Y\ 3Dވd S kgʟ%Va\GvS2Mj\^FuP"Bsuv6PP1?l[nCTg<d#N[ IA-!=$0)9i!@5yYwZVjrSu׊*wj!@Hn8K6XaHy쒄n_ 3 W[\aA)?1͟CAe,'DmUq5CΥwhrr.\5J6ct֤H 6 L/&L=I<<_\=M׈SVe-36\ q]p.Bi0EEG8 4ªl2|o=r'cy@P^"bnзJtn:ŧ/[)44=po\ dm7F4z큍/s;?5ν?jLo~Ҁ:8O>=Tz/K!z \}8^ b޽TI.IXX 9S°bZ%ʗ6DA=?aӲ2dS\xU=jMS$~-baqK?c.nj(ET\Jrpy8&Spj%Wu"=!pĺ~26n.R #}88QPTZ8έƜK~ňP)"RpE) ;$/t\*HD,(M&9αMuM,gfk5MlpuٹmlL-1NJevRI͆K󧫕_/'_5®yjRj굊u~.,nA5٭IPk|a&&[;I'OS.=p.h)㢧7_zc} 0 TfpgD3ԙBpVp6;J ս~_A-c?"wYA3D(8:@|&vW/&m)洛F4{27A%j< rchz@II@?6M ɐd]tьg9\dǾ,$EA>i]Wb#Vt)#8.71TVw'VFU!-qDmH軀L96]Uev$+IH\-_;fW~Y"i'MiDQL˱5dE2'X5HBaIayGt AMQ9mSI{)4s_^m fYYuQ{k $tט}zSR F#aDT<5\; Be;fIP'/cɸloif=f6*Vn $ɥ1- R9~, ˰lW3Ԕ~FƺPĴiH|Mҽc鞌t;2&\Jo&&5E"LnjQN TpO+Z8u\*A۲@r!#k;hZ@y`Zaa:19d^WД;Vқ6}M!- -{I^M?:&Rkӭ+V<#)Mȶ8DiU{ ohBzwE81&ENͮ.wgіn sFMSf7= j8i&< "qB yY #ބ]۲H7 UWlR5&=:YXL\6ܑ霰 T⢢=_IX$lINR/g\mxŹ] '35LcrrK5$ %D!EBo-cSl!aIC\l#<{FE0Ei,aBIH)KpZn<\)>j{N>VvK$n\ovkL10_;; =j<l;.|Lc/mȨt,6OFjXb DVI+r1 Ve0>G\v=ѡhp'㻭~NeǟBamI+ߌ6W;t$coK=9VٕP6k򠏉\xaG+E&0e;1UqVCXe%!5ݵ&yuLaP򔹆iz ΋W-{m$!1#Wŧ ?Z5/>5/.#n|'vW?Bx!{I$?'=WcKaiU1߉*<4%YO"OB0%Wmk^n=7BrʷgFkV~M0;BVĂ9iI~b9}S( AQ-rӅ"qA "ߠ6p,LP"6zFqƺd3:D9H2Gב>Jxn::QGUU&XoyjN*I -?xBҲIXVZ">hRB5i&0h欄hVp%KIrtɕK&DUlyцИx%Zdl\Y5|rz!X3!n 9Ti8LG>t<D?PSkHHA)5xQEAd&eɥmȫ.\8Q׮v*!\'hed&/:=ț ;f6Cu춏3*&3T.t?ěGȄaY=&jD6}mL/r0dm eJy5,o!;GEA|CP<xFY@5<$,gI7ÕG4.KNʘ/6a5q7D-2sE u6C!]ewR$f!$[qQ*CDFE^eeS|q,R;va{!@ pд S ց;-s8縃_Y!*e@)E!o { #_E«$~^bwQ! ʊ!#lDgG,hFaGE2e8Z=}m%+A`"lt}Lba"^LW7AD\Yj}Yw𸧏DTJ0՛T,4 o Tޛ9Q^MU?2);y/XyCgx;V?35Rv_+PL`(&7 5|SkM^/#Xj=HؕEf״e#.-ieW_SWu&ǜk["{݉-&A$% jT~+\32Wɚ*'|"S5;Pm_M6N 9pAv$ޞjLaonbUݷ'eAx\9gw.Ou:kwND uB&2Cck bhY"6 o'Zf3sNԡn0WOC?\^ ֿVRڥ ^Jٳ\qK6͂/ ;gg,I gU7ґaH#9g.MD;HswSnje'@jU[n3t\^k[9ެH3ʦwrl1Tcύ;fVelr'ֆq:Tvr.@DJ wz iuqeC!OIW/.>6ͧ++V\^ V+9K65KI5ג7չ4'>d/>Fktf$9"J0,U7EK!{ ׬roq"wU د&[2z4%? 7F_!T^:r_xNwg,ڽ~\ioc l" ׯ|]Y~\Z)%-2ڗ. LKBV/g$QjQkf"ق5Qoȶ'7R})]e`q b2R7hbGO !_u.R>=Ozπra;[ϲ4з'hR M?>jAίML[. _v`U#l'}v % 3wZ45$Y!'z@&1Cn*ex`]Q: ]kM\䫹or$e^{ +U|Ii2b8})+E+g O3.% DW"vV"{#ˠ]߸^uf?n"NspQIFNqM9FaW6#s|fUpFsǔDZ\'W^CqHh}(cpnV" arwTC8=㐋'naE:-DuC]Y'cБb.02/ٙ 4M,hb}Dm2%tBzyKF}9kC\*z4R1ZGe+8qL?Wrٳ}dAEvtHhvߝVrOKI->s5 MV)㆓@GA_7W[/L~%M5)&0Dըp$u ]!h5JajSúJ:&Vnr,(ITx5z1SKe魤tgS6z3)7͇ϾkKQld칗K [m#h$:m#d7} ABOZ1֪c:n4> \nzH"I&A,a|5{*ߞmFڒqV~cw?#!nG|h1ĩdA"Lk8U)%n|Tf d^X$%E>55ԬT$e$壠nþmL^m]}y"-qwA6%eZ"3SbGxw)j!9jZN.g{j&ȽZlO׸T#{{%кLܚoKpYoñA,L%Gba˯sBfA/\Lr K"ݽ?܉%y7l.KܪV^9K;.Sv1\{OmG/ ʁU@ |ϱhNwt& m%1\+owxf/D p6fYk iag1ȓp3y8[οG.㏨zP2X&}O7T9S4,~v^ YMiYel W180ZA+e.וReQH.zDM#"WDBͫJ_Cb?뜋j+Vl\Mӟ(u1FPjPZv }͸]g8%1.^?.7Aurp/,֥H[-I7}1#-';{Xapfիٶ-Xx y27[w jksY5 +a'~!$SFܮ'"))@Ig1\k%Wu=qsc-A溯ys~}Te@A/'f*`{ғ#93哄Pan[ǚIZg|ľRaǂȵJVCԬhn|8,ӱ~$NɃ*,~qf(Rl?q{6Φ$=3 i4ssc"|i)sۺY,лu}FёfK]cWJ^cP<0f<+ 4>Ϣ~Wt\$$6f//{ :̧dOS!K71֚uCeӅ)<m Z^#@6QDXzSA2Y>U+e%IG>[GfD.-܀"̾Ke f,UxbQG@Ci U T/F53Lg%Ġ>^w{I{ kN+e{ !~qjH;J6CSgHA!n{7R17vI/2xD{[HQ``, +! 7vdH١JPXWb,`յe(~ei8Yfi"ROv?wWOvrQPqٴ%6&&Zyłrk<,oEIS*Lw,͵_SJIGh,ͤ@|.KeK<bn;;&n8hVVTGgؽ 6-@w.zyʹ״uεg`g W1g FE6W e̦ae9SY&$xjlh蛹3 tn\VΈM do/:\Kd-CMld&ЦEb4x!4,#eDUknӷp/ eztn8l0 aTEôpjZD7~ yu11 \V7j44[w"d=sk9%5e*$G%pެ *(Mkg 2HNBf")IE7tSVqYᔷ{Zxm~Ƀn(;?()!1SW6{UœIe!zGvMPiWsFVdXnw1Dwmb^w[=Uv-2 !!Ѭج7m Q%F4궔t鏐 XlsKfd^ 0XNƝa{V\k^ŇUU 520Fgh&A-b .$Eu'aֻrOpZ?Jq uO[Cwwude=؍mҺҜ@XLo 5klHZI}W4*MDKxr~ 4W !<䨠.!gT";hde‚IԂj0".莺O|:$gc?vnDڭ|BA4}؍SXMq6"Jbc4OjHW/2eTw6R!O>iOSw.f6켘Я<Þ=t0%&n 6[Wn32_ҜR~Ď]ERxz!VQŹ^2&n-yj5ck R\I }2& WprEvK(ru?^g1*1Y?稚_Hx)3wOEWz,ajc.tj5yu?*_V X)eG!ڰ'FE +kb(7;Ϭb܇:"}ԫ.~]DPJUyο$x.e(5SX[Bi(jLʼn/.ʊ!.:2vz?lhEu$)!Aɭan&'78m80Rx Kaog~TJ_m+|v䥺oY>ϣ/ƛSƙTK Y\\3} |s)%#NiM~L_6%̺hFw^E+u]a0$g^H[PQub  n/+)^pzRׅm;ǡX~6IY sub<.4VN[ȵ+E @պ4jn?j=ֹuϙ>Y&JBE(r $ tIqW`55MTQXdX?̛9o 3Jxhqm\pGZjiJYcR weޖPOSU(QAQ:uKlid'q"jQ23W{M\SBE ܉s|plXX-UeLKFy&Rg=bD~uL95h.[t+Q7z2@yi21DwaO2+FdjX׳) OR5˶5ǖބ.y`+^ċh h &"܇> |*OIu\2MPP *8W50{~M,>NST0FY*Đ\$0a&hI2A28&v1]qI=3j.*a4DV莒't@KJZyG(-P=͍z_Ι~[pv}Pד#ybv] P ȮF$As%p=m#c1ʦߊy$֟[  SwV/cvn -Lg%U? mXg0S 'zE ]kb%1.C>0Q 't -ŗp{i$qTy2Qy<%!/Ѭ =pn'gIvޒSr Rz޽ǝ'iN]YVx ]ȟWExllWp r)KG9 OrucN`tE Q&6zAkPrt,7!ZRxvNFu;jW^0w'TKHv<ڠl*)7>4o Dfb~TmG5f.oRTM_5cB` irl  PJՍRO+eq}mK{a#׳o<7vy녮1?}O IwXuH Uo>c&XYx9P?ȸBoaOrFҼp|~ x&tz.PB0U-J&DiX)e*W_RMMuñuT"Q,d68'g @K3#zQa;g۴ nM#"D;g)@d!dž\GsMtAUT6yI.IZ8cহ0tE M8!f"`-,VKNJLoD։`M#$ {@0CAЬ` Rs)L!w߯).9Eetm(ƪB\. 9f#0UŰrr3m &V˂)ԓMI>pWB8WKGֶN-͉Ԓq=J)4FStutWVϔn47NdxsJNr>yE* IFkrՀ机*!4>&c@dQ1X~\'-p%7}bneba=q Ly5b{cur!6kOMS=i@-3Rג &Bc#G&|>/:`#̐7BH#[lηY*1Mh_.sTd }{IMz|Dn}fIׯ@pMBE*;4PV;t*Tf΢wL! ) 9G%=H&-K<k&':t}yS᫆!]̓rkVU¿~6iD`)_=97Xnز ǽeL^GHɲ*\oҧj3._|O!YWH@Cdt|IW e$ vq=vEѕEiLfyb ճ\  -G﷔,Oze( h loC$,S)稈ʇG7_XSk,e͡ͅN)"H; hkЎc ןuIR=Ao,^ K_@T'G{ʘB +.ATD Jg]wXV3U[~t?}I[9 EVm#1(Jгzx >4d 2RK~i3zKV!^|v̦ӵl}d1~5HnN ՂУ5ڸÿfln6 9o7fSL>LG/Yvs+s[+SB H=ݵskv Oyڃ%Fz%̒k߱:v_,eMiV:h^5"ٲO!W2EM[ {SJ!D!N>bRqt. n{!v J,Jy>>K#cB~-gB@SwG_Hͧn{sCew`ۈ ه̧]>J=1>z8KywV# ~DA =2/K0B$b"b*GE:Gcn=T#/˧۫-ɝww\G\wzarǕ=r(&oݪXֿ' %mbaܘ2t( r "J\#ƧRG_fԐ1yUxI3QntZF3+{|=v[`8 #`=|EZf\`8UH9k՘իj.cU ֤H&H`LeSq] `0\䟂*qxu\w16HHqVo_?-~p/"»ވJ+*ۏVt s޸vp%pT{2顚Q@C{M1JV,iᾲDM"%Y`F8oz7D܈CU,؅_nW?O=\A_R 挧If> Sߵ};lEp2}R0zH@(:c07FZYoCĒp+BLaǏ:p_0,\o d8yV.k/6 ^BZ,a->*"G8]GZ Gu gZ _̖RCD{D> :+$΍*C %pӺZQ`U<1 3GW?t=RZ Js8>FU= 7Gͳ狉.l|羔\>E[Ѥ dZgz>:/#V$T9i~ɦεll3gCx,+T*en'C݃r$_,N& y+$n}ld`兛?=OS'뭏 usxvT$*t'V'+S_t)1x@[+L}`MDX504 0b֒O-tgHieEMZBjm2@<Ђ!u7칵wPϭcs_֓O+0_·wnZmFwThn~t;Mwa H;ݲ;HYsL՚O'FFj@3- tnc,}[[ a]SKYB?Yp򢅗g% {X{j ɞ1J?ش=T}qgA{ȋiI7{T8@1,F,K[H@xSؒ!c87#&FH@R\ohB'$b!P,jg --W61,?<ӏo/MDcUWBFm`dM(x㨱X5G<VmCHY"21!A1Z q`*Q)티J֠ (vu Л֍3/뎾&>/S):DJB"aq~yrg\ң) |-||?l}@yiB17#jgpd2O<9K x{l{ZH 6,;dU6Z%)Kj*qVk-V*Ɣ!~yV)t 1> AWxPkAz5$hmk?鑎G-loeY{F #A(E.RQZV96p:KxL 7ӱܳ[vk-z}Դy[X rVw1+/p(]GyyN 5*" tAČ/4 +/GklO4ig9ܝR` O*%O2AI71P.PO/6tFC5x [<˧g7Gdau܂!f4`MXrٽ^&LV|Xc0[ /YE%Rwi:|AQ-9دfGtQ͖OS}"C|SH&dފ9dM>?د""RkH#H|w! )X3syBAEQqv;ߟV\̡GOg(%*U6'J&{vpσ9_82+A̩ ͒[̙FhJ驆GjFQC _J_&imSD'䌴~E`HijI95o5|_>&tna[f;4'0ޜG532Dk^כǠOuL_ÐInaۂ B+b%ɔ@YBwpTܭ M2)X}y/{ O!yC8ך3?*^x%@L:^PINRdei'F}Va]՟S^ dZDmQmO`_Uj9j"(P`'4w߹ s'.dFm<[4QP2Pmg<*_/Mf`*y 5}UG q2a[K`V\j105NL| .GӮjTw~۬Myb]쯊,["sTN+G|JSߕV|u Et@Z|  >(LH!":2'[w +h=*TH 8,v5/@V%tL[ ϊuš=}%sN+N`AMI'TUhn\\fSUѶxcyv!d…a#Y0[=(S眂Ұ]la]CB,ܩ*emOʀRՍ'tJ3*ULChQݨʞ_&:;,=N)0c/D# 7mdoY0DiQ3c,^ܽGF 2ߜ:R/B/%alʄPj3;Sװ3q:2OK:8%hD.n3CLVw)ĐٝjmmG 6x(@Yc;=ïQYEHp&?FILl ~wV&B5~wzШ/ 9W,!ܣ Aqs37vҙ Nx:nGΐcI#8#?5!< mS=:F%G;jY/?H\8XX.{C9&py^-/cHOƕ$Sr-(Pj }pyH! .Ŷ\>-Yզn ^ehjk4g;/C7"wE4Τ*R/: P곘~'o29=TPP].5p6^W{_m q1QSZP JI :'8D~o֘aBiӑ3NfEzp\͡o_5k?S;+_ۑB2xmd/]CMCZ)ugշ>$c(o nԟQ,N`i.=ri-)ShFQ bƦ}D^Qw)}cO|"W䋤mmİj +P {B} )KxfIuM!apOݫOn69|5\K2zہv+yJ^j.׽/s=~ш>.\B+ ,MX gE טx:8YV8@H&C`i|dIq8?_NU\4 W|0VOppxX?;oY o3)HC4yg^@cGN}"4ZsSλ&Nw"@Yz+/9݈ KK?jG׺$F8yhO|;6^e 0YdG@c#|D$)OkoPۢ" }sǨ~Ś@}tE^ W3[^K>n˼l 2tn?Z4($OB0 ˣrZckC1ε3s-@AX.$TF~>ggxF^F~} 71W9T ⬹kBo[_V$ [7w/$o>&{3^ >db,7 p )M{7B3zrW,t*GIڸu"mkKJԱqd(q?\r*9-aYk 0AUboZp=󌉍i&s kC,㴃A@"3C~E6"&x ̥(R7fP9#Zk9=}l%RT; /K"c=GJJgC3$Vi3YM\ >*:?7W̗T ;i5*B. 0 yBu׵JS Բ^DqUKTקdE8GCuJxY@br.w>Rqr,g ChATOeiCrXWr¾2\ֿxYL<ŲHV{qR?1}X 6E2:j~0?_AJAj6O|bCJ~gtBq CSe{rlyI%t@ R@de#jR,.].jl 6"M-ƞ6kn'>a^3 ̾m"wa"}Eԧդ$ҟ*1I~ϔxC ur_bqNqUxtTtz0cׂKeokKtL+c|*/G_ʫ aMR0Ĉ~J֐i5=ϭ޹3ovw~1i/b'Sii[EWeZZT(,[b89ԏr5po8iN7X͙7В6Ouҷۺ&p䜒CYÄMJf&vygܦIH-I E!6y ̗Q2p?G4Zj$A=]2vs&>yu~n/#=| =< $Hl6䳵s(&Ɲ]noԞhJp1W̨9 ,D "0~%!'!>Jc*lMRsVq6#kk{Yh޽>o$%:rﯝMk{>O_ee1:^%Y}=bxL8=0}~EE)a(>Y !L Ztt@^Px.X԰"n64v[A&Ǜ J.%Cy#tI>N0&?W2_ (ȂR[SRWY }nX/v=݃`  f rmR*iiQ lژ\j:eH-^ԺV.:.I+w:'\·(0`6"Z$w&n輸Qϻ%=3a#]Y.ܩe*PQZ *YfX0) >b8'^CҞ5, F><\h.çt瓇݁{?@e t?>gƌ3x,h\> S4t.e-n2 &?Ć4sqR!#@{_F#.s“ "l4 Ok QM;bzH؅0YK]$ͅYQr5u'6bf L,~#B&\Jn2c@9 {6ƚ ݘM?w/Hb~G$cb;![?m?TiX8173sGOsb Y pr=3E16&~|s (кtʳd0#\Kkۗz)3|58?_STQC3UᴶVYL oWKC#Cgi J@]S|c*u[fUiIVew8d5"+΄^%3Jr,eH-@bo,2fK6IjrgiOSX}.2I2~A^ڡ0`\$ L%1U)E0,_AxEnp0 30.g9".V@)Ո% DE68;Rq xhLp0TJ4al}`j,'eݸ!gٹJ͍0zXq5CQ,9%wX͏nІs}n1Oa $|W~$$.E2ݦ'uJw/39ȱ\ZoO&)`T:Wa4ZKAjwb xU3hc 0 k0#qrvaB/8k؇L,5RΎfE gGṟ%j wr]{`㞉e~-x@ACNDY.ql?8ZWɅI󋔾 4~ydܯ?9jy6}DQ'hCwQ?=hlkA`CM80;s+ 鳺HcS |j hؼ5}\!g.@$A%\3'G7(?@Yg"ﯞEYZ%PdEy%kj>hfN^I$^VXJ|pGq^_P7˫b]}XmJ4W!Zw_ޗ/Apr ѫlvH*Cp(xi.q5͹xl -𓮭$ ƅVkݘCg7iOOޟl?]6tm?b)gG#3Rf?9 XT DasvhtZ.}*vx!_AQu\~ҳYXΣ-}J%6O(ϱ լ(Zb0rnDs_zbvNܦn%7̈Lg=v'A|YFIXUquc.v#wݾ?؏oxղJ@`De|qÝ'k,{.>)!#A姏 &7w^k40?o(;{H%$D5q䦝"2 FoŚeT-LCnZ(%ѡfbXV?*+joM5zl,CiQRp_k3B;FByTdžb~)繘W1d5jqF5UأhB[# I|5tϼ>[6qDk´n7EHeWB7>Pt5e!MATkPkJAsEзʝq")KBkJTqY;n?7hx?t*ZrzlK&DJ:\5mGq H#I8m)5! vl+nsgn{5۵d@ ڏ֘x2&t^2LI+G?KYdZ88?`R&R+s Րı DoE^a(|Vt,H(QO;돺{w7 㺹wPSkER͖DY)K~GZ -#21 94< S@tʲd4Vy<(G% 52DD# 솠ekD<(hWP(?HJ?%3cDnB:?΢}_< m)0mtw&+$#YDB$0yᗈjHXx10ܠx%T*Mȿ…>' t}vÝʎJiaiUx_;PJT RqC<vN O1[XMmlѩوFkjw=A*/WL~a%Px7kz|ߠzXclHB^2R֧ s>6DqEbH'/9qAVB[rNdE?p'ZEf붿gv{x.yJe7vtg$* MTxD3R2`-ֈӔH/-"gTgbXB[>-ѷqZ;8o4Ak$qq\?=>lԉ<2@q/ɭ7rk4QZR$3e,45'K:#4 H߽Nןx, / 嚭n)Lģ,:LQ3aAH8{^SsȐeypz"HKO#)aP(3[WDEK%ot-)E83~Z ]rkXGp^m_I^WA,i;n#QSsC:ϝb$~w8* 2uf{Xn O0}Q*cλ뛑g0R`[R}xE6V@س%/xM_etYyy.L{0M 5e1Nؤ1&^s3'nPB} 8<2VNڑ<!i>lLYt.͓V<6Ah M-7gC'Z{# -Q-Z&Gs?ьpjoNi1[n/S5)fӞ!V$N_@M%%*4>({`:P Ҷԩ(#*Ucظ}+?G*%a*sw$'o+5~b{CY<@hi~!P^ 55`:3/ `EAVPaBZ+Z9lx#d4dȋ0kPeFEeUK=ujjbO<@`Qx7a9S3P{䉧"nm2C.cQ0~wK[>D?Vn$n`,!2Ȭ?Z . 4t #3 -"gWv Alț[;&7=ӀDnzz?c%zF-#VN4-UsHPFS̘ ^*DpER3h6;pɳj!<7@oO3V6zeH [@q ck{iKi,iIlk-oV2}ט(hBߌޖB-1LM f1osW|yJᄐ2Qr΍V#(GN]2!ku=>N3Cj*EgXkM%.H<ըu|$Ѧ[zot]ԗJ7I+{|jYQQټ)58 eR~ǥ%=, RÓLgIyt\}z%< 4DES)<3 il .Nb(J٬=kwqK2_-+է!+b?b`~N<6b0{\ge(U3]aim8"*@ԙB"yKqv_3}uM^RŒ rɭ[BsX>y.iv IVaE022qHW)|BUL2%RH-55`$$ө,ļP\Rlf["Z$3TiMsbVł 0NKKyqIA@mm8'n(jrɀ>N 0ҀjbOm 2QuaOW5Ǭ5 @g#o'7KIr !׸9Y, BԱODIC3K!ҤxKy R*YWBebg$UZ?6m>"lNL/_coXF{YlwpTE Ϙ9erzB AT[RUK3 d}" Rn(Z6w]?LV(Ub'#8@X&.k] n@6܍IQr=C`J'0KJ;R J!ɩ> #eZy ٝtFhMgPS!ϦK 2EBF yӴ0&#]Tё=-o5AZ׼׌#"h8H9wϓ_"U^\YK牁diGR>Fo>7lAt5 a< J w@NKq [Ϟvzx77Gz*dےvL=ZS+2&Xc1 X&!+26nQo) FTUXYwq4\{|(|_s-̭}`?Yuzj&d|߳3jhoMnj)$)IjmӰD|!@%zDE͒ Exl 0tl xFєZ,~eCA7qNrfyx#EÍ@g3ڃL ,``44#ոG?]N#]A}h8e(u7ن=.sK".kX[[=7}jR` B߹Re~>z;ߔUsBxT5Xpm bZʱj'ӈdOԕ G|D<?H|h2Z+OUA̪]AkEy$1.N.%w-_xJ| GQ(@Ef@ZI_*2;Fp !&C!=d;Ss]`Ђ s{C+8^Ɵ7TdWV恠@S.@'10d;W,S,?:[P}WLY?k'7(5i*]e]Tq=i1{6 : ÝjT/7j**R-/fXnt͏SJEk" 7pV l$z "t $+F'+Lc6Ǻ2FU r9ٓ+2/tB|h̳gha95}e -bb?CIB9JR9,k2GD̉Cz?Κ#MAbe|yc l♦E RLcL&r<_t.eQS=R."6(ic~q f3U^9S!LA۫Bn}{nC'Q,~Ru]j&>^3i7[(RET:1Et0B^,-#<`H572Om9j-+h)@ !qh*^140M<Ɠ٠eMa K_ ([]0OY0-} :E[R(u{;b[Gи-VO^;ί /9(6m& xB!{|/&S(qʂ_ryȅZ4H_ȥѥsql.kbFںDP~Qy z; ge`zpuR&C[.7Lpk52?͙4BI[$eaKQ:\nKYNӎ)O))gCZүKT2a$0m.O2! !{+9(ʨzQX h7CSY"n%/Dc+U:o ֫9T9P NN<3TBr+njQjDp-58zKoTr)B*lnBEk~%@/#oڐvEpzhֈ,#1H jo$b4r=P&$ B*K$9 %)XE?זJh,FD.fWWhX*sy3irFfJj׶r  :FRtU=4<^ڑwBi6cL=pL++WyAS/ +pk#vo[H-O7SP࠽4}Ե$/ʂ10 p򾘛(HhR]E hpDk_ȵV3a iygE \bvW<= /__z_ܸ_tb䳊4Y`Weɨ <N2G׵GtF;ݥTBVyQELy՞([7M~^&wKKK1Qo!<[4N$r̗zlP@>&4jݡ,,&5۱ݯMZ ezjyY'Iʤ=Z&hVHYWAmQ.ZWyeM:?P؈89 wOI͛;unq׽\$_;?gRD޸cp]cS=Έ;eY$C I^Ƅ|6K7ś^( ^rnJo"~K=jJwY&-GrVt_viZ<͝ۼ*2H!1Fȶ(d3(+F;4xw,z`⮠On~?"\6l3Lnܶh&7mw%Gb*"(ƗQ)2S7,JGgRnJ 1$9T>j֞sp=;@ t(X+_Q“\5.c̄j\ fF6rn;VDT>|(IckoF} i8>gLJ?gF П ]5:[℅ `j: - icǐKKvLjZ%/B+LjR*aLRh=G9/X2_-1VS"dJDre'I=sغ[ԔUopeYqoou X,Awv{/UWCT2S88 j5 cD9ȉ8J%EmIZ%{+})|g̫%#RK/ شӘvp$jX[+Y NE;i! {jJ>.L-g] jZ ޻f"@lٗÓzeãfz|F9<|R%|yo 4`7(G?ҪEa{Iy:9ĨL?=CH:ųA=Gg55yU3?b58ỶE,#lȷC"u֖KeZlMKClX zUYk4*C5V\&`CGʕ8i31֏Z!Q~ߵeX?ŠZjPFd7|,XBvə-CTH+0AAVYQ;u?Jkgǵ pl GɗNsr9-`jKg?8JXyvwI#y IHJ_+SLn9J8`[̣G< !WZ<ȣɱjrkFt P X{78ԈrΎOCpO8$m=p/nf D=XKC09 *#UQ;<ưȂ igmxLF4äm$x_}Yq\;9ژYI -[*<<:N]j2;Z9xٚZѬ4Gӣqk-]& оtnsVTMZ}VGk لY @K(>RLǨep!O#Eo!kvZ ifZARڮLs ~'ţD C;n%5IDNǵݑt95f/e=UOЗqN&ăJU-OdjsgA+x$UŊ>Jz5" ҎGbҖ&f(7E2?YH$9?#C6Uz&mA R%&f_CKS\UiMq|{Zҗ}{I duj!ݳ#] *mkh5'?y~/PY>o~yzwpW?ߛV|*;O{rz?ӺGϦ/_vZV/qsN۝cr8ctC7O;u*Mpx8Y''ڂA!\O j_{g~x"gV?pNI'ھ3lkǍZjΎ~b I 5.X&Qwf 4$4ȋkAP^ zQl5ˆn#۠",j)9\Z+H1aXB4]<{" fUdi}LZDYB}O+o>c@c4ċნMipd'X0'UZgֵogLq-ڣ!`\tW*V/ c?C]|MrH "G#L[d6dy9ӂbr1K@Rg7X"zu*prN8YpVپi]RFU]rz |7InJ'!|vx&нV# ֒Ɖ5Am=j՚3ߒZ)']M9;jS?:jU[NXBԿud =Yuݰ'hul$L1'^^3v;~4hpmn껟CHfJ,*-^9> Zu`5f=ɞD#k,{QJ[YR) -q8JXL 6^l<<}Ε_`v<np\2V X7}xSGQtF30YuĽȮgQ"\A8N[Sw+t_x5NťyILFDݬ7_+|=jXdߒoIu"FmǛ`E"bF +F~, Jspu,n{)-wʚwMsFPսikq?’SI!eeMU[tب76V!hz901qd4o K{V!I{| Q#T+qP?|]E!?^%W-:N5ޞ)rjXΘ[8Gē$?"Tl98-;;Sl++Q;/d7<.+ma ƋX:;D( gF0)QfUVQ.X^+kB횥lY;KB+_WkD?5@s1?B*"Pfgf<%Mˆ[VsL7v"vn^uqw [`n ˔js\Mi3֙+L9DV&EdWE16L8uoaݤG(Ln*%Z/srKg%yfΥȡ J3F V}-y%*U}H@'Et'YzA|IYhZr_6oX=4Ⱥ8hko-$F$)NHWQ.ڳ~[W#EK|b۟KLvZʪ3ju{˂ %"#5'w12䠹^@F;^kI^=K]/~m>tq:7/cضnmV.f\d R_0 J<‚W&\Y c5@JISy8wMOɬ >}P|Qe(2骏k7"۟^ldxuk7>n _s:s;@sLa ܸddHXr)hW-7^=|$Gt[ a^_\NrΨ *eJ5X?hh5)U]s+80sBP5*<*n)e8%FXN (1V߱`cDp>b-DȲ5%"1oGבXtbNT)2xV,iAB* P4Y{UlL%Vxď@z 5"Tٿ D"fnZNADyRFT&wv[QCibb2 G\ћ3UH=CR,y} TQujXjfT(5 X4 AQhщxds:A͎5"ˇuysTr~®Jq0K A\޺DuCKK_!!Pk5ޞGɫPNck*U(5䥧/SSku{Wа}/?a^ycPx'ED0)S-`{(EeU@k@?mid%wI}݀Ka'Wy7WʟlaT@K3Z+ԗ#ێVG܆bU{TdGi,q.ӞBwԒ$ꏘ ~|]:^ryT%hCv;_{"j['P7b@X)z%K1TE+cOA 46{ *9nwЀ*eI%P-)9K?&g$`3^Bhool6֢fUj\K)@ alתD8tҕ) J,I\GEnvL,wU@ |嗚8u V޷U{T$cT|`-漌I`U@,(r:mmlﳥIڸ+L<=u Ɛ]q6B"h"F2vFԽLWYr9T 7.Uζ\}9)(gԼcr/Ւx~w%,>FtrRAQ|DֶTh_@ջc蛯*jd$- Έ2~S""?هvZ/P)P@t<'esxCv__lBm"Ԏ2jΎ!Ry<8Ѧu"B?`Pݺ: -U9ƞ9p0"m^/S23m$iҐ"l^9.L9t,n4wrK6`M<j Y@>io}i-jlrbMK&wiZ`e8Lɯ(4*fuwqnnozs;*s&peΙ2pu+ z3 ԹX5Ό$܃Ԉ0- k8+̮o*=h, *j4NG#nfV)Jy2moc_3|,@€7:s-K&TD%*bt6v~KR*VnR+_6L(,#{ Үð3$_78w>n RP;*+L㣒×뛯>F\!BiYDu-.&QOxFSB?CBC@_ǺNSx]3C x@:!!I|(#2謖1"viqS-85[c U2O`{];Wӥjanx>c]cL՟Q|*d^[X .[N2j>sÝ;R%8Xg87p[MM!E,),cy=B)py@\voUj2@ڧs|dTCd3aLoO~N>,dD𴬀ir=7Ѐv]ښv)J t&γ4kF"bTM>r[yNH۷:p⟲>%`(rqGǖ%CU$cRCz} vfɥ{M'-:w P;h% ]݋Thj`'7gf8x.@Cuh^=qQԜ)&h(:Շ3'Pl-;ó }/of| c:͌ OoN{FYRW 8l g+-ŸzUݢɁ-Up %nPeGX{պymqj,"`4нA(@!֮dgO>QGwM5<-LridΙ 5^8DqxQ9(8aBrXK-+Q\TMfq&>^i׵'`&ǍTJ٥^L5nd^1Y!@qwR^יYwv^xZ B㣠Ä]r\;%XՅ>GoA tE)7Io:m5=~7/Rs]dAWcb)#IuQWdӇ"$7WLE0CM@ kSKଞͪ2yMUcVK$( #AA,d @1/>q~ifK^>{uVa,Q2Ntߑ̄ :PGgj{jw_hf<8 7aɟNwe^g8FߺhNԝM=^A"ʡ>▟6^̅Oܓ} vD`,(F%ų~xٞa=7Rkw{亀\ 樇"Kz/f EJ}(P+&B'm9@T3VJ#Q7+Ϝ MX0-"Ȅ42ٱ1&~sZ<Ƣ| СT 2 zYOC@}WeeN{\wйY/}huQo(~H"hԬX֧ ϳ3X բYתr).%K_6 $o#\I5u^\[5Djv/)אV_9hJ rhUK|^CH6Gm~q(b)]&~\.L-4T16M0n nTRkq WUkTĵ[4c?:αCp%`>3t#㟋ad%E4&";7\ӭB ~WTbmFHM,$?eg&ɔ%\EܫcnEͧ/$$w{p2*={ 0^BT䦶܅z!SIvdӉ;g%Бcd|tлE߲9@( 0x%{&[@\@f=\ / DKn߉RR$Y>dH8ٷǾ'X&qLNnsyEd2zeJ˩QP!s@}×Ϥ~!a_g]7rDsڥ7ڗ&?| :(Rn1Ar @fޚ1:';.cUەԓXšҎ֜Xno HH1 |8xZW>ܸd@ܳfE>wGw`v%Y0d?l$WZ<Θ\U] r+ݶ7 )P?!U!zR > 2Gq Mפ,t(BBFifn*j9 &^١4ReO CUO<#"1j?0MZJ[z'F3!3NJbM(s.,JCO2#@}0j~rXyT %.ͯ\(? ĭDHL*x_C !b[k>]& ED^JGѵ./E܉bOKJ: Mj*:b$[GN=+(J VJଓkkkCƚ?ㅟk^[󾍄@wۋ?!3%RTP.:7!PxWP0}P1+#7`J]RN]DP:hYIj؄gTx-vsH > `E/d9W܀tH0YGŠ}GS#0W@^ 5qZ:!8/cc7AQ}@ ʳȵUSٜP@GV@f;cXVM%!)5yUiރb Sz6o\/OG17}ap됲ؙPqٴݍÝͅ8XL1vI6;d n ڰF&m5$,u "/c!45cB"V(G-~] Vqx}r?IZ \mO{/PpV[+wlSt4o5{'Ԯ=̚WtZuPVÃ;A-Ҏ B-qm'ԺTf^Z^:?~Ou&^sm`xu=pպ~d[kzFWE,[7HT{շhǗnzVa5h7)Y <V[_&#ҙ$ɸ=QJ%>5l*[ӕswV(G,]B H7L̎>?Cw-WsexAY*b~tO.tk{JeSj(e<7`DIqD)=ZsF3d-)y-~Y"c5N@ޠ$gIr \$QJ잣2!AbL_'s)< O05a˪n8K$pKY0+ Ay{)k$$!V:(XVz &Lt׎ Q,8z`~qx3$"+ [u}ja[_j*ST\.b3z12T1YUN=/iÝ?l|.q WAXEͪEE:<6kVS}2\aݾ/I*Roh8L/nIYk3ze}`*|:_{ oeΰ`HL9E~S)fds?ªSOm3N&؃埢"t`OO㵃qB^ ta3SCp終5`!a*S:K>U֖UCG (QmG'B? ]81MdR&[#GxDyEM/qXTj;Z`/ )[ߺv^иxʗN=@T 6IF1ϕF"ZV$IXnp2a3o]'TLϦ6լ`[12{7F& }f%6jUoy&@ _5soڭ# R%*%iK^(Kݳ29Ϧj&g9<\ 4[|o -Ph؏,Ȍ^.F`FR , M%跬(c,a'X7T&9ʇO)P8T9 g"%JVSuu1Ϗʝ}MF AZ{n@@ cW)<`H҈r= #l'yZfr.YGfAMd5'Q5:ɰ׺]g8uN>H" @Wbxfd &$(ŽT_c+g1/5O~*LSa,u[Crئ53&N¦PY􆷰#ua*<6om4688|ս$绡]܎n":wVŵw7nWJGi: a&hjZfZ,^T|vo6bzfUKd." Bu =>b5Mg) v;e *CKP$-}jG͘YU(@_X6C .&0 ڧߘ7YOm԰m2hzf;v{_df&w9 K(ieޯ'%h7O"ᲜC X>|Y;<> ?^5I925ֹ-L- /0WfbӴ"%<(p~lz"rI*&0:v~NL^0d#t@KOKLW8+~ct2ϖ1 d.543t-ҋ8W1(!g'mp)<=䞄bZN5D@͟5e5yG 77wd<(Q❣A/-a+QIUR!&oko]5"/akjTൕFV?-">X8t F#/_x>*vEX:3@ߴ>}НWnqfS h]Շ!ŀCg7]wQТZP6" PRD3q}/XBX6MeV]<3vІ0Z2utwo/N0`2$80OC ƬlY=i,T@-UE},kih@(3JByDrS5Ga |+ O%\HJH\>D̦HWWVO:]XaL|b Di1qZ{MW"լIc=WI&2#$ M;Bera.5_g/ͮ-)EZ2]_ŝ&D+wfق?C-$JI+_W⮵h#w` %2  @|`T|*~: ^+t#R 303/<(5"B+YQa$MUYT)ROaݜ.]-C6EsB{dז1R(8:<,?.Z o $Fe<f"veȞ.}l?/MJq7DrS3P~Qo]BL=-WsVX';;wg]37D|smzO+- 6!{4&=ogX(bfkU^ngmd]r,gP]HUCmk<'؋X'&µۂˮģ F_JឯIش6,:nk?4//"ɪ3Q@$/d>\BC+M>@͐эiXkB߭J `D9XU)oalDx oLL=L|a ޝD_69ޝ8aK0NZ,/ ЍB282x+l?AYG˾-rN%ÕW-ǓW6Q iۨͅ1^=zDUo`sV֚^:*TiWɂܲ<- < kUx=_WAy[0{ VjBUT)pVmeWoi;bX=h3lpKO4 [^ҋw;OE2THu6/ō ִ$q 2Tu4- K F-RY18p$y,_?[5 D?#,OeT;t?7/Av 7r)ɭdf}ҿ趒BG۫(iKa4;ZcB)"yֽ_ f%Οu'䱎|Y5ػN"mO},WwAԆkUNCeQ5#Ya{^6՛G;.1_|# .&Puz& W|Gm߽BlyY貦\w5R[ؚӸ{bf&HS>DD+RE`4w /+i1Δ@sV~2BAw-R{۠eƮ{Fm Xr7ps(*\;\3Ee sj oZ>giYΛjU#V0JG LTv5 Pw{ sGz ơi(Dq@dv(䁘#TvQ[> sCr LցU,`"j!%DTՃA*ڤ4AErp0DaTX֚ DHb1?KYt6|yjRQ%mՁras>;hGOhm51wcV^ehcTizV/_LKyYnٸI sI>E꤈+P50{AjJIm{+rqَl)J\Sz/jo@f{@;F/yn>$3պFXMnɍe\,ȻS|5bheFTH'>g-9NYFh)zrSOl-U+Bl1Js2?Z=9)+~LUZ;ҫOT8cg}I"gIYq\ QZ̥; AvZP@DnRc_U6]eֻTq/. ]YN"X0bOe)_fjVEYqW /[pPnalX뱢G٥Byp ^Mq% wTCbv ]/rӔ} 4e1c~O_?+whC-FhWV<w]hV`a=I ak5Ǥ ֛ݿeUGH>-`kYn.+=Z9ݦҏ$ n Trxopb|"cV3wαOyqm'U,q$oVL?da({oT8Vo5654=Ak/5d5/)KŒ Ot0:4;d@bA9ëA"( r.ϒ[Yix'QMn laR= ȄCg [?H7g1%C!yiU au-N"!mj9(o$b#X;GUC#dnȧjLS##Or<ΛyYйûXE) Hn"*kYd]WL>F&r‘P463ZݴE%ؼ[6+ǒ[NTh_$so+|zmy $ Q-t`=̶{ɻdB^udv7 Ekգ Īщʻ(L2pEakY<3# ! 8%Hl9kKA د Y8RhrKdXctnNFx~š,-\kK*=|!$T9>,plШM)ok\[ܷޖ.W彛*_ QhאDhOy ޠviݰL6r|Q<[p`R~x _ꟹV2 Y7#s468BS(8EC:,Co5yg\EX7XN*! %Xzu2A:(1[[$g 7-.\xW)ۢ{,8FDf%HOE~oˢti8B |x~)z@B[g斓I!t?. }#AQa{yXX:l}(VH#<{7 nܵ Ҝc=!tGm/VJzezjHՈz(굽2H:=xj r[(ϱKG|?䚮_W1W>E;XMMLH*l2K #|Lq(]S ?jUf+Zֲ2:Ug2ȿ PJda-b5434Q Ԣ(x Ft8SlqOztWE<D|9UF"ggC!MLOIĞ~ƒ O>,< MSaѐ cVh-2sTcpqTUXI; g6Le+|84(Jn)ͭژW[>̷ǔ W,5=|Hnji:|+.Q1tSʻcG$^#ׅ^ݾTQ\!S1mЀ,"2a o+Hb2r(B$x38= U8.F=E^j;R%#Vҕ[&k b!0t7UB- Irqs;܎&]&s$p\n?wܔFN -ʮbdDw9ov;v=Q,T"P`?"O߭e2˳A&HK\2m7[ l[X =^ qVm? :vd$4i/!Ѡ D̰sbAwW$ -VBrӭtRdCf±cAN"PN1ZNYR78QZB$Ba37Jɋg)ZAZ+L-:R8~u]<|O&o*ܣv)]aL&\aweDGM0 r(x!t)Zat{{r<;cY;h]q{&R ]{N%""8Eسp"nw=s\XkoJ#kdԶDiHo9]T#REz{@z^D=boG|_ 8\ehn% @B{ muǙu$I1t< \낒FHl,b 0nnPcL>Z=s'YDsj[ L"b{cq dgc1PiF`k5T4C{5[^ҧP mԮ3]qӥ3^}}تx_`qB /5Y5Ivrɯ--znMzȲcBbM[T4e_2sL[_XIt! Fw|wl}Y5'pرQ,Qv\зtRF4ͪ%moDs&Jk&%U}oI4Rw)$fD^gWtVm5Cd@ru(]ZB )5 Fo`$Y,:1 ~'e@;_x;̀"0PvK9qZ힁=?A*KU}\L:Ok\hJ׾J][g^lRjaל971(Tf?*=ifUk1սKP:yyfQ!V=q4GyVw%k ޔUJc"î~q6U:"PӴ,픚k?fgFT;fgug2TXB x=:=u;^D<7^}YmO7(gfvE2N-B egGtpV{p|דBu)ޡs)-a@7M~Ww_d8 (UTy!{m)mnPZT[ZYpֵ-]^o/Bx=._lP2iLt^w_Z__8)Auw=Hx2f1owQ++RzHʑg=K谢>k-F|`m9c]HenHs+;篷qA+wn-U^1v|o롵H'Gv)dO!] B1ҋĕgsoy~7;a.:CEWE&__A,5ZZ]z clEQ%jΉr^;_LhA g~=A `dϒ^0c)psDȽ_1d!#xZy:dn=y'w-Z0͗EB Bū}9'ܽ(+-PO Ng$6'pS0rΉʝkn6T+`[^ąԾ:=>09k- Aza$E8<^T:* ve/x8WX`֥:jo|{$IRԄPaeRN.fSjPP0 Y;zՈ<}fs:QXzL}CtuK/=pw@r R# V&xAD)Tz_[v}}٭u6Sm2,][>2fx3ϥ }F!^Ja1|P|Y}@;g]Ɏh祯wIeK7xeL ICqZh*DE=2Rş VAZRL!C;$1^;Vqר|1&sR:7g9H< \dI/g"k>ӣ Arӣ$a{pWEEwMLL$E(|f$m՚L$Ѹb jX=G)ܭg)L(d?֛æD2Ձqs-lg ˡqPt0fSHVs җWu皥rwi1^݋Z{+<65Նr+͠Bu&?-X MuJ(ʐm 6*袘W0k1~ 30o),E{Wo^nodʝHDkԯp5 O"7Ġb ^~JL9LMc&DW4#(E*(D T+)̪1֘QϪK~ƅkpi菱WJ$:Yc#ψewG?.4K_:*y{b̅ Oj(4j5iØS.bXm-+a\17,BuH85)LmT2#6Lw-V$-/$/-Ӭ$.RuY>2%^}ks{i{ {2nJ*Ahk.ѦBe,oZAkC:Vks[>!>wLX}_T·bKK|ë5~Ė,~N@d?:DhVǨۿVa[Z_WO/n͵$q-+_K \jooU^V^ڏ8:!*Sjs.qqCTH%_|-=VMnp-?YSLJ[zP?8؊f}͢$պl7CP G[ϸ&L<Q )В xޙѱf(l<;{|ĸ% wt"<+㉦ m0_ 0vԐ~lxh0# b"c`4- c뀷nI{{;t0NS]o<}.m ߊ8H6RC )2sյ\pt.(kw[qT#l9,&GШuMn'v )TMC&@^mӴ^ m%pD \ Jd.M_z]r",@`In֍'uA)5ju5tGY\fЧԸNw?>&W?>[."}]S !$X)V.y<]HP&5<1ks[~yK0=g=OEw^ ɇ^kwNw8]\|c70n[mG.8 =OuzQK.VM1y1.$Ğ3pk_^?-tt1w'Wf+Y,`@Եrݵfi4ҙѽBjBW:폸1'ʫk >L:6 r͞4ĕnzdl>-H>Xm9;h,x>X<0d=vP{Q@rtF<Rj`sӜ[hvOe\m!e2$хMw,]矝=5k<] an::Y[jqA X"5&s~/&rcė[[*[˚y؀cjl2 RWf2i>82/P.nfIԻH65Myk4w>O$캥02]Ceyqȳ@@#]KľI!17ViHi4PrŸ3P!jS0+LB3*L/Xj9E)Wn6fng-P#9r"zm(zu5QsWr]MĤZ}ϖ 7>'̮Cq 25}4FM9?2,;N*Vq1kuwջ>qg5iUs5nT]vK?:Ǎ*Oo6EH9 )uwr"#yxyaVe^"J&OPWYI%0 9j U0͓q?}?6R鈏{6PklA)0C *>8l؞B!Y|ћ @M?Ӵ%b+ID`B՞(,e s0peG]R,2?u\g${JN>@u6Ő^U-*obIc+lvU)_wD3JJ$|ąEJ\N­z΃w[ ?([Q݅ce{o*( Kr A<J- (S6J|*ln$2rm[C@eщ=.C(oXvskh@xXZBL_v~nyr#eW(!sdf,!-8(ȭJ9\lk9Ҿ pdaRܗ+[YjBh~/(N;_us=vu9nr: 2}i90#{:Aj$#X1 f@lhEE]ol "#zʵ=%`fY]9ȩ)%i(m@plN²8v3wwznfZVqu st ^aqE`kdMŐt&&pOyz-Eh* ȥaU , f X&j[ @=܋}gVxBWLYv z׻bOYܳ*:}Ub yYO`OCPqÙta\B>GP[B$GLL7 X|xswh偎ѰgA2T*>@^)OZ)kVfy5<U|,VI I1;PeMJ#H^5K:pN6i#^pP&*]Bt-YOrj)%ݞkxHk ײ9?-3ӻC5|-V5j 3u99`LƸ`͆;QO4 YKzIڋCiX"s1ʠbQ;r8S]31ΗCNgV4c)-ǚuLJ(NjF9J6@nLkOZա{ [KU$u-?׃͹ @$j4+drNCຸyi+.,] N[,~ V}V&9wGj͙3c .j]N~TcdFhv-mjFJ -/|M6* #Y 'yL{ GIfG+,#`R\^R-ZlrVɎ9(JDy[Bd .9uj9·B*d 5™"v܃`j˧,VUR$6HܲgK܏/Kŭ n Q&p#ȕYԝ,\c}ȍ>lh0իm:L*S. 4?"NVQ@:4ݴE|p0Q|P5!A<0q y^_/bRj;\'߯҄p/VLD1)oXr|U₻ŏ${hBE-Bj!m3V ^LhN͝,Kw?ŒLF؂VA|DFM J e0+:)$i+>SfH"ql6˄W ^hY&&L9>g0{H죊RO SK J|YPPڊYUGj1hKrvl\@4Dw A=wtΚɊ+6\(#K#…!+1z%2e@ui֙=SoZ{ 0<2n:@n=PJ`Dmh"t)K`/Tmc2S8<&=#^FD9(8'`d^1vZy"DJO8gDE6q"4i#-ԁ4AT(AɡƯx6}\Jx.=+j!*5A- |$0Tw6H ZfJQKF\d a)<|D?H rGݝV oqT yW&X'{ #fU#N; tAǛ΢O㏊)O$ #.B&ށ56/3μ43ԐX* 7" *a Qy䩂{wwַ\I)nKF|1L/;#nxV,d:Q˩̡izљycqs6i3=.,t)ٳD8қ~*7ď|햗/wȽcWCI [9HJ:W [ g<<8\Hh{Cms:S1DRiύ ExΤBX̝r%x88{Y#8&͓*뾊9BOhG?8/o;ݫOD%,.;K#n[~2%KM3J菋Z?HbĜei֚L8|HΏs$V^.4%]J-AM5bt4hTYW+pJڷ"+ZO)4hT4%yq.Kg%},i'ާݜf jɧ2+kģ{7Z1z:ώ ugQٖ+uX<KEa9h AD?wx$kaOKRZQS$(ya,  3<\ /VYLqLϋM=KTD?w[, ֭i?PWO'-֓ .T8C n`kGϖ-pRF(I\h1kB[0s FD [xo6$TާtFBuY"OlV+(gzB7O{ á^h˝I([DW\<ɞVx4c?:e^|={$6QJL5BLCMΐĖ\ynx$q}=l'-e"Kjsfb{1qYXi802g.ω.]差>ԮvY;03 GCNe>ΟEBħ`޺ӍT^MEZ.}IkQbQ9 @WȵIT*roj,GHn>^ !$jgAt. ֭I1 $tp pw~sȒ7)daµq'CǢ {)c.ZLFh)rd:fbڃncocM}MkKZwvmr{w~~ . ƾt}7Z7ێWbJsө;bk{\yOsYsf8se%I3y>/ǫ2Ydsq 5S/0}k KėE&$?@pTH3%]n{PODز"ui5HrdY%FܨA.|HybeDM0#|9C`DHF% 9=J2Z< Q>7t ݠӾ%gk;9ezMkQ:";北XIB(ӷ?h2ؖ7}{NKg6p‰d|LF)c;!S1ҡy霍RcښfPQ:r0¨Fii%a%WnY4quZM蠅QdTh iIS77FHI\d _B^}34IvnA12ln6zrH3yH$T e!u {=,J l@Fov0] q7NLkPƹZmO+,tJL)/)F냤/GeOw=kM,;GFxCO\~'[Fq}hwK&WPsfq.ĨBU&Z&B^.^Й}r2W%eW'=DKxGU,U3ljypu}Z[B BٵZ1@Du*{;ezG/5u$Vy<.kɛQDVc{ DLljIE,jgu^r1굆ɇ닏<& ĥ\G\{R|]#aIPøJZOiE `|1J^BC^Θi-ĝ y9GT+9% -R*و3R`x!%>KxEL 2ګlT:Vup_=/"rPKĺ~ 5NYf8=Vݥg MDJ5nD""ӳ B4~>]ԽrU߹m4^kO%QQZ09ޯ_:QOq –^*䅇b fZUz2ȢhT A8,(_/`i$ŏlZ򅀟|Ԑ =3B@jN_cMdu+7nJC1I λO])FݾvúlRPiFsK))0_qk5ndQ܋sivWqQ!,̨Edʀ0J<oEah/̫SzJ+P\|v5`샙P6ECi4XxR#j14,We&Z<+[Q\AS(݁J|U]nO;04ZR60`&I hU'}=0$˴~m[^#wE'iv-oNr@籲I]muR!+7"*!"F4t'2_,ɽHwA_(R5jq( I_CS^H%& !B!C&׫U;8О<;漵B31^^o<=R)>)i>IQSF>WI LȐc<67ݒ L^ukg__/ŶE88$~Zn˝709#cQE4i}a!ٞ]nGwoC|iREtiLJTjl,/G1Y*v91DMFf۫w0K{C\R2PVIXzXM2-ޠ32v[Պm'{Q-ʷ:>]5)Nvl*9- d:#sF3!ǣǡʧ!+^ilYiv׭<4pl#7>dvҰkR=`~Qg5݅U[(պOV< d  HL9\ Q Sw{M!L07WQ !eRNm0'4g7IYQFGyNփ&ÓJE=OluK-:̹09nt%7VSt>z o֑'>%g2[_>\{p>pK'vpN0cc7"Z=ƋoB>Xh$*+qD$)_}LJ#.tƒ+l:*pt1AYC[x!}MUY²β5=PBPOXˡb*UmO]x l쌄L9: mq!: N&qrxM/,a.'T8s \Q-A%75̛6(;g{ndFH*JF<@79zb8W{`d<8`D-zFAN)C'7#z\)A)Q+[F_ZlEwH5|p{;B\C"%us5g3phME^$38;^E R"L&Q﹦۲JDlixԙ+w J^X~S^|owaK4825B|) >Rљնyۊuc7"~4Ǎ!:غr1\Q%F `D278.˔gHiOJ\o)M\.E$hk jDrY zq#c/jgYSy:nMes7̡?:lfȀS q^14ڻ]G&T?Iyc>Y_lߺigQ[)8kV[rPo?%,g ## 'ckHwiuHR@&=FOgg5t8FʧJs݌m _>1=~pgSMR&ZҕʮIiO:Fuu2h‹Z`dncSđe f~[ k?ˀw7ߟ6Ae/uYDRôKCL4~3o&ÖWVBx D_.hsTۜl{8~=XruۧSF~@{GcȪF oȾGk HFL3>Cr!y\gx;|eHE!Ќܹ,RK:Uu~SQ2ƑEݛQ "D}hIŵ6(oKJn2  ىИ/YV;KiЫg{̤ ĞL,wT3.ՇM{!ҭ F2 hEk74.K]H iL|hR`e/AFÎbߚIlQy|*3]b&@-/-@D.BJǘs{H>fB u A@Qh1,}%_8Nųa礝5!^SφAز>k}ӜpTMɏ*mAxaH`=5hFTBf 'pcQ&4PC|_Z;d (# n'ɛU؍ϕZȀbYЬ`R >QL<ETx^)[HĶy坈G&H8~raA#kq,E!UG%K;>;8$I,xRb 2H3nXETE}pJ8nLr"e~_FoEz L?b*f]'F ̀31x\ +˿3Ulch+zy: HJ Ϛ?HYV5{jue cCJ:vwxTV[g),Wv_F12_7?w*qD̷_2)M@z.֢s Nld@Z+4?˗bm!Ӥ1,pSګY?lޖsH|Gj yx4V"yh- 1ZvOBڜ!:BLԌXGSzeo*LE]FܪH? ' =Y0N.g`=vv9BÀgI VVlA=# k4 Y 0`Kz(w^kq@W9EvCyHe=25+.${ b^F䯃FbYb RZ)ϱ"̄^2s.FnT$A~(1ARyXu=8O)K&7SVB09 Liyk ಙ\6Nӫj߹ki2Xj~ۥ&V(7Q-eY}BΊ;Ͷ< 9t`f|NZZQw ,v͐8Q?6BS W(zFx" sjkwc7s#h$A}UTF׿gJ9֫C='u ! /w!ҕy{J+@r?GaipH͚vY1 RD]vvΓr~Hf^ɘi\c5K/R)24/+y~Nc|ڊ\DZ^YXvYt'γȨ\[ygD* ז 9;ɦj; UWv԰Y hW;ю7t x|s=u'EONDe9=9&hx zVPTDXIqTui#rȡҭzĹE.p:%؈ 7K2fL9bJo)p­ZB¿זk 6CB q/# FROR|b'h7=0Fdl%-m ݸN] $ଂp dYTqЖDQT" Ji9\ۅQ*MńU@ O=|dHMY EG4m)>$DW<52v׏+SNW{cb(ʱXK^nY8]k5Y߳.Эtnb{brҪՎDMӫpG \~Bp].hKq+/?dH"JlJv19dؓQp]GKaͭz̭ ѬVNefٗiFCǁ cxCnT Y pd>DX+ 5"3Vsȥ#;K)60w.7⟙ӣG =#EC1Ioo߁ `|"C78D>>>0w|79/'f.z;oS\=]r.>s""!9G0FG.'QT/ܻ,][+ in$,m'sX<#pgΖ[#CEO.]y\Sف{TLg-q|={˄cww_a_V*vNU;>SF jVymo?Q_=:ףiZ?Gљ^c۹~~^kF۬תkfȍ{\qYgIR7;$ hݻ4j t3?|نOno~NFGNI3uhnOЫ̻ksgi%Wjo=4oTZXN$ب:VZbyGج:@::c+f}@lװ%]1bM 5yx5޼0^7/F9P}<2MrznfK"@Iӝ;>[T^]eÞslUuZu1NWvo jT=T8h;K2w8dAEvhܛS[W,79g&s>k]]:iT 0s#W$tqEGV[7m}=Nux{9qڭLJyr*IپA:exu۴z vuCG͟aZK묅n p]Y&(,5ky3o) v5P~8Juf}8[_NfWBrs_YM;/X ԹuYt+9\z3nou"`O~a>D`4vmzu5g3;!JZ  ȍnIE5L1=]CWΌUVoeC4d_ ()YVVm[FY՝C)}hv$@0UíBGEUùb eFTQ2伭p<}EcI`B }+-!_egR2xR"RӾTOFx#~E5S2آ[5vI9| ų~!37bO}/Dd2N-"ĉvc v: 8>}to}lj>kQ*y%tFrYڷ~ʊ5e2קu-M"5l=Zl`+9L(lOX(\CQoKof߬w5Y׸|xM|j^3+|*_R2f?$C,N5\qzx;>yڍi#;(v^oVU3̯=~&3;E`"u'iSy7of\ͻ`dl-E[]vu~>{/P߭^jRM 3T҃KjLL\Q.كjZ~kX=H× Yܜ?__hjW|nW)ܨ 6*m_9N "u{>Ҷ'j[Ǻ{ex[86ds8. ?n݄~k\,ϳc}8YNG8/qɁ_7xDeO1=>VKme,ņ%gY{7(cj}jOLD8cyhv1/q6H4nKEDw$ޱLH`avlqP2=\<5}0 %ë?[^,G/ëK;_S$1οgK7XuFJ#<V_QNf:U2V עvg~=\ "5yr[@B*J?uRRyQ;ߵ܅ mx}UEhr=ktf+s".ÀQ?1gMZP+!?-H?¹OKx)|Onoy) { gN-fZR4#gL^ǭܺiS=V5A\Xv +SbZyG{ٿuFmE0h z8^4xeU?g@H'ԫ:#жf)Y}n30Fc/"~r VvI6f=SHt'cdk=]d)HHNmu {(OcxFSE ?#n܍7hߜGw.KdMFPkKg@;-݈E`K"mS:&bb(hP:alʷ%4@fGt iT6KHd~ڶ;5E=н3 (W.&4iWlGoXpsx˜HSvG*,k$85L;@4IlBUeb1z_ۺ%2RbaF4W:nsuh xtnepj3PES$N e^wq[-ݺKPsn|~I8~&UD>S:vč,vmv(xɑ9_Q`\9{{ss/kO0﷐mE885] }X1 Ɵ|tqF6B'קltN4tӛ)Nswq:A85E'y[BCZT s T~tgx=K>^T/dUnsEݜ>S 1yvIV(`B{[˃ZNX*$佧rHN香|L.2 R=Ec5Zօ!zZymrb:]CoabG':q",sFl3? Ŵ9W߯bpsIsGf < HB6>==yԏ++Bl:*M*>tJ٫\ӫǕα$FJER&xk>2Q) 1ߚ-x"JE@"5+Z8KLrJ^DfK׬pP/HQMDkL@N$bx'.=֌yY>/Φ˫8hjDw2a̘j3a??xy*T*n^R&M|tL'\]&_+et~_ǃ2j>K9MuHbC]#5A#,JQGW8?J.G=W2/5*T a%Mm=ޑO0$GNb0)͈a(sj]"wճZ*JaU5"}]@"TlV{72]{<oX\% :Jma.ء%?(u!7n7۬d+XW-m;^j o-xdrd=gƎn-A^-Jn()ɛ!ZSOҲ*8Z.a0'kSɷw4czW##W/B#,9CQI7*hD9OQ:T X(be1hNԎO:Q$")o8Zz35L-j.3)YݩpOp5%vܲlo%h⃏=HEF8)6JR ߹_0 enVpZ8$mqcl5v <:\ ԫZ>\:ՠYO5K 5;k1V.LhWFJ;|H!^?<;tL!#k4?|\L˼8=Z'PR| !bnEnR>%SR#aw!zT8Iμ' ##&lWO8CQƚ^/Ǥ0×FTJYDA$YoGHZ!~\fK= ӥV&~D1Hܤ<WsvHHjS\r"K_Ԃ`q9: sMqbLg9 CIp%`qw|S R?>aB]|r'o~A0 ǽDP;ՇvF.L!& ٷdB>`|ә -#G$%aSdq . r{͜`}Y-=!f$+ K4FU,=Lȵ| +0[RcﯭlE ocWDOu .Y2 㕔tsON"5+ ׄt&i:?љ1>;RDT;|FC@t҅ !`f(Y+u6NmB"=kQ+=-)XCvQåYy56%N)d rK>LW^_֖mԕmW :z|J1#dӸ} +f21%u$Js3rt>Ft$ie }uxNɧ` _$?U'Փ[O'Iݭ3Q[GV1+5+FҨ3֬4J0q79n5J]^ӫKQ .0nEĺMu9צ|0<=Gb99=\ qt߿`؍.epwJu:s2-)gM!H\H= Ϥn;t(j`9(>Ғ1ږv@A_3pjd#ܻE][]M\:sS,ժ{Ӭ,$ 1jV qgaCۄX=9Ѭ#g6پH2;zt:< "K"kv5iBa!Z~ ?C.D.axIOOc_FUr@ñi O%cJJY2-XMN* Kh?1#2KIu1 οHl#K~ԻzPxiugU7n,[ӤT=WƧVZVHC.G9 x{ZV/dc}[sUZY>/gjq(`X8GuZ.,[M=RYaow%RfԵ=ak< ɏgDe*E] Z9|U4_ߌO xtf=4_5ٟ޳Jڹ=:k tznՐ4NZpn !l:.ީC:n+8֒ hhLUfv;7[)yHUElvr"wΔ ˪ùyKGs-$ȴSM%G"V(\2'9.&okMC:Ʃ¼x@CajeL%Nuv숍k/ p:BvV 7s^ %MKJE`Z5W7~ ((`,үmU9>#~L>m̦) t?W Xǟ-kʕLa|X#dYɄB- !.yN˜-&¹w_8[ oFq]\dSnh\c[;fWjmwE`*Oy ]Ը;Nߺ]"a{*~FV)-!"I;KyE\^WXVn{s|Mpwݯ|+ԆHaqtjdv>5;& "۪w@Ηouf[[( xDvZxtwg ;'ujR׎WOwI\dHknH8{8Q(Wӓ:5 dgg_$]Z' ߕx9왠-W˺tԝj]i&jt%أ+uR*ICsK㗉..Iy`t(ЌTʍJoӛ)G %'\dYNO'[mePX]I*;o_@IZ-*1MSaA]/= -Ow$ƃ]*mE}įN%(DÇpBIiijUlY6{,'; OP"#ZC r!o(@n tmZ9[~i0v! ,RC54UUu-/we_ZӤ7n#P>e6|Ջ£!5`0j6?8RW<̓7W_~3SE} $_5sl V5k7[,&=2{.¶ k͚#gx Oy$WZC~a^3)sUR7=<ղv)܅Ptj{c'c3[Jy#Df'dvnT$$[CN8`:vi5 vZ9&l- ;\3lo_ꖆ& 9^^Tq`Kzpҿ ~3h<=B 9?vs5eڙDS0'-+).VRn}b=(+5ĵ~˅i}h͜ yx83g9 IGb ۶5I}7j7gwQmxzO|?qTG#?Cl?~^+zCe0LNnKɍ[`z㾓1Ylsv9AcQlӥ%zx w !EaMRD85?)'n'Q>92\ڂ%"؍=ˤh]0SʹN3ÊNDdFN"c^y I>oΣyCz18-COXo-iG^l)zcK^,s Z bg]mךA5\2-YIkʊ6ߜ Ud'I9\W,Ù$SQDNڶYQx}PʚM (S|քzDLuǑ* KGϩֵ(̖Q#V 0-sS4u8o3P1+=Y#gZ/ ~ioP P_nW3A}|ZmPQyGgYg&ַ d@jl5w>黍*>v.R(u|Υӥ'",v2M}75w*xx}(ͿtU#FA XlgLvcq鮝.[V-\i5v%i @jloJks^AL\wES)OmjZVT?[$CV;.6' NnN$&G:LΝD$ *$)qbC! Dx泖jhUM%I#f䯉rJ4fg|4[onv54z2ZJMwDڞ^n3٤B]~u>uI8&W]HmTU; GۅF8ӫ]N=&^zSb>4 Fl5p&J[uJSpFmnn*B_|HV|y>*_aT;/;/t/};[|E)/]>g-C_m?LƃSt: !`I$ ֽ6M[t~JXccwAyzTv6Xh0[Jt-ZNOڿ!ܯu8&!Ee7EEoĈu/rZ:=JuF{ Rab4rbv|B- mBo7(VvsX@^XJugb=m7Q}Ba{ rgV35s7iQv8N8 y!4.z_/ou)֨WZF]1<׮b۔*wMI`9e ^JV䕏m.؉zםh5cXWg;a*I z0e) YƘajZ:+ɥ[Ak(BvMa>6_DZgwRɨ?u{2 N/Acz]C@e TGF@P\WK)]j6ttLMI2:u84) i#ѣHj/aؾIҤSlȉ.7R<0^b:ގ\fУ;=W5*^fGK]}Cp|ٿBV|Ʃ\G(i>NoqR>LBR.*T5}Tb@,C.07`.Z%i;>Ղ̧RE!s7nZkoԮun$#0.2)7SF1v?ek[mt3=$u# Ir0v;&?#ίDžq̈67`<*"pS3k4"_d1F'iGgV ףhdLt8 #W-^%e*gy PGͻ=Ho<+ٛE''jhmaA5rn\- Fr6cx6's`-7^i:Uf-2r%.MgT@esqAqoW\1\8bwb}QKuRꕟ & \1*`9x;>(Qٰ^h|a>{64V^tz&Dkw4v'OU*>KGEگOI/b ţxk,IlNFgarKcy/R;We0MW\jh Ez"aR2@/6Y`W.!~w9 \'+tFpX}1tnYyer 33 xĐW<qY8<̵0bTP>%,7Of 1d|oa/%5[} 9A49_Nfw!ZGϊ43wi7 }([a oVfCڲخ$.Ɩ*nzc72v`!M MJ&>}h|/Nψ ?6?9 Y\ u/e22C|MBy&\8rtzyjrox@8{Z?R)-;΍N_bӭw4BrJTXNeRu"ʻŐS (Q%s{S,җ[AiV,{2CRavb_vrUܭth%3hC F,!J2laq_T5zk 71'7닛Iip2鏿 #_2=ZAY:\-23rUf}"OA!ȣ> *,~PTtN~$_W?@Ѣ>Q\0!J8(+3.i3:_LKhGSG -Qi(ݫ{IR?V M)?|%x|&rrE4Sjy4PQxW1+k heURg*l5!Z{X!"3 꼿 13Rx%*o;<{ZC;0% "_|ԚE7kx1 #OL?l鉲"x,X.fUs\;~ƺݪ&^wlUL\ ՠI.^m5_9RGdA2t]vzx6s#JDd F"UD$J:(+cЬ㉁K¾5ʟy&2(%NjхZ6ܓx,Wnn'܊vVIoswWҶVKF@O wy:;,k}2HHx{D г(EL?BΠ Yk~% zکiirY$ =B_ XʬfGWYoQ! QCFRV,szk{xxW_ieڛ3(*ir Nx)^G'᝻ 댽!A`ah4-V ^D&+tng8LuOXKu!z΍BOw_3 &Jd3鍎_9ӻTЊ[wR4eJFmwa<6-ģu 'gٙgTczsX̓6Ζ֏ׂw(qtVPITL*/SYeY3S)zݓu%xp71K2"r< w:%%'+r[RGg jêf֑BTku oPd7n_{/'[v_q[+ Sy3q+ųr5?թ5+z7sr`uϗX*> NvA9Hkh1@F}RU׎YիtPըtJ#lq[7[rf٩Uڍ.^i+M&JVKGS`|E[i8vYdj+ǍfƧ+zn쎨qܩ:qiD5HVY?HK1T }TkAxJ(=ڭ4OK/,Q I%TLHС-t&}dCкvҫ7Tw36\M9|8?;5'dOGxmrj\ Y!u9 0ƾԂnnZixD2x/l2.@=v5E+ЎȦP/:C7}pG:ǴGߪ[.3M$Ɯo?F k֛_tu4bɖNrL%e<Zbrc"98z{V!,z{kk )@/X:UA宍K }J-8āw !j;k!.;'[]w󾔭-6ZhHH?`G !*\м]F/VD(xZE \0rr>&(j&\h|:^:`>CE?B+&/fձ}1x%t}骣qz!I *M; 8~OP-i/Bnnw*)ve( `ϯL!p@2eTZ'Pm~\;?qv|g')GhdǡM2Yډ;; {V7F@Eeh }G`S4i*Q%(IUB\'~;]q )u ʈ<]+&"7zj%Ӂ&u"t1\2`J<'6ąsV)Q@懥J6O)R잯4*}˒ݜMaބH[ԢI.c/p ar\ν3`_tWr3qMӆHQ= d3?L'V8sdr90|4VɅU +w&E$T/_IlS@VCp@w*TzLC@:fҽ7 mߐmn&9P,;x8h2);RDww7~-VᓯU5}z!6/EMc3[ ,,憱 ߝ)2yȰ xXlΙog{b6.M) 23U>@; ei-<y3p&qt.ɫT3aqh:+*YyUQF6W5 N[NB{̞g:ժ9Фwz4R,yUdN~*!HͰ- ǭo| K„(#}<)7 ]rMr,w,̳T_R 77hQö am PY . ȼjc09:om<\8ˆ䗉O㻒CV`- Eq;IԸ Z ͋7gKw( BxMYpW,<кۿ ?m- ø1/W@\ . ";sX=Cf]Ȥnj;h+* _ ۺ^,t!`̣FJyScas#L|֡C"z-Ժ*q-"ybſB3/G: 9"ٔ&7Ó%+C-Jq`fzTW%ctX,ʔ5~W\VKđ܉ZMLIDo)xyHɖkg+D ӱzzEM3LoŃ8eEeZczLal̤gdtY񍉴盰5VNgEN#2Ybh zF=F eBio.+}pvUd_A"/ZW.i|ژөxQ0XS YL-ph}rSN}Igh0 3 N-ьAy8:a;z/&́^6&'Q 7@h0t19]^_ B70 _ tFm_"} i35dGF7 DR:In]?(]V8MY (ٷ+uL{_ 6&_UgsE"f%JԉFGYW&(*pȕ{m H'ӱYխѵ91zJխ״ oV=2wP%2W QIݿϏP|ycڿI MSiP9k^GI<8~d˺}+/KS`a vXtqֶy[pxCՇr> (XEp`E/Ez-b<"$J\hYWb@{  %qzUމ:!aWU~ y}7Z}Yw"D_`יp~CKL~+70#4jk- e_s%j>+K@%)h!@yodAV68ŽKKdyȧd_/ P o^jJa?6 V̑Z_;,/Ž Ԯ &R^K穈ZNQɮX:)E}s)/oE0-oX eFFV>ܨ\'3.%)dbg(cyTL Gbjnj# `FgTs al1K-CmV W-1K_4<_,%n$ip~eM3c%=_U O4qac}/C,Z8Y(Q {[e0pdX' xΪ^"U4r|Jw] )53'SݮkPR|–4ÊS lFՖkO+vL65_(w&T^SZH(Ctr)\n3֣mu 0j`'Ndr]a``CƙA(yxC=W\˟-meR"K/.US]9hO`ᨀPIW+$/FA'OqK;8|YD-)"zꎄSV-ҩ&%,?OڽM%=I+&9SY w0T"-QEqfUy_X%$ Zى&tz9"W{v\/4e+=y;g(4kV)zl@•Dʛ*AL^ ΘF4Pd~ix,l~++˕H0+Z!ϑV]~,"ȍ}[ˑ^g^m]N9#B/7-W& Z}gVD=,S[`8lVe%_5ӔdQd@,GĮ~x[2cf8HmhP`ARm1cv:g3gwH4b)6A+f} A4JCD%;ӝo %C_I0>&K^,LiYB7=I3w)d*<Ë\՗*Ӱ?mև6[&#=|5B̐"]4"vm\@ɹ"P_Umn#DOr^XBI\1Fѿp;.}:$8r`ltCa`QK}E5PbdQcF>n&G)'{5ľ.Ɠ9Qoč,A<-I͝Ga:|щ\dWIc"Կ@= ""u=2ݩtڍq8ޜeKŽ]+'(H$@(B~c \:Dz!azsH^ԄYVLqĥDeVX:jcy>[ yZ4ɦх'幊D%eJ',}b)^E3h=@Jzi }q/k"tZy5節CzzH#W#K7,s[]Yc{c˺,bgi47uZ :wx%gΠ%,BNǃ>5k!UIҢHpKz$BI z}$$ZMJ )@ a< KeT7yT &S%?P'؟T>y:hFCwBBxf0ER,NH?)vqc2={R.ݛ}E(`ȟ,޳F5޹jɕ?]جה0.bUZ&R˵zyQo)sV|tNm#'* E>ϩW e%,xbg|!3-=.ChJj,p'<mpk9v{~mzɅ%' Θ>^n&(܋t !!WR.;il(,ju}L}EhR+C |$NJø #J7;|.0T_}  >. :{oOI Aa|ENK?OoSY!$xLdl hKC5R}A(),;8HnÇz1hGK{pdh*#mQH~Jާ fC?wpI<"ڻ֫0hAC<:JMP[""kɋl'hx\w^m m,JJ024ac% @3*_@3$ݴY ]VҎ{쬤(boӂX=nV@>T{yVTCT=B2` i I_OKֳS&r$0 0RPz DL3\n)6Cwv!Z.6r<Iw| r}gLEZgZ47 ӕqёU`e:#"/ęG͈BJpD8[<z<~mz2^|x#Y:Q-M׺5myf6[q0&+J@Pr}-AË'c|-}ky W`: .inr\<6/8_~σbsr>~4::;$p*F5F+"aydR>a+5A+X P8 0=eAS{"'}7 ֏WvKs=H3  LWg3e i}rÄ(<$j.ݳLl846KSUP)j=垟?X"Ua! `',8FZ4zh[pOW)7%L3 !sӮNVe +- =1mE$J3,dKAp_ ~ xFGq6gzXe34j`F֙HlGvW2:OzaV?d<{ V,+SR"*%:)p\EI5H7-sCyuw;&fe("$trk@LzGOaaW9hё!vNJ3l2YdRyN\ =Al.'}^^dNt]BٟGjn;={9;ˏKZ+:`XniNU2޷)ȫ9R,(n:9s cFծ_-2~]SzLmȱ~ZmN{g0 ")\J7ϾkÃx{kf-e]н#=JL:O4 VӜrlsԾ]uk T!tz9$4-ֻ\7B m=읅Fr.˪=GW"?.3V放e햒48[,jO.KΆ49ǣ8knia13p9D g')?P(U30_QW͖YeYʞ(. tp'Nڝ%I|_GXpHDLC7nVq4cĸMh֢f= 8ų,75Wʷ<ҿwkыk5SLw$Zh<[ d)~Wu@%ہ'B#zhǓ; JØKLca@ ܃n~gAZnle4D{&KaD[T%K{]+NBYS+ի>L ߋ[.d@=W+m=}֐K% >v"983Z_tCh_pj6<| q-hJֲ`L t ҫKA)_m3.A]Υ7s0͙>t=Te[3O@/x>&ʢA]aWB TX=a*9pm]4b 5},e::8=*|iÜ|Z&*Ϸ,Vk,CÇ'}0CO#lLTFO=&ݢ9GZ UsftsAb j#-IC6FT-bg+kLXd}7 O%RkV[1l&'))ԅ7R=&hB} zZ¦M fU#?Ҟ~ &|ϲ8J8^B60{$tRiFq*OwZ ,);J#j#"vIAe om&M&4Ҹ*>W Uy`re47ɝt1s?߮k</"5{V n62O f֠Tث$pCʞow#5veQmEZM ʣf7sRs*C@pNGܧ WskOd ̘KI 11ȍ K[][kFb5\fkm.bPʕtoaƔS%z5[HƺҴ/*U|%Y辱Mll-)hx<ˏG,@橤YtcnjKx7ɵ}!쨭.SWDBhL-#a>|u{%4aŬV"\0b{/w9)c6{T^:{I.PbQɅcĐ|2|4U vHG~Kn;`hp}= 8~'6bU4Al*GLS<}tJ;7y`m̩?k?vvE'm%"66&ɩWbH&.@D]Q0[^Fؘ~N,(JhP(!WZOA݂BLV4sf%\fkTfo%ß T. T3'vb Z%cpv[EQZ: ~ݢϒqBzlDٍK7 Y(جT(,tZ$k9?,^=lbS.y `kEZ\`8dD!fME_]>+ޖдGr&:==- }zZ|lnpvҋx5&q?% ƪ& `(:5ُee/@<7{\!!T(-OK!O5?|XEYUpV)2BsZCraf7"U[)fK1VL 6"a:!ΐ`nu)on2A5dwmr|DŐ[ R`9#í{D@̤ 'h.aMc~4^ =v;Qu|(MJ Ԥe֖I\&,j;7+q\#l TBU|PA~ai7%S[j-ɶhZ]Z]ly+ 饕&x}Nm|GNG:קSKidt/%]y얦{%׎}6]{{܈;__G}$g_5թ7"F|VG- ZF>9??Ft~>axљz ĸj;s<$L?Q bA ȐweE2;)aSrË%)*ougCn׷۹X ,ic% ߲UJ)EG* XUH% 3UuH)+n0qf&4Q#a SE^,J,]7F(tn9z`\o;_ϞcaESK)}2G]Rȳ|l6K0o11$fIyUG];c3L% wF^MI{Z-'!K-)sNB[5;Tp0L<$PE887/FMC}IR.)dEQ|bH=/KG4-(H3'%/8e$VMW$0TϢ FϤ)T!ޑ$] w8!X:ad7BamMQ,eTjcms,z# d6;ju}U:(e񄚍|[j9s"Fq nb@ T7{7DtE6`"P`.*yv@ŒZ{W'Ǫs_EQ2bAO~#91SN^V[K9 Fh@IՓyS-{aCh;+,~Fm{)2*>DiZŭv>ē+Ոש~&O(: ΗRaQ2= >|[aZvLI$hf5"xG{v ~'mVs(+}of=c>nBݵspVzeLe3;2$:}ƎfidoH?Ʈ9 "N}kԚs"% ,TFVpwn9`w qQY51MZBT1__̾*@Pm`(m} */T@Z2~܉\;IKy AƵNQL,TGPDxqH.lꮁYC'00q7·/9B32Ubb_0vߞaim죒:sbcvV{u[_NC8"T"jRYi- :)%.-#P~K 6eV1NA,4|u$<|FzE؂u&`U=: fªcMYk{GUēfQ(;tܱ,I@dPY%RKZM"wbi={gA1[ePn.#5PΨw`B[&jIJ7y{C̾縗?fũ/ ~KVJB*߸pz<kmb@AۮX&REw.x2 eEk+9hkڋB>6b% GH7lh{`;=җuή]ۜ\2RUyH/8=@D,YPヒ b1IQ6odJpr`3 拔UV4w KwSe>_]3}%6&"]ZB={C~umMW%.վ}gf?1k"El{0/6r\"O>/v{G:r9nBRh~q yfM ZH/fֿ"j*N"9 8o 8ϒ b7K&r>0@ Z-t4̢X(uο?B.Q$ ϫɅ6t[Aaɶ }_ijR(`3.PnY9,Ȓtuz NuV+_۽o@a#kRPTCP t Jכ/XB.̉Qȅ17[ޘbjG׃@SdEO27ܝ;2_%n?Z5dk:qpİuqB8~cDg%=Sh*DAF6 m:,љnAȼR3Y $zda3~N'~i*Z>9`k4^ 쾇q5*8M:_ @&2τ T@N0f%J N0"sɇK% ^ $BeR[Rwm: ơ؋֋z%lB, (GV3IܠxX@dmLMy%e:b{e4} ~s|γ,}%R98~Ċ&EA>}JnX8-J58_),Wّ{袝l>[nV /ҳ"s&wN3#xԧ|2YE|st586o$Av4d Ǥ[{x\ _R+ؑO#QǦOu?jG,7Z+=}|HfzXgGJ$-c;U9ܗ&ge̊v׳5 M_]Cƙ[ȡe&Y-%䨃z`?ø{LXnJh!`F`V9߆#F{(b҇XWqZ=s?65ڿQY&юӖ[qIȆ?t/R'ZP09=E xZv_1cKWz*Dxiq0 x">mϚ A,4u=YTZ ~q[v(uXk&ϭ4BX+67ߟ'mpGj]( kY{* ?$NůYh¡иa~D% "#us8 6K;MP2XoF|Cm͕+cuŠ3\mecxQ"SIИ A԰ |ռs k,3%w[.O.9 }lf#ha՝&d)bkZ C6V)c8O,zbͣe-4H>I EӯiRGoNhbEcvۢ'5? "D(lym+2xΜh.X6?bvX;,Y&q{M³^d8Fց3Dօz([簚ǻߡ\#4d?ܼ Dicj(` bdB+eOZrI,2ݯ i&!+rqJ:Y;g|e1݄lA*~;(%5"#`j%GE*L(.;Kz/ҏlDF K)cqk[~keK,H__Dh2O8;b2#r༛PjrpP5F-IkD^)vHX2r/v1@9|ٓ`yÚ7l`?dXy~#% ZMyZ]3smn[c~*l!F GCT7@<\s;%ՇTb ;< P•Bd"~兀t;I4tDQP@kv@Mtn"!ƹf"yF CnSv{.r"uE=<ȯ3'Smc #3kGH+K&Bj Nf+İ@%Llkc^R˔_\D2xo0S5y|^5ݘxgeBy o"I%]t$^P[nGZ*#i}|_z((drv]iaOc]`e}%vZ# 6+ZIݸZ CN2 [IoG$b<^~z_>2^LיS=Z}ł6XʆY-ӿȻ|:+rs.xp0uaI+#ZvcA+V"ʳN#4uU~HG7W@s!ړI.8&]Y L)wڃKzQ5JꄔeQK4lnѲNN|n'xάV%SP(ŊQV$w$BH4&ۛ:W8 >9zn|Tx RPFlI}#b`(RQN RccQ87B4;uҗ$5-GUz r/Z ,dTzelΝUjo]cƌ 0nЉn Wfr(V]pNܿ]M\YlWp-k ESAjru\h0cŨQm<WФHzcp<6 Oi4 ƌ.|4< k]uZG鷡*J}\i4{Q^'9)s1?ܭU!IQԛxZ~0Vu+iۥf-^ >'|Xa\R,KdchyN-'pN*+9oolvQ|SIҨlnVZc_v!/=+^Mv(\ ufu{j^S &:M^Tڝ [ QiՍJܝ9kdmШlw>󜮪gJ2qs{ݶ !zpW:mq1F2hcb8{.tz)a^?P$U; 'g}`^%]SEFT[kȮ4SV}ܮTc[H eh8HN `=o|m@AK=_}:Od(<v]PEN̡:Ols" bFΎ~ڈTk gMid-R aN?[cnj՝6]lLf 6Ԁgr|mB~s_嘸L<~,s!.˜+9Yh*23_]v|ڂȴ~}dMJ 7և/3qɹr :-vE_FM`Gg4;4Nm[FNbEg qsX.I7lYIS3%X.ҩvlPng4{*$R`)TNfvXӄ+4Ë='3eA:Юtp*#tJU} ٧{NIUlaP T RnBgyLOv"MQ8e44V?a)ogLeG˖EZF OЩ&-]lSbj;S̞ }9J1if3㉷f[ DH=K4U5FQXHTANƁO}EfpFӿt<\&ëGٓ)ǵ@ea?BB&F{^XK45&Ʊv~н5qoy2@r(j}=.o F!@C(! 5XhU<9kI|v$R@v@2'-CGLʛM7ϼՒwByܢbғX0"4b?L GaQyo9g[TUEXptJ]VS}c!7^k}2Mm:NS +J&N^\jي!)ݴ7n-Q =2;9}\WP*x'"0^:5 ӛ`i +;:JW:/3)ON@ƃʓI]VYH;NtZJ`/`۫=^}#U bI({G 9YQr]<\s1#FL}uL{[r5--d}*],M{ûu~FZ^/6'ވZ]oSeKD8[U7cN(5`䙡d@h"s n` hex7rVq6wb樓I~eOVЩ-rVսp=媔qi9T}uMbwO )7cn$ N;_*:'ʲu='j!ywdx[ܱyhq/ غrg쯿7pId&TtJrS5xM rUySzJm *hVV2^%ˇAiևMki_+K|[Pd)[_ p;Gv68ޢ2d9UFجuJx \sW6LٍGB f E706:yG4`曨2rb6{v_ǯcG^o$eQ CbE|_\$x3v::2!1:%X vNUm$Kּ9D&QP)O&=? gqٕ`D@ac] [)gIX)dQ<"Op2$nF009딭2+b􀟭9 3"g]r jB#}>5\TSjBqx:K{X(.TC"D7,2s.әYl&ReCQn#"ׇX@`PuqkC.Wk%?&竉ҒMn%_S">Un@ TjЇu(β{2]=Eei^p;#񲼹4pȕRۻ˵NUV0GALYxH~ 'Z82G\$O>{%q2\axEُ S; =W%uMZ0Lԁ DКS"i?qۉE,P% ( A%qYACk4Ou~lpڿ4CZtO9Kk޲qMj4,rVR J3>z @&q95[/uR+n'Kc`SƚL5~=cDKxNǢm M6M<_{1R@Nhp;C{,$ՃUyp圥8wR%LJo90YRa+f b)z_'SOD!/^Fy't."ɝ%Np]B;gmq 46Z1ubxVDA,! #G=XdNw㤍yoQ:TkLM+2.Tv+G+/ڼ@_Yݐ_G X57t⒆h̓. U{٪xPJը4uoAŭaY`T=7-^w^r{CXMjrRM"{ jvܩtp?+7q4o$:J>ƗN\Gfaʣlpdkz 6N:fH,/p> z9߁r}sUԏoR,Di1d7$t& 2W@$iwG6'ҴEt&謘k AkһyHMθwKE{ɍ@#.n%bXڗӘS8s\Π}1Qn;%Ɉ1LnP!ɧqr2rv7SpnNٖݣϋt.VYq]H[i EÕe5%wX=iijGW ,qӱ&fͳ?$/ڍcۼqN'dʎ:J (p+]R쳤&bgƅb Qy~o]Bk.خHPPw^'lfsr]Վ^5Xn FVk F1W { ֎m[!Tҕ)>(v|@Xk`|ȍW<`p "Ek;rޕۦc}zێ wNo[.J#WO,3~ZfE #Nq(t[>OFKAQ9+͎pcnfH6EKQ""rK0:D)rI}1D7!+.Tjtgͽ~L(\Kx8sK?~ryҧ7B5XTOY9ɀ$3:f^݋'$*h!%7US,E4(> GxN5jNW%ԣj]q|Hs3t-@c[x\f3za7q0vȔH{D!)v}! 2薃Zߢ ƩPppwY[]F/q.#i ʍbGO뵤sa+Qx4ܽ^BsSt8_.CZ|ҽ 87#NՁVM&ȯT*u:6`Z:KNGS墂ʷ9CB e{bzʽoι*Ww L aڻX=̈́~Tޑ`:KU]7FÀ QlNNl2|?%Cnb{{LJ2Ll LAԷe.DKfr4‚e>N3Zvj3kZ" f-7$vą_MF8qb%ڳtq|[$N ͘1jUN+g{-P{͛Q}D iI.ch( '@]\ږbKzguj[6 dKНi͎+n#Gŷ6h;9gaAVV䜳βZ]>7ái# (*QT,3 e;!0Z1/5LjSZOfsl5'⵫§^&xVZSVu$?p.W("~5jj!ț$7$K՚ ɦo`ru3{[jx+o8P|M§V'-c-n$prtp;~5AMGZove|f/ ~},c4OCOަ^zn ;‡^!ZvG?4݇O0noTs"Z|Ov[%YޒEl\}q k=HI9&&+9L |8{sBVgO6߬dU3f.gRv~rA-K}ש 'ܢ;dSSի'!Yv5Xdzn45ZUg+1j;v[k^r?c׺YO٫{ܩ6{wָݮ$&gKGҩtLj',(+)Ϻ;n/O` ʶnRr:$0vFg#0 t-wUwp Ttc*SSS#?rҔ;p zpG |lʨ#?ݥoV~6 #Yz`cKoS^QK?uіuK޸ʏ;QWWm[~tv>ip?7MZ}ѐPu1K ȟCIR^:`˿`_-q?xhCZk84lɉ/~Xc'awҟ;6j7B6ԂpFIAQGuq%N6,ْ o!:C R3!v[?c6FK6~q ?n: Ḇ=t7'6piŝO֮A&Ο)wWm͛l=~Gp.? C.nC7~އNp24 9]1O'dp38]$}zHQ<9m+y@P@ѮŦ0\EeZPo6=UO+@ѕ͜XNnhNOV,pv}$?ĶfU?*RO!#SrԿ R wBJvNV,Yr}-Pje{dz7.~Ne?gF'|C7ۮһGW̅IqEBaV&&5Ez $6Zr%Lv14 yl=+\`7.fvo'5[^)y%ηU2,% tTnKS|̥$#KVh,C^+1`B=ƈ;@((.ja*Cx j=WE0߆kwC'C)iaN?]VPIᡩSWpN6k[Riw?fL!<Qu_eɷ;D3a+zT\ag,dmX9Z&_RᏏ@^UUJݝ$3{-Kgژ,pqͩTY(vAi>BU JVsI+ud%!0U\XnuY]^|RK,ܬ_vGL|5{KrP gDes)*)I7@GwBYNybHl )[<2UL7 [,4lrMo*6.}9w/r#szUuVs*1r^:Up<%_lLu+:fv;\5]#GNMė-7'Ry&=Hę'nWJ"(Wң҈՜i8C'_*`|CuJǦ3g5" +>>}Xsĉ G-->j3=Rn|"9l_F)lC@0]tQ jPT7 Q`]Gќ6nyjV!&E-l)GchoN}_ErotU}*ճ^ vR{{Vl5Υ%4#fU6­wzT;[XEI6/U Pjp"9#ʓ=x?gx>qQI6382 8cOGt6x/+Fȡ8[lUUкf"p嗀Hԡ}S[yCwīB"ӡ'`+(7ٟ8TNE3VQE;Tpt}@xLgV StNY1xeI;3)=*䞷p' "%_R˯'0VԮq/9 xJǧ*1[Hh]џ.Gg~5jap)P*LI./T\!D>/pK/h/L`~ƖM$%\,yvм?VU|sw|r |A9^sbB+ 0g'Jlpu5#-nQ{6dl*9Sҥ/_*)QL[(Uz@S޹[x=K>^T/>z}gAhwP*VkvTJ@UUZtW|&~CV¸Kl,pa g5%:4%lO M`D{juV䛌_ ';Rz| yXjNVkJN6'w+C4R{mtHOe6·D%]|E%Y!%x#,{"tƹ&zVpA_\lXkv~Qہ<СϠX?.h%K{Pgٝ%}zvBxxzs1 nk݌G^W2R8B mo[L ȟrVūR+.Qz֍=}aCӣpn+YsL08) h>Wt H~le_w[" ө3;@CޑmV73J3=ޛT(EMr瀇/ v_2XgЄ%gzRk) v! 687#J ȋVְU ._%Fv4z^c{[i6wksEih ɍwy w-8#d}7aEδʺ':cU?6ɋ+uR0 v؄DoMD+n|qMLmʼmU9eSz7iIt; ;k8%{XĽI~I@!0H XdL;>hK˩-_ Bo `:xE}޿_ W4sg:Zgl8v7A3f;N.A, P.S/ (з\"6<" 4׿aЖȓOsZIhՈ3"$2|yd:#T&}g*4Cxη `=v SC!P{'ybkУ-hDLU^n:K׳xis5-mN֑s.HVKo^~9zCq]=E v&ҋ>Ӏ_alx24XKwGS͊sLjONg9PPDGlb= WylqClDZܹػ:r[KV*%JOF07C_`JW|raPWܗ}׆WAmtgCMN'µeg r/ѯ<+@)[ F.YvZqCsP+Y2B:(ֳ4|M듛#7q !N-xd2sM>\|<>RlDw'Ca> )ng4+ :b ¼P޼ޕAOD35IJbNu({  ӧ jJ)%5 ԁ\/g~1}JΝOnr?N> Ǘ V $Bk3{#@9+,oKxS@><3M2\"P KLwo@Ub9߈q Hw޹E@gGp/ɩO\|6ϒ&, )\YIl% 81fUa?4|}Q lhٽv(VAAqBYc4_FeT,9ICUO@DԾUމHK˿ZWy8~<w=#D+!,`-S(w7 swJ"VtEWNmu&FҖ\kƹU` NiFbZ%xtfZͲƄU OXPg+5Ϊ8:ҩ̏?c[h1^`Bte(38Vïn /5*_:sI`l"׿f PEs)xʼn,Qo%m7Pk%jjcVńu%cV0 γ\H4k „Ǣ(P_Q7~<2QS\T7p ,tNؽ9!qG]#'@ei0?v_9@rV䌝D^3˱  5ӭ;*!ҟ)83![#e1fYbAϜv'N%t[L+x˼ 1z=B m*UEs!x~t9}Pŧ]Vzarc -l EA(~Vh1̍ tw͙yzO.y"]$Ol< UfSm&}J%|@'P/N5\zA9R)]lFQoհJ. 뭜u*\x%9!^*OjdK):i;lYf¥LSEV¹8eyɀVR ʮ_x$6;b"hġ}Ď>! z"O.Vκ9 j4y-Ȝ;뜠P0/`ZElp t*EW)-\8-X N3@㩀휅5(h.`-6ݥ:SM(- 60Nx)D ;>$(qįRp3ln_AoGQW+9ChhɤYȖ̗ [#Z?|,7}Fp.{ *\oC[gK 5 %XbR͜VN y}IfXgHR2%)]${ڈE֬U|g8̃ڞu^9Vtt:'C2_#\" oc|-XRojEq(H[ݬtZ͝ɖ].舦:q )=|F?8*2-Oͷ,ѕ䣳dҿ'gXlfϏv* cH!5TjDIϔTRhח__eo_5qȤVV>Ŝ5F_,D8W 5:kR_˾{jvԧՏM&'XHq鑴@x!LF_R,nu ҩ8:$B|; C4-(g@u*y*8Rf^mtg6OGj8ѧvup F I2pZFi$br~&zAHAȤt rG2nl=l~YHx m ]}䞅8=WB%w;?%RɴG/ (BջB6ĺte%FwhAȣQ6Cn'KĎd]Ї?6#Q>"{a-ُCO~7Vq1=̏0%kC u&R2'xaH't4?>UMnUV(/;fg=(H,n͋i?8]";Jwzz|)ri'{hPz,T{t] E\-La=õ۔m/7N0YA4EęА0ՠ:P#԰8 l4)p1Ecxk~.,EˆWD*&PY.>HgqzZwLԨ`xuEjbE0D!Frpdg'To={)#7֫ |^3~j`ZtDݫ VϪ;!!W"20 _cٱ,W5:Z|RT- iҶgWW)_F|ָ7R^%8/AuAENh^we $nt/x+a_9@]4k3ͨ7 ҪJ^_&;w sї4"f_VX؈$#hNC9%>Nf@_Õ ]Kl0<0O0(lBm@A R,'  9 TJvwD1?#KH PTeT;MvT1%'܂/baoj]~s@#Hs'EYt ȰZkl7Wf@3w,N(I6P"_nd[!P XJ0iYCxZʹw0D7f,IQ/혡} *glf#kj2v0uX=H#*SzF$O/e5>,{37?ekCT}\gbTyx3φq=Lfx\PQ⽳Q8ٶKde,󑢍| w7KTO<{2yaȠwL Q3L̑\xRq;_ͶM&[5k3#㍄[,pݰ WxxK>/_j`gr$'7ߒ4(:1ȈTkp7d'9Rɨh"8N[ώ.ӗJ 4H &EuhOZ+Y}Tg0RK,4GC:0>,{mGJj|c'vƣ"raZ@Y3µkÍ͘Qb!( D"U8@èGz7k"$]!;CMRA!!kX0 H&d>9}Vkv.RDī*Tr xQ!W[fL_$P)2qK>Q͜F/zl=xA-E6p6Sz~sկ,sJyNyv4F9ۊIwQҨDb!A#\-qsᕼ U*O6r-d U%Wdz8l״~Sa}8(W>"y0R H 9mF:YمG_QGJ1{R՛X۫^BQ]"ڟM;ؔ:cShr=75[~.H1ci3.Nq⽵xF^Qh^?WH: 'dڿ:pJ\QmgÌ_u@-Svo. Qu \$\` g F9$0(-Ff$g՛ u'CVėBG֕䧛]=\\uV0՝0Xs煔2I=\AbZTDWqG+UKYDǶjF1UK;;8 Vʥ4Kewn;f|ҿJ׿V_Fge/\-|e_f;WX$%|'T1sd$(9.*,WYY]nC~yq0bY+‹R-qńx%6>qjN dL 1zu# *|1r9 rx:rk<g_so7ɋd-;tVN!*ǂv>b އy)@4 $A iӕ>Jg$s.}zbb,>6 ͖KFkf1sܨOL\_akRuJ3S7j%5 Z.s׸@$?`%o?1i\%6'ZJ<7Xq|g(Kzoyǯ:Ug|nV倡15N$;FkwTj톿$3YD5l FZF_1,o[d_qJ=wu>AΪ‡Nc5^1buKg\yrqmZ]~7\udQ^kc [`-K卤 UC44-4>vZJ Z 3<>[kQ(LsHjE [`GcO4YJi'`a˨:;* 2oOs#']#{rNQx '#hP?6# \.ɇq,زKXXxG- tzYn'=dSK@wϙ0p*I%c+zc?{,fZMDY(d8> oU~&V'\;l&+=BJr #ˤXۚߣOX/,rU,tV>lZ,eAQ9J*Rv*Q@<,^3/uMmQĈyW3 E}쮚0Ǽ0t9+QnFr5yM !wj,v]n"d ,5;[Gu]/&o6O5FY ? Rڐ8 Hؾ+:}.+iGDqmsxeSX), 'ݥD~ڿsc+i=yo6X-٤bհ߭6gE8|F?thg0-l#GaMJ^U]5݀++`YA}UX=AX ޙ 1j~‘BY-YvqN0_] JG(į=QW׾{MGwvܼ3x4{Q+fctdA(HZ=CjĸR'0#Pvzeg 9S]U-X;BgXsU@YD%P xըgA}A[6j!WnUM&?VPAeZ>;bOO 5C7$ebE!.%h"{c<0e53oy:K*% %.ea4gKK8[AW@+:cǼԅ6G?pxZd .jx:Nŧ?&LIbHHd/x%S Rv|AeujqL@Y]!Lq\A9 (HCs2'a_fD)by(#;&?t.=vuW)uxp׈''`nHMOժ[s٢w[ZO:ʀ*赇"KNEGۗ^i4{fֈ`afOY=;i3vE^( +5hBw/ eئ[zT J݃ n22`"JVhJ'/k7tPp%#TP{=TtTQ fAzgwHFT9NMU75=nrRg6|Uzm2leSU*~O)qc]p!S]E+ Q8I!^bwV JoļӋ[m4<^U1DiE,cE1tb`JE@B$B4,R iWĬKdHLSG)@ _PحK|*t!^p'lYnzGigsqłgz!^%W=oD0U%=.;DY7N$`ۺ YdQO)euiL{#8GUy8B+ԏ5aSh|6rUhdGQRFGFB%~ t>Իg-D;z95cpԨuX^jŀ$S@xH Ȗ; !zVփI;PYck}cyёE,Me0Ku|6ۂG^j7 )J^,؝kG/ߔZ˫"b ]J1dz徿u{MDwMe6ta(dR=Yx[,݄iCvHZ},Cs/dq3wbe!"2:Smjʨ>׬zxT_NES,wA=~Dž׫\.7A(*L; GDƚlJƫ`[dpA-''fϰyOnٲEh~qfkӺKܼEӺH "! $ ~N>ƈ3gYkUS׮rˈq?׎%!6:ymmj>WuȄ)s]!Cboab}O-"&_IpQ:YIH)|}%^Mv50&Σ`6=kG3.AL Rj9=7WDJ9 b)JWo,jSPCIO|seMxSDJBzDJZVDBx5&R^. < fBP*gXXrkY}ɤ[;?i=U 6|5ItyW֐uh/b4-3<*J$10Ǣ D70Qi@ w*nAXQD+>%C@B16aM~U^k3ċp!s Q!i5~kɬ hhmfJӦx-{)9Dz1o&PR\*o1k!%(Y]FԀΙ 7}^>O]$-4!4)k+aahN/)C rnNj/ < $n4Ѷԯ%B\-$?@2_rgi䋔v?dhΦOb߂uo>.{on ܦV)ߤ{7#dXO7㩣OY~b-ty7u+Χ[ql$`-߯Kp iD}ZgWzoo77?&mC~jc+Lf pϩIJdhD{k +Oegfmt6B&[N0t\в+ yLz]>2 z*3 b 4"QR4hrPJ22MCOR9;Ɵ 5- c*9c9 ^It7+P!hή@g6!JEثYK>$>Ri)g VA~)?D IME"{GS/vخf؍|Q;avz恨H(l"$,Iw%/y:\7o),ojD|-FZ ntS p<|J{%G޻ںtGe]|6P rK+Y] M~D#TET`UCeߒ~~eLDv—#}>(YK|[3oΐŭ~pk<@_ > .`x?MN6k]KcUavTҽGž"ʿ`H7e/Y@^]iy:ړPyc{bY٫j񒣎>e}Y.D;lOFgXiJG'':L¬߈;9S؝VlՄza29J!bdsk* ETv= 1& l~S(gCF],O[_?)b dirC \XtR/^w-7a悯ۇ(L-G?s>6 SibaG&ezP2x]D2e;jD| 6 D/Y ӆAP}Di0-ҋF ̄P)vz)Cfˢ S U9[̳.^58)!M- æglܐw;=gN*-ZtLu;iQ{& ǎ\ nL#LڭD9\+Ui ;^qV.qm~ n?ucca7~k67{9VKyǝk=vϩ~d$(gL-'P!+Vyx0Fg&0*DmO{!g(] Z ִfuϪԫX:uIڰ¢z!tU07h=:SnͺfG+C5B񛖃#X֒HNE;5΁;&5{ٸ/z<ԡ>OET5aqꌄi\# Fg 6ݢ;=&CgQg=Ƣ71l|`cJ!HJCo90?ӥ֏>hNs(uzlN{mtK0S^>haս19 jXJ-y_7tðn SM=c)Qth*{EglTM5(YlzrPcgS pd_0#GqIIz*rbO;J!.:7ʹjR|.6 [+R6衲))@N39yls>FWp։9D.v?Fy{\_v>_;ywn_vnIO$$a5Iw _2?\֧vUjk8ꎮ]ߎ%(h_XpL1ѵJy'=`Yk(9рXrvΤLo"2L՝O'l^~A MlX26Hj5O#.e ⟑$*V<4|,YB1\u[}ځu *YS T.Dq{#\/ WHm3{ȅ35[$+wn"oI0X i{T8M,#+%/h'9[diq~Ju9:1M"ְ,?0w+u?$'WDκISwp9% pm _6xOmA5jgӉ^8F2EKov+hphUһQmgP[937?'zk< spW pG#, SsMf+̲ )_jsٴ҆$Y2@tB=W{ ;;j) 12 #?9dU32 lTImg*u];\bOJܖEPf{䶛>[X^`9su^h b~XkK /Ml.xVl)0-RKϑt] n7h?"X1a O?mK#T.Ap2bK86xVSHhL_;[wY-u"T(~1myuH . E3tEVu& SlQbA5zS@'ybhU8}Åt?t ;'e]t_'"X go DoJԖuۺ?_y[xږ[w1݊+`RF|ސP~An ?w!y9RTTJAv sK&!4w`?@VDTJ~wt5.>tC瞡l6}*V4Ee\(c Wʇ7`svdBL%3hI[ٖi5U  a'λ9;!b,nmc3ԆOPMx4ީ'`@J8uV0Պ\rE2m&7,pF{'%"lث*P,ER0ҙ2AL9 4w!rClpzUmg.+9կPT1w9_β~Eya\z0ebvQ[Cp:n"-(³퓛ݙi?92 Dc嫖: +IrFPTn$_ps:y [yI2<&/l2 Xz=Íj*c0F?KVÑP w;FJIQH-Eದ(VMgiSBKIa[dז卂^$}M 6YDF =D.C ׋4[QjlIq1PT9irbو~1A8}' 3!.G/!ƣ%<<@GiZӱ7??CQFD~c.Jd#>va[#(8Tr9qϛI)*3dGdq)oj ?rJ\4젆zF,e qShޏ,>9iWm%<>67چk'~^P{6U? M5e14z+Mm𻻼jo)x`+ pM6p/:Ÿ缵ye^ycaZS!(@'UuM`ݪnIߺ3=eb"w(ːa˗[~1F+"-W.L1dȞ|@RٶQ^#-רEDgzg;bZe[%y΢ݔ*3H>YNs|IHKȺ<].DM3Y93e q-cSNʵ2SI2eBdNW{pɰ?J.7P:+I=Q.HIZ"&Sκb^2-2Ďb$XFo[Xx,dtDJ-락RRB1Pm+hojTl=vRZ \^F**,*1 {*F%@, }c o\:=;ӚY8@Ըb=56,KT:0&m8&{?pmY0" ")%ڋN9oRF$#\nO7W~S- oJMdl\Dj?ݪzH92 XZjZETf%$lx"hC\ꕛ\\PΦIĶx>MN^Ř+vaa$k:OQ;'16W8!is FJPJV{Ē Ų"᢬t>Pm< LKF{*#u1ׯnO7{ _r^^@Aw;=P@ _k=l$MR M#{f$ǜg2H e1&Zyb#yrm-Q@c!`W5 Vw1V7]YUy̖f9(Bp0Z\J@ ainrJ@bJh憿ټdJC&0BR]h|׋BS"I>瀠h.!&VND[$%_ #M rs%ǐ]&P q>lq-A0w{׫E1nĕEcy&L¨TV*_+ꂧ'I;wg-)qˏK)w5[=_T90aq'|TsZ6ba2ga+deaJ]3c]dxX2@d$ VbJQudU7U.[ͳ'eY%)=O.\}3}Cҿ\xe].@779J` `#M,k7EQZodF6qx#r:`'nK ­Bg=68 ݢw{zFFpHݛ|74Y8v,{8 0d KCy`W̳)์6oۛQr{}vu(Nnxr:[# &ГUjߋsF~Oi,k5E®USۢ\i6L茮j≏b!{ ϦOEyD+n'Z2Bѻl~>3i`k%G >9Ozlb3lMUK}g;w4;Z e>.N'''23;x\͖]}rvsȷlzװzϜ#?@T-w߆=-aX2ZW~?W?5RO$% $'W.A&q-;s=]ͤ릠@@] e!9#BtEQO;qs5_~^-AJwS?U51TaȌ82S yHKϝ,Pp3?"@Ϥ.7h^j7ڴ[Nr]<>vc$^ y9hl`᭸IҔKy-E&p:Zw!, i1ҵaoh$T_dޙe_\ܖmvֽ*dUרSkj(usmM QLDB,t5DarI).nw=MH4Q K,sl) `wxX|3Z6Q ~&k* jKo <+kSJg-͔҃福z<| Sw(W uE°çULx Oo. 0) r`TM*}kGUBU%bx yL6DCBk1 Yq_V}FR hVNoHlt^D;ۣnWD-oZlꯗq+~ ͬP" HO<{6tgȩ|8*Q?pp΋itL#X;lܢSGR߷v. f=@iI{gvUL0ysAA4fUgxS/KLȃMK؏uE|aێ:5e⾡ZjYqV}D!:.ō Qi/udYvk (l ${J3Pa0&rm^BRǏޠ{d_; ]]<1.򴻃݆y7r-F$\5ԝ!K ds3g L#!PO5$t`ؒ-A*%R@S /'WF6yWabPm([IU.%TQw;ቻbK1d !vY &<UƧMKD|ryYKrRXe lENd}&;-ybQ4*+_!Q%+6,I5C6PH}8i dRRĢW"R(KFbEvnc Tn`+ubrPAPY`2"d~EN".TWڈ10uHw4/' Y?4?ŪhxI| /T /d *rD`+kDqL.4t3v`Z=L7km)9Xe˥[w:*3Ƭ*E`U`¤Q"/`X*S*_ PTf9}'++FίۇIh9_ OGO9$ƭ\D0QPf2$@la VC6S%~(ښlvA%WquqbMQ\PJ⢹urP˸9eH>w:aD X'p-:|~tt=T=ypIfl[o:|&)4AHT$@a %4Rj-//~ F=h"Ѭ7Jwh9U(w d:A`#ɧ4qb|ǿ54K6ǬTX_U!=pT4 ؏G1.QUPS\U_D&Kgky[`[|wM u5n9N6##I#XcYDo\y?nz!g=Bv%rd 7 w8k|F g#FWa 5^49ʿ{ 8K$夢(D%{פiꏗIs|7ϷV(?VAlea*.Q:Hx՚FOwq(- 6:NH=hA3|ECwwg 9njB)69|'<6|`7٤Z-^E|;Pf<孥F*uuhSz~BP"GZJ7mcoG{'?udx 8dn婘;{:qĈDMU#C cAX-z_ZW]ъDnw΋? ~W@ MT~r'yM8b!@D* cw-}kјtQHrZӳK-W$[bK"bQ^<Org+ ù݊@P u niќd>y@!G_Ne:ѝ;!ϧEM03?+?lj7\RS]w:LRF { nA5x>3~hEfHo- o.HO[CQ1 MR:没_' f N:Ԗ-i)^eQ3=ߜjs b5ddt=1üjԗgu Ko{n8aMԀC0Cy0J %mjܤ`a`ݑBhv:"CDuob0*-EGD!^itXйÏϹ .I C~f6f,OpJR[YMjOJ)] sCUu eT'E4yXi=P5#|IvڭߜEU)f0i/jQaBD0+E* ( ks\1ƭM&D;H XYԓZGDӪ$C= m_;\.Z$;5zZ$ﷃѠwusG6^\]\t>:Mޣ;Fu2ګCSts3$^gI$nl/B!kІv] _\t;qsl w;G/탾<O@>1X%]d=ѩCgbJGr)|nthuƲzP(2Z~v4mu"`0Z`3'~89YڳiR}BPWo wl*u`EYx[o&#QIr`8"ڴcֆgϕ}a')Vb'xs|ֺ(]*4'WQjj Ֆzo8MVƛ5_蛛 tTe+b҈]8xn$]r3ŎzH/~Zn/}tfI6-"0P,{vW%7vxEZ!CTSj0вl?B{ OY 9 ڡvr)t!ǥ6u"Dk@RRBON)oylvLOgĴҡ8[J%MKTC`}7[wa"Q;$Tm5FIG"MyviJ}C8Bmj!Ž>h|^ɰĸ!?f#wSsk哟-BFAf)FKrTfLz:Nߜv?X @~BR,96ap0cfXNa aK߄EUZ01 L) )@~r7Ey4"ӤͬVsvFᩕV\23aMv|rQʉqb> I|Jv𷺽fUr d8Խ'yr}{q\ zΤ$-]sk)Zg"r)Lǭaz, ;S7}_RSeUB\wnTqaA FO휾Sy'. i+c]GXF8\swOdn 6ihλGJ? kiLt-F4;/g)gC=GohI~o?Qһ.Ooozg]B8W ٤ZgSY?y'wי`Ԑ kHe&2BMVڂ>Cf4j@T`a<1Vf/9r 좀,J]AY_S7q/DVы~ܻY\Se@qn8O+(B6EpK| ۛsg.[=UPЬ/ɐkl&hUV v/m2#XA =h$%>{ C͒sٹt.|\l2L>f, d )9~h`>39?:!㻛?0b>H7O(ߝK<Oڈ7W2T优G2>g{_L H5c#׆'! pDU04` o)XEQti KFpzkR_})kyBl3óMD-σ~}o}@tꖯ{u,'n~$Pۺ)3a HclXUdeIFYR;dX&,M; kO8WW*-ȯP'a*^wC4)x>){?Ĥ$!䗏LMȋ=^,bk2~AeO*U<-OIٸL?a!GHdrE۠){rm،o>eԛ֭MO P+jbR쑁WhEX,KjAQ'%Ph;y8{ӿ읁4r*p]:_y>}/:R[Ym>cߪ BrTr-ew魡.Τ\齀QrtezbBC`64+&[bsRt3Om*Q%WF]|Pa2ߜsTx֫4vG KL$mMϒwr:. kchIыEQ(ΎJtd P x_osSYgap[` {٬9X?VƦi"_O?,9EXwS(4N@çOA(Joz?X.IfcwIKD޴ޠP9L>ŧ&9+?]_p5M!x_T>w6f ن\bx_{`UhX9mTF#a'?vtӵ;LzݛB^ҟn]H9u~K9JϯJ:g3#K*cWҎ/4;JwR5[.73+rߕ<|q=|, v.AqK&Lj"Y}/ u"ҝUU̴=$tӺqeswaqKt~l$~^f}_~uѷ ůno\ wuoot&?IvܗVz-.O Ɍz$G?%}vNs;tHORC.=>KC{Az/?qI^V\s_te5e>5 l8Jȹ#U?XWHxTD ?fi1|u KJF&R\*l"ef߽ R mU~>XUqŦRdeS9G(VDL?*9D닷]j̈́fFf5Gd%XjQެna; oF#[fgCA5܄E3g#z8] (.H}WyvԏRbzSRV&L(ًn0}G=)v{mguurhג}/I %J| c$[A'yv9[)Ɵ;=L0n?߲޾ÆIk5_0b_@ʌvE!&4j9{9ţD=VQ(2x`^'$]]|bT1ff° QB٨[K247F*Wި߿ɗ?b ^I;ݣ}ӝC;ʻ=@i|ww}Q=Ũ>> >ĵߩ^/(޲ Կ6/g]u|ƙΛ.ӯ؏^|* l`C58>+AAl;/q~^3_=9/JBQR %׊խjP|^н [HR-ڽA\фJw[}`c 01Kywu{0ɵk_x\fJ ̄N V+:^vw{!u[QCar“%p ~p| w@nTk. D 1f]ŵO;Si,2VE0Vəo h'vdKyE"2=7"?BձJj n"aoa= J=W1ͦx*7QEw頛\c\_uruq"go(}8-]]%"t.3)1| ᧫OÌ۞ٽ^I9(=9d^I]@b'KvV7-uDeC?6r$&/vٲAG6NH891v4r>޽}$$]1_eЫ[@JY]&\ccE]mZ[ϚLK_XړIobW%6,&dAлZkbI4XCuc6hW<<^p` { M 7&ݻ놸CxvR=E8$K1jM*" IMHO+[Y}CޖzUKx.i \w?ScA_71IN>C`hIԧHw5b3xp׋+h`^If@LpIU(n z7Q 3vH f_V-R?Cg!ӹǽd? 0XDawk?`'a4r2WDZn6F5Gn# 4@4+Sm~P׽=9h([03ils{&AP#i6bcrX N9 5i̋F|f5,k'˺oy-3#&y}ZX}A3l}x֡ s4GoOj*oN&2m Ԉo.Z-ӠN wRCv)k<lúA֗l75;M={ uÇ|ɦI ~Y |%U`B|N5ljP hHąѴPch.V = T=k"eV °XO#_bӅStY׋l@n{ D܃|)av)GM*mr63ˆ4mw="n. m_Ķ?OYMjދ} +A2D[DN z=t{!qD |<j*);FgRAX9%$[UkF^hucۍz٦N%8mz䩑2X%GE6*VFI1#RKp*(DᕎrMf3Sir1/ |&;Vh>1?/;!0InoKuZ9ZΝ"~snx[@4yV. 0t+W E7"q㘩'8/߯Oz/Nv ^[ E *ٷBPx*%oߜ&OPb =0]&b+;JUFȩ37d&P,wK7ׄ}9'*99>j e7Ǜ+~k,-ɬ3O,} 9?ïy~q\Ή3%uMGOՁr]\lm&cjywΐ#˥(,\wE: Ιxϕk^lke/I_t?0z>I/j1#g2sg620unY=vה0 ʢh\gGԨk}_U{PwyK*4rAeܮGM1)FPPУWPY< ->-Dt2dv9D{U.7mss}VM354ZֵCv߶iaYx<.Ž; dF}JUL'xRe89m>E,Y&Tgnw1| 98֡=&"QB# ݨAhBT'"~}:dhaI''a#87ڸ$So>('?h`89"/"PV{iw 4皃@CFir[V4 syڟWs8s @t>`M)VDA܃Wj9?R#ïs–<OH}㦜 %Μ``nq]^3I\aQИPf=3꽹"ՌEU@"{)yiUbu/r}i=xy@] jȠ&TK]ܘJGWp%ӾJ1_P c&FBIϘlIoyPSZ5SsHcTXP -r/B0AsN0쬺{;Gě$:WC xOL#ϞGmEɮ4& o֑$"`khRB% dvgj2b+ i*]%nAAB'1ߢSE-yQt852.!}k{K) % ]T\͝?/ۗ|&P<&R3:SsPiH.jP6"Ia$ h.Lz޻D|[H'DĽ/sQڟ&,n/@6S$4bP Do# \|26ɓӜPRssR9ʰ-uBqy#16.\D}\* CCT0O#Ppe)ys ;`_>J3Tr67E& m.%곒'./$[|OFT'PŽD !H p˽e5 ד^dRr.KU%/ k4] CB(_qo h0Ce^ħ1÷g%R?qOWGuˠ0S!ɑˀKG"|ۍ,=/@e}FN2YDḰo5׹Ki?&i2b&nu+li]?2VI m#6?vLυ \tFJK4id*&E&{<_䧅ͩeQOy[WwwN`L}mRo9Jt}jԿdYpS% *ߋURtTFNmKpM -O|4Diw4T zlޕVw3f_M>kb"7--~eFBfV_= xPz"E;ܟ0hn?8W[C z}9k*ߍorKex9zEބ@VECGfEŨBv&F0}e2RŗDj߅C 8!;pZhcX3Lʒ0?V U392ҬB0D,+l^y0D:}DZ{0ނdȷCE4A m$pyShq7j$xƯ~SufC“Ǯ~Y_~ .cUj2Mr#/Jn+p e Uop`L 5_*`Qs&́T2߰ WBN$u1̮qܻY{W=͛^1~P€?^>neAm"FE]Ɯ\YWi"g Srn9QюfBO K˼VN-@YѰ*}fkq T&\*vrG e5D1۝=[!vX)PȜ4a|abGaAB *r{!Ocà1͋;XsJ(+5(_Gs[(Ԯ-͆<% \/=͞Iv:(S?hrD wf|9`RN wdVpZ̿6CVM$k[geG /I'nǓi5%J5"nr?.JJp~I)J(9THz,ɫZ<^zafHOe1z@l4]4FO(>܉fhܵB17Hyι+20} ZlEp9Fx+Aϴ`6s뵹&#; w7 +WU%x@ 0 }5zPQRt|}&A>/#8gtWRs/36n® :R/b`MSr1 C=b$RKɃ ҌkdǹnH[s$Œ(6;|7y$159K銋ASh$`ڢR +^1R0H2~'8U=Uv&hx"+7J9Ok`ro,W3a<>GI.N?V+ ƙ'i(16| pjV~[<.A#$USюA{"Ψ%#Fۊ=3Yv$[-EtVCфRIr@9_ʒz𞻦U  3)rկHd;$5Ƶ&WP=X^PMHī|FrR6u[{/@kFtTX\A8(4u"t2[MS6IQ4}zc0ɟ8_nOA mG?v@D\!OԀdTQռҗhcnڝ?_fwo1%Cv\>ziKO3 ~[EzPJjXL9gZ7=iM^26=^Q顅{uJ#dgg1cmREYmC l}AxJ1 ÷Fٚ/|8eY#'(.4<5S;![+kCwښIn}H"3<F+ i ̊O41!ݯmywp qf` yոw2roB8AzhLņE)ʏZOץu.}BPҜ5EѪd}ȑa}[V5 ދartèGaMnή.[Iٜ|U:hCxd%o]ctf̊m /|eFEER9^e2ܼV&IDU]4< `w%`PgMWn>Sf %6)WR̒-Ѓ&8?iyjػ"\A$ 鏺t?97>ä\&' ②5 X,꘮'VȤk>d+JRU] dǐf_ѝ3^/zB l|C73鋛m#sOwunfy좬sF$t!F:H@40[HEM'q4I`?zt_фC"1W d4 WKSȪVs /p{{z&<ۇf)j16[QX˫'~ms0.V^=+/WXgE~}GRP$ôst@X6Vi+S^ʹƒʥMWq4 IDkA&<&X7)J-=kiuލ_<enrlwvf!,}nZDB =flwjE{. u'Lz#*"QMLDܔ<XZ(lVXW4.,DrHc]Xq]h?#H\;j3h1VMʅ'v_ݨXFU77sKyruA2qt084H&k$l2}VK\w}E~0}]@S'朡ZiWDE`A6ɍF9Fmȩsan-P86l)TPY*뷻?qζm=ȊOyɄ ](SM6yꢛ:~HmѫNBYcLF}D;#]vZGx!)x$s%ڞϐ&*A+5cnu"/اlbg8- {*uդl<{[`R"#BT6}{ڛ1J<gb5J}Vh˹ti?f`q--1ARg *" #o.,2T[wksgoNtV΅ 㪪'dS=,,JNXRP*o댯'`ER ?}PT )SjZ޼ݥNF)s,lr@~AXea)\7( s‘盹}Enn{+5|\w?'/?$D} |n6B{5v#gߜZ*g & AwۚX!ݕ–g ? gI1AygKeR+z=T_!#XhbM=}YH|U 6AKcB+CoF;ԇ${9j҆\,V3 h"4c?W(h4m}nA(O( jG7RagwAl 4#bʳjj(t|rJYyo,pLs.!UW(.Z՚2ڧ.̔=Jܢx2`<tp!FWA1bI4?^Xڊ^w=|hx 9 Ε2GNvu%4YkAVSbƠ/W6dT UB!d>8hm;;ntX9)jQ~ m],Q#6Nk=9wc{ֵҙYs#X6g%>Y3'(Ce(Hԥ2*4Qk1翤lZF)V^vkLϪu I-ڍ&E 5Q +9rN{B7?\ܔAo=*u\ݎoGIf08JyzʺLNQTS7]D ]WOHoqq*Oƒߨ'IG@Dnr ҍR \%EYUN6D ~Y++gJԯSF"f.!+ ʦ!-@x6u>dNgܺ_gMe> ޷xjQ$o"ΐwK51_A)ͫ@1Ԕ `G#Ǩ;B!X ?vcyN3ҙ{ң@5BnZ!A'wυ+\I4pBNSHSb:ZNP"tɴ2GKQK2qH6f$m۞ `lZO5S{F{GmQLy0!S6i\2:Gdx`lBOYAe"lHBԤBa ܀\0rG֢JJ|0c fw gZ.[S@ e rT4Ńk5d]@@%$[!6*@ZP T8>\t{W%zW`\;?~T/gȎk^`f"O29Ev$*.5fM=ȹZ,gR-4G@/+G`a |%yeps! ԟPJ1)>|Mϒx n4턩׳ZS.&!]j%0,M]phѿkׇ N&>خq +!$pGO^nX9)F=@ͱ v\g`xg,|_m"#|K81;ic^ċcVƃǨp`Ӧ+[aP 䙥õ)Y;0nη3bm쀱sZ5lp>v3909j6Պs_믍Zj|nSQ;hzINJGżކ_bG 6\,fj!3_Vsԧk\Ԏ$Iy}"{ӻr&\4t~u& h{u܆vg]]pus5knIiV0t@X@oXrD݊T>W01zBg8~CbYRg˘;0 ISD uMWD㿡xRYYB'Xsc?ѲP f MA\(%5q^OL7 j% '-(K~ꮰ nQ.x2J`# TQ(ưT]? ޛBz|x pW ;Rg]_D%d. imAjJHh"7L,/=߅qZOzzn$軝hӻλfd=e*sbfĉYMm*xW%pGImY'AL`l '%|<`.uG,5eeL$м!#Od_yдDwGFQiM*-Bg瓞GBhҬ%oVPA5#IUKE[^\$^`#cYS&#ZWuм68,vk -@?R$vpڈgCqvKR8.`/woGɕ%VQDO 4VÉB;Gahiw]} \;QZ;DVvSoOEzi ,Kɑ"뭲`(p+km:򳲭2&ƒ[2wնc<럻ig1!th^˼6Qq`+A]%D0, !=p[ꦷ%o>@7H5 P *=F#i'! {W-`K'*;gCHˌ @hT@{`iu⳥Ut"j$m klaսr4Wlj&NҠeg m[4n|YhK4sf#q! Y%H!4j<)7s^}h{ NU6f\+T5ޜȈfV"@myQ|m1rȳ:b#"&MNq`4*/Ju)\`pF'f cʅ2\]R5sC6n)isP뎤T豄oAƕs1q<:y1/r#h<_nq7ׅ6TS(S?I8Am MBwAzr+R#A0t3%=wei҃>9S`HI+[u9ΐx:><}InL\wvu;r^+O Fg+D|s$6g D./[2E;E>qwP8k%FB {zwޭӪn2J;pzD whĂ!Cש}3up*U[ȁ:N"feȒo={=L'Lc61)wR6[r1gW:S[5 x*n@Ї*}GU>ݣV='|U< ^JJ:ߚCuވDi y\i! ͵` #p}BDZ1 FLLRF7*emoD|HNnmsrru&P@ȭUVK>U1;W&.%*bY,8jP T?YǛ. ZlG;Q Ͼ<1Qf)+- 7BA0kԏ9cΣ 4uw RNh0RDufƅw?u/h(!ٗ.x?v\wQs=Gs[e=*B0/Z uB"3c }TȚNsdeö-ic 5d,0f}2蚴jh*L]w7FѯNhc¶.F K?LiH7j1 샭I1JnX/їдz3 Y9q)/.x !# tyHYcI>bg8O0./ J!84T1DK UHjyGD>#b r_}&݂nf9I!i+EʺKl$Vo9K-f?Ix+Sfb6Qp"PCh7XsmQUһ*콓wMpU5FrM&y>1⠗^reW$8:Phba;Ə:J߯Y\rZ?݋)n'xWq隉'd҅\_.gOSU!vjr^+ R4!VbnXF>)PI2^j)QGm"=$B*%.?qr n7'K!T8Br4gYֵdoh<4Cs&r/Cx/p)3AQ4|)&ds1kVkRx]!Y4*?9mf 1JIևrldhDXwB(`ـ湏yOֱ#01u2KIo0W7=GF5|>^P`t 'w@P zml6C?G܄oEͧvn>~D1pxn$ {m \IHh-TD ONVrJ.-r%H#at 5Ȭ4GDbo$UF:qȵӊ{Ѿ] !/A,gc+\S\bU Ⱥyowj@6bHspP٥t{mq=da#Ɲ6Q.6.C` V* gWL$ w,_,#J'|j7\MjIoYoTR*KOELwՐ-П ן0IoaIuvjEZs"'Pr;=|'ZMA]W4=7}POHU;7RWsMU]be 9H0ry="O»ٓmك:[!\ M)E:X[{^dѭ"]k&$/C9/ y\|5]&el쬼6 S$S%D[DS@L DhYDI&mp_ 5Q*(/y/:+gNU,nX5Mj[1MV45PЍl=ktT<aF{Ǐ~͑_ݞ`٠uo?&ۑP(^gY4Fwn)axcЏ\pߌfͻ\Q?7I%.@ ͱptrH䁹I÷9IUR<^)Et۞ SUh{%vWMWB=ZV Ǻ,uQ~DB҇eL9DRdC:w WBY-BEnl"}r2X"Y3ڞ&R^#o$ӆb<пR~ m@=ihb%o_☕=tAuCK!E*I$k<#hq|2ܝ9 4e_.EkqFNBxhanLsT>5fS4)̈~ASddkr*}VX`^k 6bقWz{N;Wyxk춱O^4wII޷~pu<$Bo=z픉`q0zD0=^=?4)0٪iV&Ns׮+㿺f5~&{\k0oCg A.b#;.V4d@JC=-Oîv#iK4_Ct Ȭi}hS_wȷ(dfPpoX6Q*QXGD:+6=)FҖD{Ain+냥n6~/\ Ε3"!J-P!F4FQ!.ȈǬ(4c(48{ w} r+f^HDvBךgൣ|n[g_Ugӿ%ĺ8a #).2w%cEEN~^h@f6Pp/]~^4E,H9-)eI^'1"yt^ {,ym B}6%^b [9R8.C֍OJȍ;厃z80)4(ǐw/9 f25>V@ܕUol2PFRv'-Qߔsw;YQGM~dp)bOjD&*c dcLͱᔓ!t-}s x0$>sԍH!eaa+ L>9d|͇$]H\_&7}*&Pzv,3z^x jʹ)sV p`\sJhcwct.ɪwo/{g DO_]\u5GnUe/Fks:l]ʙGi?V /yaIbTaNm[ɹ 9=/:?$?~>z~*X--:?I81:ߠS1mM5ay5K zOC5&Az"я+V?0` S"'yZ_Hlx< $.I> 5Cؤ;-c-8l%(>Bơ1tUuM 6 :39GFɫA6W:R8~wyBL H -j,'LA Wl8"H_?VBWFҦk&$C %~|i4D24*!WńҶ N:Fq7.p=(#1oG#>$ó.6cݒ%r3^sHKCL V1T*Uɛ|@]E¹vԒ 7%d&]+YxеH;BcEeZo51xn])RoLHr(Y-eYxG5ߏFϾ\ˆ^̕EbAIOzq\rlmǽ{QΞ-PDZJit(r$t珸?N*^[ľ?1 LU+߯?V υO9;V]$D ^aC_ffx_).lLےM_ܓϳ}{-Π]\NL?d #‡r쬗4d P[̝Y׉2b6tYu(n7[ cU,6AV%>5 [t#]_]Ʌ OQ||E+0:PƢBK6Ŧ IK7AU6͒lolN N{}DC\9C?a{_o~Y~US:VL | JZQV3`! 7?A6!rMM C?AbDDb2&7\SBʋNqUE?+lш(<;iM $zb~+0C+vQ+ <]率G7S6*9υ,$~İ'j [ c<3t;弈BbqN#,UNZOI1﫸^i23X*H]1}c9}2pQղ*o2nHnxUKw@|`E79 If"qY'2Tn#=Y^B𞋨el6I{3eh%:8uWc>CtnɯMudR)+D1tw_-* =xC#/ԜF&8JeqVn,aki㴛+ enss;WH@F}S T;ޑ.C]%mԸ4En5%6vNPz.,(v:BUMoH8 /e瘉^'Rj^Oi rY#u_gx t\X+*IX?kxT:q+HHZFI ;w!K^.ڞ,Ȥ{ỵUqim-+Ҥp3/Lb PCqygTiTFrۡ7YeXP7j" \+`KٵX]zxG7)~ ?.h|̍͝cYհ4S7<,IнG(zfj\y[ a5"ϼߕLJ-oP2!1Kj+>؈uM?%`{(_Y{>:KޏF &XfDa2dWiz克Mr|Hί<*+oi(G|\UrM6oE(bd"i!.&{˼dURF*kբYm)j}J*ʼႦ c'f]'Ht_zD-)bCQCQVJ8ľ±y|?DP߻zp?v׏Β 9gЍ6e xN*rm=,:UZݐLOQ2, ~Z0H'8+ɰ_ѮD 5h}'eV2ȴ0jqU=[f 2//>{uA-1v(ojL9Dϝhb@ &P}d!~}UT>M4AyոUiH }5*H^y.FSҵ&\%b2T :'[ւ>РF.sd1ћ Nr&&W#f׆5'˽T쯒V$m]9lnI E8wW\ᖫ5aT,jQ焺+ϑѦvaF}d ť[^tn;;IpeK}:M>.=1ϻ`'ً1$c+Oh%Y>_|* ꜈Q7UI da@;\} z 5꼼OSht,CQf%ww率{Qd܌5FP2/̦?=ׅm"u;%)y`}[H3e Yejr2(9Ҙ&_s9vvQZ٬J&(9'$# .`n<ᡐS<L2lbS%Ҋ5 [tD%R抇[8yz\^`k`,e-vTX_@\24rn4J@9lnԦ_I/KH/~TL"NM.@4UubYvtK2Gwg@8UE{Lo@C7pŸ !Qοf^wq]$x"h H]-' J Fctp)5Rd|Di?福^SW Oe> @ѽOp)9?#gz7m䖒Xnܦ3LbĶH5Ӽo_iQmB5/%vz[ ac6b)Jtq%(~uZؐ.uvxCdtmvvo~sr~9Ii"]X$i47+s.ٛƒFTOe++t#6 ?,a3C|%4v$f)RuĄsGh|[t ],ԣ2{LRXOgJSm q_iRrngQbOZ u9,Nqcp%7jl^P/{ GH^EZh% 'w{#7.gz{';\CyCҶ.fwOAܸGBH Q*fx ShYxV!+4U1l@L#@ѤeOATVݚBAH`;͛"Z-#e }̂xDl7y\ N!k!H z I\ʟS+_0ͪ݁ɵgwO&ں0md@&:1?^hmdBZ1NTsȬ׏pt{5vhAR!pjExs{~珃KWTZ +7Hq^ [hNVaW\)qa bRj#^&q@iDm O~8lH=Q"'~S,@CSxmS %!BeEݎݤ*5<-$ E07C:&+=?*ӈw .J{R1opIhw$e2ɯf>M@JbM _SL2W=L|jyKjUxͬዷ)᠏Jk lFk]QP,ڞj1E&A~rK VB>h+/JMrA Ws&XK@1rW0&NC(4?0c9ĕ/Mi)S6e^A8 sYhCuww3+|eAd=T~R.Z3r06h.3EO |R6/L A*^EAtMlYGǁ?0BeՈ2QyXHx(۳ܹz̼>`[oq L9KȊU]r_S䚾.,O.pDlWwXv8ţ8VIA .QcY,$3L){ NHaZP4[Ȥ\H\L 9gEvBf/ܴp Ikst{rhSMG+ISt7)m]JW/FQrDn ˤ;L>"xzu~޽q?п<*<`l<0%&H 6ΡU.zZ 3CUJxTu>}b1'I 3_.LiW/Ɵ\HQITbSxŽqr|d&sl՟+9l|#qּ`mۻ}~0/TNr:Rq X/܉y ~m)r{()U5Veo֩&+uvx6F+{.% swm8ou=4 5C<;XUT@V)6tgw~g;R['Mb +;mƢ8)NrTB s^8Hn'n>SJ~ l_*{aM cD}cȵ 7bYEW:K(-\iz'9(ԃ;ݒ_TɝL Bvt.`gO,!`KCN뉊zוp9K|[7M/2DȳQ =Wo/s᥁KpQL^E Ї8-yӾ!K$I%vm`BaO*j»HH𐗝\uKYQe3_H:\rVʧĦ=wN{-i~hwEc5n{#60QqZߨ} r%97jqmgTyfiǭʤ{{k MKsҐ(g)u[J PnH18PŇʐ53o `glב^(:g}(1&MJ&+RJuII-Q"Ԉ4xFIØGrp<=|l45wI9_RT3 laqh,jJ= >VŔ, }Qdco+NO s8pxY=͢POIjKBԟ6E[wDr \:Tg(Pn.uCn{^6}ǝ$-pkCG #~ WURb ݠE%Z+^Mlk|#Ro[Cfk# KEx y5+"no:Wf8J.IF=ES_/~s6aʢ/'ET)<;!kp9VD`tEVCt ua; WUUqS=eYgLqd}͒3ěϐ/{kܝj"6;eqE;yr-Txm. D{QrZEWUqR[NMZ @^gOAerJ!ĊJ"w&~Si[/x9Rb͆/U3v?ըN=ԉ_7sh ˠT_X/RCG?FRc~KiwVg*M=Lww^Xe3 f|cg$B,Oֆlr3tBkfI- 2FM{ >zT+wPOUCƝ[+3A,t{ysl ߌ<-T<ۭNݕށHmGL2kSZ-Zm&jƖMJS-&A_jhRLd\>6-2k)鎶9JI3ҠN_]UX+iLʏU/ݱՔhA*]?6G9!n}{m63z:$UuWaKDh$ξ(xvtpc 3~;CP!<5_dZJsw`2K1E ޡ5 *8 '"mI{\OJAXg]sE$ѿ}#K JXV7XR4 5"o~=VnIx3?~,E%Rc@bURބWp8#s#:*Ge39 \n0?RC]||P}.&8D׹('Zy^KT;Ͽ7ۨko8.r박@M{p~Ӑ$?챆W28jp*4-*XܒP Uc^2;{ya=tz>#*< W8bm n۴Qgyu\8/ת6MIszE䧓~gCpfBߋtIɋl|:<}^];l "qW]QKmVRn ZJF9;LBRH8 vׅ& d*^Ob9ęB|$Me]7C{`4dpunCI¢ g]ZV=6 +w |^ܕUGI bF<\9l|) Ao8#^EE ?vYx#F )#d bSv8GXyh #&Sqiz"AEV ̱ւq b Źf~(,Jv,WZF-|=&8blu`!Cunj -v_DW{@VqE]włs-}>E݂ ƧⰱA$LV*8xa:XKBfzrºo|ʺTP5. UXzZ i`TxQ6xC5a! CNR|4ZjhUG˛D4f4Ҧ^蚄h VK%' wX ^G wo9!=Wz׾xќAS竓Qwpe&U稝020yɧ_Xɖ+os+oJ#eWֲ+ֲ+ύʷpvKv̊!GӐJ ZKI2kp!KVL0A`S[d>+}<"dT6g9sܿQ!2q24aPa絍4KOrK.( ,|g/J/KoLJY=܉(E%SD!A_C¶?9+Fbܵ呞BI$ޫ G'I$2Trg]{?M.FK.3BWht3!3l1؛f%"sqB,T(䙝D괓>U1fV LE]c+S׻_h$FR@'[EV 9s2U%^9Fc"RGy|*BDp씉nKCK7@vM.g-&vl eyCz5^_F$ hC{o@.}-$Blqlx,<+ksS!~RCߓB_ z}lv5ɜWgw9wٷIo"!6G`v+[N:ҍz#ӱ) Ѧ oXvb0KK<ԗgVy@=b(b0ϪDYBfl :H9E>c30̱rX#Y J͖˔Rڋ+Zy( .gcw\ij4!Trż69KBvx_Bl B|`Y:4BNMSnfB1raLjX=@@"G u|@:U|}@4;&5Ր{>I@K,Bx5}$g)]3,!%wG#C*&x5uk'_% EBߩ \Aoڳ51zFLIrWb9EeWl5wyI% 3h-Hz#m[8g A\Z;tlG [ç3/@{4^L(YFódt bsY> sh?<;{ 8!t{IXsuv rwe HE۽w@ZFqr >(/Go+x_W;P)" VH^jdB-2H&o=K _p)~4yskcd$:|DU}+m˕3H_=Tw'@_w 9i>4VZ\gߒ 2hR*#wO=1g\{l(_D@n:.kW&l˥hՄuVv{ipU [ϲvޔΓ\EU5[("yeNƞqT|ѕp=mdJz[GfˀMXa _=O,#X EoWB/᨟\t/$\!9ޜ&Wzquz{("DHU@:PGy7'=Xe-_s1ڇQ)}M-h, 菊NP@ug̒&8 ҫQƸl;ib^rVbK0Pр~6iQjvxQ,61oTBk7R jcM6aǠ:`Vv?xz> ,w.)21XۉMN{WnUQA4 Q(sTX?C[/GHx \iV\yӟ-&Oݕ-+3pld)6֓ P/2WL}Fs'ΕCKMt n(?b. AX,W V%jpU0Qo<=p72aĿr3h؟)ER㤭FD|*l9telBMNߺ'n҃͠a2vхXһߜžlVYGQzfgۉ&"_}>ź`U8"cN{)/ŒNӺw7^./=q-2c?C87#xiE^ƥ π fKk.Σz@9-ZZ^u_}M^=^(+WB 5"ǭVRi}h jAc%N^D5M 2[((`2ARouń%Nh *Ygz+g* ĽwSaV]>ZBz:FIKi3(jh*O2H9$6S9jĖk3tIN+a>nH^J- v8Z|qٹnW~'X+$h3P{qgۧI v8]F)cPn˭lZ;ҺU+pH0bei\|LꃸܔTu)3sf)^ExkXf14PjU`$ B,4}n PTRhqNnZpd /T3zb.6UlPaCA7:;6Ѩ;L7A2^Et4JzgݛOGBg&D5sAe4G.vPF!Vv&m }$Bp2v,,IXaׄ!c8J5*X)а}悵`Z~G7gɧpеYݶhl'~='!RGM8_,aLT΅QA+5Em1`]5݄݁ӑNms3ks<7<¦Gx!hM|2[e7 , =>mK-yJJ_Xe>Yddh\A1λ_WI{~̦mv!|0|Ꞌ^sG(#&;?mo"D#QEQ/;'=^kk:ֈh<.?5EΨg}^w\u:^iې'SCjirSH<ʥ h#M63Zxu 1B'o}(<40GA:%bX*8e˫rnD^7@NK%dIb|DZ4( a%hC+_LLC4.,x.BTU%\ooU|*J ²# ੦w0NJ[g~7Ujn~l@twjOS%zqG2͠`]kFHXڋ.hwWg5>v>`2̺ ;u~qBԺZ8,벫xd'elLn-'cڕ 7 o$9[뤶9VO uuU?I Ľi]uǦldgęTxD8d+׳,[G_ͤl ar)^T+c,W?CPn1&$4ӗ@` qkn]qmjR\*9 h>* S8͓x i܀\Iˣ{R{>BlDw6&c}he;:kXTer}j}ʻPV$MMf!:sP%UJ3f 7z5>FZ>"0"u`ka,U^9} @'I@XVs4w6nW'(C%#t!V2N$ OjϹfk;rkYU|[&~Ē~L6ydC[HO(gDgr!2ɼ.ݽf.k1=teM/Lj+_}r4v8C:G,u1.!!*F*S,C^ Cv Jܨ ;3&|}UV:u(ͣåMte4%P h,ꂣs1ܭIKRV=;2&Z ii%%IWȗYjЗQtu(`ED!㞞'sb^nF}ʞ@k?sB0eDg,zaԯi -Ҽ[s uWlޠ8e\zUiIB> 0f hly4KnUbe47NAϒp9? אW+!|D RCqi7",P VI@^EP6 _uNVN,} eT=*=䛀tC*跉# v|*LJ Ωr-2<а.:H$wPUghIc{]jy'y-g,X%>i ^3*>qDkMd09|%gWW7>~r!0Rleݐ'C$p:L*tꥑFtZ bxli>CR]n2lƽ,բ )>@LJiþD;}(`{QjvfGB43 $nj-Ëh$rna זuK߻]KM~FE&C|݃Ú+`w3d )LMZ$%țZ ,>ڹ\-z/̂dhJ͗b﹈2/a#7EUY~e`vv%\ Z `A ƌ6>]F]J󗀆W Dh!0f3Fh|jB.IPǀ41n=+E3QZ]N6e "&]ګ:)AX%23\Unm+c)QSXhi/o??bAcD~ӇFu/>ZbSQa@#fQ|I?Mt"h#̄cK#k[r0O1A ^Ywٶ.a?7YT{|O j-@Fأ6/f*;Ej; gU[_Յ3nhŃayЊ]Tp*Y4oR5{InOkd\@zCPGbkN*tKϙ B*f6[55Xfvh. 4V,-8'f5A pJ_t9\eJAK-g _-2-#)ԥ}`PD܏rtIHYg=U.+E(J=5rop\?#EC8Y)I!&%ך+w<r ^+utEf.Cv=֛NOs/EyVi61nٯWݨ;$ƉtFTJHM"q:Os*JN0K=tg}jm1#sPjV"nr:摜v11`6*UmT$+uyw{g٣b"ѻGh~ g>>TrJ_h>xIpE (^ 6z7ݡBOٔn;Qz,Fk壂CYPnk7bU{ؤ@~D_# #}FkRŋiU:t˝c.ވoά|*Vڷf\~KMz0U.A;m\ Qr1Rv'5gz1G lzg᛽~<%ԥ.WQ:u7 A5Pu98v:GGs9 BMG3Ǟ`&?bB Uyft!ySOFhQ*<\QҮsSE0Y{\Q\'lthڸ#Ch60涗Y(x=h[mH֒*KtޯϫгA?&M8KVswvfKmė4 h;sE*hL_l+Whoosh-2.5.7/docs/0000755000076500000240000000000012277504634014026 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/0000755000076500000240000000000012277504634015125 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/0000755000076500000240000000000012277504634016071 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/0000755000076500000240000000000012277504634017713 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/analysis.txt0000644000076500000240000003200212254366350022270 0ustar mattstaff00000000000000=============== About analyzers =============== Overview ======== An analyzer is a function or callable class (a class with a ``__call__`` method) that takes a unicode string and returns a generator of tokens. Usually a "token" is a word, for example the string "Mary had a little lamb" might yield the tokens "Mary", "had", "a", "little", and "lamb". However, tokens do not necessarily correspond to words. For example, you might tokenize Chinese text into individual characters or bi-grams. Tokens are the units of indexing, that is, they are what you are able to look up in the index. An analyzer is basically just a wrapper for a tokenizer and zero or more filters. The analyzer's ``__call__`` method will pass its parameters to a tokenizer, and the tokenizer will usually be wrapped in a few filters. A tokenizer is a callable that takes a unicode string and yields a series of ``analysis.Token`` objects. For example, the provided :class:`whoosh.analysis.RegexTokenizer` class implements a customizable, regular-expression-based tokenizer that extracts words and ignores whitespace and punctuation. :: >>> from whoosh.analysis import RegexTokenizer >>> tokenizer = RegexTokenizer() >>> for token in tokenizer(u"Hello there my friend!"): ... print repr(token.text) u'Hello' u'there' u'my' u'friend' A filter is a callable that takes a generator of Tokens (either a tokenizer or another filter) and in turn yields a series of Tokens. For example, the provided :meth:`whoosh.analysis.LowercaseFilter` filters tokens by converting their text to lowercase. The implementation is very simple:: def LowercaseFilter(tokens): """Uses lower() to lowercase token text. For example, tokens "This","is","a","TEST" become "this","is","a","test". """ for t in tokens: t.text = t.text.lower() yield t You can wrap the filter around a tokenizer to see it in operation:: >>> from whoosh.analysis import LowercaseFilter >>> for token in LowercaseFilter(tokenizer(u"These ARE the things I want!")): ... print repr(token.text) u'these' u'are' u'the' u'things' u'i' u'want' An analyzer is just a means of combining a tokenizer and some filters into a single package. You can implement an analyzer as a custom class or function, or compose tokenizers and filters together using the ``|`` character:: my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() The first item must be a tokenizer and the rest must be filters (you can't put a filter first or a tokenizer after the first item). Note that this only works if at least the tokenizer is a subclass of ``whoosh.analysis.Composable``, as all the tokenizers and filters that ship with Whoosh are. See the :mod:`whoosh.analysis` module for information on the available analyzers, tokenizers, and filters shipped with Whoosh. Using analyzers =============== When you create a field in a schema, you can specify your analyzer as a keyword argument to the field object:: schema = Schema(content=TEXT(analyzer=StemmingAnalyzer())) Advanced Analysis ================= Token objects ------------- The ``Token`` class has no methods. It is merely a place to record certain attributes. A ``Token`` object actually has two kinds of attributes: *settings* that record what kind of information the ``Token`` object does or should contain, and *information* about the current token. Token setting attributes ------------------------ A ``Token`` object should always have the following attributes. A tokenizer or filter can check these attributes to see what kind of information is available and/or what kind of information they should be setting on the ``Token`` object. These attributes are set by the tokenizer when it creates the Token(s), based on the parameters passed to it from the Analyzer. Filters **should not** change the values of these attributes. ====== ================ =================================================== ========= Type Attribute name Description Default ====== ================ =================================================== ========= str mode The mode in which the analyzer is being called, '' e.g. 'index' during indexing or 'query' during query parsing bool positions Whether term positions are recorded in the token False bool chars Whether term start and end character indices are False recorded in the token bool boosts Whether per-term boosts are recorded in the token False bool removestops Whether stop-words should be removed from the True token stream ====== ================ =================================================== ========= Token information attributes ---------------------------- A ``Token`` object may have any of the following attributes. The ``text`` attribute should always be present. The original attribute may be set by a tokenizer. All other attributes should only be accessed or set based on the values of the "settings" attributes above. ======== ========== ================================================================= Type Name Description ======== ========== ================================================================= unicode text The text of the token (this should always be present) unicode original The original (pre-filtered) text of the token. The tokenizer may record this, and filters are expected not to modify it. int pos The position of the token in the stream, starting at 0 (only set if positions is True) int startchar The character index of the start of the token in the original string (only set if chars is True) int endchar The character index of the end of the token in the original string (only set if chars is True) float boost The boost for this token (only set if boosts is True) bool stopped Whether this token is a "stop" word (only set if removestops is False) ======== ========== ================================================================= So why are most of the information attributes optional? Different field formats require different levels of information about each token. For example, the ``Frequency`` format only needs the token text. The ``Positions`` format records term positions, so it needs them on the ``Token``. The ``Characters`` format records term positions and the start and end character indices of each term, so it needs them on the token, and so on. The ``Format`` object that represents the format of each field calls the analyzer for the field, and passes it parameters corresponding to the types of information it needs, e.g.:: analyzer(unicode_string, positions=True) The analyzer can then pass that information to a tokenizer so the tokenizer initializes the required attributes on the ``Token`` object(s) it produces. Performing different analysis for indexing and query parsing ------------------------------------------------------------ Whoosh sets the ``mode`` setting attribute to indicate whether the analyzer is being called by the indexer (``mode='index'``) or the query parser (``mode='query'``). This is useful if there's a transformation that you only want to apply at indexing or query parsing:: class MyFilter(Filter): def __call__(self, tokens): for t in tokens: if t.mode == 'query': ... else: ... The :class:`whoosh.analysis.MultiFilter` filter class lets you specify different filters to use based on the mode setting:: intraword = MultiFilter(index=IntraWordFilter(mergewords=True, mergenums=True), query=IntraWordFilter(mergewords=False, mergenums=False)) Stop words ---------- "Stop" words are words that are so common it's often counter-productive to index them, such as "and", "or", "if", etc. The provided ``analysis.StopFilter`` lets you filter out stop words, and includes a default list of common stop words. :: >>> from whoosh.analysis import StopFilter >>> stopper = StopFilter() >>> for token in stopper(LowercaseFilter(tokenizer(u"These ARE the things I want!"))): ... print repr(token.text) u'these' u'things' u'want' However, this seemingly simple filter idea raises a couple of minor but slightly thorny issues: renumbering term positions and keeping or removing stopped words. Renumbering term positions -------------------------- Remember that analyzers are sometimes asked to record the position of each token in the token stream: ============= ========== ========== ========== ========== Token.text u'Mary' u'had' u'a' u'lamb' Token.pos 0 1 2 3 ============= ========== ========== ========== ========== So what happens to the ``pos`` attribute of the tokens if ``StopFilter`` removes the words ``had`` and ``a`` from the stream? Should it renumber the positions to pretend the "stopped" words never existed? I.e.: ============= ========== ========== Token.text u'Mary' u'lamb' Token.pos 0 1 ============= ========== ========== or should it preserve the original positions of the words? I.e: ============= ========== ========== Token.text u'Mary' u'lamb' Token.pos 0 3 ============= ========== ========== It turns out that different situations call for different solutions, so the provided ``StopFilter`` class supports both of the above behaviors. Renumbering is the default, since that is usually the most useful and is necessary to support phrase searching. However, you can set a parameter in StopFilter's constructor to tell it not to renumber positions:: stopper = StopFilter(renumber=False) Removing or leaving stop words ------------------------------ The point of using ``StopFilter`` is to remove stop words, right? Well, there are actually some situations where you might want to mark tokens as "stopped" but not remove them from the token stream. For example, if you were writing your own query parser, you could run the user's query through a field's analyzer to break it into tokens. In that case, you might want to know which words were "stopped" so you can provide helpful feedback to the end user (e.g. "The following words are too common to search for:"). In other cases, you might want to leave stopped words in the stream for certain filtering steps (for example, you might have a step that looks at previous tokens, and want the stopped tokens to be part of the process), but then remove them later. The ``analysis`` module provides a couple of tools for keeping and removing stop-words in the stream. The ``removestops`` parameter passed to the analyzer's ``__call__`` method (and copied to the ``Token`` object as an attribute) specifies whether stop words should be removed from the stream or left in. :: >>> from whoosh.analysis import StandardAnalyzer >>> analyzer = StandardAnalyzer() >>> [(t.text, t.stopped) for t in analyzer(u"This is a test")] [(u'test', False)] >>> [(t.text, t.stopped) for t in analyzer(u"This is a test", removestops=False)] [(u'this', True), (u'is', True), (u'a', True), (u'test', False)] The ``analysis.unstopped()`` filter function takes a token generator and yields only the tokens whose ``stopped`` attribute is ``False``. .. note:: Even if you leave stopped words in the stream in an analyzer you use for indexing, the indexer will ignore any tokens where the ``stopped`` attribute is ``True``. Implementation notes -------------------- Because object creation is slow in Python, the stock tokenizers do not create a new ``analysis.Token`` object for each token. Instead, they create one ``Token`` object and yield it over and over. This is a nice performance shortcut but can lead to strange behavior if your code tries to remember tokens between loops of the generator. Because the analyzer only has one ``Token`` object, of which it keeps changing the attributes, if you keep a copy of the Token you get from a loop of the generator, it will be changed from under you. For example:: >>> list(tokenizer(u"Hello there my friend")) [Token(u"friend"), Token(u"friend"), Token(u"friend"), Token(u"friend")] Instead, do this:: >>> [t.text for t in tokenizer(u"Hello there my friend")] That is, save the attributes, not the token object itself. If you implement your own tokenizer, filter, or analyzer as a class, you should implement an ``__eq__`` method. This is important to allow comparison of ``Schema`` objects. The mixing of persistent "setting" and transient "information" attributes on the ``Token`` object is not especially elegant. If I ever have a better idea I might change it. ;) Nothing requires that an Analyzer be implemented by calling a tokenizer and filters. Tokenizers and filters are simply a convenient way to structure the code. You're free to write an analyzer any way you want, as long as it implements ``__call__``. Whoosh-2.5.7/docs/build/html/_sources/api/0000755000076500000240000000000012277504634020464 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/api/analysis.txt0000644000076500000240000000247012254366350023047 0ustar mattstaff00000000000000=================== ``analysis`` module =================== .. automodule:: whoosh.analysis Analyzers ========= .. autoclass:: IDAnalyzer .. autoclass:: KeywordAnalyzer .. autoclass:: RegexAnalyzer .. autoclass:: SimpleAnalyzer .. autoclass:: StandardAnalyzer .. autoclass:: StemmingAnalyzer .. autoclass:: FancyAnalyzer .. autoclass:: NgramAnalyzer .. autoclass:: NgramWordAnalyzer .. autoclass:: LanguageAnalyzer Tokenizers ========== .. autoclass:: IDTokenizer .. autoclass:: RegexTokenizer .. autoclass:: CharsetTokenizer .. autoclass:: SpaceSeparatedTokenizer .. autoclass:: CommaSeparatedTokenizer .. autoclass:: NgramTokenizer .. autoclass:: PathTokenizer Filters ======= .. autoclass:: PassFilter .. autoclass:: LoggingFilter .. autoclass:: MultiFilter .. autoclass:: TeeFilter .. autoclass:: ReverseTextFilter .. autoclass:: LowercaseFilter .. autoclass:: StripFilter .. autoclass:: StopFilter .. autoclass:: StemFilter .. autoclass:: CharsetFilter .. autoclass:: NgramFilter .. autoclass:: IntraWordFilter .. autoclass:: CompoundWordFilter .. autoclass:: BiWordFilter .. autoclass:: ShingleFilter .. autoclass:: DelimitedAttributeFilter .. autoclass:: DoubleMetaphoneFilter .. autoclass:: SubstitutionFilter Token classes and functions =========================== .. autoclass:: Token .. autofunction:: unstopped Whoosh-2.5.7/docs/build/html/_sources/api/api.txt0000644000076500000240000000012312254366350021766 0ustar mattstaff00000000000000========== Whoosh API ========== .. toctree:: :glob: :maxdepth: 1 ** Whoosh-2.5.7/docs/build/html/_sources/api/codec/0000755000076500000240000000000012277504634021541 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/api/codec/base.txt0000644000076500000240000000063512254366350023214 0ustar mattstaff00000000000000===================== ``codec.base`` module ===================== .. automodule:: whoosh.codec.base Classes ======= .. autoclass:: Codec :members: .. autoclass:: PerDocumentWriter :members: .. autoclass:: FieldWriter :members: .. autoclass:: PostingsWriter :members: .. autoclass:: TermsReader :members: .. autoclass:: PerDocumentReader :members: .. autoclass:: Segment :members: Whoosh-2.5.7/docs/build/html/_sources/api/collectors.txt0000644000076500000240000000111012254366350023363 0ustar mattstaff00000000000000===================== ``collectors`` module ===================== .. automodule:: whoosh.collectors Base classes ============ .. autoclass:: Collector :members: .. autoclass:: ScoredCollector :members: .. autoclass:: WrappingCollector :members: Basic collectors ================ .. autoclass:: TopCollector .. autoclass:: UnlimitedCollector .. autoclass:: SortingCollector Wrappers ======== .. autoclass:: FilterCollector .. autoclass:: FacetCollector .. autoclass:: CollapseCollector .. autoclass:: TimeLimitCollector .. autoclass:: TermsCollector Whoosh-2.5.7/docs/build/html/_sources/api/columns.txt0000644000076500000240000000120312254366350022675 0ustar mattstaff00000000000000===================== ``columns`` module ===================== .. automodule:: whoosh.columns Base classes ============ .. autoclass:: Column :members: .. autoclass:: ColumnWriter :members: .. autoclass:: ColumnReader :members: Basic columns ============= .. autoclass:: VarBytesColumn .. autoclass:: FixedBytesColumn .. autoclass:: RefBytesColumn .. autoclass:: NumericColumn Technical columns ================= .. autoclass:: BitColumn .. autoclass:: CompressedBytesColumn .. autoclass:: StructColumn .. autoclass:: PickleColumn Experimental columns ==================== .. autoclass:: ClampedNumericColumn Whoosh-2.5.7/docs/build/html/_sources/api/fields.txt0000644000076500000240000000117012254366350022466 0ustar mattstaff00000000000000================= ``fields`` module ================= .. automodule:: whoosh.fields Schema class ============ .. autoclass:: Schema :members: .. autoclass:: SchemaClass FieldType base class ==================== .. autoclass:: FieldType :members: Pre-made field types ==================== .. autoclass:: ID .. autoclass:: IDLIST .. autoclass:: STORED .. autoclass:: KEYWORD .. autoclass:: TEXT .. autoclass:: NUMERIC .. autoclass:: DATETIME .. autoclass:: BOOLEAN .. autoclass:: NGRAM .. autoclass:: NGRAMWORDS Exceptions ========== .. autoexception:: FieldConfigurationError .. autoexception:: UnknownFieldError Whoosh-2.5.7/docs/build/html/_sources/api/filedb/0000755000076500000240000000000012277504634021711 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/api/filedb/filestore.txt0000644000076500000240000000073312254366350024445 0ustar mattstaff00000000000000=========================== ``filedb.filestore`` module =========================== .. automodule:: whoosh.filedb.filestore Base class ========== .. autoclass:: Storage :members: Implementation classes ====================== .. autoclass:: FileStorage .. autoclass:: RamStorage Helper functions ================ .. autofunction:: copy_storage .. autofunction:: copy_to_ram Exceptions ========== .. autoexception:: ReadOnlyError Whoosh-2.5.7/docs/build/html/_sources/api/filedb/filetables.txt0000644000076500000240000000055012254366350024560 0ustar mattstaff00000000000000============================ ``filedb.filetables`` module ============================ .. automodule:: whoosh.filedb.filetables Hash file ========= .. autoclass:: HashWriter :members: .. autoclass:: HashReader :members: Ordered Hash file ================= .. autoclass:: OrderedHashWriter .. autoclass:: OrderedHashReader Whoosh-2.5.7/docs/build/html/_sources/api/filedb/structfile.txt0000644000076500000240000000040012254366350024624 0ustar mattstaff00000000000000============================ ``filedb.structfile`` module ============================ .. automodule:: whoosh.filedb.structfile Classes ======= .. autoclass:: StructFile :members: .. autoclass:: BufferFile .. autoclass:: ChecksumFile Whoosh-2.5.7/docs/build/html/_sources/api/formats.txt0000644000076500000240000000051412254366350022674 0ustar mattstaff00000000000000================== ``formats`` module ================== .. automodule:: whoosh.formats Base class ========== .. autoclass:: Format :members: Formats ======= .. autoclass:: Existence .. autoclass:: Frequency .. autoclass:: Positions .. autoclass:: Characters .. autoclass:: PositionBoosts .. autoclass:: CharacterBoosts Whoosh-2.5.7/docs/build/html/_sources/api/highlight.txt0000644000076500000240000000135712254366350023176 0ustar mattstaff00000000000000==================== ``highlight`` module ==================== .. automodule:: whoosh.highlight See :doc:`how to highlight terms in search results `. Manual highlighting =================== .. autoclass:: Highlighter :members: .. autofunction:: highlight Fragmenters =========== .. autoclass:: Fragmenter :members: .. autoclass:: WholeFragmenter .. autoclass:: SentenceFragmenter .. autoclass:: ContextFragmenter .. autoclass:: PinpointFragmenter Scorers ======= .. autoclass:: FragmentScorer .. autoclass:: BasicFragmentScorer Formatters ========== .. autoclass:: UppercaseFormatter .. autoclass:: HtmlFormatter .. autoclass:: GenshiFormatter Utility classes =============== .. autoclass:: Fragment :members: Whoosh-2.5.7/docs/build/html/_sources/api/idsets.txt0000644000076500000240000000055512254366350022521 0ustar mattstaff00000000000000============================ ``support.bitvector`` module ============================ .. automodule:: whoosh.idsets Base classes ============ .. autoclass:: DocIdSet :members: .. autoclass:: BaseBitSet Implementation classes ====================== .. autoclass:: BitSet .. autoclass:: OnDiskBitSet .. autoclass:: SortedIntSet .. autoclass:: MultiIdSet Whoosh-2.5.7/docs/build/html/_sources/api/index.txt0000644000076500000240000000107712254366350022335 0ustar mattstaff00000000000000================ ``index`` module ================ .. automodule:: whoosh.index Functions ========= .. autofunction:: create_in .. autofunction:: open_dir .. autofunction:: exists_in .. autofunction:: exists .. autofunction:: version_in .. autofunction:: version Base class ========== .. autoclass:: Index :members: Implementation ============== .. autoclass:: FileIndex Exceptions ========== .. autoexception:: LockError .. autoexception:: IndexError .. autoexception:: IndexVersionError .. autoexception:: OutOfDateError .. autoexception:: EmptyIndexError Whoosh-2.5.7/docs/build/html/_sources/api/lang/0000755000076500000240000000000012277504634021405 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/api/lang/morph_en.txt0000644000076500000240000000021712254366350023751 0ustar mattstaff00000000000000======================== ``lang.morph_en`` module ======================== .. automodule:: whoosh.lang.morph_en .. autofunction:: variations Whoosh-2.5.7/docs/build/html/_sources/api/lang/porter.txt0000644000076500000240000000020112254366350023446 0ustar mattstaff00000000000000====================== ``lang.porter`` module ====================== .. automodule:: whoosh.lang.porter .. autofunction:: stem Whoosh-2.5.7/docs/build/html/_sources/api/lang/wordnet.txt0000644000076500000240000000045512254366350023630 0ustar mattstaff00000000000000======================== ``lang.wordnet`` module ======================== .. automodule:: whoosh.lang.wordnet Thesaurus ========= .. autoclass:: Thesaurus :members: Low-level functions =================== .. autofunction:: parse_file .. autofunction:: synonyms .. autofunction:: make_index Whoosh-2.5.7/docs/build/html/_sources/api/matching.txt0000644000076500000240000000127412254366350023017 0ustar mattstaff00000000000000=================== ``matching`` module =================== .. automodule:: whoosh.matching Matchers ======== .. autoclass:: Matcher :members: .. autoclass:: NullMatcher .. autoclass:: ListMatcher .. autoclass:: WrappingMatcher .. autoclass:: MultiMatcher .. autoclass:: FilterMatcher .. autoclass:: BiMatcher .. autoclass:: AdditiveBiMatcher .. autoclass:: UnionMatcher .. autoclass:: DisjunctionMaxMatcher .. autoclass:: IntersectionMatcher .. autoclass:: AndNotMatcher .. autoclass:: InverseMatcher .. autoclass:: RequireMatcher .. autoclass:: AndMaybeMatcher .. autoclass:: ConstantScoreMatcher Exceptions ========== .. autoexception:: ReadTooFar .. autoexception:: NoQualityAvailable Whoosh-2.5.7/docs/build/html/_sources/api/qparser.txt0000644000076500000240000000305312254366350022677 0ustar mattstaff00000000000000================== ``qparser`` module ================== .. automodule:: whoosh.qparser Parser object ============= .. autoclass:: QueryParser :members: Pre-made configurations ----------------------- The following functions return pre-configured QueryParser objects. .. autofunction:: MultifieldParser .. autofunction:: SimpleParser .. autofunction:: DisMaxParser Plug-ins ======== .. autoclass:: Plugin :members: .. autoclass:: SingleQuotePlugin .. autoclass:: PrefixPlugin .. autoclass:: WildcardPlugin .. autoclass:: RegexPlugin .. autoclass:: BoostPlugin .. autoclass:: GroupPlugin .. autoclass:: EveryPlugin .. autoclass:: FieldsPlugin .. autoclass:: PhrasePlugin .. autoclass:: RangePlugin .. autoclass:: OperatorsPlugin .. autoclass:: PlusMinusPlugin .. autoclass:: GtLtPlugin .. autoclass:: MultifieldPlugin .. autoclass:: FieldAliasPlugin .. autoclass:: CopyFieldPlugin Syntax node objects =================== Base nodes ---------- .. autoclass:: SyntaxNode :members: Nodes ----- .. autoclass:: FieldnameNode .. autoclass:: TextNode .. autoclass:: WordNode .. autoclass:: RangeNode .. autoclass:: MarkerNode Group nodes ----------- .. autoclass:: GroupNode .. autoclass:: BinaryGroup .. autoclass:: ErrorNode .. autoclass:: AndGroup .. autoclass:: OrGroup .. autoclass:: AndNotGroup .. autoclass:: AndMaybeGroup .. autoclass:: DisMaxGroup .. autoclass:: RequireGroup .. autoclass:: NotGroup Operators --------- .. autoclass:: Operator .. autoclass:: PrefixOperator .. autoclass:: PostfixOperator .. autoclass:: InfixOperator Whoosh-2.5.7/docs/build/html/_sources/api/query.txt0000644000076500000240000000264612254366350022376 0ustar mattstaff00000000000000================ ``query`` module ================ .. automodule:: whoosh.query See also :mod:`whoosh.qparser` which contains code for parsing user queries into query objects. Base classes ============ The following abstract base classes are subclassed to create the "real" query operations. .. autoclass:: Query :members: .. autoclass:: CompoundQuery .. autoclass:: MultiTerm .. autoclass:: ExpandingTerm .. autoclass:: WrappingQuery Query classes ============= .. autoclass:: Term .. autoclass:: Variations .. autoclass:: FuzzyTerm .. autoclass:: Phrase .. autoclass:: And .. autoclass:: Or .. autoclass:: DisjunctionMax .. autoclass:: Not .. autoclass:: Prefix .. autoclass:: Wildcard .. autoclass:: Regex .. autoclass:: TermRange .. autoclass:: NumericRange .. autoclass:: DateRange .. autoclass:: Every .. autoclass:: NullQuery Binary queries ============== .. autoclass:: Require .. autoclass:: AndMaybe .. autoclass:: AndNot .. autoclass:: Otherwise Span queries ============ .. autoclass:: Span :members: .. autoclass:: SpanQuery .. autoclass:: SpanFirst .. autoclass:: SpanNear .. autoclass:: SpanNear2 .. autoclass:: SpanNot .. autoclass:: SpanOr .. autoclass:: SpanContains .. autoclass:: SpanBefore .. autoclass:: SpanCondition Special queries =============== .. autoclass:: NestedParent .. autoclass:: NestedChildren .. autoclass:: ConstantScoreQuery Exceptions ========== .. autoexception:: QueryError Whoosh-2.5.7/docs/build/html/_sources/api/reading.txt0000644000076500000240000000041712254366350022634 0ustar mattstaff00000000000000================== ``reading`` module ================== .. automodule:: whoosh.reading Classes ======= .. autoclass:: IndexReader :members: .. autoclass:: MultiReader .. autoclass:: TermInfo :members: Exceptions ========== .. autoexception:: TermNotFound Whoosh-2.5.7/docs/build/html/_sources/api/scoring.txt0000644000076500000240000000103712254366350022666 0ustar mattstaff00000000000000================== ``scoring`` module ================== .. automodule:: whoosh.scoring Base classes ============ .. autoclass:: WeightingModel :members: .. autoclass:: BaseScorer :members: .. autoclass:: WeightScorer .. autoclass:: WeightLengthScorer Scoring algorithm classes ========================= .. autoclass:: BM25F .. autoclass:: TF_IDF .. autoclass:: Frequency Scoring utility classes ======================= .. autoclass:: FunctionWeighting .. autoclass:: MultiWeighting .. autoclass:: ReverseWeighting Whoosh-2.5.7/docs/build/html/_sources/api/searching.txt0000644000076500000240000000063512254366350023170 0ustar mattstaff00000000000000==================== ``searching`` module ==================== .. automodule:: whoosh.searching Searching classes ================= .. autoclass:: Searcher :members: Results classes =============== .. autoclass:: Results :members: .. autoclass:: Hit :members: .. autoclass:: ResultsPage :members: Exceptions ========== .. autoexception:: NoTermsException .. autoexception:: TimeLimit Whoosh-2.5.7/docs/build/html/_sources/api/sorting.txt0000644000076500000240000000125612254366350022712 0ustar mattstaff00000000000000================== ``sorting`` module ================== .. automodule:: whoosh.sorting Base types ========== .. autoclass:: FacetType :members: .. autoclass:: Categorizer :members: Facet types =========== .. autoclass:: FieldFacet .. autoclass:: QueryFacet .. autoclass:: RangeFacet .. autoclass:: DateRangeFacet .. autoclass:: ScoreFacet .. autoclass:: FunctionFacet .. autoclass:: MultiFacet .. autoclass:: StoredFieldFacet Facets object ============= .. autoclass:: Facets :members: FacetType objects ================= .. autoclass:: FacetMap :members: .. autoclass:: OrderedList .. autoclass:: UnorderedList .. autoclass:: Count .. autoclass:: Best Whoosh-2.5.7/docs/build/html/_sources/api/spelling.txt0000644000076500000240000000076312254366350023044 0ustar mattstaff00000000000000=================== ``spelling`` module =================== See :doc:`correcting errors in user queries <../spelling>`. .. automodule:: whoosh.spelling Corrector objects ================= .. autoclass:: Corrector :members: .. autoclass:: ReaderCorrector .. autoclass:: GraphCorrector :members: .. autoclass:: MultiCorrector QueryCorrector objects ====================== .. autoclass:: QueryCorrector :members: .. autoclass:: SimpleQueryCorrector .. autoclass:: Correction Whoosh-2.5.7/docs/build/html/_sources/api/support/0000755000076500000240000000000012277504634022200 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/api/support/charset.txt0000644000076500000240000000045312254366350024370 0ustar mattstaff00000000000000========================== ``support.charset`` module ========================== .. automodule:: whoosh.support.charset .. data:: default_charset An extensive case- and accent folding charset table. Taken from http://speeple.com/unicode-maps.txt .. autofunction:: charset_table_to_dict Whoosh-2.5.7/docs/build/html/_sources/api/support/levenshtein.txt0000644000076500000240000000030212254366350025254 0ustar mattstaff00000000000000============================== ``support.levenshtein`` module ============================== .. automodule:: whoosh.support.levenshtein .. autofunction:: relative .. autofunction:: distance Whoosh-2.5.7/docs/build/html/_sources/api/util.txt0000644000076500000240000000013412254366350022174 0ustar mattstaff00000000000000=============== ``util`` module =============== .. automodule:: whoosh.util :members: Whoosh-2.5.7/docs/build/html/_sources/api/writing.txt0000644000076500000240000000051012254366350022700 0ustar mattstaff00000000000000================== ``writing`` module ================== .. automodule:: whoosh.writing Writer ====== .. autoclass:: IndexWriter :members: Utility writers =============== .. autoclass:: BufferedWriter :members: .. autoclass:: AsyncWriter :members: Exceptions ========== .. autoexception:: IndexingError Whoosh-2.5.7/docs/build/html/_sources/batch.txt0000644000076500000240000000750712254366350021542 0ustar mattstaff00000000000000=================================== Tips for speeding up batch indexing =================================== Overview ======== Indexing documents tends to fall into two general patterns: adding documents one at a time as they are created (as in a web application), and adding a bunch of documents at once (batch indexing). The following settings and alternate workflows can make batch indexing faster. StemmingAnalyzer cache ====================== The stemming analyzer by default uses a least-recently-used (LRU) cache to limit the amount of memory it uses, to prevent the cache from growing very large if the analyzer is reused for a long period of time. However, the LRU cache can slow down indexing by almost 200% compared to a stemming analyzer with an "unbounded" cache. When you're indexing in large batches with a one-shot instance of the analyzer, consider using an unbounded cache:: w = myindex.writer() # Get the analyzer object from a text field stem_ana = w.schema["content"].format.analyzer # Set the cachesize to -1 to indicate unbounded caching stem_ana.cachesize = -1 # Reset the analyzer to pick up the changed attribute stem_ana.clear() # Use the writer to index documents... The ``limitmb`` parameter ========================= The ``limitmb`` parameter to :meth:`whoosh.index.Index.writer` controls the *maximum* memory (in megabytes) the writer will use for the indexing pool. The higher the number, the faster indexing will be. The default value of ``128`` is actually somewhat low, considering many people have multiple gigabytes of RAM these days. Setting it higher can speed up indexing considerably:: from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(limitmb=256) .. note:: The actual memory used will be higher than this value because of interpreter overhead (up to twice as much!). It is very useful as a tuning parameter, but not for trying to exactly control the memory usage of Whoosh. The ``procs`` parameter ======================= The ``procs`` parameter to :meth:`whoosh.index.Index.writer` controls the number of processors the writer will use for indexing (via the ``multiprocessing`` module):: from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(procs=4) Note that when you use multiprocessing, the ``limitmb`` parameter controls the amount of memory used by *each process*, so the actual memory used will be ``limitmb * procs``:: # Each process will use a limit of 128, for a total of 512 writer = ix.writer(procs=4, limitmb=128) The ``multisegment`` parameter ============================== The ``procs`` parameter causes the default writer to use multiple processors to do much of the indexing, but then still uses a single process to merge the pool of each sub-writer into a single segment. You can get much better indexing speed by also using the ``multisegment=True`` keyword argument, which instead of merging the results of each sub-writer, simply has them each just write out a new segment:: from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(procs=4, multisegment=True) The drawback is that instead of creating a single new segment, this option creates a number of new segments **at least** equal to the number of processes you use. For example, if you use ``procs=4``, the writer will create four new segments. (If you merge old segments or call ``add_reader`` on the parent writer, the parent writer will also write a segment, meaning you'll get five new segments.) So, while ``multisegment=True`` is much faster than a normal writer, you should only use it for large batch indexing jobs (or perhaps only for indexing from scratch). It should not be the only method you use for indexing, because otherwise the number of segments will tend to increase forever! Whoosh-2.5.7/docs/build/html/_sources/dates.txt0000644000076500000240000001461212254366350021554 0ustar mattstaff00000000000000================================ Indexing and parsing dates/times ================================ Indexing dates ============== Whoosh lets you index and search dates/times using the :class:`whoosh.fields.DATETIME` field type. Instead of passing text for the field in ``add_document()``, you use a Python ``datetime.datetime`` object:: from datetime import datetime, timedelta from whoosh import fields, index schema = fields.Schema(title=fields.TEXT, content=fields.TEXT, date=fields.DATETIME) ix = index.create_in("indexdir", schema) w = ix.writer() w.add_document(title="Document 1", content="Rendering images from the command line", date=datetime.utcnow()) w.add_document(title="Document 2", content="Creating shaders using a node network", date=datetime.utcnow() + timedelta(days=1)) w.commit() Parsing date queries ==================== Once you've have an indexed ``DATETIME`` field, you can search it using a rich date parser contained in the :class:`whoosh.qparser.dateparse.DateParserPlugin`:: from whoosh import index from whoosh.qparser import QueryParser from whoosh.qparser.dateparse import DateParserPlugin ix = index.open_dir("indexdir") # Instatiate a query parser qp = QueryParser("content", ix.schema) # Add the DateParserPlugin to the parser qp.add_plugin(DateParserPlugin()) With the ``DateParserPlugin``, users can use date queries such as:: 20050912 2005 sept 12th june 23 1978 23 mar 2005 july 1985 sep 12 today yesterday tomorrow now next friday last tuesday 5am 10:25:54 23:12 8 PM 4:46 am oct 31 2010 last tuesday to today today to next friday jan 2005 to feb 2008 -1 week to now now to +2h -1y6mo to +2 yrs 23d Normally, as with other types of queries containing spaces, the users need to quote date queries containing spaces using single quotes:: render date:'last tuesday' command date:['last tuesday' to 'next friday'] If you use the ``free`` argument to the ``DateParserPlugin``, the plugin will try to parse dates from unquoted text following a date field prefix:: qp.add_plugin(DateParserPlugin(free=True)) This allows the user to type a date query with spaces and special characters following the name of date field and a colon. The date query can be mixed with other types of queries without quotes:: date:last tuesday render date:oct 15th 2001 5:20am command If you don't use the ``DateParserPlugin``, users can still search DATETIME fields using a simple numeric form ``YYYY[MM[DD[hh[mm[ss]]]]]`` that is built into the ``DATETIME`` field:: from whoosh import index from whoosh.qparser import QueryParser ix = index.open_dir("indexdir") qp = QueryParser("content", schema=ix.schema) # Find all datetimes in 2005 q = qp.parse(u"date:2005") # Find all datetimes on June 24, 2005 q = qp.parse(u"date:20050624") # Find all datetimes from 1am-2am on June 24, 2005 q = qp.parse(u"date:2005062401") # Find all datetimes from Jan 1, 2005 to June 2, 2010 q = qp.parse(u"date:[20050101 to 20100602]") About time zones and basetime ============================= The best way to deal with time zones is to always index ``datetime``\ s in native UTC form. Any ``tzinfo`` attribute on the ``datetime`` object is *ignored* by the indexer. If you are working with local datetimes, you should convert them to native UTC datetimes before indexing. Date parser notes ================= Please note that the date parser is still somewhat experimental. Setting the base datetime ------------------------- When you create the ``DateParserPlugin`` you can pass a ``datetime`` object to the ``basedate`` argument to set the datetime against which relative queries (such as ``last tuesday`` and ``-2 hours``) are measured. By default, the basedate is ``datetime.utcnow()`` at the moment the plugin is instantiated:: qp.add_plugin(DateParserPlugin(basedate=my_datetime)) Registering an error callback ----------------------------- To avoid user queries causing exceptions in your application, the date parser attempts to fail silently when it can't parse a date query. However, you can register a callback function to be notified of parsing failures so you can display feedback to the user. The argument to the callback function is the date text that could not be parsed (this is an experimental feature and may change in future versions):: errors = [] def add_error(msg): errors.append(msg) qp.add_plugin(DateParserPlug(callback=add_error)) q = qp.parse(u"date:blarg") # errors == [u"blarg"] Using free parsing ------------------ While the ``free`` option is easier for users, it may result in ambiguities. As one example, if you want to find documents containing reference to a march and the number 2 in documents from the year 2005, you might type:: date:2005 march 2 This query would be interpreted correctly as a date query and two term queries when ``free=False``, but as a single date query when ``free=True``. In this case the user could limit the scope of the date parser with single quotes:: date:'2005' march 2 Parsable formats ---------------- The date parser supports a wide array of date and time formats, however it is not my intention to try to support *all* types of human-readable dates (for example ``ten to five the friday after next``). The best idea might be to pick a date format that works and try to train users on it, and if they use one of the other formats that also works consider it a happy accident. Limitations =========== * Since it's based on Python's ``datetime.datetime`` object, the ``DATETIME`` field shares all the limitations of that class, such as no support for dates before year 1 on the proleptic Gregorian calendar. The ``DATETIME`` field supports practically unlimited dates, so if the ``datetime`` object is every improved it could support it. An alternative possibility might be to add support for ``mxDateTime`` objects someday. * The ``DateParserPlugin`` currently only has support for English dates. The architecture supports creation of parsers for other languages, and I hope to add examples for other languages soon. * ``DATETIME`` fields do not currently support open-ended ranges. You can simulate an open ended range by using an endpoint far in the past or future. Whoosh-2.5.7/docs/build/html/_sources/facets.txt0000644000076500000240000006722712254366350021733 0ustar mattstaff00000000000000==================== Sorting and faceting ==================== .. note:: The API for sorting and faceting changed in Whoosh 3.0. Overview ======== Sorting and faceting search results in Whoosh is based on **facets**. Each facet associates a value with each document in the search results, allowing you to sort by the keys or use them to group the documents. Whoosh includes a variety of **facet types** you can use for sorting and grouping (see below). Sorting ======= By default, the results of a search are sorted with the highest-scoring documents first. You can use the ``sortedby`` keyword argument to order the results by some other criteria instead, such as the value of a field. Making fields sortable ---------------------- In order to sort on a field, you should create the field using the ``sortable=True`` keyword argument:: schema = fields.Schema(title=fields.TEXT(sortable=True), content=fields.TEXT, modified=fields.DATETIME(sortable=True) ) It's possible to sort on a field that doesn't have ``sortable=True``, but this requires Whoosh to load the unique terms in the field into memory. Using ``sortable`` is much more efficient. About column types ------------------ When you create a field using ``sortable=True``, you are telling Whoosh to store per-document values for that field in a *column*. A column object specifies the format to use to store the per-document values on disk. The :mod:`whoosh.columns` module contains several different column object implementations. Each field type specifies a reasonable default column type (for example, the default for text fields is :class:`whoosh.columns.VarBytesColumn`, the default for numeric fields is :class:`whoosh.columns.NumericColumn`). However, if you want maximum efficiency you may want to use a different column type for a field. For example, if all document values in a field are a fixed length, you can use a :class:`whoosh.columns.FixedBytesColumn`. If you have a field where many documents share a relatively small number of possible values (an example might be a "category" field, or "month" or other enumeration type fields), you might want to use :class:`whoosh.columns.RefBytesColumn` (which can handle both variable and fixed-length values). There are column types for storing per-document bit values, structs, pickled objects, and compressed byte values. To specify a custom column object for a field, pass it as the ``sortable`` keyword argument instead of ``True``:: from whoosh import columns, fields category_col = columns.RefBytesColumn() schema = fields.Schema(title=fields.TEXT(sortable=True), category=fields.KEYWORD(sortable=category_col) Using a COLUMN field for custom sort keys ----------------------------------------- When you add a document with a sortable field, Whoosh uses the value you pass for the field as the sortable value. For example, if "title" is a sortable field, and you add this document:: writer.add_document(title="Mr. Palomar") ...then ``Mr. Palomar`` is stored in the field column as the sorting key for the document. This is usually good, but sometimes you need to "massage" the sortable key so it's different from the value the user searches and/or sees in the interface. For example, if you allow the user to sort by title, you might want to use different values for the visible title and the value used for sorting:: # Visible title title = "The Unbearable Lightness of Being" # Sortable title: converted to lowercase (to prevent different ordering # depending on uppercase/lowercase), with initial article moved to the end sort_title = "unbearable lightness of being, the" The best way to do this is to use an additional field just for sorting. You can use the :class:`whoosh.fields.COLUMN` field type to create a field that is not indexed or stored, it only holds per-document column values:: schema = fields.Schema(title=fields.TEXT(stored=True), sort_title=fields.COLUMN(columns.VarBytesColumn()) ) The single argument to the :class:`whoosh.fields.COLUMN` initializer is a :class:`whoosh.columns.ColumnType` object. You can use any of the various column types in the :mod:`whoosh.columns` module. As another example, say you are indexing documents that have a custom sorting order associated with each document, such as a "priority" number:: name=Big Wheel price=100 priority=1 name=Toss Across price=40 priority=3 name=Slinky price=25 priority=2 ... You can use a column field with a numeric column object to hold the "priority" and use it for sorting:: schema = fields.Schema(name=fields.TEXT(stored=True), price=fields.NUMERIC(stored=True), priority=fields.COLUMN(columns.NumericColumn("i"), ) (Note that :class:`columns.NumericColumn` takes a type code character like the codes used by Python's ``struct`` and ``array`` modules.) Making existing fields sortable ------------------------------- If you have an existing index from before the ``sortable`` argument was added in Whoosh 3.0, or you didn't think you needed a field to be sortable but now you find that you need to sort it, you can add "sortability" to an existing index using the :func:`whoosh.sorting.add_sortable` utility function:: from whoosh import columns, fields, index, sorting # Say we have an existing index with this schema schema = fields.Schema(title=fields.TEXT, price=fields.NUMERIC) # To use add_sortable, first open a writer for the index ix = index.open_dir("indexdir") with ix.writer() as w: # Add sortable=True to the "price" field using field terms as the # sortable values sorting.add_sortable(w, "price", sorting.FieldFacet("price")) # Add sortable=True to the "title" field using the # stored field values as the sortable value sorting.add_sortable(w, "title", sorting.StoredFieldFacet("title")) You can specify a custom column type when you call ``add_sortable`` using the ``column`` keyword argument:: add_sortable(w, "chapter", sorting.FieldFacet("chapter"), column=columns.RefBytesColumn()) See the documentation for :func:`~whoosh.sorting.add_sortable` for more information. Sorting search results ---------------------- When you tell Whoosh to sort by a field (or fields), it uses the per-document values in the field's column as sorting keys for the documents. Normally search results are sorted by descending relevance score. You can tell Whoosh to use a different ordering by passing the ``sortedby`` keyword argument to the :meth:`~whoosh.searching.Searcher.search` method:: from whoosh import fields, index, qparser schema = fields.Schema(title=fields.TEXT(stored=True), price=fields.NUMERIC(sortable=True)) ix = index.create_in("indexdir", schema) with ix.writer() as w: w.add_document(title="Big Deal", price=20) w.add_document(title="Mr. Big", price=10) w.add_document(title="Big Top", price=15) with ix.searcher() as s: qp = qparser.QueryParser("big", ix.schema) q = qp.parse(user_query_string) # Sort search results from lowest to highest price results = s.search(q, sortedby="price") for hit in results: print(hit["title"]) You can use any of the following objects as ``sortedby`` values: A ``FacetType`` object Uses this object to sort the documents. See below for the available facet types. A field name string Converts the field name into a ``FieldFacet`` (see below) and uses it to sort the documents. A list of ``FacetType`` objects and/or field name strings Bundles the facets together into a ``MultiFacet`` so you can sort by multiple keys. Note that this shortcut does not allow you to reverse the sort direction of individual facets. To do that, you need to construct the ``MultiFacet`` object yourself. .. note:: You can use the ``reverse=True`` keyword argument to the ``Searcher.search()`` method to reverse the overall sort direction. This is more efficient than reversing each individual facet. Examples -------- Sort by the value of the size field:: results = searcher.search(myquery, sortedby="size") Sort by the reverse (highest-to-lowest) order of the "price" field:: facet = sorting.FieldFacet("price", reverse=True) results = searcher.search(myquery, sortedby=facet) Sort by ascending size and then descending price:: mf = sorting.MultiFacet() mf.add_field("size") mf.add_field("price", reverse=True) results = searcher.search(myquery, sortedby=mf) # or... sizes = sorting.FieldFacet("size") prices = sorting.FieldFacet("price", reverse=True) results = searcher.search(myquery, sortedby=[sizes, prices]) Sort by the "category" field, then by the document's score:: cats = sorting.FieldFacet("category") scores = sorting.ScoreFacet() results = searcher.search(myquery, sortedby=[cats, scores]) Accessing column values ----------------------- Per-document column values are available in :class:`~whoosh.searching.Hit` objects just like stored field values:: schema = fields.Schema(title=fields.TEXT(stored=True), price=fields.NUMERIC(sortable=True)) ... results = searcher.search(myquery) for hit in results: print(hit["title"], hit["price"]) ADVANCED: if you want to access abitrary per-document values quickly you can get a column reader object:: with ix.searcher() as s: reader = s.reader() colreader = s.reader().column_reader("price") for docnum in reader.all_doc_ids(): print(colreader[docnum]) Grouping ======== It is often very useful to present "faceted" search results to the user. Faceting is dynamic grouping of search results into categories. The categories let users view a slice of the total results based on the categories they're interested in. For example, if you are programming a shopping website, you might want to display categories with the search results such as the manufacturers and price ranges. ==================== ================= Manufacturer Price -------------------- ----------------- Apple (5) $0 - $100 (2) Sanyo (1) $101 - $500 (10) Sony (2) $501 - $1000 (1) Toshiba (5) ==================== ================= You can let your users click the different facet values to only show results in the given categories. Another useful UI pattern is to show, say, the top 5 results for different types of found documents, and let the user click to see more results from a category they're interested in, similarly to how the Spotlight quick results work on Mac OS X. The ``groupedby`` keyword argument ---------------------------------- You can use the following objects as ``groupedby`` values: A ``FacetType`` object Uses this object to group the documents. See below for the available facet types. A field name string Converts the field name into a ``FieldFacet`` (see below) and uses it to sort the documents. The name of the field is used as the facet name. A list or tuple of field name strings Sets up multiple field grouping criteria. A dictionary mapping facet names to ``FacetType`` objects Sets up multiple grouping criteria. A ``Facets`` object This object is a lot like using a dictionary, but has some convenience methods to make setting up multiple groupings a little easier. Examples -------- Group by the value of the "category" field:: results = searcher.search(myquery, groupedby="category") Group by the value of the "category" field and also by the value of the "tags" field and a date range:: cats = sorting.FieldFacet("category") tags = sorting.FieldFacet("tags", allow_overlap=True) results = searcher.search(myquery, groupedby={"category": cats, "tags": tags}) # ...or, using a Facets object has a little less duplication facets = sorting.Facets() facets.add_field("category") facets.add_field("tags", allow_overlap=True) results = searcher.search(myquery, groupedby=facets) To group results by the *intersected values of multiple fields*, use a ``MultiFacet`` object (see below). For example, if you have two fields named ``tag`` and ``size``, you could group the results by all combinations of the ``tag`` and ``size`` field, such as ``('tag1', 'small')``, ``('tag2', 'small')``, ``('tag1', 'medium')``, and so on:: # Generate a grouping from the combination of the "tag" and "size" fields mf = MultiFacet("tag", "size") results = searcher.search(myquery, groupedby={"tag/size": mf}) Getting the faceted groups -------------------------- The ``Results.groups("facetname")`` method returns a dictionary mapping category names to lists of **document IDs**:: myfacets = sorting.Facets().add_field("size").add_field("tag") results = mysearcher.search(myquery, groupedby=myfacets) results.groups("size") # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]} If there is only one facet, you can just use ``Results.groups()`` with no argument to access its groups:: results = mysearcher.search(myquery, groupedby=myfunctionfacet) results.groups() By default, the values in the dictionary returned by ``groups()`` are lists of document numbers in the same relative order as in the results. You can use the ``Searcher`` object's ``stored_fields()`` method to take a document number and return the document's stored fields as a dictionary:: for category_name in categories: print "Top 5 documents in the %s category" % category_name doclist = categories[category_name] for docnum, score in doclist[:5]: print " ", searcher.stored_fields(docnum) if len(doclist) > 5: print " (%s more)" % (len(doclist) - 5) If you want different information about the groups, for example just the count of documents in each group, or you don't need the groups to be ordered, you can specify a :class:`whoosh.sorting.FacetMap` type or instance with the ``maptype`` keyword argument when creating the ``FacetType``:: # This is the same as the default myfacet = FieldFacet("size", maptype=sorting.OrderedList) results = mysearcher.search(myquery, groupedby=myfacet) results.groups() # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]} # Don't sort the groups to match the order of documents in the results # (faster) myfacet = FieldFacet("size", maptype=sorting.UnorderedList) results = mysearcher.search(myquery, groupedby=myfacet) results.groups() # {"small": [1, 2, 4, 5, 8], "medium": [0, 3, 6], "large": [7, 9]} # Only count the documents in each group myfacet = FieldFacet("size", maptype=sorting.Count) results = mysearcher.search(myquery, groupedby=myfacet) results.groups() # {"small": 5, "medium": 3, "large": 2} # Only remember the "best" document in each group myfacet = FieldFacet("size", maptype=sorting.Best) results = mysearcher.search(myquery, groupedby=myfacet) results.groups() # {"small": 8, "medium": 3, "large": 7} Alternatively you can specify a ``maptype`` argument in the ``Searcher.search()`` method call which applies to all facets:: results = mysearcher.search(myquery, groupedby=["size", "tag"], maptype=sorting.Count) (You can override this overall ``maptype`` argument on individual facets by specifying the ``maptype`` argument for them as well.) Facet types =========== FieldFacet ---------- This is the most common facet type. It sorts or groups based on the value in a certain field in each document. This generally works best (or at all) if each document has only one term in the field (e.g. an ID field):: # Sort search results by the value of the "path" field facet = sorting.FieldFacet("path") results = searcher.search(myquery, sortedby=facet) # Group search results by the value of the "parent" field facet = sorting.FieldFacet("parent") results = searcher.search(myquery, groupedby=facet) parent_groups = results.groups("parent") By default, ``FieldFacet`` only supports **non-overlapping** grouping, where a document cannot belong to multiple facets at the same time (each document will be sorted into one category arbitrarily.) To get overlapping groups with multi-valued fields, use the ``allow_overlap=True`` keyword argument:: facet = sorting.FieldFacet(fieldname, allow_overlap=True) This supports overlapping group membership where documents have more than one term in a field (e.g. KEYWORD fields). If you don't need overlapping, don't use ``allow_overlap`` because it's *much* slower and uses more memory (see the secion on ``allow_overlap`` below). QueryFacet ---------- You can set up categories defined by arbitrary queries. For example, you can group names using prefix queries:: # Use queries to define each category # (Here I'll assume "price" is a NUMERIC field, so I'll use # NumericRange) qdict = {} qdict["A-D"] = query.TermRange("name", "a", "d") qdict["E-H"] = query.TermRange("name", "e", "h") qdict["I-L"] = query.TermRange("name", "i", "l") # ... qfacet = sorting.QueryFacet(qdict) r = searcher.search(myquery, groupedby={"firstltr": qfacet}) By default, ``QueryFacet`` only supports **non-overlapping** grouping, where a document cannot belong to multiple facets at the same time (each document will be sorted into one category arbitrarily). To get overlapping groups with multi-valued fields, use the ``allow_overlap=True`` keyword argument:: facet = sorting.QueryFacet(querydict, allow_overlap=True) RangeFacet ---------- The ``RangeFacet`` is for NUMERIC field types. It divides a range of possible values into groups. For example, to group documents based on price into buckets $100 "wide":: pricefacet = sorting.RangeFacet("price", 0, 1000, 100) The first argument is the name of the field. The next two arguments are the full range to be divided. Value outside this range (in this example, values below 0 and above 1000) will be sorted into the "missing" (None) group. The fourth argument is the "gap size", the size of the divisions in the range. The "gap" can be a list instead of a single value. In that case, the values in the list will be used to set the size of the initial divisions, with the last value in the list being the size for all subsequent divisions. For example:: pricefacet = sorting.RangeFacet("price", 0, 1000, [5, 10, 35, 50]) ...will set up divisions of 0-5, 5-15, 15-50, 50-100, and then use 50 as the size for all subsequent divisions (i.e. 100-150, 150-200, and so on). The ``hardend`` keyword argument controls whether the last division is clamped to the end of the range or allowed to go past the end of the range. For example, this:: facet = sorting.RangeFacet("num", 0, 10, 4, hardend=False) ...gives divisions 0-4, 4-8, and 8-12, while this:: facet = sorting.RangeFacet("num", 0, 10, 4, hardend=True) ...gives divisions 0-4, 4-8, and 8-10. (The default is ``hardend=False``.) .. note:: The ranges/buckets are always **inclusive** at the start and **exclusive** at the end. DateRangeFacet -------------- This is like ``RangeFacet`` but for DATETIME fields. The start and end values must be ``datetime.datetime`` objects, and the gap(s) is/are ``datetime.timedelta`` objects. For example:: from datetime import datetime, timedelta start = datetime(2000, 1, 1) end = datetime.now() gap = timedelta(days=365) bdayfacet = sorting.DateRangeFacet("birthday", start, end, gap) As with ``RangeFacet``, you can use a list of gaps and the ``hardend`` keyword argument. ScoreFacet ---------- This facet is sometimes useful for sorting. For example, to sort by the "category" field, then for documents with the same category, sort by the document's score:: cats = sorting.FieldFacet("category") scores = sorting.ScoreFacet() results = searcher.search(myquery, sortedby=[cats, scores]) The ``ScoreFacet`` always sorts higher scores before lower scores. .. note:: While using ``sortedby=ScoreFacet()`` should give the same results as using the default scored ordering (``sortedby=None``), using the facet will be slower because Whoosh automatically turns off many optimizations when sorting. FunctionFacet ------------- This facet lets you pass a custom function to compute the sorting/grouping key for documents. (Using this facet type may be easier than subclassing FacetType and Categorizer to set up some custom behavior.) The function will be called with the index searcher and index document ID as arguments. For example, if you have an index with term vectors:: schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True, vector=True)) ix = RamStorage().create_index(schema) ...you could use a function to sort documents higher the closer they are to having equal occurances of two terms:: def fn(searcher, docnum): v = dict(searcher.vector_as("frequency", docnum, "text")) # Sort documents that have equal number of "alfa" and "bravo" first return 0 - (1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0)) facet = sorting.FunctionFacet(fn) results = searcher.search(myquery, sortedby=facet) StoredFieldFacet ---------------- This facet lets you use stored field values as the sorting/grouping key for documents. This is usually slower than using an indexed field, but when using ``allow_overlap`` it can actually be faster for large indexes just because it avoids the overhead of reading posting lists. :class:`~whoosh.sorting.StoredFieldFacet` supports ``allow_overlap`` by splitting the stored value into separate keys. By default it calls the value's ``split()`` method (since most stored values are strings), but you can supply a custom split function. See the section on ``allow_overlap`` below. MultiFacet ========== This facet type returns a composite of the keys returned by two or more sub-facets, allowing you to sort/group by the intersected values of multiple facets. ``MultiFacet`` has methods for adding facets:: myfacet = sorting.RangeFacet(0, 1000, 10) mf = sorting.MultiFacet() mf.add_field("category") mf.add_field("price", reverse=True) mf.add_facet(myfacet) mf.add_score() You can also pass a list of field names and/or ``FacetType`` objects to the initializer:: prices = sorting.FieldFacet("price", reverse=True) scores = sorting.ScoreFacet() mf = sorting.MultiFacet("category", prices, myfacet, scores) Missing values ============== * When sorting, documents without any terms in a given field, or whatever else constitutes "missing" for different facet types, will always sort to the end. * When grouping, "missing" documents will appear in a group with the key ``None``. Using overlapping groups ======================== The common supported workflow for grouping and sorting is where the given field has *one value for document*, for example a ``path`` field containing the file path of the original document. By default, facets are set up to support this single-value approach. Of course, there are situations where you want documents to be sorted into multiple groups based on a field with multiple terms per document. The most common example would be a ``tags`` field. The ``allow_overlap`` keyword argument to the :class:`~whoosh.sorting.FieldFacet`, :class:`~whoosh.sorting.QueryFacet`, and :class:`~whoosh.sorting.StoredFieldFacet` allows this multi-value approach. However, there is an important caveat: using ``allow_overlap=True`` is slower than the default, potentially *much* slower for very large result sets. This is because Whoosh must read every posting of every term in the field to create a temporary "forward index" mapping documents to terms. If a field is indexed with *term vectors*, ``FieldFacet`` will use them to speed up ``allow_overlap`` faceting for small result sets, but for large result sets, where Whoosh has to open the vector list for every matched document, this can still be very slow. For very large indexes and result sets, if a field is stored, you can get faster overlapped faceting using :class:`~whoosh.sorting.StoredFieldFacet` instead of ``FieldFacet``. While reading stored values is usually slower than using the index, in this case avoiding the overhead of opening large numbers of posting readers can make it worthwhile. ``StoredFieldFacet`` supports ``allow_overlap`` by loading the stored value for the given field and splitting it into multiple values. The default is to call the value's ``split()`` method. For example, if you've stored the ``tags`` field as a string like ``"tag1 tag2 tag3"``:: schema = fields.Schema(name=fields.TEXT(stored=True), tags=fields.KEYWORD(stored=True)) ix = index.create_in("indexdir") with ix.writer() as w: w.add_document(name="A Midsummer Night's Dream", tags="comedy fairies") w.add_document(name="Hamlet", tags="tragedy denmark") # etc. ...Then you can use a ``StoredFieldFacet`` like this:: ix = index.open_dir("indexdir") with ix.searcher() as s: sff = sorting.StoredFieldFacet("tags", allow_overlap=True) results = s.search(myquery, groupedby={"tags": sff}) For stored Python objects other than strings, you can supply a split function (using the ``split_fn`` keyword argument to ``StoredFieldFacet``). The function should accept a single argument (the stored value) and return a list or tuple of grouping keys. Using a custom sort order ========================= It is sometimes useful to have a custom sort order per-search. For example, different languages use different sort orders. If you have a function to return the sorting order you want for a given field value, such as an implementation of the Unicode Collation Algorithm (UCA), you can customize the sort order for the user's language. The :class:`whoosh.sorting.TranslateFacet` lets you apply a function to the value of another facet. This lets you "translate" a field value into an arbitrary sort key, such as with UCA:: from pyuca import Collator # The Collator object has a sort_key() method which takes a unicode # string and returns a sort key c = Collator("allkeys.txt") # Make a facet object for the field you want to sort on nf = sorting.FieldFacet("name") # Wrap the facet in a TranslateFacet with the translation function # (the Collator object's sort_key method) tf = sorting.TranslateFacet(facet, c.sort_key) # Use the facet to sort the search results results = searcher.search(myquery, sortedby=tf) (You can pass multiple "wrapped" facets to the ``TranslateFacet``, and it will call the function with the values of the facets as multiple arguments.) The TranslateFacet can also be very useful with numeric fields to sort on the output of some formula:: # Sort based on the average of two numeric fields def average(a, b): return (a + b) / 2.0 # Create two facets for the fields and pass them with the function to # TranslateFacet af = sorting.FieldFacet("age") wf = sorting.FieldFacet("weight") facet = sorting.TranslateFacet(average, af, wf) results = searcher.search(myquery. sortedby=facet) Remember that you can still sort by multiple facets. For example, you could sort by a numeric value transformed by a quantizing function first, and then if that is equal sort by the value of another field:: # Sort by a quantized size first, then by name tf = sorting.TranslateFacet(quantize, sorting.FieldFacet("size")) results = searcher.search(myquery, sortedby=[tf, "name"]) Expert: writing your own facet ============================== TBD. Whoosh-2.5.7/docs/build/html/_sources/fieldcaches.txt0000644000076500000240000000326212254366350022705 0ustar mattstaff00000000000000============ Field caches ============ The default (``filedb``) backend uses *field caches* in certain circumstances. The field cache basically pre-computes the order of documents in the index to speed up sorting and faceting. Generating field caches can take time the first time you sort/facet on a large index. The field cache is kept in memory (and by default written to disk when it is generated) so subsequent sorted/faceted searches should be faster. The default caching policy never expires field caches, so reused searchers and/or sorting a lot of different fields could use up quite a bit of memory with large indexes. Customizing cache behaviour =========================== (The following API examples refer to the default ``filedb`` backend.) *By default*, Whoosh saves field caches to disk. To prevent a reader or searcher from writing out field caches, do this before you start using it:: searcher.set_caching_policy(save=False) By default, if caches are written to disk they are saved in the index directory. To tell a reader or searcher to save cache files to a different location, create a storage object and pass it to the ``storage`` keyword argument:: from whoosh.filedb.filestore import FileStorage mystorage = FileStorage("path/to/cachedir") reader.set_caching_policy(storage=mystorage) Creating a custom caching policy ================================ Expert users who want to implement a custom caching policy (for example, to add cache expiration) should subclass :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`. Then you can pass an instance of your policy object to the ``set_caching_policy`` method:: searcher.set_caching_policy(MyPolicy()) Whoosh-2.5.7/docs/build/html/_sources/glossary.txt0000644000076500000240000000563512254366350022324 0ustar mattstaff00000000000000.. _glossary: ======== Glossary ======== .. glossary:: Analysis The process of breaking the text of a field into individual *terms* to be indexed. This consists of tokenizing the text into terms, and then optionally filtering the tokenized terms (for example, lowercasing and removing *stop words*). Whoosh includes several different analyzers. Corpus The set of documents you are indexing. Documents The individual pieces of content you want to make searchable. The word "documents" might imply files, but the data source could really be anything -- articles in a content management system, blog posts in a blogging system, chunks of a very large file, rows returned from an SQL query, individual email messages from a mailbox file, or whatever. When you get search results from Whoosh, the results are a list of documents, whatever "documents" means in your search engine. Fields Each document contains a set of fields. Typical fields might be "title", "content", "url", "keywords", "status", "date", etc. Fields can be indexed (so they're searchable) and/or stored with the document. Storing the field makes it available in search results. For example, you typically want to store the "title" field so your search results can display it. Forward index A table listing every document and the words that appear in the document. Whoosh lets you store *term vectors* that are a kind of forward index. Indexing The process of examining documents in the corpus and adding them to the *reverse index*. Postings The *reverse index* lists every word in the corpus, and for each word, a list of documents in which that word appears, along with some optional information (such as the number of times the word appears in that document). These items in the list, containing a document number and any extra information, are called *postings*. In Whoosh the information stored in postings is customizable for each *field*. Reverse index Basically a table listing every word in the corpus, and for each word, the list of documents in which it appears. It can be more complicated (the index can also list how many times the word appears in each document, the positions at which it appears, etc.) but that's how it basically works. Schema Whoosh requires that you specify the *fields* of the index before you begin indexing. The Schema associates field names with metadata about the field, such as the format of the *postings* and whether the contents of the field are stored in the index. Term vector A *forward index* for a certain field in a certain document. You can specify in the Schema that a given field should store term vectors. Whoosh-2.5.7/docs/build/html/_sources/highlight.txt0000644000076500000240000003246212254366350022426 0ustar mattstaff00000000000000================================================ How to create highlighted search result excerpts ================================================ Overview ======== The highlighting system works as a pipeline, with four component types. * **Fragmenters** chop up the original text into __fragments__, based on the locations of matched terms in the text. * **Scorers** assign a score to each fragment, allowing the system to rank the best fragments by whatever criterion. * **Order functions** control in what order the top-scoring fragments are presented to the user. For example, you can show the fragments in the order they appear in the document (FIRST) or show higher-scoring fragments first (SCORE) * **Formatters** turn the fragment objects into human-readable output, such as an HTML string. Requirements ============ Highlighting requires that you have the text of the indexed document available. You can keep the text in a stored field, or if the original text is available in a file, database column, etc, just reload it on the fly. Note that you might need to process the text to remove e.g. HTML tags, wiki markup, etc. How to ====== Get search results:: results = mysearcher.search(myquery) for hit in results: print(hit["title"]) You can use the :meth:`~whoosh.searching.Hit.highlights` method on the :class:`whoosh.searching.Hit` object to get highlighted snippets from the document containing the search terms. The first argument is the name of the field to highlight. If the field is stored, this is the only argument you need to supply:: results = mysearcher.search(myquery) for hit in results: print(hit["title"]) # Assume "content" field is stored print(hit.highlights("content")) If the field is not stored, you need to retrieve the text of the field some other way. For example, reading it from the original file or a database. Then you can supply the text to highlight with the ``text`` argument:: results = mysearcher.search(myquery) for hit in results: print(hit["title"]) # Assume the "path" stored field contains a path to the original file with open(hit["path"]) as fileobj: filecontents = fileobj.read() print(hit.highlights("content", text=filecontents)) The character limit =================== By default, Whoosh only pulls fragments from the first 32K characters of the text. This prevents very long texts from bogging down the highlighting process too much, and is usually justified since important/summary information is usually at the start of a document. However, if you find the highlights are missing information (for example, very long encyclopedia articles where the terms appear in a later section), you can increase the fragmenter's character limit. You can change the character limit on the results object like this:: results = mysearcher.search(myquery) results.fragmenter.charlimit = 100000 To turn off the character limit:: results.fragmenter.charlimit = None If you instantiate a custom fragmenter, you can set the character limit on it directly:: sf = highlight.SentenceFragmenter(charlimit=100000) results.fragmenter = sf See below for information on customizing the highlights. If you increase or disable the character limit to highlight long documents, you may need to use the tips in the "speeding up highlighting" section below to make highlighting faster. Customizing the highlights ========================== Number of fragments ------------------- You can use the ``top`` keyword argument to control the number of fragments returned in each snippet:: # Show a maximum of 5 fragments from the document print hit.highlights("content", top=5) Fragment size ------------- The default fragmenter has a ``maxchars`` attribute (default 200) controlling the maximum length of a fragment, and a ``surround`` attribute (default 20) controlling the maximum number of characters of context to add at the beginning and end of a fragment:: # Allow larger fragments results.fragmenter.maxchars = 300 # Show more context before and after results.fragmenter.surround = 50 Fragmenter ---------- A fragmenter controls how to extract excerpts from the original text. The ``highlight`` module has the following pre-made fragmenters: :class:`whoosh.highlight.ContextFragmenter` (the default) This is a "smart" fragmenter that finds matched terms and then pulls in surround text to form fragments. This fragmenter only yields fragments that contain matched terms. :class:`whoosh.highlight.SentenceFragmenter` Tries to break the text into fragments based on sentence punctuation (".", "!", and "?"). This object works by looking in the original text for a sentence end as the next character after each token's 'endchar'. Can be fooled by e.g. source code, decimals, etc. :class:`whoosh.highlight.WholeFragmenter` Returns the entire text as one "fragment". This can be useful if you are highlighting a short bit of text and don't need to fragment it. The different fragmenters have different options. For example, the default :class:`~whoosh.highlight.ContextFragmenter` lets you set the maximum fragment size and the size of the context to add on either side:: my_cf = highlight.ContextFragmenter(maxchars=100, surround=30) See the :mod:`whoosh.highlight` docs for more information. To use a different fragmenter:: results.fragmenter = my_cf Scorer ------ A scorer is a callable that takes a :class:`whoosh.highlight.Fragment` object and returns a sortable value (where higher values represent better fragments). The default scorer adds up the number of matched terms in the fragment, and adds a "bonus" for the number of __different__ matched terms. The highlighting system uses this score to select the best fragments to show to the user. As an example of a custom scorer, to rank fragments by lowest standard deviation of the positions of matched terms in the fragment:: def StandardDeviationScorer(fragment): """Gives higher scores to fragments where the matched terms are close together. """ # Since lower values are better in this case, we need to negate the # value return 0 - stddev([t.pos for t in fragment.matched]) To use a different scorer:: results.scorer = StandardDeviationScorer Order ----- The order is a function that takes a fragment and returns a sortable value used to sort the highest-scoring fragments before presenting them to the user (where fragments with lower values appear before fragments with higher values). The ``highlight`` module has the following order functions. ``FIRST`` (the default) Show fragments in the order they appear in the document. ``SCORE`` Show highest scoring fragments first. The ``highlight`` module also includes ``LONGER`` (longer fragments first) and ``SHORTER`` (shorter fragments first), but they probably aren't as generally useful. To use a different order:: results.order = highlight.SCORE Formatter --------- A formatter contols how the highest scoring fragments are turned into a formatted bit of text for display to the user. It can return anything (e.g. plain text, HTML, a Genshi event stream, a SAX event generator, or anything else useful to the calling system). The ``highlight`` module contains the following pre-made formatters. :class:`whoosh.highlight.HtmlFormatter` Outputs a string containing HTML tags (with a class attribute) around the matched terms. :class:`whoosh.highlight.UppercaseFormatter` Converts the matched terms to UPPERCASE. :class:`whoosh.highlight.GenshiFormatter` Outputs a Genshi event stream, with the matched terms wrapped in a configurable element. The easiest way to create a custom formatter is to subclass ``highlight.Formatter`` and override the ``format_token`` method:: class BracketFormatter(highlight.Formatter): """Puts square brackets around the matched terms. """ def format_token(self, text, token, replace=False): # Use the get_text function to get the text corresponding to the # token tokentext = highlight.get_text(text, token) # Return the text as you want it to appear in the highlighted # string return "[%s]" % tokentext To use a different formatter:: brf = BracketFormatter() results.formatter = brf If you need more control over the formatting (or want to output something other than strings), you will need to override other methods. See the documentation for the :class:`whoosh.highlight.Formatter` class. Highlighter object ================== Rather than setting attributes on the results object, you can create a reusable :class:`whoosh.highlight.Highlighter` object. Keyword arguments let you change the ``fragmenter``, ``scorer``, ``order``, and/or ``formatter``:: hi = highlight.Highlighter(fragmenter=my_cf, scorer=sds) You can then use the :meth:`whoosh.highlight.Highlighter.highlight_hit` method to get highlights for a ``Hit`` object:: for hit in results: print(hit["title"]) print(hi.highlight_hit(hit)) (When you assign to a ``Results`` object's ``fragmenter``, ``scorer``, ``order``, or ``formatter`` attributes, you're actually changing the values on the results object's default ``Highlighter`` object.) Speeding up highlighting ======================== Recording which terms matched in which documents during the search may make highlighting faster, since it will skip documents it knows don't contain any matching terms in the given field:: # Record per-document term matches results = searcher.search(myquery, terms=True) PinpointFragmenter ------------------ Usually the highlighting system uses the field's analyzer to re-tokenize the document's text to find the matching terms in context. If you have long documents and have increased/disabled the character limit, and/or if the field has a very complex analyzer, re-tokenizing may be slow. Instead of retokenizing, Whoosh can look up the character positions of the matched terms in the index. Looking up the character positions is not instantaneous, but is usually faster than analyzing large amounts of text. To use :class:`whoosh.highlight.PinpointFragmenter` and avoid re-tokenizing the document text, you must do all of the following: Index the field with character information (this will require re-indexing an existing index):: # Index the start and end chars of each term schema = fields.Schema(content=fields.TEXT(stored=True, chars=True)) Record per-document term matches in the results:: # Record per-document term matches results = searcher.search(myquery, terms=True) Set a :class:`whoosh.highlight.PinpointFragmenter` as the fragmenter:: results.fragmenter = highlight.PinpointFragmenter() PinpointFragmenter limitations ------------------------------ When the highlighting system does not re-tokenize the text, it doesn't know where any other words are in the text except the matched terms it looked up in the index. Therefore when the fragmenter adds surrounding context, it just adds or a certain number of characters blindly, and so doesn't distinguish between content and whitespace, or break on word boundaries, for example:: >>> hit.highlights("content") 're when the fragmenter\n ad' (This can be embarassing when the word fragments form dirty words!) One way to avoid this is to not show any surrounding context, but then fragments containing one matched term will contain ONLY that matched term:: >>> hit.highlights("content") 'fragmenter' Alternatively, you can normalize whitespace in the text before passing it to the highlighting system:: >>> text = searcher.stored_ >>> re.sub("[\t\r\n ]+", " ", text) >>> hit.highlights("content", text=text) ...and use the ``autotrim`` option of ``PinpointFragmenter`` to automatically strip text before the first space and after the last space in the fragments:: >>> results.fragmenter = highlight.PinpointFragmenter(autotrim=True) >>> hit.highlights("content") 'when the fragmenter' Using the low-level API ======================= Usage ----- The following function lets you retokenize and highlight a piece of text using an analyzer:: from whoosh.highlight import highlight excerpts = highlight(text, terms, analyzer, fragmenter, formatter, top=3, scorer=BasicFragmentScorer, minscore=1, order=FIRST) ``text`` The original text of the document. ``terms`` A sequence or set containing the query words to match, e.g. ("render", "shader"). ``analyzer`` The analyzer to use to break the document text into tokens for matching against the query terms. This is usually the analyzer for the field the query terms are in. ``fragmenter`` A :class:`whoosh.highlight.Fragmenter` object, see below. ``formatter`` A :class:`whoosh.highlight.Formatter` object, see below. ``top`` The number of fragments to include in the output. ``scorer`` A :class:`whoosh.highlight.FragmentScorer` object. The only scorer currently included with Whoosh is :class:`~whoosh.highlight.BasicFragmentScorer`, the default. ``minscore`` The minimum score a fragment must have to be considered for inclusion. ``order`` An ordering function that determines the order of the "top" fragments in the output text. Whoosh-2.5.7/docs/build/html/_sources/index.txt0000644000076500000240000000144512254366350021563 0ustar mattstaff00000000000000============================== Whoosh |release| documentation ============================== Whoosh was created by `Matt Chaput `_. You can view outstanding issues on the `Whoosh Bitbucket page `_ and get help on the `Whoosh mailing list `_. Contents ======== .. toctree:: :maxdepth: 2 releases/index quickstart intro glossary schema indexing searching parsing querylang dates query analysis stemming ngrams facets highlight keywords spelling fieldcaches batch threads nested recipes api/api tech/index Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` Whoosh-2.5.7/docs/build/html/_sources/indexing.txt0000644000076500000240000003701012254366350022256 0ustar mattstaff00000000000000====================== How to index documents ====================== Creating an Index object ======================== To create an index in a directory, use ``index.create_in``:: import os, os.path from whoosh import index if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) To open an existing index in a directory, use ``index.open_dir``:: import whoosh.index as index ix = index.open_dir("indexdir") These are convenience methods for:: from whoosh.filedb.filestore import FileStorage storage = FileStorage("indexdir") # Create an index ix = storage.create_index(schema) # Open an existing index storage.open_index() The schema you created the index with is pickled and stored with the index. You can keep multiple indexes in the same directory using the indexname keyword argument:: # Using the convenience functions ix = index.create_in("indexdir", schema=schema, indexname="usages") ix = index.open_dir("indexdir", indexname="usages") # Using the Storage object ix = storage.create_index(schema, indexname="usages") ix = storage.open_index(indexname="usages") Clearing the index ================== Calling ``index.create_in`` on a directory with an existing index will clear the current contents of the index. To test whether a directory currently contains a valid index, use ``index.exists_in``:: exists = index.exists_in("indexdir") usages_exists = index.exists_in("indexdir", indexname="usages") (Alternatively you can simply delete the index's files from the directory, e.g. if you only have one index in the directory, use ``shutil.rmtree`` to remove the directory and then recreate it.) Indexing documents ================== Once you've created an ``Index`` object, you can add documents to the index with an ``IndexWriter`` object. The easiest way to get the ``IndexWriter`` is to call ``Index.writer()``:: ix = index.open_dir("index") writer = ix.writer() Creating a writer locks the index for writing, so only one thread/process at a time can have a writer open. .. note:: Because opening a writer locks the index for writing, in a multi-threaded or multi-process environment your code needs to be aware that opening a writer may raise an exception (``whoosh.store.LockError``) if a writer is already open. Whoosh includes a couple of example implementations (:class:`whoosh.writing.AsyncWriter` and :class:`whoosh.writing.BufferedWriter`) of ways to work around the write lock. .. note:: While the writer is open and during the commit, the index is still available for reading. Existing readers are unaffected and new readers can open the current index normally. Once the commit is finished, existing readers continue to see the previous version of the index (that is, they do not automatically see the newly committed changes). New readers will see the updated index. The IndexWriter's ``add_document(**kwargs)`` method accepts keyword arguments where the field name is mapped to a value:: writer = ix.writer() writer.add_document(title=u"My document", content=u"This is my document!", path=u"/a", tags=u"first short", icon=u"/icons/star.png") writer.add_document(title=u"Second try", content=u"This is the second example.", path=u"/b", tags=u"second short", icon=u"/icons/sheep.png") writer.add_document(title=u"Third time's the charm", content=u"Examples are many.", path=u"/c", tags=u"short", icon=u"/icons/book.png") writer.commit() You don't have to fill in a value for every field. Whoosh doesn't care if you leave out a field from a document. Indexed fields must be passed a unicode value. Fields that are stored but not indexed (i.e. the ``STORED`` field type) can be passed any pickle-able object. Whoosh will happily allow you to add documents with identical values, which can be useful or annoying depending on what you're using the library for:: writer.add_document(path=u"/a", title=u"A", content=u"Hello there") writer.add_document(path=u"/a", title=u"A", content=u"Deja vu!") This adds two documents to the index with identical path and title fields. See "updating documents" below for information on the ``update_document`` method, which uses "unique" fields to replace old documents instead of appending. Indexing and storing different values for the same field -------------------------------------------------------- If you have a field that is both indexed and stored, you can index a unicode value but store a different object if necessary (it's usually not, but sometimes this is really useful) using a "special" keyword argument ``_stored_``. The normal value will be analyzed and indexed, but the "stored" value will show up in the results:: writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title") Finishing adding documents -------------------------- An ``IndexWriter`` object is kind of like a database transaction. You specify a bunch of changes to the index, and then "commit" them all at once. Calling ``commit()`` on the ``IndexWriter`` saves the added documents to the index:: writer.commit() Once your documents are in the index, you can search for them. If you want to close the writer without committing the changes, call ``cancel()`` instead of ``commit()``:: writer.cancel() Keep in mind that while you have a writer open (including a writer you opened and is still in scope), no other thread or process can get a writer or modify the index. A writer also keeps several open files. So you should always remember to call either ``commit()`` or ``cancel()`` when you're done with a writer object. Merging segments ================ A Whoosh ``filedb`` index is really a container for one or more "sub-indexes" called segments. When you add documents to an index, instead of integrating the new documents with the existing documents (which could potentially be very expensive, since it involves resorting all the indexed terms on disk), Whoosh creates a new segment next to the existing segment. Then when you search the index, Whoosh searches both segments individually and merges the results so the segments appear to be one unified index. (This smart design is copied from Lucene.) So, having a few segments is more efficient than rewriting the entire index every time you add some documents. But searching multiple segments does slow down searching somewhat, and the more segments you have, the slower it gets. So Whoosh has an algorithm that runs when you call ``commit()`` that looks for small segments it can merge together to make fewer, bigger segments. To prevent Whoosh from merging segments during a commit, use the ``merge`` keyword argument:: writer.commit(merge=False) To merge all segments together, optimizing the index into a single segment, use the ``optimize`` keyword argument:: writer.commit(optimize=True) Since optimizing rewrites all the information in the index, it can be slow on a large index. It's generally better to rely on Whoosh's merging algorithm than to optimize all the time. (The ``Index`` object also has an ``optimize()`` method that lets you optimize the index (merge all the segments together). It simply creates a writer and calls ``commit(optimize=True)`` on it.) For more control over segment merging, you can write your own merge policy function and use it as an argument to the ``commit()`` method. See the implementation of the ``NO_MERGE``, ``MERGE_SMALL``, and ``OPTIMIZE`` functions in the ``whoosh.writing`` module. Deleting documents ================== You can delete documents using the following methods on an ``IndexWriter`` object. You then need to call ``commit()`` on the writer to save the deletions to disk. ``delete_document(docnum)`` Low-level method to delete a document by its internal document number. ``is_deleted(docnum)`` Low-level method, returns ``True`` if the document with the given internal number is deleted. ``delete_by_term(fieldname, termtext)`` Deletes any documents where the given (indexed) field contains the given term. This is mostly useful for ``ID`` or ``KEYWORD`` fields. ``delete_by_query(query)`` Deletes any documents that match the given query. :: # Delete document by its path -- this field must be indexed ix.delete_by_term('path', u'/a/b/c') # Save the deletion to disk ix.commit() In the ``filedb`` backend, "deleting" a document simply adds the document number to a list of deleted documents stored with the index. When you search the index, it knows not to return deleted documents in the results. However, the document's contents are still stored in the index, and certain statistics (such as term document frequencies) are not updated, until you merge the segments containing deleted documents (see merging above). (This is because removing the information immediately from the index would essentially involving rewriting the entire index on disk, which would be very inefficient.) Updating documents ================== If you want to "replace" (re-index) a document, you can delete the old document using one of the ``delete_*`` methods on ``Index`` or ``IndexWriter``, then use ``IndexWriter.add_document`` to add the new version. Or, you can use ``IndexWriter.update_document`` to do this in one step. For ``update_document`` to work, you must have marked at least one of the fields in the schema as "unique". Whoosh will then use the contents of the "unique" field(s) to search for documents to delete:: from whoosh.fields import Schema, ID, TEXT schema = Schema(path = ID(unique=True), content=TEXT) ix = index.create_in("index") writer = ix.writer() writer.add_document(path=u"/a", content=u"The first document") writer.add_document(path=u"/b", content=u"The second document") writer.commit() writer = ix.writer() # Because "path" is marked as unique, calling update_document with path="/a" # will delete any existing documents where the "path" field contains "/a". writer.update_document(path=u"/a", content="Replacement for the first document") writer.commit() The "unique" field(s) must be indexed. If no existing document matches the unique fields of the document you're updating, ``update_document`` acts just like ``add_document``. "Unique" fields and ``update_document`` are simply convenient shortcuts for deleting and adding. Whoosh has no inherent concept of a unique identifier, and in no way enforces uniqueness when you use ``add_document``. Incremental indexing ==================== When you're indexing a collection of documents, you'll often want two code paths: one to index all the documents from scratch, and one to only update the documents that have changed (leaving aside web applications where you need to add/update documents according to user actions). Indexing everything from scratch is pretty easy. Here's a simple example:: import os.path from whoosh import index from whoosh.fields import Schema, ID, TEXT def clean_index(dirname): # Always create the index from scratch ix = index.create_in(dirname, schema=get_schema()) writer = ix.writer() # Assume we have a function that gathers the filenames of the # documents to be indexed for path in my_docs(): add_doc(writer, path) writer.commit() def get_schema() return Schema(path=ID(unique=True, stored=True), content=TEXT) def add_doc(writer, path): fileobj = open(path, "rb") content = fileobj.read() fileobj.close() writer.add_document(path=path, content=content) Now, for a small collection of documents, indexing from scratch every time might actually be fast enough. But for large collections, you'll want to have the script only re-index the documents that have changed. To start we'll need to store each document's last-modified time, so we can check if the file has changed. In this example, we'll just use the mtime for simplicity:: def get_schema() return Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT) def add_doc(writer, path): fileobj = open(path, "rb") content = fileobj.read() fileobj.close() modtime = os.path.getmtime(path) writer.add_document(path=path, content=content, time=modtime) Now we can modify the script to allow either "clean" (from scratch) or incremental indexing:: def index_my_docs(dirname, clean=False): if clean: clean_index(dirname) else: incremental_index(dirname) def incremental_index(dirname) ix = index.open_dir(dirname) # The set of all paths in the index indexed_paths = set() # The set of all paths we need to re-index to_index = set() with ix.searcher() as searcher: writer = ix.writer() # Loop over the stored fields in the index for fields in searcher.all_stored_fields(): indexed_path = fields['path'] indexed_paths.add(indexed_path) if not os.path.exists(indexed_path): # This file was deleted since it was indexed writer.delete_by_term('path', indexed_path) else: # Check if this file was changed since it # was indexed indexed_time = fields['time'] mtime = os.path.getmtime(indexed_path) if mtime > indexed_time: # The file has changed, delete it and add it to the list of # files to reindex writer.delete_by_term('path', indexed_path) to_index.add(indexed_path) # Loop over the files in the filesystem # Assume we have a function that gathers the filenames of the # documents to be indexed for path in my_docs(): if path in to_index or path not in indexed_paths: # This is either a file that's changed, or a new file # that wasn't indexed before. So index it! add_doc(writer, path) writer.commit() The ``incremental_index`` function: * Loops through all the paths that are currently indexed. * If any of the files no longer exist, delete the corresponding document from the index. * If the file still exists, but has been modified, add it to the list of paths to be re-indexed. * If the file exists, whether it's been modified or not, add it to the list of all indexed paths. * Loops through all the paths of the files on disk. * If a path is not in the set of all indexed paths, the file is new and we need to index it. * If a path is in the set of paths to re-index, we need to index it. * Otherwise, we can skip indexing the file. Clearing the index ================== In some cases you may want to re-index from scratch. To clear the index without disrupting any existing readers:: from whoosh import writing with myindex.writer() as mywriter: # You can optionally add documents to the writer here # e.g. mywriter.add_document(...) # Using mergetype=CLEAR clears all existing segments so the index will # only have any documents you've added to this writer mywriter.mergetype = writing.CLEAR Or, if you don't use the writer as a context manager and call ``commit()`` directly, do it like this:: mywriter = myindex.writer() # ... mywriter.commit(mergetype=writing.CLEAR) .. note:: If you don't need to worry about existing readers, a more efficient method is to simply delete the contents of the index directory and start over. Whoosh-2.5.7/docs/build/html/_sources/intro.txt0000644000076500000240000000421712254366350021607 0ustar mattstaff00000000000000====================== Introduction to Whoosh ====================== About Whoosh ------------ Whoosh was created by `Matt Chaput `_. It started as a quick and dirty search server for the online documentation of the `Houdini `_ 3D animation software package. Side Effects Software generously allowed Matt to open source the code in case it might be useful to anyone else who needs a very flexible or pure-Python search engine (or both!). * Whoosh is fast, but uses only pure Python, so it will run anywhere Python runs, without requiring a compiler. * By default, Whoosh uses the `Okapi BM25F `_ ranking function, but like most things the ranking function can be easily customized. * Whoosh creates fairly small indexes compared to many other search libraries. * All indexed text in Whoosh must be *unicode*. * Whoosh lets you store arbitrary Python objects with indexed documents. What is Whoosh? --------------- Whoosh is a fast, pure Python search engine library. The primary design impetus of Whoosh is that it is pure Python. You should be able to use Whoosh anywhere you can use Python, no compiler or Java required. Like one of its ancestors, Lucene, Whoosh is not really a search engine, it's a programmer library for creating a search engine [1]_. Practically no important behavior of Whoosh is hard-coded. Indexing of text, the level of information stored for each term in each field, parsing of search queries, the types of queries allowed, scoring algorithms, etc. are all customizable, replaceable, and extensible. .. [1] It would of course be possible to build a turnkey search engine on top of Whoosh, like Nutch and Solr use Lucene. What can Whoosh do for you? --------------------------- Whoosh lets you index free-form or structured text and then quickly find matching documents based on simple or complex search criteria. Getting help with Whoosh ------------------------ You can view outstanding issues on the `Whoosh Bitbucket page `_ and get help on the `Whoosh mailing list `_. Whoosh-2.5.7/docs/build/html/_sources/keywords.txt0000644000076500000240000000732412254366350022325 0ustar mattstaff00000000000000======================================= Query expansion and Key word extraction ======================================= Overview ======== Whoosh provides methods for computing the "key terms" of a set of documents. For these methods, "key terms" basically means terms that are frequent in the given documents, but relatively infrequent in the indexed collection as a whole. Because this is a purely statistical operation, not a natural language processing or AI function, the quality of the results will vary based on the content, the size of the document collection, and the number of documents for which you extract keywords. These methods can be useful for providing the following features to users: * Search term expansion. You can extract key terms for the top N results from a query and suggest them to the user as additional/alternate query terms to try. * Tag suggestion. Extracting the key terms for a single document may yield useful suggestions for tagging the document. * "More like this". You can extract key terms for the top ten or so results from a query (and removing the original query terms), and use those key words as the basis for another query that may find more documents using terms the user didn't think of. Usage ===== * Get more documents like a certain search hit. *This requires that the field you want to match on is vectored or stored, or that you have access to the original text (such as from a database)*. Use :meth:`~whoosh.searching.Hit.more_like_this`:: results = mysearcher.search(myquery) first_hit = results[0] more_results = first_hit.more_like_this("content") * Extract keywords for the top N documents in a :class:`whoosh.searching.Results` object. *This requires that the field is either vectored or stored*. Use the :meth:`~whoosh.searching.Results.key_terms` method of the :class:`whoosh.searching.Results` object to extract keywords from the top N documents of the result set. For example, to extract *five* key terms from the ``content`` field of the top *ten* documents of a results object:: keywords = [keyword for keyword, score in results.key_terms("content", docs=10, numterms=5) * Extract keywords for an arbitrary set of documents. *This requires that the field is either vectored or stored*. Use the :meth:`~whoosh.searching.Searcher.document_number` or :meth:`~whoosh.searching.Searcher.document_numbers` methods of the :class:`whoosh.searching.Searcher` object to get the document numbers for the document(s) you want to extract keywords from. Use the :meth:`~whoosh.searching.Searcher.key_terms` method of a :class:`whoosh.searching.Searcher` to extract the keywords, given the list of document numbers. For example, let's say you have an index of emails. To extract key terms from the ``content`` field of emails whose ``emailto`` field contains ``matt@whoosh.ca``:: with email_index.searcher() as s: docnums = s.document_numbers(emailto=u"matt@whoosh.ca") keywords = [keyword for keyword, score in s.key_terms(docnums, "body")] * Extract keywords from arbitrary text not in the index. Use the :meth:`~whoosh.searching.Searcher.key_terms_from_text` method of a :class:`whoosh.searching.Searcher` to extract the keywords, given the text:: with email_index.searcher() as s: keywords = [keyword for keyword, score in s.key_terms_from_text("body", mytext)] Expansion models ================ The ``ExpansionModel`` subclasses in the :mod:`whoosh.classify` module implement different weighting functions for key words. These models are translated into Python from original Java implementations in Terrier. Whoosh-2.5.7/docs/build/html/_sources/nested.txt0000644000076500000240000002345712254366350021745 0ustar mattstaff00000000000000=========================================== Indexing and searching document hierarchies =========================================== Overview ======== Whoosh's full-text index is essentially a flat database of documents. However, Whoosh supports two techniques for simulating the indexing and querying of hierarchical documents, that is, sets of documents that form a parent-child hierarchy, such as "Chapter - Section - Paragraph" or "Module - Class - Method". You can specify parent-child relationships *at indexing time*, by grouping documents in the same hierarchy, and then use the :class:`whoosh.query.NestedParent` and/or :class:`whoosh.query.NestedChildren` to find parents based on their children or vice-versa. Alternatively, you can use *query time joins*, essentially like external key joins in a database, where you perform one search to find a relevant document, then use a stored value on that document (for example, a ``parent`` field) to look up another document. Both methods have pros and cons. Using nested document indexing ============================== Indexing -------- This method works by indexing a "parent" document and all its "child" documents *as a "group"* so they are guaranteed to end up in the same segment. You can use the context manager returned by ``IndexWriter.group()`` to group documents:: with ix.writer() as w: with w.group(): w.add_document(kind="class", name="Index") w.add_document(kind="method", name="add document") w.add_document(kind="method", name="add reader") w.add_document(kind="method", name="close") with w.group(): w.add_document(kind="class", name="Accumulator") w.add_document(kind="method", name="add") w.add_document(kind="method", name="get result") with w.group(): w.add_document(kind="class", name="Calculator") w.add_document(kind="method", name="add") w.add_document(kind="method", name="add all") w.add_document(kind="method", name="add some") w.add_document(kind="method", name="multiply") w.add_document(kind="method", name="close") with w.group(): w.add_document(kind="class", name="Deleter") w.add_document(kind="method", name="add") w.add_document(kind="method", name="delete") Alternatively you can use the ``start_group()`` and ``end_group()`` methods:: with ix.writer() as w: w.start_group() w.add_document(kind="class", name="Index") w.add_document(kind="method", name="add document") w.add_document(kind="method", name="add reader") w.add_document(kind="method", name="close") w.end_group() Each level of the hierarchy should have a query that distinguishes it from other levels (for example, in the above index, you can use ``kind:class`` or ``kind:method`` to match different levels of the hierarchy). Once you've indexed the hierarchy of documents, you can use two query types to find parents based on children or vice-versa. (There is currently no support in the default query parser for nested queries.) NestedParent query ------------------ The :class:`whoosh.query.NestedParent` query type lets you specify a query for child documents, but have the query return an "ancestor" document from higher in the hierarchy:: # First, we need a query that matches all the documents in the "parent" # level we want of the hierarchy all_parents = query.Term("kind", "class") # Then, we need a query that matches the children we want to find wanted_kids = query.Term("name", "close") # Now we can make a query that will match documents where "name" is # "close", but the query will return the "parent" documents of the matching # children q = query.NestedParent(all_parents, wanted_kids) # results = Index, Calculator Note that in a hierarchy with more than two levels, you can specify a "parents" query that matches any level of the hierarchy, so you can return the top-level ancestors of the matching children, or the second level, third level, etc. The query works by first building a bit vector representing which documents are "parents":: Index | Calculator | | 1000100100000100 | | | Deleter Accumulator Then for each match of the "child" query, it calculates the previous parent from the bit vector and returns it as a match (it only returns each parent once no matter how many children match). This parent lookup is very efficient:: 1000100100000100 | |<-+ close NestedChildren query -------------------- The opposite of ``NestedParent`` is :class:`whoosh.query.NestedChildren`. This query lets you match parents but return their children. This is useful, for example, to search for an album title and return the songs in the album:: # Query that matches all documents in the "parent" level we want to match # at all_parents = query.Term("kind", "album") # Parent documents we want to match wanted_parents = query.Term("album_title", "heaven") # Now we can make a query that will match parent documents where "album_title" # contains "heaven", but the query will return the "child" documents of the # matching parents q1 = query.NestedChildren(all_parents, wanted_parents) You can then combine that query with an ``AND`` clause, for example to find songs with "hell" in the song title that occur on albums with "heaven" in the album title:: q2 = query.And([q1, query.Term("song_title", "hell")]) Deleting and updating hierarchical documents -------------------------------------------- The drawback of the index-time method is *updating and deleting*. Because the implementation of the queries depends on the parent and child documents being contiguous in the segment, you can't update/delete just one child document. You can only update/delete an entire top-level document at once (for example, if your hierarchy is "Chapter - Section - Paragraph", you can only update or delete entire chapters, not a section or paragraph). If the top-level of the hierarchy represents very large blocks of text, this can involve a lot of deleting and reindexing. Currently ``Writer.update_document()`` does not automatically work with nested documents. You must manually delete and re-add document groups to update them. To delete nested document groups, use the ``Writer.delete_by_query()`` method with a ``NestedParent`` query:: # Delete the "Accumulator" class all_parents = query.Term("kind", "class") to_delete = query.Term("name", "Accumulator") q = query.NestedParent(all_parents, to_delete) with myindex.writer() as w: w.delete_by_query(q) Using query-time joins ====================== A second technique for simulating hierarchical documents in Whoosh involves using a stored field on each document to point to its parent, and then using the value of that field at query time to find parents and children. For example, if we index a hierarchy of classes and methods using pointers to parents instead of nesting:: # Store a pointer to the parent on each "method" document with ix.writer() as w: w.add_document(kind="class", c_name="Index", docstring="...") w.add_document(kind="method", m_name="add document", parent="Index") w.add_document(kind="method", m_name="add reader", parent="Index") w.add_document(kind="method", m_name="close", parent="Index") w.add_document(kind="class", c_name="Accumulator", docstring="...") w.add_document(kind="method", m_name="add", parent="Accumulator") w.add_document(kind="method", m_name="get result", parent="Accumulator") w.add_document(kind="class", c_name="Calculator", docstring="...") w.add_document(kind="method", m_name="add", parent="Calculator") w.add_document(kind="method", m_name="add all", parent="Calculator") w.add_document(kind="method", m_name="add some", parent="Calculator") w.add_document(kind="method", m_name="multiply", parent="Calculator") w.add_document(kind="method", m_name="close", parent="Calculator") w.add_document(kind="class", c_name="Deleter", docstring="...") w.add_document(kind="method", m_name="add", parent="Deleter") w.add_document(kind="method", m_name="delete", parent="Deleter") # Now do manual joins at query time with ix.searcher() as s: # Tip: Searcher.document() and Searcher.documents() let you look up # documents by field values more easily than using Searcher.search() # Children to parents: # Print the docstrings of classes on which "close" methods occur for child_doc in s.documents(m_name="close"): # Use the stored value of the "parent" field to look up the parent # document parent_doc = s.document(c_name=child_doc["parent"]) # Print the parent document's stored docstring field print(parent_doc["docstring"]) # Parents to children: # Find classes with "big" in the docstring and print their methods q = query.Term("kind", "class") & query.Term("docstring", "big") for hit in s.search(q, limit=None): print("Class name=", hit["c_name"], "methods:") for child_doc in s.documents(parent=hit["c_name"]): print(" Method name=", child_doc["m_name"]) This technique is more flexible than index-time nesting in that you can delete/update individual documents in the hierarchy piece by piece, although it doesn't support finding different parent levels as easily. It is also slower than index-time nesting (potentially much slower), since you must perform additional searches for each found document. Future versions of Whoosh may include "join" queries to make this process more efficient (or at least more automatic). Whoosh-2.5.7/docs/build/html/_sources/ngrams.txt0000644000076500000240000000375612254366350021752 0ustar mattstaff00000000000000============================== Indexing and searching N-grams ============================== Overview ======== N-gram indexing is a powerful method for getting fast, "search as you type" functionality like iTunes. It is also useful for quick and effective indexing of languages such as Chinese and Japanese without word breaks. N-grams refers to groups of N characters... bigrams are groups of two characters, trigrams are groups of three characters, and so on. Whoosh includes two methods for analyzing N-gram fields: an N-gram tokenizer, and a filter that breaks tokens into N-grams. :class:`whoosh.analysis.NgramTokenizer` tokenizes the entire field into N-grams. This is more useful for Chinese/Japanese/Korean languages, where it's useful to index bigrams of characters rather than individual characters. Using this tokenizer with roman languages leads to spaces in the tokens. :: >>> ngt = NgramTokenizer(minsize=2, maxsize=4) >>> [token.text for token in ngt(u"hi there")] [u'hi', u'hi ', u'hi t',u'i ', u'i t', u'i th', u' t', u' th', u' the', u'th', u'the', u'ther', u'he', u'her', u'here', u'er', u'ere', u're'] :class:`whoosh.analysis.NgramFilter` breaks individual tokens into N-grams as part of an analysis pipeline. This is more useful for languages with word separation. :: >>> my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4) >>> [token.text for token in my_analyzer(u"rendering shaders")] [u'ren', u'rend', u'end', u'ende', u'nde', u'nder', u'der', u'deri', u'eri', u'erin', u'rin', u'ring', u'ing', u'sha', u'shad', u'had', u'hade', u'ade', u'ader', u'der', u'ders', u'ers'] Whoosh includes two pre-configured field types for N-grams: :class:`whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS`. The only difference is that ``NGRAM`` runs all text through the N-gram filter, including whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text using a tokenizer, then runs each word through the N-gram filter. TBD. Whoosh-2.5.7/docs/build/html/_sources/parsing.txt0000644000076500000240000003525312254366350022123 0ustar mattstaff00000000000000==================== Parsing user queries ==================== Overview ======== The job of a query parser is to convert a *query string* submitted by a user into *query objects* (objects from the :mod:`whoosh.query` module). For example, the user query: .. code-block:: none rendering shading might be parsed into query objects like this:: And([Term("content", u"rendering"), Term("content", u"shading")]) Whoosh includes a powerful, modular parser for user queries in the :mod:`whoosh.qparser` module. The default parser implements a query language similar to the one that ships with Lucene. However, by changing plugins or using functions such as :func:`whoosh.qparser.MultifieldParser`, :func:`whoosh.qparser.SimpleParser` or :func:`whoosh.qparser.DisMaxParser`, you can change how the parser works, get a simpler parser or change the query language syntax. (In previous versions of Whoosh, the query parser was based on ``pyparsing``. The new hand-written parser is less brittle and more flexible.) .. note:: Remember that you can directly create query objects programmatically using the objects in the :mod:`whoosh.query` module. If you are not processing actual user queries, this is preferable to building a query string just to parse it. Using the default parser ======================== To create a :class:`whoosh.qparser.QueryParser` object, pass it the name of the *default field* to search and the schema of the index you'll be searching. :: from whoosh.qparser import QueryParser parser = QueryParser("content", schema=myindex.schema) .. tip:: You can instantiate a ``QueryParser`` object without specifying a schema, however the parser will not process the text of the user query. This is useful for debugging, when you want to see how QueryParser will build a query, but don't want to make up a schema just for testing. Once you have a ``QueryParser`` object, you can call ``parse()`` on it to parse a query string into a query object:: >>> parser.parse(u"alpha OR beta gamma") Or([Term("content", u"alpha"), Term("content", "beta")]) See the :doc:`query language reference ` for the features and syntax of the default parser's query language. Common customizations ===================== Searching for any terms instead of all terms by default ------------------------------------------------------- If the user doesn't explicitly specify ``AND`` or ``OR`` clauses:: physically based rendering ...by default, the parser treats the words as if they were connected by ``AND``, meaning all the terms must be present for a document to match:: physically AND based AND rendering To change the parser to use ``OR`` instead, so that any of the terms may be present for a document to match, i.e.:: physically OR based OR rendering ...configure the QueryParser using the ``group`` keyword argument like this:: from whoosh import qparser parser = qparser.QueryParser(fieldname, schema=myindex.schema, group=qparser.OrGroup) The Or query lets you specify that documents that contain more of the query terms score higher. For example, if the user searches for ``foo bar``, a document with four occurances of ``foo`` would normally outscore a document that contained one occurance each of ``foo`` and ``bar``. However, users usually expect documents that contain more of the words they searched for to score higher. To configure the parser to produce Or groups with this behavior, use the ``factory()`` class method of ``OrGroup``:: og = qparser.OrGroup.factory(0.9) parser = qparser.QueryParser(fieldname, schema, group=og) where the argument to ``factory()`` is a scaling factor on the bonus (between 0 and 1). Letting the user search multiple fields by default -------------------------------------------------- The default QueryParser configuration takes terms without explicit fields and assigns them to the default field you specified when you created the object, so for example if you created the object with:: parser = QueryParser("content", schema=myschema) And the user entered the query: .. code-block:: none three blind mice The parser would treat it as: .. code-block:: none content:three content:blind content:mice However, you might want to let the user search *multiple* fields by default. For example, you might want "unfielded" terms to search both the ``title`` and ``content`` fields. In that case, you can use a :class:`whoosh.qparser.MultifieldParser`. This is just like the normal QueryParser, but instead of a default field name string, it takes a *sequence* of field names:: from whoosh.qparser import MultifieldParser mparser = MultifieldParser(["title", "content"], schema=myschema) When this MultifieldParser instance parses ``three blind mice``, it treats it as: .. code-block:: none (title:three OR content:three) (title:blind OR content:blind) (title:mice OR content:mice) Simplifying the query language ------------------------------ Once you have a parser:: parser = qparser.QueryParser("content", schema=myschema) you can remove features from it using the :meth:`~whoosh.qparser.QueryParser.remove_plugin_class` method. For example, to remove the ability of the user to specify fields to search:: parser.remove_plugin_class(qparser.FieldsPlugin) To remove the ability to search for wildcards, which can be harmful to query performance:: parser.remove_plugin_class(qparser.WildcardPlugin) See :doc:`/api/qparser` for information about the plugins included with Whoosh's query parser. Changing the AND, OR, ANDNOT, ANDMAYBE, and NOT syntax ------------------------------------------------------ The default parser uses English keywords for the AND, OR, ANDNOT, ANDMAYBE, and NOT functions:: parser = qparser.QueryParser("content", schema=myschema) You can replace the default ``CompoundsPlugin`` and ``NotPlugin`` objects to replace the default English tokens with your own regular expressions. The :class:`whoosh.qparser.CompoundsPlugin` implements the ability to use AND, OR, ANDNOT, and ANDMAYBE clauses in queries. You can instantiate a new ``CompoundsPlugin`` and use the ``And``, ``Or``, ``AndNot``, and ``AndMaybe`` keyword arguments to change the token patterns:: # Use Spanish equivalents instead of AND and OR cp = qparser.CompoundsPlugin(And=" Y ", Or=" O ") parser.replace_plugin(cp) The :class:`whoosh.qparser.NotPlugin` implements the ability to logically NOT subqueries. You can instantiate a new ``NotPlugin`` object with a different token:: np = qparser.NotPlugin("NO ") parser.replace_plugin(np) The arguments can be pattern strings or precompiled regular expression objects. For example, to change the default parser to use typographic symbols instead of words for the AND, OR, ANDNOT, ANDMAYBE, and NOT functions:: parser = qparser.QueryParser("content", schema=myschema) # These are regular expressions, so we have to escape the vertical bar cp = qparser.CompoundsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~") parser.replace_plugin(cp) parser.replace_plugin(qparser.NotPlugin("!")) Adding less-than, greater-than, etc. ------------------------------------ Normally, the way you match all terms in a field greater than "apple" is with an open ended range:: field:{apple to] The :class:`whoosh.qparser.GtLtPlugin` lets you specify the same search like this:: field:>apple The plugin lets you use ``>``, ``<``, ``>=``, ``<=``, ``=>``, or ``=<`` after a field specifier, and translates the expression into the equivalent range:: date:>='31 march 2001' date:[31 march 2001 to] Adding fuzzy term queries ------------------------- Fuzzy queries are good for catching misspellings and similar words. The :class:`whoosh.qparser.FuzzyTermPlugin` lets you search for "fuzzy" terms, that is, terms that don't have to match exactly. The fuzzy term will match any similar term within a certain number of "edits" (character insertions, deletions, and/or transpositions -- this is called the "Damerau-Levenshtein edit distance"). To add the fuzzy plugin:: parser = qparser.QueryParser("fieldname", my_index.schema) parser.add_plugin(qparser.FuzzyTermPlugin()) Once you add the fuzzy plugin to the parser, you can specify a fuzzy term by adding a ``~`` followed by an optional maximum edit distance. If you don't specify an edit distance, the default is ``1``. For example, the following "fuzzy" term query:: cat~ would match ``cat`` and all terms in the index within one "edit" of cat, for example ``cast`` (insert ``s``), ``at`` (delete ``c``), and ``act`` (transpose ``c`` and ``a``). If you wanted ``cat`` to match ``bat``, it requires two edits (delete ``c`` and insert ``b``) so you would need to set the maximum edit distance to ``2``:: cat~2 Because each additional edit you allow increases the number of possibilities that must be checked, edit distances greater than ``2`` can be very slow. It is often useful to require that the first few characters of a fuzzy term match exactly. This is called a prefix. You can set the length of the prefix by adding a slash and a number after the edit distance. For example, to use a maximum edit distance of ``2`` and a prefix length of ``3``:: johannson~2/3 You can specify a prefix without specifying an edit distance:: johannson~/3 The default prefix distance is ``0``. Allowing complex phrase queries ------------------------------- The default parser setup allows phrase (proximity) queries such as:: "whoosh search library" The default phrase query tokenizes the text between the quotes and creates a search for those terms in proximity. If you want to do more complex proximity searches, you can replace the phrase plugin with the :class:`whoosh.qparser.SequencePlugin`, which allows any query between the quotes. For example:: "(john OR jon OR jonathan~) peters*" The sequence syntax lets you add a "slop" factor just like the regular phrase:: "(john OR jon OR jonathan~) peters*"~2 To replace the default phrase plugin with the sequence plugin:: parser = qparser.QueryParser("fieldname", my_index.schema) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) Alternatively, you could keep the default phrase plugin and give the sequence plugin different syntax by specifying a regular expression for the start/end marker when you create the sequence plugin. The regular expression should have a named group ``slop`` for the slop factor. For example:: parser = qparser.QueryParser("fieldname", my_index.schema) parser.add_plugin(qparser.SequencePlugin("!(~(?P[1-9][0-9]*))?")) This would allow you to use regular phrase queries and sequence queries at the same time:: "regular phrase" AND !sequence query~2! Advanced customization ====================== QueryParser arguments --------------------- QueryParser supports two extra keyword arguments: ``group`` The query class to use to join sub-queries when the user doesn't explicitly specify a boolean operator, such as ``AND`` or ``OR``. This lets you change the default operator from ``AND`` to ``OR``. This will be the :class:`whoosh.qparser.AndGroup` or :class:`whoosh.qparser.OrGroup` class (*not* an instantiated object) unless you've written your own custom grouping syntax you want to use. ``termclass`` The query class to use to wrap single terms. This must be a :class:`whoosh.query.Query` subclass (*not* an instantiated object) that accepts a fieldname string and term text unicode string in its ``__init__`` method. The default is :class:`whoosh.query.Term`. This is useful if you want to change the default term class to :class:`whoosh.query.Variations`, or if you've written a custom term class you want the parser to use instead of the ones shipped with Whoosh. :: >>> from whoosh.qparser import QueryParser, OrGroup >>> orparser = QueryParser("content", schema=myschema, group=OrGroup) Configuring plugins ------------------- The query parser's functionality is provided by a set of plugins. You can remove plugins to remove functionality, add plugins to add functionality, or replace default plugins with re-configured or rewritten versions. The :meth:`whoosh.qparser.QueryParser.add_plugin`, :meth:`whoosh.qparser.QueryParser.remove_plugin_class`, and :meth:`whoosh.qparser.QueryParser.replace_plugin` methods let you manipulate the plugins in a ``QueryParser`` object. See :doc:`/api/qparser` for information about the available plugins. .. _custom-op: Creating custom operators ------------------------- * Decide whether you want a ``PrefixOperator``, ``PostfixOperator``, or ``InfixOperator``. * Create a new :class:`whoosh.qparser.syntax.GroupNode` subclass to hold nodes affected by your operator. This object is responsible for generating a :class:`whoosh.query.Query` object corresponding to the syntax. * Create a regular expression pattern for the operator's query syntax. * Create an ``OperatorsPlugin.OpTagger`` object from the above information. * Create a new ``OperatorsPlugin`` instance configured with your custom operator(s). * Replace the default ``OperatorsPlugin`` in your parser with your new instance. For example, if you were creating a ``BEFORE`` operator:: from whoosh import qparser, query optype = qparser.InfixOperator pattern = " BEFORE " class BeforeGroup(qparser.GroupNode): merging = True qclass = query.Ordered Create an OpTagger for your operator:: btagger = qparser.OperatorPlugin.OpTagger(pattern, BeforeGroup, qparser.InfixOperator) By default, infix operators are left-associative. To make a right-associative infix operator, do this:: btagger = qparser.OperatorPlugin.OpTagger(pattern, BeforeGroup, qparser.InfixOperator, leftassoc=False) Create an :class:`~whoosh.qparser.plugins.OperatorsPlugin` instance with your new operator, and replace the default operators plugin in your query parser:: qp = qparser.QueryParser("text", myschema) my_op_plugin = qparser.OperatorsPlugin([(btagger, 0)]) qp.replace_plugin(my_op_plugin) Note that the list of operators you specify with the first argument is IN ADDITION TO the default operators (AND, OR, etc.). To turn off one of the default operators, you can pass None to the corresponding keyword argument:: cp = qparser.OperatorsPlugin([(optagger, 0)], And=None) If you want ONLY your list of operators and none of the default operators, use the ``clean`` keyword argument:: cp = qparser.OperatorsPlugin([(optagger, 0)], clean=True) Operators earlier in the list bind more closely than operators later in the list. Whoosh-2.5.7/docs/build/html/_sources/query.txt0000644000076500000240000000033212254366350021613 0ustar mattstaff00000000000000============= Query objects ============= The classes in the :mod:`whoosh.query` module implement *queries* you can run against the index. TBD. See :doc:`searching` for how to search the index using query objects. Whoosh-2.5.7/docs/build/html/_sources/querylang.txt0000644000076500000240000001145712254366350022467 0ustar mattstaff00000000000000========================== The default query language ========================== .. highlight:: none Overview ======== A query consists of *terms* and *operators*. There are two types of terms: single terms and *phrases*. Multiple terms can be combined with operators such as *AND* and *OR*. Whoosh supports indexing text in different *fields*. You must specify the *default field* when you create the :class:`whoosh.qparser.QueryParser` object. This is the field in which any terms the user does not explicitly specify a field for will be searched. Whoosh's query parser is capable of parsing different and/or additional syntax through the use of plug-ins. See :doc:`parsing`. Individual terms and phrases ============================ Find documents containing the term ``render``:: render Find documents containing the phrase ``all was well``:: "all was well" Note that a field must store Position information for phrase searching to work in that field. Normally when you specify a phrase, the maximum difference in position between each word in the phrase is 1 (that is, the words must be right next to each other in the document). For example, the following matches if a document has ``library`` within 5 words after ``whoosh``:: "whoosh library"~5 Boolean operators ================= Find documents containing ``render`` *and* ``shading``:: render AND shading Note that AND is the default relation between terms, so this is the same as:: render shading Find documents containing ``render``, *and* also either ``shading`` *or* ``modeling``:: render AND shading OR modeling Find documents containing ``render`` but *not* modeling:: render NOT modeling Find documents containing ``alpha`` but not either ``beta`` or ``gamma``:: alpha NOT (beta OR gamma) Note that when no boolean operator is specified between terms, the parser will insert one, by default AND. So this query:: render shading modeling is equivalent (by default) to:: render AND shading AND modeling See :doc:`customizing the default parser ` for information on how to change the default operator to OR. Group operators together with parentheses. For example to find documents that contain both ``render`` and ``shading``, or contain ``modeling``:: (render AND shading) OR modeling Fields ====== Find the term ``ivan`` in the ``name`` field:: name:ivan The ``field:`` prefix only sets the field for the term it directly precedes, so the query:: title:open sesame Will search for ``open`` in the ``title`` field and ``sesame`` in the *default* field. To apply a field prefix to multiple terms, group them with parentheses:: title:(open sesame) This is the same as:: title:open title:sesame Of course you can specify a field for phrases too:: title:"open sesame" Inexact terms ============= Use "globs" (wildcard expressions using ``?`` to represent a single character and ``*`` to represent any number of characters) to match terms:: te?t test* *b?g* Note that a wildcard starting with ``?`` or ``*`` is very slow. Note also that these wildcards only match *individual terms*. For example, the query:: my*life will **not** match an indexed phrase like:: my so called life because those are four separate terms. Ranges ====== You can match a range of terms. For example, the following query will match documents containing terms in the lexical range from ``apple`` to ``bear`` *inclusive*. For example, it will match documents containing ``azores`` and ``be`` but not ``blur``:: [apple TO bear] This is very useful when you've stored, for example, dates in a lexically sorted format (i.e. YYYYMMDD):: date:[20050101 TO 20090715] The range is normally *inclusive* (that is, the range will match all terms between the start and end term, *as well as* the start and end terms themselves). You can specify that one or both ends of the range are *exclusive* by using the ``{`` and/or ``}`` characters:: [0000 TO 0025} {prefix TO suffix} You can also specify *open-ended* ranges by leaving out the start or end term:: [0025 TO] {TO suffix} Boosting query elements ======================= You can specify that certain parts of a query are more important for calculating the score of a matched document than others. For example, to specify that ``ninja`` is twice as important as other words, and ``bear`` is half as important:: ninja^2 cowboy bear^0.5 You can apply a boost to several terms using grouping parentheses:: (open sesame)^2.5 roc Making a term from literal text =============================== If you need to include characters in a term that are normally treated specially by the parser, such as spaces, colons, or brackets, you can enclose the term in single quotes:: path:'MacHD:My Documents' 'term with spaces' title:'function()' Whoosh-2.5.7/docs/build/html/_sources/quickstart.txt0000644000076500000240000002242212254366350022644 0ustar mattstaff00000000000000=========== Quick start =========== Whoosh is a library of classes and functions for indexing text and then searching the index. It allows you to develop custom search engines for your content. For example, if you were creating blogging software, you could use Whoosh to add a search function to allow users to search blog entries. A quick introduction ==================== :: >>> from whoosh.index import create_in >>> from whoosh.fields import * >>> schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) >>> ix = create_in("indexdir", schema) >>> writer = ix.writer() >>> writer.add_document(title=u"First document", path=u"/a", ... content=u"This is the first document we've added!") >>> writer.add_document(title=u"Second document", path=u"/b", ... content=u"The second one is even more interesting!") >>> writer.commit() >>> from whoosh.qparser import QueryParser >>> with ix.searcher() as searcher: ... query = QueryParser("content", ix.schema).parse("first") ... results = searcher.search(query) ... results[0] ... {"title": u"First document", "path": u"/a"} The ``Index`` and ``Schema`` objects ==================================== To begin using Whoosh, you need an *index object*. The first time you create an index, you must define the index's *schema*. The schema lists the *fields* in the index. A field is a piece of information for each document in the index, such as its title or text content. A field can be *indexed* (meaning it can be searched) and/or *stored* (meaning the value that gets indexed is returned with the results; this is useful for fields such as the title). This schema has two fields, "title" and "content":: from whoosh.fields import Schema, TEXT schema = Schema(title=TEXT, content=TEXT) You only need to do create the schema once, when you create the index. The schema is pickled and stored with the index. When you create the ``Schema`` object, you use keyword arguments to map field names to field types. The list of fields and their types defines what you are indexing and what's searchable. Whoosh comes with some very useful predefined field types, and you can easily create your own. :class:`whoosh.fields.ID` This type simply indexes (and optionally stores) the entire value of the field as a single unit (that is, it doesn't break it up into individual words). This is useful for fields such as a file path, URL, date, category, etc. :class:`whoosh.fields.STORED` This field is stored with the document, but not indexed. This field type is not indexed and not searchable. This is useful for document information you want to display to the user in the search results. :class:`whoosh.fields.KEYWORD` This type is designed for space- or comma-separated keywords. This type is indexed and searchable (and optionally stored). To save space, it does not support phrase searching. :class:`whoosh.fields.TEXT` This type is for body text. It indexes (and optionally stores) the text and stores term positions to allow phrase searching. :class:`whoosh.fields.NUMERIC` This type is for numbers. You can store integers or floating point numbers. :class:`whoosh.fields.BOOLEAN` This type is for boolean (true/false) values. :class:`whoosh.fields.DATETIME` This type is for ``datetime`` objects. See :doc:`dates` for more information. :class:`whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS` These types break the field text or individual terms into N-grams. See :doc:`ngrams` for more information. (As a shortcut, if you don't need to pass any arguments to the field type, you can just give the class name and Whoosh will instantiate the object for you.) :: from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT schema = Schema(title=TEXT(stored=True), content=TEXT, path=ID(stored=True), tags=KEYWORD, icon=STORED) See :doc:`schema` for more information. Once you have the schema, you can create an index using the ``create_in`` function:: import os.path from whoosh.index import create_in if not os.path.exists("index"): os.mkdir("index") ix = create_in("index", schema) (At a low level, this creates a *Storage* object to contain the index. A ``Storage`` object represents that medium in which the index will be stored. Usually this will be ``FileStorage``, which stores the index as a set of files in a directory.) After you've created an index, you can open it using the ``open_dir`` convenience function:: from whoosh.index import open_dir ix = open_dir("index") The ``IndexWriter`` object ========================== OK, so we've got an ``Index`` object, now we can start adding documents. The ``writer()`` method of the ``Index`` object returns an ``IndexWriter`` object that lets you add documents to the index. The IndexWriter's ``add_document(**kwargs)`` method accepts keyword arguments where the field name is mapped to a value:: writer = ix.writer() writer.add_document(title=u"My document", content=u"This is my document!", path=u"/a", tags=u"first short", icon=u"/icons/star.png") writer.add_document(title=u"Second try", content=u"This is the second example.", path=u"/b", tags=u"second short", icon=u"/icons/sheep.png") writer.add_document(title=u"Third time's the charm", content=u"Examples are many.", path=u"/c", tags=u"short", icon=u"/icons/book.png") writer.commit() Two important notes: * You don't have to fill in a value for every field. Whoosh doesn't care if you leave out a field from a document. * Indexed text fields must be passed a unicode value. Fields that are stored but not indexed (``STORED`` field type) can be passed any pickle-able object. If you have a text field that is both indexed and stored, you can index a unicode value but store a different object if necessary (it's usually not, but sometimes this is really useful) using this trick:: writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title") Calling commit() on the ``IndexWriter`` saves the added documents to the index:: writer.commit() See :doc:`indexing` for more information. Once your documents are committed to the index, you can search for them. The ``Searcher`` object ======================= To begin searching the index, we'll need a ``Searcher`` object:: searcher = ix.searcher() You'll usually want to open the searcher using a ``with`` statement so the searcher is automatically closed when you're done with it (searcher objects represent a number of open files, so if you don't explicitly close them and the system is slow to collect them, you can run out of file handles):: with ix.searcher() as searcher: ... This is of course equivalent to:: try: searcher = ix.searcher() ... finally: searcher.close() The Searcher's ``search()`` method takes a *Query object*. You can construct query objects directly or use a query parser to parse a query string. For example, this query would match documents that contain both "apple" and "bear" in the "content" field:: # Construct query objects directly from whoosh.query import * myquery = And([Term("content", u"apple"), Term("content", "bear")]) To parse a query string, you can use the default query parser in the ``qparser`` module. The first argument to the ``QueryParser`` constructor is the default field to search. This is usually the "body text" field. The second optional argument is a schema to use to understand how to parse the fields:: # Parse a query string from whoosh.qparser import QueryParser parser = QueryParser("content", ix.schema) myquery = parser.parse(querystring) Once you have a ``Searcher`` and a query object, you can use the ``Searcher``'s ``search()`` method to run the query and get a ``Results`` object:: >>> results = searcher.search(myquery) >>> print(len(results)) 1 >>> print(results[0]) {"title": "Second try", "path": "/b", "icon": "/icons/sheep.png"} The default ``QueryParser`` implements a query language very similar to Lucene's. It lets you connect terms with ``AND`` or ``OR``, eleminate terms with ``NOT``, group terms together into clauses with parentheses, do range, prefix, and wilcard queries, and specify different fields to search. By default it joins clauses together with ``AND`` (so by default, all terms you specify must be in the document for the document to match):: >>> print(parser.parse(u"render shade animate")) And([Term("content", "render"), Term("content", "shade"), Term("content", "animate")]) >>> print(parser.parse(u"render OR (title:shade keyword:animate)")) Or([Term("content", "render"), And([Term("title", "shade"), Term("keyword", "animate")])]) >>> print(parser.parse(u"rend*")) Prefix("content", "rend") Whoosh includes extra features for dealing with search results, such as * Sorting results by the value of an indexed field, instead of by relelvance. * Highlighting the search terms in excerpts from the original documents. * Expanding the query terms based on the top few documents found. * Paginating the results (e.g. "Showing results 1-20, page 1 of 4"). See :doc:`searching` for more information. Whoosh-2.5.7/docs/build/html/_sources/recipes.txt0000644000076500000240000001437412254366350022113 0ustar mattstaff00000000000000============== Whoosh recipes ============== General ======= Get the stored fields for a document from the document number ------------------------------------------------------------- :: stored_fields = searcher.stored_fields(docnum) Analysis ======== Eliminate words shorter/longer than N ------------------------------------- Use a :class:`~whoosh.analysis.StopFilter` and the ``minsize`` and ``maxsize`` keyword arguments. If you just want to filter based on size and not common words, set the ``stoplist`` to ``None``:: sf = analysis.StopFilter(stoplist=None, minsize=2, maxsize=40) Allow optional case-sensitive searches -------------------------------------- A quick and easy way to do this is to index both the original and lowercased versions of each word. If the user searches for an all-lowercase word, it acts as a case-insensitive search, but if they search for a word with any uppercase characters, it acts as a case-sensitive search:: class CaseSensitivizer(analysis.Filter): def __call__(self, tokens): for t in tokens: yield t if t.mode == "index": low = t.text.lower() if low != t.text: t.text = low yield t ana = analysis.RegexTokenizer() | CaseSensitivizer() [t.text for t in ana("The new SuperTurbo 5000", mode="index")] # ["The", "the", "new", "SuperTurbo", "superturbo", "5000"] Searching ========= Find every document ------------------- :: myquery = query.Every() iTunes-style search-as-you-type ------------------------------- Use the :class:`whoosh.analysis.NgramWordAnalyzer` as the analyzer for the field you want to search as the user types. You can save space in the index by turning off positions in the field using ``phrase=False``, since phrase searching on N-gram fields usually doesn't make much sense:: # For example, to search the "title" field as the user types analyzer = analysis.NgramWordAnalyzer() title_field = fields.TEXT(analyzer=analyzer, phrase=False) schema = fields.Schema(title=title_field) See the documentation for the :class:`~whoosh.analysis.NgramWordAnalyzer` class for information on the available options. Shortcuts ========= Look up documents by a field value ---------------------------------- :: # Single document (unique field value) stored_fields = searcher.document(id="bacon") # Multiple documents for stored_fields in searcher.documents(tag="cake"): ... Sorting and scoring =================== See :doc:`facets`. Score results based on the position of the matched term ------------------------------------------------------- The following scoring function uses the position of the first occurance of a term in each document to calculate the score, so documents with the given term earlier in the document will score higher:: from whoosh import scoring def pos_score_fn(searcher, fieldname, text, matcher): poses = matcher.value_as("positions") return 1.0 / (poses[0] + 1) pos_weighting = scoring.FunctionWeighting(pos_score_fn) with myindex.searcher(weighting=pos_weighting) as s: ... Results ======= How many hits were there? ------------------------- The number of *scored* hits:: found = results.scored_length() Depending on the arguments to the search, the exact total number of hits may be known:: if results.has_exact_length(): print("Scored", found, "of exactly", len(results), "documents") Usually, however, the exact number of documents that match the query is not known, because the searcher can skip over blocks of documents it knows won't show up in the "top N" list. If you call ``len(results)`` on a query where the exact length is unknown, Whoosh will run an unscored version of the original query to get the exact number. This is faster than the scored search, but may still be noticeably slow on very large indexes or complex queries. As an alternative, you might display the *estimated* total hits:: found = results.scored_length() if results.has_exact_length(): print("Scored", found, "of exactly", len(results), "documents") else: low = results.estimated_min_length() high = results.estimated_length() print("Scored", found, "of between", low, "and", high, "documents") Which terms matched in each hit? -------------------------------- :: # Use terms=True to record term matches for each hit results = searcher.search(myquery, terms=True) for hit in results: # Which terms matched in this hit? print("Matched:", hit.matched_terms()) # Which terms from the query didn't match in this hit? print("Didn't match:", myquery.all_terms() - hit.matched_terms()) Global information ================== How many documents are in the index? ------------------------------------ :: # Including documents that are deleted but not yet optimized away numdocs = searcher.doc_count_all() # Not including deleted documents numdocs = searcher.doc_count() What fields are in the index? ----------------------------- :: return myindex.schema.names() Is term X in the index? ----------------------- :: return ("content", "wobble") in searcher How many times does term X occur in the index? ---------------------------------------------- :: # Number of times content:wobble appears in all documents freq = searcher.frequency("content", "wobble") # Number of documents containing content:wobble docfreq = searcher.doc_frequency("content", "wobble") Is term X in document Y? ------------------------ :: # Check if the "content" field of document 500 contains the term "wobble" # Without term vectors, skipping through list... postings = searcher.postings("content", "wobble") postings.skip_to(500) return postings.id() == 500 # ...or the slower but easier way docset = set(searcher.postings("content", "wobble").all_ids()) return 500 in docset # If field has term vectors, skipping through list... vector = searcher.vector(500, "content") vector.skip_to("wobble") return vector.id() == "wobble" # ...or the slower but easier way wordset = set(searcher.vector(500, "content").all_ids()) return "wobble" in wordset Whoosh-2.5.7/docs/build/html/_sources/releases/0000755000076500000240000000000012277504634021516 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/releases/0_3.txt0000644000076500000240000000467512254366350022650 0ustar mattstaff00000000000000======================== Whoosh 0.3 release notes ======================== * Major improvements to reading/writing of postings and query performance. * Changed default post limit (run size) from 4 MB to 32 MB. * Finished migrating backend-specific code into ``whoosh.filedb`` package. * Moved formats from whoosh.fields module into new whoosh.formats module. * DocReader and TermReader classes combined into new IndexReader interface. You can get an IndexReader implementation by calling Index.reader(). Searcher is now a wrapper around an IndexReader. * Range query object changed, with new signature and new syntax in the default query parser. Now you can use ``[start TO end]`` in the query parser for an inclusive range, and ``{start TO end}`` for an exclusive range. You can also mix the delimiters, for example ``[start TO end}`` for a range with an inclusive start but exclusive end term. * Added experimental DATETIME field type lets you pass a ``datetime.datetime`` object as a field value to ``add_document``:: from whoosh.fields import Schema, ID, DATETIME from whoosh.filedb.filestore import RamStorage from datetime import datetime schema = Schema(id=ID, date=DATETIME) storage = RamStorage() ix = storage.create_index(schema) w = ix.writer() w.add_document(id=u"A", date=datetime.now()) w.close() Internally, the DATETIME field indexes the datetime object as text using the format (4 digit year + 2 digit month + 2 digit day + 'T' + 2 digit hour + 2 digit minute + 2 digit second + 6 digit microsecond), for example ``20090817T160203109000``. * The default query parser now lets you use quoted strings in prefix and range queries, e.g. ``["2009-05" TO "2009-12"]``, ``"alfa/bravo"*``, making it easier to work with terms containing special characters. * ``DocReader.vector_as(docnum, fieldid, astype)`` is now ``IndexReader.vector_as(astype, docnum, fieldid)`` (i.e. the astype argument has moved from the last to the first argument), e.g. ``v = ixreader.vector_as("frequency", 102, "content")``. * Added whoosh.support.charset for translating Sphinx charset table files. * Added whoosh.analysis.CharsetTokenizer and CharsetFilter to enable case and accent folding. * Added experimental ``whoosh.ramdb`` in-memory backend. * Added experimental ``whoosh.query.FuzzyTerm`` query type. * Added ``whoosh.lang.wordnet`` module containing ``Thesaurus`` object for using WordNet synonym database. Whoosh-2.5.7/docs/build/html/_sources/releases/1_0.txt0000644000076500000240000004344712254366350022646 0ustar mattstaff00000000000000======================== Whoosh 1.x release notes ======================== Whoosh 1.8.3 ============ Whoosh 1.8.3 contains important bugfixes and new functionality. Thanks to all the mailing list and BitBucket users who helped with the fixes! Fixed a bad ``Collector`` bug where the docset of a Results object did not match the actual results. You can now pass a sequence of objects to a keyword argument in ``add_document`` and ``update_document`` (currently this will not work for unique fields in ``update_document``). This is useful for non-text fields such as ``DATETIME`` and ``NUMERIC``, allowing you to index multiple dates/numbers for a document:: writer.add_document(shoe=u"Saucony Kinvara", sizes=[10.0, 9.5, 12]) This version reverts to using the CDB hash function for hash files instead of Python's ``hash()`` because the latter is not meant to be stored externally. This change maintains backwards compatibility with old files. The ``Searcher.search`` method now takes a ``mask`` keyword argument. This is the opposite of the ``filter`` argument. Where the ``filter`` specifies the set of documents that can appear in the results, the ``mask`` specifies a set of documents that must not appear in the results. Fixed performance problems in ``Searcher.more_like``. This method now also takes a ``filter`` keyword argument like ``Searcher.search``. Improved documentation. Whoosh 1.8.2 ============ Whoosh 1.8.2 fixes some bugs, including a mistyped signature in Searcher.more_like and a bad bug in Collector that could screw up the ordering of results given certain parameters. Whoosh 1.8.1 ============ Whoosh 1.8.1 includes a few recent bugfixes/improvements: - ListMatcher.skip_to_quality() wasn't returning an integer, resulting in a "None + int" error. - Fixed locking and memcache sync bugs in the Google App Engine storage object. - MultifieldPlugin wasn't working correctly with groups. - The binary matcher trees of Or and And are now generated using a Huffman-like algorithm instead perfectly balanced. This gives a noticeable speed improvement because less information has to be passed up/down the tree. Whoosh 1.8 ========== This release relicensed the Whoosh source code under the Simplified BSD (A.K.A. "two-clause" or "FreeBSD") license. See LICENSE.txt for more information. Whoosh 1.7.7 ============ Setting a TEXT field to store term vectors is now much easier. Instead of having to pass an instantiated whoosh.formats.Format object to the vector= keyword argument, you can pass True to automatically use the same format and analyzer as the inverted index. Alternatively, you can pass a Format subclass and Whoosh will instantiate it for you. For example, to store term vectors using the same settings as the inverted index (Positions format and StandardAnalyzer):: from whoosh.fields import Schema, TEXT schema = Schema(content=TEXT(vector=True)) To store term vectors that use the same analyzer as the inverted index (StandardAnalyzer by default) but only store term frequency:: from whoosh.formats import Frequency schema = Schema(content=TEXT(vector=Frequency)) Note that currently the only place term vectors are used in Whoosh is keyword extraction/more like this, but they can be useful for expert users with custom code. Added :meth:`whoosh.searching.Searcher.more_like` and :meth:`whoosh.searching.Hit.more_like_this` methods, as shortcuts for doing keyword extraction yourself. Return a Results object. "python setup.py test" works again, as long as you have nose installed. The :meth:`whoosh.searching.Searcher.sort_query_using` method lets you sort documents matching a given query using an arbitrary function. Note that like "complex" searching with the Sorter object, this can be slow on large multi-segment indexes. Whoosh 1.7 ========== You can once again perform complex sorting of search results (that is, a sort with some fields ascending and some fields descending). You can still use the ``sortedby`` keyword argument to :meth:`whoosh.searching.Searcher.search` to do a simple sort (where all fields are sorted in the same direction), or you can use the new :class:`~whoosh.sorting.Sorter` class to do a simple or complex sort:: searcher = myindex.searcher() sorter = searcher.sorter() # Sort first by the group field, ascending sorter.add_field("group") # Then by the price field, descending sorter.add_field("price", reverse=True) # Get the Results results = sorter.sort_query(myquery) See the documentation for the :class:`~whoosh.sorting.Sorter` class for more information. Bear in mind that complex sorts will be much slower on large indexes because they can't use the per-segment field caches. You can now get highlighted snippets for a hit automatically using :meth:`whoosh.searching.Hit.highlights`:: results = searcher.search(myquery, limit=20) for hit in results: print hit["title"] print hit.highlights("content") See :meth:`whoosh.searching.Hit.highlights` for more information. Added the ability to filter search results so that only hits in a Results set, a set of docnums, or matching a query are returned. The filter is cached on the searcher. # Search within previous results newresults = searcher.search(newquery, filter=oldresults) # Search within the "basics" chapter results = searcher.search(userquery, filter=query.Term("chapter", "basics")) You can now specify a time limit for a search. If the search does not finish in the given time, a :class:`whoosh.searching.TimeLimit` exception is raised, but you can still retrieve the partial results from the collector. See the ``timelimit`` and ``greedy`` arguments in the :class:`whoosh.searching.Collector` documentation. Added back the ability to set :class:`whoosh.analysis.StemFilter` to use an unlimited cache. This is useful for one-shot batch indexing (see :doc:`../batch`). The ``normalize()`` method of the ``And`` and ``Or`` queries now merges overlapping range queries for more efficient queries. Query objects now have ``__hash__`` methods allowing them to be used as dictionary keys. The API of the highlight module has changed slightly. Most of the functions in the module have been converted to classes. However, most old code should still work. The ``NullFragmeter`` is now called ``WholeFragmenter``, but the old name is still available as an alias. Fixed MultiPool so it won't fill up the temp directory with job files. Fixed a bug where Phrase query objects did not use their boost factor. Fixed a bug where a fieldname after an open parenthesis wasn't parsed correctly. The change alters the semantics of certain parsing "corner cases" (such as ``a:b:c:d``). Whoosh 1.6 ========== The ``whoosh.writing.BatchWriter`` class is now called :class:`whoosh.writing.BufferedWriter`. It is similar to the old ``BatchWriter`` class but allows you to search and update the buffered documents as well as the documents that have been flushed to disk:: writer = writing.BufferedWriter(myindex) # You can update (replace) documents in RAM without having to commit them # to disk writer.add_document(path="/a", text="Hi there") writer.update_document(path="/a", text="Hello there") # Search committed and uncommited documents by getting a searcher from the # writer instead of the index searcher = writer.searcher() (BatchWriter is still available as an alias for backwards compatibility.) The :class:`whoosh.qparser.QueryParser` initialization method now requires a schema as the second argument. Previously the default was to create a ``QueryParser`` without a schema, which was confusing:: qp = qparser.QueryParser("content", myindex.schema) The :meth:`whoosh.searching.Searcher.search` method now takes a ``scored`` keyword. If you search with ``scored=False``, the results will be in "natural" order (the order the documents were added to the index). This is useful when you don't need scored results but want the convenience of the Results object. Added the :class:`whoosh.qparser.GtLtPlugin` parser plugin to allow greater than/less as an alternative syntax for ranges:: count:>100 tag:<=zebra date:>='29 march 2001' Added the ability to define schemas declaratively, similar to Django models:: from whoosh import index from whoosh.fields import SchemaClass, ID, KEYWORD, STORED, TEXT class MySchema(SchemaClass): uuid = ID(stored=True, unique=True) path = STORED tags = KEYWORD(stored=True) content = TEXT index.create_in("indexdir", MySchema) Whoosh 1.6.2: Added :class:`whoosh.searching.TermTrackingCollector` which tracks which part of the query matched which documents in the final results. Replaced the unbounded cache in :class:`whoosh.analysis.StemFilter` with a bounded LRU (least recently used) cache. This will make stemming analysis slightly slower but prevent it from eating up too much memory over time. Added a simple :class:`whoosh.analysis.PyStemmerFilter` that works when the py-stemmer library is installed:: ana = RegexTokenizer() | PyStemmerFilter("spanish") The estimation of memory usage for the ``limitmb`` keyword argument to ``FileIndex.writer()`` is more accurate, which should help keep memory usage memory usage by the sorting pool closer to the limit. The ``whoosh.ramdb`` package was removed and replaced with a single ``whoosh.ramindex`` module. Miscellaneous bug fixes. Whoosh 1.5 ========== .. note:: Whoosh 1.5 is incompatible with previous indexes. You must recreate existing indexes with Whoosh 1.5. Fixed a bug where postings were not portable across different endian platforms. New generalized field cache system, using per-reader caches, for much faster sorting and faceting of search results, as well as much faster multi-term (e.g. prefix and wildcard) and range queries, especially for large indexes and/or indexes with multiple segments. Changed the faceting API. See :doc:`../facets`. Faster storage and retrieval of posting values. Added per-field ``multitoken_query`` attribute to control how the query parser deals with a "term" that when analyzed generates multiple tokens. The default value is `"first"` which throws away all but the first token (the previous behavior). Other possible values are `"and"`, `"or"`, or `"phrase"`. Added :class:`whoosh.analysis.DoubleMetaphoneFilter`, :class:`whoosh.analysis.SubstitutionFilter`, and :class:`whoosh.analysis.ShingleFilter`. Added :class:`whoosh.qparser.CopyFieldPlugin`. Added :class:`whoosh.query.Otherwise`. Generalized parsing of operators (such as OR, AND, NOT, etc.) in the query parser to make it easier to add new operators. In intend to add a better API for this in a future release. Switched NUMERIC and DATETIME fields to use more compact on-disk representations of numbers. Fixed a bug in the porter2 stemmer when stemming the string `"y"`. Added methods to :class:`whoosh.searching.Hit` to make it more like a `dict`. Short posting lists (by default, single postings) are inline in the term file instead of written to the posting file for faster retrieval and a small saving in disk space. Whoosh 1.3 ========== Whoosh 1.3 adds a more efficient DATETIME field based on the new tiered NUMERIC field, and the DateParserPlugin. See :doc:`../dates`. Whoosh 1.2 ========== Whoosh 1.2 adds tiered indexing for NUMERIC fields, resulting in much faster range queries on numeric fields. Whoosh 1.0 ========== Whoosh 1.0 is a major milestone release with vastly improved performance and several useful new features. *The index format of this version is not compatibile with indexes created by previous versions of Whoosh*. You will need to reindex your data to use this version. Orders of magnitude faster searches for common terms. Whoosh now uses optimizations similar to those in Xapian to skip reading low-scoring postings. Faster indexing and ability to use multiple processors (via ``multiprocessing`` module) to speed up indexing. Flexible Schema: you can now add and remove fields in an index with the :meth:`whoosh.writing.IndexWriter.add_field` and :meth:`whoosh.writing.IndexWriter.remove_field` methods. New hand-written query parser based on plug-ins. Less brittle, more robust, more flexible, and easier to fix/improve than the old pyparsing-based parser. On-disk formats now use 64-bit disk pointers allowing files larger than 4 GB. New :class:`whoosh.searching.Facets` class efficiently sorts results into facets based on any criteria that can be expressed as queries, for example tags or price ranges. New :class:`whoosh.writing.BatchWriter` class automatically batches up individual ``add_document`` and/or ``delete_document`` calls until a certain number of calls or a certain amount of time passes, then commits them all at once. New :class:`whoosh.analysis.BiWordFilter` lets you create bi-word indexed fields a possible alternative to phrase searching. Fixed bug where files could be deleted before a reader could open them in threaded situations. New :class:`whoosh.analysis.NgramFilter` filter, :class:`whoosh.analysis.NgramWordAnalyzer` analyzer, and :class:`whoosh.fields.NGRAMWORDS` field type allow producing n-grams from tokenized text. Errors in query parsing now raise a specific ``whoosh.qparse.QueryParserError`` exception instead of a generic exception. Previously, the query string ``*`` was optimized to a :class:`whoosh.query.Every` query which matched every document. Now the ``Every`` query only matches documents that actually have an indexed term from the given field, to better match the intuitive sense of what a query string like ``tag:*`` should do. New :meth:`whoosh.searching.Searcher.key_terms_from_text` method lets you extract key words from arbitrary text instead of documents in the index. Previously the :meth:`whoosh.searching.Searcher.key_terms` and :meth:`whoosh.searching.Results.key_terms` methods required that the given field store term vectors. They now also work if the given field is stored instead. They will analyze the stored string into a term vector on-the-fly. The field must still be indexed. User API changes ================ The default for the ``limit`` keyword argument to :meth:`whoosh.searching.Searcher.search` is now ``10``. To return all results in a single ``Results`` object, use ``limit=None``. The ``Index`` object no longer represents a snapshot of the index at the time the object was instantiated. Instead it always represents the index in the abstract. ``Searcher`` and ``IndexReader`` objects obtained from the ``Index`` object still represent the index as it was at the time they were created. Because the ``Index`` object no longer represents the index at a specific version, several methods such as ``up_to_date`` and ``refresh`` were removed from its interface. The Searcher object now has :meth:`~whoosh.searching.Searcher.last_modified`, :meth:`~whoosh.searching.Searcher.up_to_date`, and :meth:`~whoosh.searching.Searcher.refresh` methods similar to those that used to be on ``Index``. The document deletion and field add/remove methods on the ``Index`` object now create a writer behind the scenes to accomplish each call. This means they write to the index immediately, so you don't need to call ``commit`` on the ``Index``. Also, it will be much faster if you need to call them multiple times to create your own writer instead:: # Don't do this for id in my_list_of_ids_to_delete: myindex.delete_by_term("id", id) myindex.commit() # Instead do this writer = myindex.writer() for id in my_list_of_ids_to_delete: writer.delete_by_term("id", id) writer.commit() The ``postlimit`` argument to ``Index.writer()`` has been changed to ``postlimitmb`` and is now expressed in megabytes instead of bytes:: writer = myindex.writer(postlimitmb=128) Instead of having to import ``whoosh.filedb.filewriting.NO_MERGE`` or ``whoosh.filedb.filewriting.OPTIMIZE`` to use as arguments to ``commit()``, you can now simply do the following:: # Do not merge segments writer.commit(merge=False) # or # Merge all segments writer.commit(optimize=True) The ``whoosh.postings`` module is gone. The ``whoosh.matching`` module contains classes for posting list readers. Whoosh no longer maps field names to numbers for internal use or writing to disk. Any low-level method that accepted field numbers now accept field names instead. Custom Weighting implementations that use the ``final()`` method must now set the ``use_final`` attribute to ``True``:: from whoosh.scoring import BM25F class MyWeighting(BM25F): use_final = True def final(searcher, docnum, score): return score + docnum * 10 This disables the new optimizations, forcing Whoosh to score every matching document. :class:`whoosh.writing.AsyncWriter` now takes an :class:`whoosh.index.Index` object as its first argument, not a callable. Also, the keyword arguments to pass to the index's ``writer()`` method should now be passed as a dictionary using the ``writerargs`` keyword argument. Whoosh now stores per-document field length using an approximation rather than exactly. For low numbers the approximation is perfectly accurate, while high numbers will be approximated less accurately. The ``doc_field_length`` method on searchers and readers now takes a second argument representing the default to return if the given document and field do not have a length (i.e. the field is not scored or the field was not provided for the given document). The :class:`whoosh.analysis.StopFilter` now has a ``maxsize`` argument as well as a ``minsize`` argument to its initializer. Analyzers that use the ``StopFilter`` have the ``maxsize`` argument in their initializers now also. The interface of :class:`whoosh.writing.AsyncWriter` has changed. Misc ==== * Because the file backend now writes 64-bit disk pointers and field names instead of numbers, the size of an index on disk will grow compared to previous versions. * Unit tests should no longer leave directories and files behind. Whoosh-2.5.7/docs/build/html/_sources/releases/2_0.txt0000644000076500000240000003066212254366350022642 0ustar mattstaff00000000000000======================== Whoosh 2.x release notes ======================== Whoosh 2.5 ========== * Whoosh 2.5 will read existing indexes, but segments created by 2.5 will not be readable by older versions of Whoosh. * As a replacement for field caches to speed up sorting, Whoosh now supports adding a ``sortable=True`` keyword argument to fields. This makes Whoosh store a sortable representation of the field's values in a "column" format (which associates a "key" value with each document). This is more robust, efficient, and customizable than the old behavior. You should now specify ``sortable=True`` on fields that you plan on using to sort or group search results. (You can still sort/group on fields that don't have ``sortable=True``, however it will use more RAM and be slower as Whoosh caches the field values in memory.) Fields that use ``sortable=True`` can avoid specifying ``stored=True``. The field's value will still be available on ``Hit`` objects (the value will be retrieved from the column instead of from the stored fields). This may actually be faster for certain types of values. * Whoosh will now detect common types of OR queries and use optimized read-ahead matchers to speed them up by several times. * Whoosh now includes pure-Python implementations of the Snowball stemmers and stop word lists for various languages adapted from NLTK. These are available through the :class:`whoosh.analysis.LanguageAnalyzer` analyzer or through the ``lang=`` keyword argument to the :class:`~whoosh.fields.TEXT` field. * You can now use the :meth:`whoosh.filedb.filestore.Storage.create()` and :meth:`whoosh.filedb.filestore.Storage.destory()` methods as a consistent API to set up and tear down different types of storage. * Many bug fixes and speed improvements. * Switched unit tests to use ``py.test`` instead of ``nose``. * Removed obsolete ``SpellChecker`` class. Whoosh 2.4 ========== * By default, Whoosh now assembles the individual files of a segment into a single file when committing. This has a small performance penalty but solves a problem where Whoosh can keep too many files open. Whoosh is also now smarter about using mmap. * Added functionality to index and search hierarchical documents. See :doc:`/nested`. * Rewrote the Directed Acyclic Word Graph implementation (used in spell checking) to be faster and more space-efficient. Word graph files created by previous versions will be ignored, meaning that spell checking may become slower unless/until you replace the old segments (for example, by optimizing). * Rewrote multiprocessing indexing to be faster and simpler. You can now do ``myindex.writer(procs=n)`` to get a multiprocessing writer, or ``myindex.writer(procs=n, multisegment=True)`` to get a multiprocessing writer that leaves behind multiple segments, like the old MultiSegmentWriter. (``MultiSegmentWriter`` is still available as a function that returns the new class.) * When creating ``Term`` query objects for special fields (e.g. NUMERIC or BOOLEAN), you can now use the field's literal type instead of a string as the second argument, for example ``Term("num", 20)`` or ``Term("bool", True)``. (This change may cause problems interacting with functions that expect query objects to be pure textual, such as spell checking.) * All writing to and reading from on-disk indexes is now done through "codec" objects. This architecture should make it easier to add optional or experimental features, and maintain backwards compatibility. * Fixes issues #75, #137, #206, #213, #215, #219, #223, #226, #230, #233, #238, #239, #240, #241, #243, #244, #245, #252, #253, and other bugs. Thanks to Thomas Waldmann and Alexei Gousev for the help! Whoosh 2.3.2 ============ * Fixes bug in BM25F scoring function, leading to increased precision in search results. * Fixes issues #203, #205, #206, #208, #209, #212. Whoosh 2.3.1 ============ * Fixes issue #200. Whoosh 2.3 ========== * Added a :class:`whoosh.query.Regex` term query type, similar to :class:`whoosh.query.Wildcard`. The parser does not allow regex term queries by default. You need to add the :class:`whoosh.qparser.RegexPlugin` plugin. After you add the plugin, you can use ``r"expression"`` query syntax for regular expression term queries. For example, ``r"foo.*bar"``. * Added the :class:`whoosh.qparser.PseudoFieldPlugin` parser plugin. This plugin lets you create "pseudo-fields" that run a transform function on whatever query syntax the user applies the field to. This is fairly advanced functionality right now; I'm trying to think of ways to make its power easier to access. * The documents in the lists in the dictionary returned by ``Results.groups()`` by default are now in the same relative order as in the results. This makes it much easier to display the "top N" results in each category, for example. * The ``groupids`` keyword argument to ``Searcher.search`` has been removed. Instead you can now pass a :class:`whoosh.sorting.FacetMap` object to the ``Searcher.search`` method's ``maptype`` argument to control how faceted documents are grouped, and/or set the ``maptype`` argument on individual :class:`whoosh.sorting.FacetType`` objects to set custom grouping per facet. See :doc:`../facets` for more information. * Calling ``Searcher.documents()`` or ``Searcher.document_numbers()`` with no arguments now yields all documents/numbers. * Calling ``Writer.update_document()`` with no unique fields is now equivalent to calling ``Writer.add_document()`` with the same arguments. * Fixed a problem with keyword expansion where the code was building a cache that was fast on small indexes, but unacceptably slow on large indexes. * Added the hyphen (``-``) to the list of characters that match a "wildcard" token, to make parsing slightly more predictable. A true fix will have to wait for another parser rewrite. * Fixed an unused ``__future__`` import and use of ``float("nan")`` which were breaking under Python 2.5. * Fixed a bug where vectored fields with only one term stored an empty term vector. * Various other bug fixes. Whoosh 2.2 ========== * Fixes several bugs, including a bad bug in BM25F scoring. * Added ``allow_overlap`` option to :class:`whoosh.sorting.StoredFieldFacet`. * In :meth:`~whoosh.writing.IndexWriter.add_document`, You can now pass query-like strings for BOOLEAN and DATETIME fields (e.g ``boolfield="true"`` and ``dtfield="20101131-16:01"``) as an alternative to actual ``bool`` or ``datetime`` objects. The implementation of this is incomplete: it only works in the default ``filedb`` backend, and if the field is stored, the stored value will be the string, not the parsed object. * Added :class:`whoosh.analysis.CompoundWordFilter` and :class:`whoosh.analysis.TeeFilter`. Whoosh 2.1 ========== This release fixes several bugs, and contains speed improvments to highlighting. See :doc:`/highlight` for more information. Whoosh 2.0 ========== Improvements ------------ * Whoosh is now compatible with Python 3 (tested with Python 3.2). Special thanks to Vinay Sajip who did the work, and also Jordan Sherer who helped fix later issues. * Sorting and grouping (faceting) now use a new system of "facet" objects which are much more flexible than the previous field-based system. For example, to sort by first name and then score:: from whoosh import sorting mf = sorting.MultiFacet([sorting.FieldFacet("firstname"), sorting.ScoreFacet()]) results = searcher.search(myquery, sortedby=mf) In addition to the previously supported sorting/grouping by field contents and/or query results, you can now use numeric ranges, date ranges, score, and more. The new faceting system also supports overlapping groups. (The old "Sorter" API still works but is deprecated and may be removed in a future version.) See :doc:`/facets` for more information. * Completely revamped spell-checking to make it much faster, easier, and more flexible. You can enable generation of the graph files use by spell checking using the ``spelling=True`` argument to a field type:: schema = fields.Schema(text=fields.TEXT(spelling=True)) (Spelling suggestion methods will work on fields without ``spelling=True`` but will slower.) The spelling graph will be updated automatically as new documents are added -- it is no longer necessary to maintain a separate "spelling index". You can get suggestions for individual words using :meth:`whoosh.searching.Searcher.suggest`:: suglist = searcher.suggest("content", "werd", limit=3) Whoosh now includes convenience methods to spell-check and correct user queries, with optional highlighting of corrections using the ``whoosh.highlight`` module:: from whoosh import highlight, qparser # User query string qstring = request.get("q") # Parse into query object parser = qparser.QueryParser("content", myindex.schema) qobject = parser.parse(qstring) results = searcher.search(qobject) if not results: correction = searcher.correct_query(gobject, gstring) # correction.query = corrected query object # correction.string = corrected query string # Format the corrected query string with HTML highlighting cstring = correction.format_string(highlight.HtmlFormatter()) Spelling suggestions can come from field contents and/or lists of words. For stemmed fields the spelling suggestions automatically use the unstemmed forms of the words. There are APIs for spelling suggestions and query correction, so highly motivated users could conceivably replace the defaults with more sophisticated behaviors (for example, to take context into account). See :doc:`/spelling` for more information. * :class:`whoosh.query.FuzzyTerm` now uses the new word graph feature as well and so is much faster. * You can now set a boost factor for individual documents as you index them, to increase the score of terms in those documents in searches. See the documentation for the :meth:`~whoosh.writing.IndexWriter.add_document` for more information. * Added built-in recording of which terms matched in which documents. Use the ``terms=True`` argument to :meth:`whoosh.searching.Searcher.search` and use :meth:`whoosh.searching.Hit.matched_terms` and :meth:`whoosh.searching.Hit.contains_term` to check matched terms. * Whoosh now supports whole-term quality optimizations, so for example if the system knows that a UnionMatcher cannot possibly contribute to the "top N" results unless both sub-matchers match, it will replace the UnionMatcher with an IntersectionMatcher which is faster to compute. The performance improvement is not as dramatic as from block quality optimizations, but it can be noticeable. * Fixed a bug that prevented block quality optimizations in queries with words not in the index, which could severely degrade performance. * Block quality optimizations now use the actual scoring algorithm to calculate block quality instead of an approximation, which fixes issues where ordering of results could be different for searches with and without the optimizations. * the BOOLEAN field type now supports field boosts. * Re-architected the query parser to make the code easier to understand. Custom parser plugins from previous versions will probably break in Whoosh 2.0. * Various bug-fixes and performance improvements. * Removed the "read lock", which caused more problems than it solved. Now when opening a reader, if segments are deleted out from under the reader as it is opened, the code simply retries. Compatibility ------------- * The term quality optimizations required changes to the on-disk formats. Whoosh 2.0 if backwards-compatible with the old format. As you rewrite an index using Whoosh 2.0, by default it will use the new formats for new segments, making the index incompatible with older versions. To upgrade an existing index to use the new formats immediately, use ``Index.optimize()``. * Removed the experimental ``TermTrackingCollector`` since it is replaced by the new built-in term recording functionality. * Removed the experimental ``Searcher.define_facets`` feature until a future release when it will be replaced by a more robust and useful feature. * Reader iteration methods (``__iter__``, ``iter_from``, ``iter_field``, etc.) now yield :class:`whoosh.reading.TermInfo` objects. * The arguments to :class:`whoosh.query.FuzzyTerm` changed. Whoosh-2.5.7/docs/build/html/_sources/releases/index.txt0000644000076500000240000000014312254366350023360 0ustar mattstaff00000000000000============= Release notes ============= .. toctree:: :maxdepth: 2 2_0 1_0 0_3 Whoosh-2.5.7/docs/build/html/_sources/schema.txt0000644000076500000240000003551512254366350021721 0ustar mattstaff00000000000000================== Designing a schema ================== About schemas and fields ======================== The schema specifies the fields of documents in an index. Each document can have multiple fields, such as title, content, url, date, etc. Some fields can be indexed, and some fields can be stored with the document so the field value is available in search results. Some fields will be both indexed and stored. The schema is the set of all possible fields in a document. Each individual document might only use a subset of the available fields in the schema. For example, a simple schema for indexing emails might have fields like ``from_addr``, ``to_addr``, ``subject``, ``body``, and ``attachments``, where the ``attachments`` field lists the names of attachments to the email. For emails without attachments, you would omit the attachments field. Built-in field types ==================== Whoosh provides some useful predefined field types: :class:`whoosh.fields.TEXT` This type is for body text. It indexes (and optionally stores) the text and stores term positions to allow phrase searching. ``TEXT`` fields use :class:`~whoosh.analysis.StandardAnalyzer` by default. To specify a different analyzer, use the ``analyzer`` keyword argument to the constructor, e.g. ``TEXT(analyzer=analysis.StemmingAnalyzer())``. See :doc:`analysis`. By default, ``TEXT`` fields store position information for each indexed term, to allow you to search for phrases. If you don't need to be able to search for phrases in a text field, you can turn off storing term positions to save space. Use ``TEXT(phrase=False)``. By default, ``TEXT`` fields are not stored. Usually you will not want to store the body text in the search index. Usually you have the indexed documents themselves available to read or link to based on the search results, so you don't need to store their text in the search index. However, in some circumstances it can be useful (see :doc:`highlight`). Use ``TEXT(stored=True)`` to specify that the text should be stored in the index. :class:`whoosh.fields.KEYWORD` This field type is designed for space- or comma-separated keywords. This type is indexed and searchable (and optionally stored). To save space, it does not support phrase searching. To store the value of the field in the index, use ``stored=True`` in the constructor. To automatically lowercase the keywords before indexing them, use ``lowercase=True``. By default, the keywords are space separated. To separate the keywords by commas instead (to allow keywords containing spaces), use ``commas=True``. If your users will use the keyword field for searching, use ``scorable=True``. :class:`whoosh.fields.ID` The ``ID`` field type simply indexes (and optionally stores) the entire value of the field as a single unit (that is, it doesn't break it up into individual terms). This type of field does not store frequency information, so it's quite compact, but not very useful for scoring. Use ``ID`` for fields like url or path (the URL or file path of a document), date, category -- fields where the value must be treated as a whole, and each document only has one value for the field. By default, ``ID`` fields are not stored. Use ``ID(stored=True)`` to specify that the value of the field should be stored with the document for use in the search results. For example, you would want to store the value of a url field so you could provide links to the original in your search results. :class:`whoosh.fields.STORED` This field is stored with the document, but not indexed and not searchable. This is useful for document information you want to display to the user in the search results, but don't need to be able to search for. :class:`whoosh.fields.NUMERIC` This field stores int, long, or floating point numbers in a compact, sortable format. :class:`whoosh.fields.DATETIME` This field stores datetime objects in a compact, sortable format. :class:`whoosh.fields.BOOLEAN` This simple filed indexes boolean values and allows users to search for ``yes``, ``no``, ``true``, ``false``, ``1``, ``0``, ``t`` or ``f``. :class:`whoosh.fields.NGRAM` TBD. Expert users can create their own field types. Creating a Schema ================= To create a schema:: from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import StemmingAnalyzer schema = Schema(from_addr=ID(stored=True), to_addr=ID(stored=True), subject=TEXT(stored=True), body=TEXT(analyzer=StemmingAnalyzer()), tags=KEYWORD) If you aren't specifying any constructor keyword arguments to one of the predefined fields, you can leave off the brackets (e.g. ``fieldname=TEXT`` instead of ``fieldname=TEXT()``). Whoosh will instantiate the class for you. Alternatively you can create a schema declaratively using the ``SchemaClass`` base class:: from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, STORED class MySchema(SchemaClass): path = ID(stored=True) title = TEXT(stored=True) content = TEXT tags = KEYWORD You can pass a declarative class to :func:`~whoosh.index.create_in` or :meth:`~whoosh.store.Storage.create_index()` instead of a :class:`~whoosh.fields.Schema` instance. Modifying the schema after indexing =================================== After you have created an index, you can add or remove fields to the schema using the ``add_field()`` and ``remove_field()`` methods. These methods are on the ``Writer`` object:: writer = ix.writer() writer.add_field("fieldname", fields.TEXT(stored=True)) writer.remove_field("content") writer.commit() (If you're going to modify the schema *and* add documents using the same writer, you must call ``add_field()`` and/or ``remove_field`` *before* you add any documents.) These methods are also on the ``Index`` object as a convenience, but when you call them on an ``Index``, the Index object simply creates the writer, calls the corresponding method on it, and commits, so if you want to add or remove more than one field, it's much more efficient to create the writer yourself:: ix.add_field("fieldname", fields.KEYWORD) In the ``filedb`` backend, removing a field simply removes that field from the *schema* -- the index will not get smaller, data about that field will remain in the index until you optimize. Optimizing will compact the index, removing references to the deleted field as it goes:: writer = ix.writer() writer.add_field("uuid", fields.ID(stored=True)) writer.remove_field("path") writer.commit(optimize=True) Because data is stored on disk with the field name, *do not* add a new field with the same name as a deleted field without optimizing the index in between:: writer = ix.writer() writer.delete_field("path") # Don't do this!!! writer.add_field("path", fields.KEYWORD) (A future version of Whoosh may automatically prevent this error.) Dynamic fields ============== Dynamic fields let you associate a field type with any field name that matches a given "glob" (a name pattern containing ``*``, ``?``, and/or ``[abc]`` wildcards). You can add dynamic fields to a new schema using the ``add()`` method with the ``glob`` keyword set to True:: schema = fields.Schema(...) # Any name ending in "_d" will be treated as a stored # DATETIME field schema.add("*_d", fields.DATETIME(stored=True), glob=True) To set up a dynamic field on an existing index, use the same ``IndexWriter.add_field`` method as if you were adding a regular field, but with the ``glob`` keyword argument set to ``True``:: writer = ix.writer() writer.add_field("*_d", fields.DATETIME(stored=True), glob=True) writer.commit() To remove a dynamic field, use the ``IndexWriter.remove_field()`` method with the glob as the name:: writer = ix.writer() writer.remove_field("*_d") writer.commit() For example, to allow documents to contain any field name that ends in ``_id`` and associate it with the ``ID`` field type:: schema = fields.Schema(path=fields.ID) schema.add("*_id", fields.ID, glob=True) ix = index.create_in("myindex", schema) w = ix.writer() w.add_document(path=u"/a", test_id=u"alfa") w.add_document(path=u"/b", class_id=u"MyClass") # ... w.commit() qp = qparser.QueryParser("path", schema=schema) q = qp.parse(u"test_id:alfa") with ix.searcher() as s: results = s.search(q) Advanced schema setup ===================== Field boosts ------------ You can specify a field boost for a field. This is a multiplier applied to the score of any term found in the field. For example, to make terms found in the title field score twice as high as terms in the body field:: schema = Schema(title=TEXT(field_boost=2.0), body=TEXT) Field types ----------- The predefined field types listed above are subclasses of ``fields.FieldType``. ``FieldType`` is a pretty simple class. Its attributes contain information that define the behavior of a field. ============ =============== ====================================================== Attribute Type Description ============ =============== ====================================================== format fields.Format Defines what kind of information a field records about each term, and how the information is stored on disk. vector fields.Format Optional: if defined, the format in which to store per-document forward-index information for this field. scorable bool If True, the length of (number of terms in) the field in each document is stored in the index. Slightly misnamed, since field lengths are not required for all scoring. However, field lengths are required to get proper results from BM25F. stored bool If True, the value of this field is stored in the index. unique bool If True, the value of this field may be used to replace documents with the same value when the user calls :meth:`~whoosh.writing.IndexWriter.document_update` on an ``IndexWriter``. ============ =============== ====================================================== The constructors for most of the predefined field types have parameters that let you customize these parts. For example: * Most of the predefined field types take a stored keyword argument that sets FieldType.stored. * The ``TEXT()`` constructor takes an ``analyzer`` keyword argument that is passed on to the format object. Formats ------- A ``Format`` object defines what kind of information a field records about each term, and how the information is stored on disk. For example, the ``Existence`` format would store postings like this: ==== ==== Doc ==== ==== 10 20 30 ==== ==== Whereas the ``Positions`` format would store postings like this: ===== ============= Doc Positions ===== ============= 10 ``[1,5,23]`` 20 ``[45]`` 30 ``[7,12]`` ===== ============= The indexing code passes the unicode string for a field to the field's ``Format`` object. The ``Format`` object calls its analyzer (see text analysis) to break the string into tokens, then encodes information about each token. Whoosh ships with the following pre-defined formats. =============== ================================================================ Class name Description =============== ================================================================ Stored A "null" format for fields that are stored but not indexed. Existence Records only whether a term is in a document or not, i.e. it does not store term frequency. Useful for identifier fields (e.g. path or id) and "tag"-type fields, where the frequency is expected to always be 0 or 1. Frequency Stores the number of times each term appears in each document. Positions Stores the number of times each term appears in each document, and at what positions. =============== ================================================================ The ``STORED`` field type uses the ``Stored`` format (which does nothing, so ``STORED`` fields are not indexed). The ``ID`` type uses the ``Existence`` format. The ``KEYWORD`` type uses the ``Frequency`` format. The ``TEXT`` type uses the ``Positions`` format if it is instantiated with ``phrase=True`` (the default), or ``Frequency`` if ``phrase=False``. In addition, the following formats are implemented for the possible convenience of expert users, but are not currently used in Whoosh: ================= ================================================================ Class name Description ================= ================================================================ DocBoosts Like Existence, but also stores per-document boosts Characters Like Positions, but also stores the start and end character indices of each term PositionBoosts Like Positions, but also stores per-position boosts CharacterBoosts Like Positions, but also stores the start and end character indices of each term and per-position boosts ================= ================================================================ Vectors ------- The main index is an inverted index. It maps terms to the documents they appear in. It is also sometimes useful to store a forward index, also known as a term vector, that maps documents to the terms that appear in them. For example, imagine an inverted index like this for a field: ========== ========================================================= Term Postings ========== ========================================================= apple ``[(doc=1, freq=2), (doc=2, freq=5), (doc=3, freq=1)]`` bear ``[(doc=2, freq=7)]`` ========== ========================================================= The corresponding forward index, or term vector, would be: ========== ====================================================== Doc Postings ========== ====================================================== 1 ``[(text=apple, freq=2)]`` 2 ``[(text=apple, freq=5), (text='bear', freq=7)]`` 3 ``[(text=apple, freq=1)]`` ========== ====================================================== If you set ``FieldType.vector`` to a ``Format`` object, the indexing code will use the ``Format`` object to store information about the terms in each document. Currently by default Whoosh does not make use of term vectors at all, but they are available to expert users who want to implement their own field types. Whoosh-2.5.7/docs/build/html/_sources/searching.txt0000644000076500000240000003270412254366350022421 0ustar mattstaff00000000000000============= How to search ============= Once you've created an index and added documents to it, you can search for those documents. The ``Searcher`` object ======================= To get a :class:`whoosh.searching.Searcher` object, call ``searcher()`` on your ``Index`` object:: searcher = myindex.searcher() You'll usually want to open the searcher using a ``with`` statement so the searcher is automatically closed when you're done with it (searcher objects represent a number of open files, so if you don't explicitly close them and the system is slow to collect them, you can run out of file handles):: with ix.searcher() as searcher: ... This is of course equivalent to:: try: searcher = ix.searcher() ... finally: searcher.close() The ``Searcher`` object is the main high-level interface for reading the index. It has lots of useful methods for getting information about the index, such as ``lexicon(fieldname)``. :: >>> list(searcher.lexicon("content")) [u"document", u"index", u"whoosh"] However, the most important method on the ``Searcher`` object is :meth:`~whoosh.searching.Searcher.search`, which takes a :class:`whoosh.query.Query` object and returns a :class:`~whoosh.searching.Results` object:: from whoosh.qparser import QueryParser qp = QueryParser("content", schema=myindex.schema) q = qp.parse(u"hello world") with myindex.searcher() as s: results = s.search(q) By default the results contains at most the first 10 matching documents. To get more results, use the ``limit`` keyword:: results = s.search(q, limit=20) If you want all results, use ``limit=None``. However, setting the limit whenever possible makes searches faster because Whoosh doesn't need to examine and score every document. Since displaying a page of results at a time is a common pattern, the ``search_page`` method lets you conveniently retrieve only the results on a given page:: results = s.search_page(q, 1) The default page length is 10 hits. You can use the ``pagelen`` keyword argument to set a different page length:: results = s.search_page(q, 5, pagelen=20) Results object ============== The :class:`~whoosh.searching.Results` object acts like a list of the matched documents. You can use it to access the stored fields of each hit document, to display to the user. :: >>> # Show the best hit's stored fields >>> results[0] {"title": u"Hello World in Python", "path": u"/a/b/c"} >>> results[0:2] [{"title": u"Hello World in Python", "path": u"/a/b/c"}, {"title": u"Foo", "path": u"/bar"}] By default, ``Searcher.search(myquery)`` limits the number of hits to 20, So the number of scored hits in the ``Results`` object may be less than the number of matching documents in the index. :: >>> # How many documents in the entire index would have matched? >>> len(results) 27 >>> # How many scored and sorted documents in this Results object? >>> # This will often be less than len() if the number of hits was limited >>> # (the default). >>> results.scored_length() 10 Calling ``len(Results)`` runs a fast (unscored) version of the query again to figure out the total number of matching documents. This is usually very fast but for large indexes it can cause a noticeable delay. If you want to avoid this delay on very large indexes, you can use the :meth:`~whoosh.searching.Results.has_exact_length`, :meth:`~whoosh.searching.Results.estimated_length`, and :meth:`~whoosh.searching.Results.estimated_min_length` methods to estimate the number of matching documents without calling ``len()``:: found = results.scored_length() if results.has_exact_length(): print("Scored", found, "of exactly", len(results), "documents") else: low = results.estimated_min_length() high = results.estimated_length() print("Scored", found, "of between", low, "and", high, "documents") Scoring and sorting =================== Scoring ------- Normally the list of result documents is sorted by *score*. The :mod:`whoosh.scoring` module contains implementations of various scoring algorithms. The default is :class:`~whoosh.scoring.BM25F`. You can set the scoring object to use when you create the searcher using the ``weighting`` keyword argument:: from whoosh import scoring with myindex.searcher(weighting=scoring.TF_IDF()) as s: ... A weighting model is a :class:`~whoosh.scoring.WeightingModel` subclass with a ``scorer()`` method that produces a "scorer" instance. This instance has a method that takes the current matcher and returns a floating point score. Sorting ------- See :doc:`facets`. Highlighting snippets and More Like This ======================================== See :doc:`highlight` and :doc:`keywords` for information on these topics. Filtering results ================= You can use the ``filter`` keyword argument to ``search()`` to specify a set of documents to permit in the results. The argument can be a :class:`whoosh.query.Query` object, a :class:`whoosh.searching.Results` object, or a set-like object containing document numbers. The searcher caches filters so if for example you use the same query filter with a searcher multiple times, the additional searches will be faster because the searcher will cache the results of running the filter query You can also specify a ``mask`` keyword argument to specify a set of documents that are not permitted in the results. :: with myindex.searcher() as s: qp = qparser.QueryParser("content", myindex.schema) user_q = qp.parse(query_string) # Only show documents in the "rendering" chapter allow_q = query.Term("chapter", "rendering") # Don't show any documents where the "tag" field contains "todo" restrict_q = query.Term("tag", "todo") results = s.search(user_q, filter=allow_q, mask=restrict_q) (If you specify both a ``filter`` and a ``mask``, and a matching document appears in both, the ``mask`` "wins" and the document is not permitted.) To find out how many results were filtered out of the results, use ``results.filtered_count`` (or ``resultspage.results.filtered_count``):: with myindex.searcher() as s: qp = qparser.QueryParser("content", myindex.schema) user_q = qp.parse(query_string) # Filter documents older than 7 days old_q = query.DateRange("created", None, datetime.now() - timedelta(days=7)) results = s.search(user_q, mask=old_q) print("Filtered out %d older documents" % results.filtered_count) Which terms from my query matched? ================================== You can use the ``terms=True`` keyword argument to ``search()`` to have the search record which terms in the query matched which documents:: with myindex.searcher() as s: results = s.seach(myquery, terms=True) You can then get information about which terms matched from the :class:`whoosh.searching.Results` and :class:`whoosh.searching.Hit` objects:: # Was this results object created with terms=True? if results.has_matched_terms(): # What terms matched in the results? print(results.matched_terms()) # What terms matched in each hit? for hit in results: print(hit.matched_terms()) .. _collapsing: Collapsing results ================== Whoosh lets you eliminate all but the top N documents with the same facet key from the results. This can be useful in a few situations: * Eliminating duplicates at search time. * Restricting the number of matches per source. For example, in a web search application, you might want to show at most three matches from any website. Whether a document should be collapsed is determined by the value of a "collapse facet". If a document has an empty collapse key, it will never be collapsed, but otherwise only the top N documents with the same collapse key will appear in the results. See :doc:`/facets` for information on facets. :: with myindex.searcher() as s: # Set the facet to collapse on and the maximum number of documents per # facet value (default is 1) results = s.collector(collapse="hostname", collapse_limit=3) # Dictionary mapping collapse keys to the number of documents that # were filtered out by collapsing on that key print(results.collapsed_counts) Collapsing works with both scored and sorted results. You can use any of the facet types available in the :mod:`whoosh.sorting` module. By default, Whoosh uses the results order (score or sort key) to determine the documents to collapse. For example, in scored results, the best scoring documents would be kept. You can optionally specify a ``collapse_order`` facet to control which documents to keep when collapsing. For example, in a product search you could display results sorted by decreasing price, and eliminate all but the highest rated item of each product type:: from whoosh import sorting with myindex.searcher() as s: price_facet = sorting.FieldFacet("price", reverse=True) type_facet = sorting.FieldFacet("type") rating_facet = sorting.FieldFacet("rating", reverse=True) results = s.collector(sortedby=price_facet, # Sort by reverse price collapse=type_facet, # Collapse on product type collapse_order=rating_facet # Collapse to highest rated ) The collapsing happens during the search, so it is usually more efficient than finding everything and post-processing the results. However, if the collapsing eliminates a large number of documents, collapsed search can take longer because the search has to consider more documents and remove many already-collected documents. Since this collector must sometimes go back and remove already-collected documents, if you use it in combination with :class:`~whoosh.collectors.TermsCollector` and/or :class:`~whoosh.collectors.FacetCollector`, those collectors may contain information about documents that were filtered out of the final results by collapsing. Time limited searches ===================== To limit the amount of time a search can take:: from whoosh.collectors import TimeLimitCollector, TimeLimit with myindex.searcher() as s: # Get a collector object c = s.collector(limit=None, sortedby="title_exact") # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds tlc = TimeLimitedCollector(c, timelimit=10.0) # Try searching try: s.search_with_collector(myquery, tlc) except TimeLimit: print("Search took too long, aborting!") # You can still get partial results from the collector results = tlc.results() Convenience methods =================== The :meth:`~whoosh.searching.Searcher.document` and :meth:`~whoosh.searching.Searcher.documents` methods on the ``Searcher`` object let you retrieve the stored fields of documents matching terms you pass in keyword arguments. This is especially useful for fields such as dates/times, identifiers, paths, and so on. :: >>> list(searcher.documents(indexeddate=u"20051225")) [{"title": u"Christmas presents"}, {"title": u"Turkey dinner report"}] >>> print searcher.document(path=u"/a/b/c") {"title": "Document C"} These methods have some limitations: * The results are not scored. * Multiple keywords are always AND-ed together. * The entire value of each keyword argument is considered a single term; you can't search for multiple terms in the same field. Combining Results objects ========================= It is sometimes useful to use the results of another query to influence the order of a :class:`whoosh.searching.Results` object. For example, you might have a "best bet" field. This field contains hand-picked keywords for documents. When the user searches for those keywords, you want those documents to be placed at the top of the results list. You could try to do this by boosting the "bestbet" field tremendously, but that can have unpredictable effects on scoring. It's much easier to simply run the query twice and combine the results:: # Parse the user query userquery = queryparser.parse(querystring) # Get the terms searched for termset = set() userquery.existing_terms(termset) # Formulate a "best bet" query for the terms the user # searched for in the "content" field bbq = Or([Term("bestbet", text) for fieldname, text in termset if fieldname == "content"]) # Find documents matching the searched for terms results = s.search(bbq, limit=5) # Find documents that match the original query allresults = s.search(userquery, limit=10) # Add the user query results on to the end of the "best bet" # results. If documents appear in both result sets, push them # to the top of the combined results. results.upgrade_and_extend(allresults) The ``Results`` object supports the following methods: ``Results.extend(results)`` Adds the documents in 'results' on to the end of the list of result documents. ``Results.filter(results)`` Removes the documents in 'results' from the list of result documents. ``Results.upgrade(results)`` Any result documents that also appear in 'results' are moved to the top of the list of result documents. ``Results.upgrade_and_extend(results)`` Any result documents that also appear in 'results' are moved to the top of the list of result documents. Then any other documents in 'results' are added on to the list of result documents. Whoosh-2.5.7/docs/build/html/_sources/spelling.txt0000644000076500000240000001161112254366350022265 0ustar mattstaff00000000000000===================================================== "Did you mean... ?" Correcting errors in user queries ===================================================== .. note:: In Whoosh 1.9 the old spelling system based on a separate N-gram index was replaced with this significantly more convenient and powerful implementation. Overview ======== Whoosh can quickly suggest replacements for mis-typed words by returning a list of words from the index (or a dictionary) that are close to the mis-typed word:: with ix.searcher() as s: corrector = s.corrector("text") for mistyped_word in mistyped_words: print corrector.suggest(mistyped_word, limit=3) See the :meth:`whoosh.spelling.Corrector.suggest` method documentation for information on the arguments. Currently the suggestion engine is more like a "typo corrector" than a real "spell checker" since it doesn't do the kind of sophisticated phonetic matching or semantic/contextual analysis a good spell checker might. However, it is still very useful. There are two main strategies for correcting words: * Use the terms from an index field. * Use words from a word list file. Pulling suggestions from an indexed field ========================================= To enable spell checking on the contents of a field, use the ``spelling=True`` keyword argument on the field in the schema definition:: schema = Schema(text=TEXT(spelling=True)) (If you have an existing index you want to enable spelling for, you can alter the schema in-place using the :func:`whoosh.writing.add_spelling` function to create the missing word graph files.) .. tip:: You can get suggestions for fields without the ``spelling`` attribute, but calculating the suggestions will be slower. You can then use the :meth:`whoosh.searching.Searcher.corrector` method to get a corrector for a field:: corrector = searcher.corrector("content") The advantage of using the contents of an index field is that when you are spell checking queries on that index, the suggestions are tailored to the contents of the index. The disadvantage is that if the indexed documents contain spelling errors, then the spelling suggestions will also be erroneous. Pulling suggestions from a word list ==================================== There are plenty of word lists available on the internet you can use to populate the spelling dictionary. (In the following examples, ``word_list`` can be a list of unicode strings, or a file object with one word on each line.) To create a :class:`whoosh.spelling.Corrector` object from a word list:: from whoosh.spelling import GraphCorrector corrector = GraphCorrector.from_word_list(word_list) Creating a corrector directly from a word list can be slow for large word lists, so you can save a corrector's graph to a more efficient on-disk form like this:: graphfile = myindex.storage.create_file("words.graph") # to_file() automatically closes the file when it's finished corrector.to_file(graphfile) To open the graph file again very quickly:: graphfile = myindex.storage.open_file("words.graph") corrector = GraphCorrector.from_graph_file(graphfile) Merging two or more correctors ============================== You can combine suggestions from two sources (for example, the contents of an index field and a word list) using a :class:`whoosh.spelling.MultiCorrector`:: c1 = searcher.corrector("content") c2 = GraphCorrector.from_graph_file(wordfile) corrector = MultiCorrector([c1, c2]) Correcting user queries ======================= You can spell-check a user query using the :meth:`whoosh.searching.Searcher.correct_query` method:: from whoosh import qparser # Parse the user query string qp = qparser.QueryParser("content", myindex.schema) q = qp.parse(qstring) # Try correcting the query with myindex.searcher() as s: corrected = s.correct_query(q, qstring) if corrected.query != q: print("Did you mean:", corrected.string) The ``correct_query`` method returns an object with the following attributes: ``query`` A corrected :class:`whoosh.query.Query` tree. You can test whether this is equal (``==``) to the original parsed query to check if the corrector actually changed anything. ``string`` A corrected version of the user's query string. ``tokens`` A list of corrected token objects representing the corrected terms. You can use this to reformat the user query (see below). You can use a :class:`whoosh.highlight.Formatter` object to format the corrected query string. For example, use the :class:`~whoosh.highlight.HtmlFormatter` to format the corrected string as HTML:: from whoosh import highlight hf = highlight.HtmlFormatter() corrected = s.correct_query(q, qstring, formatter=hf) See the documentation for :meth:`whoosh.searching.Searcher.correct_query` for information on the defaults and arguments. Whoosh-2.5.7/docs/build/html/_sources/stemming.txt0000644000076500000240000002013712254366350022276 0ustar mattstaff00000000000000======================================== Stemming, variations, and accent folding ======================================== The problem =========== The indexed text will often contain words in different form than the one the user searches for. For example, if the user searches for ``render``, we would like the search to match not only documents that contain the ``render``, but also ``renders``, ``rendering``, ``rendered``, etc. A related problem is one of accents. Names and loan words may contain accents in the original text but not in the user's query, or vice versa. For example, we want the user to be able to search for ``cafe`` and find documents containing ``café``. The default analyzer for the :class:`whoosh.fields.TEXT` field does not do stemming or accent folding. Stemming ======== Stemming is a heuristic process of removing suffixes (and sometimes prefixes) from words to arrive (hopefully, most of the time) at the base word. Whoosh includes several stemming algorithms such as Porter and Porter2, Paice Husk, and Lovins. :: >>> from whoosh.lang.porter import stem >>> stem("rendering") 'render' The stemming filter applies the stemming function to the terms it indexes, and to words in user queries. So in theory all variations of a root word ("render", "rendered", "renders", "rendering", etc.) are reduced to a single term in the index, saving space. And all possible variations users might use in a query are reduced to the root, so stemming enhances "recall". The :class:`whoosh.analysis.StemFilter` lets you add a stemming filter to an analyzer chain. :: >>> rext = RegexTokenizer() >>> stream = rext(u"fundamentally willows") >>> stemmer = StemFilter() >>> [token.text for token in stemmer(stream)] [u"fundament", u"willow"] The :func:`whoosh.analysis.StemmingAnalyzer` is a pre-packaged analyzer that combines a tokenizer, lower-case filter, optional stop filter, and stem filter:: from whoosh import fields from whoosh.analysis import StemmingAnalyzer stem_ana = StemmingAnalyzer() schema = fields.Schema(title=TEXT(analyzer=stem_ana, stored=True), content=TEXT(analyzer=stem_ana)) Stemming has pros and cons. * It allows the user to find documents without worrying about word forms. * It reduces the size of the index, since it reduces the number of separate terms indexed by "collapsing" multiple word forms into a single base word. * It's faster than using variations (see below) * The stemming algorithm can sometimes incorrectly conflate words or change the meaning of a word by removing suffixes. * The stemmed forms are often not proper words, so the terms in the field are not useful for things like creating a spelling dictionary. Variations ========== Whereas stemming encodes the words in the index in a base form, when you use variations you instead index words "as is" and *at query time* expand words in the user query using a heuristic algorithm to generate morphological variations of the word. :: >>> from whoosh.lang.morph_en import variations >>> variations("rendered") set(['rendered', 'rendernesses', 'render', 'renderless', 'rendering', 'renderness', 'renderes', 'renderer', 'renderements', 'rendereless', 'renderenesses', 'rendere', 'renderment', 'renderest', 'renderement', 'rendereful', 'renderers', 'renderful', 'renderings', 'renders', 'renderly', 'renderely', 'rendereness', 'renderments']) Many of the generated variations for a given word will not be valid words, but it's fairly fast for Whoosh to check which variations are actually in the index and only search for those. The :class:`whoosh.query.Variations` query object lets you search for variations of a word. Whereas the normal :class:`whoosh.query.Term` object only searches for the given term, the ``Variations`` query acts like an ``Or`` query for the variations of the given word in the index. For example, the query:: query.Variations("content", "rendered") ...might act like this (depending on what words are in the index):: query.Or([query.Term("content", "render"), query.Term("content", "rendered"), query.Term("content", "renders"), query.Term("content", "rendering")]) To have the query parser use :class:`whoosh.query.Variations` instead of :class:`whoosh.query.Term` for individual terms, use the ``termclass`` keyword argument to the parser initialization method:: from whoosh import qparser, query qp = qparser.QueryParser("content", termclass=query.Variations) Variations has pros and cons. * It allows the user to find documents without worrying about word forms. * The terms in the field are actual words, not stems, so you can use the field's contents for other purposes such as spell checking queries. * It increases the size of the index relative to stemming, because different word forms are indexed separately. * It acts like an ``Or`` search for all the variations, which is slower than searching for a single term. Lemmatization ============= Whereas stemming is a somewhat "brute force", mechanical attempt at reducing words to their base form using simple rules, lemmatization usually refers to more sophisticated methods of finding the base form ("lemma") of a word using language models, often involving analysis of the surrounding context and part-of-speech tagging. Whoosh does not include any lemmatization functions, but if you have separate lemmatizing code you could write a custom :class:`whoosh.analysis.Filter` to integrate it into a Whoosh analyzer. Character folding ================= You can set up an analyzer to treat, for example, ``á``, ``a``, ``å``, and ``â`` as equivalent to improve recall. This is often very useful, allowing the user to, for example, type ``cafe`` or ``resume`` and find documents containing ``café`` and ``resumé``. Character folding is especially useful for unicode characters that may appear in Asian language texts that should be treated as equivalent to their ASCII equivalent, such as "half-width" characters. Character folding is not always a panacea. See this article for caveats on where accent folding can break down. http://www.alistapart.com/articles/accent-folding-for-auto-complete/ Whoosh includes several mechanisms for adding character folding to an analyzer. The :class:`whoosh.analysis.CharsetFilter` applies a character map to token text. For example, it will filter the tokens ``u'café', u'resumé', ...`` to ``u'cafe', u'resume', ...``. This is usually the method you'll want to use unless you need to use a charset to tokenize terms:: from whoosh.analysis import CharsetFilter, StemmingAnalyzer from whoosh import fields from whoosh.support.charset import accent_map # For example, to add an accent-folding filter to a stemming analyzer: my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) # To use this analyzer in your schema: my_schema = fields.Schema(content=fields.TEXT(analyzer=my_analyzer)) The :class:`whoosh.analysis.CharsetTokenizer` uses a Sphinx charset table to both separate terms and perform character folding. This tokenizer is slower than the :class:`whoosh.analysis.RegexTokenizer` because it loops over each character in Python. If the language(s) you're indexing can be tokenized using regular expressions, it will be much faster to use ``RegexTokenizer`` and ``CharsetFilter`` in combination instead of using ``CharsetTokenizer``. The :mod:`whoosh.support.charset` module contains an accent folding map useful for most Western languages, as well as a much more extensive Sphinx charset table and a function to convert Sphinx charset tables into the character maps required by ``CharsetTokenizer`` and ``CharsetFilter``:: # To create a filter using an enourmous character map for most languages # generated from a Sphinx charset table from whoosh.analysis import CharsetFilter from whoosh.support.charset import default_charset, charset_table_to_dict charmap = charset_table_to_dict(default_charset) my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) (The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table ) Whoosh-2.5.7/docs/build/html/_sources/tech/0000755000076500000240000000000012277504634020636 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/build/html/_sources/tech/backend.txt0000644000076500000240000001110712254366350022762 0ustar mattstaff00000000000000============================== How to implement a new backend ============================== Index ===== * Subclass :class:`whoosh.index.Index`. * Indexes must implement the following methods. * :meth:`whoosh.index.Index.is_empty` * :meth:`whoosh.index.Index.doc_count` * :meth:`whoosh.index.Index.reader` * :meth:`whoosh.index.Index.writer` * Indexes that require/support locking must implement the following methods. * :meth:`whoosh.index.Index.lock` * :meth:`whoosh.index.Index.unlock` * Indexes that support deletion must implement the following methods. * :meth:`whoosh.index.Index.delete_document` * :meth:`whoosh.index.Index.doc_count_all` -- if the backend has delayed deletion. * Indexes that require/support versioning/transactions *may* implement the following methods. * :meth:`whoosh.index.Index.latest_generation` * :meth:`whoosh.index.Index.up_to_date` * :meth:`whoosh.index.Index.last_modified` * Index *may* implement the following methods (the base class's versions are no-ops). * :meth:`whoosh.index.Index.optimize` * :meth:`whoosh.index.Index.close` IndexWriter =========== * Subclass :class:`whoosh.writing.IndexWriter`. * IndexWriters must implement the following methods. * :meth:`whoosh.writing.IndexWriter.add_document` * :meth:`whoosh.writing.IndexWriter.add_reader` * Backends that support deletion must implement the following methods. * :meth:`whoosh.writing.IndexWriter.delete_document` * IndexWriters that work as transactions must implement the following methods. * :meth:`whoosh.reading.IndexWriter.commit` -- Save the additions/deletions done with this IndexWriter to the main index, and release any resources used by the IndexWriter. * :meth:`whoosh.reading.IndexWriter.cancel` -- Throw away any additions/deletions done with this IndexWriter, and release any resources used by the IndexWriter. IndexReader =========== * Subclass :class:`whoosh.reading.IndexReader`. * IndexReaders must implement the following methods. * :meth:`whoosh.reading.IndexReader.__contains__` * :meth:`whoosh.reading.IndexReader.__iter__` * :meth:`whoosh.reading.IndexReader.iter_from` * :meth:`whoosh.reading.IndexReader.stored_fields` * :meth:`whoosh.reading.IndexReader.doc_count_all` * :meth:`whoosh.reading.IndexReader.doc_count` * :meth:`whoosh.reading.IndexReader.doc_field_length` * :meth:`whoosh.reading.IndexReader.field_length` * :meth:`whoosh.reading.IndexReader.max_field_length` * :meth:`whoosh.reading.IndexReader.postings` * :meth:`whoosh.reading.IndexReader.has_vector` * :meth:`whoosh.reading.IndexReader.vector` * :meth:`whoosh.reading.IndexReader.doc_frequency` * :meth:`whoosh.reading.IndexReader.frequency` * Backends that support deleting documents should implement the following methods. * :meth:`whoosh.reading.IndexReader.has_deletions` * :meth:`whoosh.reading.IndexReader.is_deleted` * Backends that support versioning should implement the following methods. * :meth:`whoosh.reading.IndexReader.generation` * If the IndexReader object does not keep the schema in the ``self.schema`` attribute, it needs to override the following methods. * :meth:`whoosh.reading.IndexReader.field` * :meth:`whoosh.reading.IndexReader.field_names` * :meth:`whoosh.reading.IndexReader.scorable_names` * :meth:`whoosh.reading.IndexReader.vector_names` * IndexReaders *may* implement the following methods. * :meth:`whoosh.reading.DocReader.close` -- closes any open resources associated with the reader. Matcher ======= The :meth:`whoosh.reading.IndexReader.postings` method returns a :class:`whoosh.matching.Matcher` object. You will probably need to implement a custom Matcher class for reading from your posting lists. * Subclass :class:`whoosh.matching.Matcher`. * Implement the following methods at minimum. * :meth:`whoosh.matching.Matcher.is_active` * :meth:`whoosh.matching.Matcher.copy` * :meth:`whoosh.matching.Matcher.id` * :meth:`whoosh.matching.Matcher.next` * :meth:`whoosh.matching.Matcher.value` * :meth:`whoosh.matching.Matcher.value_as` * :meth:`whoosh.matching.Matcher.score` * Depending on the implementation, you *may* implement the following methods more efficiently. * :meth:`whoosh.matching.Matcher.skip_to` * :meth:`whoosh.matching.Matcher.weight` * If the implementation supports quality, you should implement the following methods. * :meth:`whoosh.matching.Matcher.supports_quality` * :meth:`whoosh.matching.Matcher.quality` * :meth:`whoosh.matching.Matcher.block_quality` * :meth:`whoosh.matching.Matcher.skip_to_quality` Whoosh-2.5.7/docs/build/html/_sources/tech/filedb.txt0000644000076500000240000000267612254366350022633 0ustar mattstaff00000000000000============ filedb notes ============ TBD. Files created ============= .toc The "master" file containing information about the index and its segments. The index directory will contain a set of files for each segment. A segment is like a mini-index -- when you add documents to the index, whoosh creates a new segment and then searches the old segment(s) and the new segment to avoid having to do a big merge every time you add a document. When you get enough small segments whoosh will merge them into larger segments or a single segment. .dci Contains per-document information (e.g. field lengths). This will grow linearly with the number of documents. .dcz Contains the stored fields for each document. .tiz Contains per-term information. The size of file will vary based on the number of unique terms. .pst Contains per-term postings. The size of this file depends on the size of the collection and the formats used for each field (e.g. storing term positions takes more space than storing frequency only). .fvz contains term vectors (forward indexes) for each document. This file is only created if at least one field in the schema stores term vectors. The size will vary based on the number of documents, field length, the formats used for each vector (e.g. storing term positions takes more space than storing frequency only), etc. Whoosh-2.5.7/docs/build/html/_sources/tech/index.txt0000644000076500000240000000014112254366350022476 0ustar mattstaff00000000000000=============== Technical notes =============== .. toctree:: :glob: :maxdepth: 2 * Whoosh-2.5.7/docs/build/html/_sources/threads.txt0000644000076500000240000000531512254366350022106 0ustar mattstaff00000000000000==================================== Concurrency, locking, and versioning ==================================== Concurrency =========== The ``FileIndex`` object is "stateless" and should be share-able between threads. A ``Reader`` object (which underlies the ``Searcher`` object) wraps open files and often individual methods rely on consistent file cursor positions (e.g. they do two ``file.read()``\ s in a row, so if another thread moves the cursor between the two read calls Bad Things would happen). You should use one Reader/Searcher per thread in your code. Readers/Searchers tend to cache information (such as field caches for sorting), so if you can share one across multiple search requests, it's a big performance win. Locking ======= Only one thread/process can write to an index at a time. When you open a writer, it locks the index. If you try to open a writer on the same index in another thread/process, it will raise ``whoosh.store.LockError``. In a multi-threaded or multi-process environment your code needs to be aware that opening a writer may raise this exception if a writer is already open. Whoosh includes a couple of example implementations (:class:`whoosh.writing.AsyncWriter` and :class:`whoosh.writing.BufferedWriter`) of ways to work around the write lock. While the writer is open and during the commit, **the index is still available for reading**. Existing readers are unaffected and new readers can open the current index normally. Lock files ---------- Locking the index is accomplished by acquiring an exclusive file lock on the ``_WRITELOCK`` file in the index directory. The file is not deleted after the file lock is released, so the fact that the file exists **does not** mean the index is locked. Versioning ========== When you open a reader/searcher, the reader represents a view of the **current version** of the index. If someone writes changes to the index, any readers that are already open **will not** pick up the changes automatically. A reader always sees the index as it existed when the reader was opened. If you are re-using a Searcher across multiple search requests, you can check whether the Searcher is a view of the latest version of the index using :meth:`whoosh.searching.Searcher.up_to_date`. If the searcher is not up to date, you can get an up-to-date copy of the searcher using :meth:`whoosh.searching.Searcher.refresh`:: # If 'searcher' is not up-to-date, replace it searcher = searcher.refresh() (If the searcher has the latest version of the index, ``refresh()`` simply returns it.) Calling ``Searcher.refresh()`` is more efficient that closing the searcher and opening a new one, since it will re-use any underlying readers and caches that haven't changed. Whoosh-2.5.7/docs/source/0000755000076500000240000000000012277504634015326 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/source/analysis.rst0000644000076500000240000003200212254366350017674 0ustar mattstaff00000000000000=============== About analyzers =============== Overview ======== An analyzer is a function or callable class (a class with a ``__call__`` method) that takes a unicode string and returns a generator of tokens. Usually a "token" is a word, for example the string "Mary had a little lamb" might yield the tokens "Mary", "had", "a", "little", and "lamb". However, tokens do not necessarily correspond to words. For example, you might tokenize Chinese text into individual characters or bi-grams. Tokens are the units of indexing, that is, they are what you are able to look up in the index. An analyzer is basically just a wrapper for a tokenizer and zero or more filters. The analyzer's ``__call__`` method will pass its parameters to a tokenizer, and the tokenizer will usually be wrapped in a few filters. A tokenizer is a callable that takes a unicode string and yields a series of ``analysis.Token`` objects. For example, the provided :class:`whoosh.analysis.RegexTokenizer` class implements a customizable, regular-expression-based tokenizer that extracts words and ignores whitespace and punctuation. :: >>> from whoosh.analysis import RegexTokenizer >>> tokenizer = RegexTokenizer() >>> for token in tokenizer(u"Hello there my friend!"): ... print repr(token.text) u'Hello' u'there' u'my' u'friend' A filter is a callable that takes a generator of Tokens (either a tokenizer or another filter) and in turn yields a series of Tokens. For example, the provided :meth:`whoosh.analysis.LowercaseFilter` filters tokens by converting their text to lowercase. The implementation is very simple:: def LowercaseFilter(tokens): """Uses lower() to lowercase token text. For example, tokens "This","is","a","TEST" become "this","is","a","test". """ for t in tokens: t.text = t.text.lower() yield t You can wrap the filter around a tokenizer to see it in operation:: >>> from whoosh.analysis import LowercaseFilter >>> for token in LowercaseFilter(tokenizer(u"These ARE the things I want!")): ... print repr(token.text) u'these' u'are' u'the' u'things' u'i' u'want' An analyzer is just a means of combining a tokenizer and some filters into a single package. You can implement an analyzer as a custom class or function, or compose tokenizers and filters together using the ``|`` character:: my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() The first item must be a tokenizer and the rest must be filters (you can't put a filter first or a tokenizer after the first item). Note that this only works if at least the tokenizer is a subclass of ``whoosh.analysis.Composable``, as all the tokenizers and filters that ship with Whoosh are. See the :mod:`whoosh.analysis` module for information on the available analyzers, tokenizers, and filters shipped with Whoosh. Using analyzers =============== When you create a field in a schema, you can specify your analyzer as a keyword argument to the field object:: schema = Schema(content=TEXT(analyzer=StemmingAnalyzer())) Advanced Analysis ================= Token objects ------------- The ``Token`` class has no methods. It is merely a place to record certain attributes. A ``Token`` object actually has two kinds of attributes: *settings* that record what kind of information the ``Token`` object does or should contain, and *information* about the current token. Token setting attributes ------------------------ A ``Token`` object should always have the following attributes. A tokenizer or filter can check these attributes to see what kind of information is available and/or what kind of information they should be setting on the ``Token`` object. These attributes are set by the tokenizer when it creates the Token(s), based on the parameters passed to it from the Analyzer. Filters **should not** change the values of these attributes. ====== ================ =================================================== ========= Type Attribute name Description Default ====== ================ =================================================== ========= str mode The mode in which the analyzer is being called, '' e.g. 'index' during indexing or 'query' during query parsing bool positions Whether term positions are recorded in the token False bool chars Whether term start and end character indices are False recorded in the token bool boosts Whether per-term boosts are recorded in the token False bool removestops Whether stop-words should be removed from the True token stream ====== ================ =================================================== ========= Token information attributes ---------------------------- A ``Token`` object may have any of the following attributes. The ``text`` attribute should always be present. The original attribute may be set by a tokenizer. All other attributes should only be accessed or set based on the values of the "settings" attributes above. ======== ========== ================================================================= Type Name Description ======== ========== ================================================================= unicode text The text of the token (this should always be present) unicode original The original (pre-filtered) text of the token. The tokenizer may record this, and filters are expected not to modify it. int pos The position of the token in the stream, starting at 0 (only set if positions is True) int startchar The character index of the start of the token in the original string (only set if chars is True) int endchar The character index of the end of the token in the original string (only set if chars is True) float boost The boost for this token (only set if boosts is True) bool stopped Whether this token is a "stop" word (only set if removestops is False) ======== ========== ================================================================= So why are most of the information attributes optional? Different field formats require different levels of information about each token. For example, the ``Frequency`` format only needs the token text. The ``Positions`` format records term positions, so it needs them on the ``Token``. The ``Characters`` format records term positions and the start and end character indices of each term, so it needs them on the token, and so on. The ``Format`` object that represents the format of each field calls the analyzer for the field, and passes it parameters corresponding to the types of information it needs, e.g.:: analyzer(unicode_string, positions=True) The analyzer can then pass that information to a tokenizer so the tokenizer initializes the required attributes on the ``Token`` object(s) it produces. Performing different analysis for indexing and query parsing ------------------------------------------------------------ Whoosh sets the ``mode`` setting attribute to indicate whether the analyzer is being called by the indexer (``mode='index'``) or the query parser (``mode='query'``). This is useful if there's a transformation that you only want to apply at indexing or query parsing:: class MyFilter(Filter): def __call__(self, tokens): for t in tokens: if t.mode == 'query': ... else: ... The :class:`whoosh.analysis.MultiFilter` filter class lets you specify different filters to use based on the mode setting:: intraword = MultiFilter(index=IntraWordFilter(mergewords=True, mergenums=True), query=IntraWordFilter(mergewords=False, mergenums=False)) Stop words ---------- "Stop" words are words that are so common it's often counter-productive to index them, such as "and", "or", "if", etc. The provided ``analysis.StopFilter`` lets you filter out stop words, and includes a default list of common stop words. :: >>> from whoosh.analysis import StopFilter >>> stopper = StopFilter() >>> for token in stopper(LowercaseFilter(tokenizer(u"These ARE the things I want!"))): ... print repr(token.text) u'these' u'things' u'want' However, this seemingly simple filter idea raises a couple of minor but slightly thorny issues: renumbering term positions and keeping or removing stopped words. Renumbering term positions -------------------------- Remember that analyzers are sometimes asked to record the position of each token in the token stream: ============= ========== ========== ========== ========== Token.text u'Mary' u'had' u'a' u'lamb' Token.pos 0 1 2 3 ============= ========== ========== ========== ========== So what happens to the ``pos`` attribute of the tokens if ``StopFilter`` removes the words ``had`` and ``a`` from the stream? Should it renumber the positions to pretend the "stopped" words never existed? I.e.: ============= ========== ========== Token.text u'Mary' u'lamb' Token.pos 0 1 ============= ========== ========== or should it preserve the original positions of the words? I.e: ============= ========== ========== Token.text u'Mary' u'lamb' Token.pos 0 3 ============= ========== ========== It turns out that different situations call for different solutions, so the provided ``StopFilter`` class supports both of the above behaviors. Renumbering is the default, since that is usually the most useful and is necessary to support phrase searching. However, you can set a parameter in StopFilter's constructor to tell it not to renumber positions:: stopper = StopFilter(renumber=False) Removing or leaving stop words ------------------------------ The point of using ``StopFilter`` is to remove stop words, right? Well, there are actually some situations where you might want to mark tokens as "stopped" but not remove them from the token stream. For example, if you were writing your own query parser, you could run the user's query through a field's analyzer to break it into tokens. In that case, you might want to know which words were "stopped" so you can provide helpful feedback to the end user (e.g. "The following words are too common to search for:"). In other cases, you might want to leave stopped words in the stream for certain filtering steps (for example, you might have a step that looks at previous tokens, and want the stopped tokens to be part of the process), but then remove them later. The ``analysis`` module provides a couple of tools for keeping and removing stop-words in the stream. The ``removestops`` parameter passed to the analyzer's ``__call__`` method (and copied to the ``Token`` object as an attribute) specifies whether stop words should be removed from the stream or left in. :: >>> from whoosh.analysis import StandardAnalyzer >>> analyzer = StandardAnalyzer() >>> [(t.text, t.stopped) for t in analyzer(u"This is a test")] [(u'test', False)] >>> [(t.text, t.stopped) for t in analyzer(u"This is a test", removestops=False)] [(u'this', True), (u'is', True), (u'a', True), (u'test', False)] The ``analysis.unstopped()`` filter function takes a token generator and yields only the tokens whose ``stopped`` attribute is ``False``. .. note:: Even if you leave stopped words in the stream in an analyzer you use for indexing, the indexer will ignore any tokens where the ``stopped`` attribute is ``True``. Implementation notes -------------------- Because object creation is slow in Python, the stock tokenizers do not create a new ``analysis.Token`` object for each token. Instead, they create one ``Token`` object and yield it over and over. This is a nice performance shortcut but can lead to strange behavior if your code tries to remember tokens between loops of the generator. Because the analyzer only has one ``Token`` object, of which it keeps changing the attributes, if you keep a copy of the Token you get from a loop of the generator, it will be changed from under you. For example:: >>> list(tokenizer(u"Hello there my friend")) [Token(u"friend"), Token(u"friend"), Token(u"friend"), Token(u"friend")] Instead, do this:: >>> [t.text for t in tokenizer(u"Hello there my friend")] That is, save the attributes, not the token object itself. If you implement your own tokenizer, filter, or analyzer as a class, you should implement an ``__eq__`` method. This is important to allow comparison of ``Schema`` objects. The mixing of persistent "setting" and transient "information" attributes on the ``Token`` object is not especially elegant. If I ever have a better idea I might change it. ;) Nothing requires that an Analyzer be implemented by calling a tokenizer and filters. Tokenizers and filters are simply a convenient way to structure the code. You're free to write an analyzer any way you want, as long as it implements ``__call__``. Whoosh-2.5.7/docs/source/api/0000755000076500000240000000000012277504634016077 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/source/api/analysis.rst0000644000076500000240000000247012254366350020453 0ustar mattstaff00000000000000=================== ``analysis`` module =================== .. automodule:: whoosh.analysis Analyzers ========= .. autoclass:: IDAnalyzer .. autoclass:: KeywordAnalyzer .. autoclass:: RegexAnalyzer .. autoclass:: SimpleAnalyzer .. autoclass:: StandardAnalyzer .. autoclass:: StemmingAnalyzer .. autoclass:: FancyAnalyzer .. autoclass:: NgramAnalyzer .. autoclass:: NgramWordAnalyzer .. autoclass:: LanguageAnalyzer Tokenizers ========== .. autoclass:: IDTokenizer .. autoclass:: RegexTokenizer .. autoclass:: CharsetTokenizer .. autoclass:: SpaceSeparatedTokenizer .. autoclass:: CommaSeparatedTokenizer .. autoclass:: NgramTokenizer .. autoclass:: PathTokenizer Filters ======= .. autoclass:: PassFilter .. autoclass:: LoggingFilter .. autoclass:: MultiFilter .. autoclass:: TeeFilter .. autoclass:: ReverseTextFilter .. autoclass:: LowercaseFilter .. autoclass:: StripFilter .. autoclass:: StopFilter .. autoclass:: StemFilter .. autoclass:: CharsetFilter .. autoclass:: NgramFilter .. autoclass:: IntraWordFilter .. autoclass:: CompoundWordFilter .. autoclass:: BiWordFilter .. autoclass:: ShingleFilter .. autoclass:: DelimitedAttributeFilter .. autoclass:: DoubleMetaphoneFilter .. autoclass:: SubstitutionFilter Token classes and functions =========================== .. autoclass:: Token .. autofunction:: unstopped Whoosh-2.5.7/docs/source/api/api.rst0000644000076500000240000000012312254366350017372 0ustar mattstaff00000000000000========== Whoosh API ========== .. toctree:: :glob: :maxdepth: 1 ** Whoosh-2.5.7/docs/source/api/codec/0000755000076500000240000000000012277504634017154 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/source/api/codec/base.rst0000644000076500000240000000063512254366350020620 0ustar mattstaff00000000000000===================== ``codec.base`` module ===================== .. automodule:: whoosh.codec.base Classes ======= .. autoclass:: Codec :members: .. autoclass:: PerDocumentWriter :members: .. autoclass:: FieldWriter :members: .. autoclass:: PostingsWriter :members: .. autoclass:: TermsReader :members: .. autoclass:: PerDocumentReader :members: .. autoclass:: Segment :members: Whoosh-2.5.7/docs/source/api/collectors.rst0000644000076500000240000000111012254366350020767 0ustar mattstaff00000000000000===================== ``collectors`` module ===================== .. automodule:: whoosh.collectors Base classes ============ .. autoclass:: Collector :members: .. autoclass:: ScoredCollector :members: .. autoclass:: WrappingCollector :members: Basic collectors ================ .. autoclass:: TopCollector .. autoclass:: UnlimitedCollector .. autoclass:: SortingCollector Wrappers ======== .. autoclass:: FilterCollector .. autoclass:: FacetCollector .. autoclass:: CollapseCollector .. autoclass:: TimeLimitCollector .. autoclass:: TermsCollector Whoosh-2.5.7/docs/source/api/columns.rst0000644000076500000240000000120312254366350020301 0ustar mattstaff00000000000000===================== ``columns`` module ===================== .. automodule:: whoosh.columns Base classes ============ .. autoclass:: Column :members: .. autoclass:: ColumnWriter :members: .. autoclass:: ColumnReader :members: Basic columns ============= .. autoclass:: VarBytesColumn .. autoclass:: FixedBytesColumn .. autoclass:: RefBytesColumn .. autoclass:: NumericColumn Technical columns ================= .. autoclass:: BitColumn .. autoclass:: CompressedBytesColumn .. autoclass:: StructColumn .. autoclass:: PickleColumn Experimental columns ==================== .. autoclass:: ClampedNumericColumn Whoosh-2.5.7/docs/source/api/fields.rst0000644000076500000240000000117012254366350020072 0ustar mattstaff00000000000000================= ``fields`` module ================= .. automodule:: whoosh.fields Schema class ============ .. autoclass:: Schema :members: .. autoclass:: SchemaClass FieldType base class ==================== .. autoclass:: FieldType :members: Pre-made field types ==================== .. autoclass:: ID .. autoclass:: IDLIST .. autoclass:: STORED .. autoclass:: KEYWORD .. autoclass:: TEXT .. autoclass:: NUMERIC .. autoclass:: DATETIME .. autoclass:: BOOLEAN .. autoclass:: NGRAM .. autoclass:: NGRAMWORDS Exceptions ========== .. autoexception:: FieldConfigurationError .. autoexception:: UnknownFieldError Whoosh-2.5.7/docs/source/api/filedb/0000755000076500000240000000000012277504634017324 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/source/api/filedb/filestore.rst0000644000076500000240000000073312254366350022051 0ustar mattstaff00000000000000=========================== ``filedb.filestore`` module =========================== .. automodule:: whoosh.filedb.filestore Base class ========== .. autoclass:: Storage :members: Implementation classes ====================== .. autoclass:: FileStorage .. autoclass:: RamStorage Helper functions ================ .. autofunction:: copy_storage .. autofunction:: copy_to_ram Exceptions ========== .. autoexception:: ReadOnlyError Whoosh-2.5.7/docs/source/api/filedb/filetables.rst0000644000076500000240000000055012254366350022164 0ustar mattstaff00000000000000============================ ``filedb.filetables`` module ============================ .. automodule:: whoosh.filedb.filetables Hash file ========= .. autoclass:: HashWriter :members: .. autoclass:: HashReader :members: Ordered Hash file ================= .. autoclass:: OrderedHashWriter .. autoclass:: OrderedHashReader Whoosh-2.5.7/docs/source/api/filedb/structfile.rst0000644000076500000240000000040012254366350022230 0ustar mattstaff00000000000000============================ ``filedb.structfile`` module ============================ .. automodule:: whoosh.filedb.structfile Classes ======= .. autoclass:: StructFile :members: .. autoclass:: BufferFile .. autoclass:: ChecksumFile Whoosh-2.5.7/docs/source/api/formats.rst0000644000076500000240000000051412254366350020300 0ustar mattstaff00000000000000================== ``formats`` module ================== .. automodule:: whoosh.formats Base class ========== .. autoclass:: Format :members: Formats ======= .. autoclass:: Existence .. autoclass:: Frequency .. autoclass:: Positions .. autoclass:: Characters .. autoclass:: PositionBoosts .. autoclass:: CharacterBoosts Whoosh-2.5.7/docs/source/api/highlight.rst0000644000076500000240000000135712254366350020602 0ustar mattstaff00000000000000==================== ``highlight`` module ==================== .. automodule:: whoosh.highlight See :doc:`how to highlight terms in search results `. Manual highlighting =================== .. autoclass:: Highlighter :members: .. autofunction:: highlight Fragmenters =========== .. autoclass:: Fragmenter :members: .. autoclass:: WholeFragmenter .. autoclass:: SentenceFragmenter .. autoclass:: ContextFragmenter .. autoclass:: PinpointFragmenter Scorers ======= .. autoclass:: FragmentScorer .. autoclass:: BasicFragmentScorer Formatters ========== .. autoclass:: UppercaseFormatter .. autoclass:: HtmlFormatter .. autoclass:: GenshiFormatter Utility classes =============== .. autoclass:: Fragment :members: Whoosh-2.5.7/docs/source/api/idsets.rst0000644000076500000240000000055512254366350020125 0ustar mattstaff00000000000000============================ ``support.bitvector`` module ============================ .. automodule:: whoosh.idsets Base classes ============ .. autoclass:: DocIdSet :members: .. autoclass:: BaseBitSet Implementation classes ====================== .. autoclass:: BitSet .. autoclass:: OnDiskBitSet .. autoclass:: SortedIntSet .. autoclass:: MultiIdSet Whoosh-2.5.7/docs/source/api/index.rst0000644000076500000240000000107712254366350017741 0ustar mattstaff00000000000000================ ``index`` module ================ .. automodule:: whoosh.index Functions ========= .. autofunction:: create_in .. autofunction:: open_dir .. autofunction:: exists_in .. autofunction:: exists .. autofunction:: version_in .. autofunction:: version Base class ========== .. autoclass:: Index :members: Implementation ============== .. autoclass:: FileIndex Exceptions ========== .. autoexception:: LockError .. autoexception:: IndexError .. autoexception:: IndexVersionError .. autoexception:: OutOfDateError .. autoexception:: EmptyIndexError Whoosh-2.5.7/docs/source/api/lang/0000755000076500000240000000000012277504634017020 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/source/api/lang/morph_en.rst0000644000076500000240000000021712254366350021355 0ustar mattstaff00000000000000======================== ``lang.morph_en`` module ======================== .. automodule:: whoosh.lang.morph_en .. autofunction:: variations Whoosh-2.5.7/docs/source/api/lang/porter.rst0000644000076500000240000000020112254366350021052 0ustar mattstaff00000000000000====================== ``lang.porter`` module ====================== .. automodule:: whoosh.lang.porter .. autofunction:: stem Whoosh-2.5.7/docs/source/api/lang/wordnet.rst0000644000076500000240000000045512254366350021234 0ustar mattstaff00000000000000======================== ``lang.wordnet`` module ======================== .. automodule:: whoosh.lang.wordnet Thesaurus ========= .. autoclass:: Thesaurus :members: Low-level functions =================== .. autofunction:: parse_file .. autofunction:: synonyms .. autofunction:: make_index Whoosh-2.5.7/docs/source/api/matching.rst0000644000076500000240000000127412254366350020423 0ustar mattstaff00000000000000=================== ``matching`` module =================== .. automodule:: whoosh.matching Matchers ======== .. autoclass:: Matcher :members: .. autoclass:: NullMatcher .. autoclass:: ListMatcher .. autoclass:: WrappingMatcher .. autoclass:: MultiMatcher .. autoclass:: FilterMatcher .. autoclass:: BiMatcher .. autoclass:: AdditiveBiMatcher .. autoclass:: UnionMatcher .. autoclass:: DisjunctionMaxMatcher .. autoclass:: IntersectionMatcher .. autoclass:: AndNotMatcher .. autoclass:: InverseMatcher .. autoclass:: RequireMatcher .. autoclass:: AndMaybeMatcher .. autoclass:: ConstantScoreMatcher Exceptions ========== .. autoexception:: ReadTooFar .. autoexception:: NoQualityAvailable Whoosh-2.5.7/docs/source/api/qparser.rst0000644000076500000240000000305312254366350020303 0ustar mattstaff00000000000000================== ``qparser`` module ================== .. automodule:: whoosh.qparser Parser object ============= .. autoclass:: QueryParser :members: Pre-made configurations ----------------------- The following functions return pre-configured QueryParser objects. .. autofunction:: MultifieldParser .. autofunction:: SimpleParser .. autofunction:: DisMaxParser Plug-ins ======== .. autoclass:: Plugin :members: .. autoclass:: SingleQuotePlugin .. autoclass:: PrefixPlugin .. autoclass:: WildcardPlugin .. autoclass:: RegexPlugin .. autoclass:: BoostPlugin .. autoclass:: GroupPlugin .. autoclass:: EveryPlugin .. autoclass:: FieldsPlugin .. autoclass:: PhrasePlugin .. autoclass:: RangePlugin .. autoclass:: OperatorsPlugin .. autoclass:: PlusMinusPlugin .. autoclass:: GtLtPlugin .. autoclass:: MultifieldPlugin .. autoclass:: FieldAliasPlugin .. autoclass:: CopyFieldPlugin Syntax node objects =================== Base nodes ---------- .. autoclass:: SyntaxNode :members: Nodes ----- .. autoclass:: FieldnameNode .. autoclass:: TextNode .. autoclass:: WordNode .. autoclass:: RangeNode .. autoclass:: MarkerNode Group nodes ----------- .. autoclass:: GroupNode .. autoclass:: BinaryGroup .. autoclass:: ErrorNode .. autoclass:: AndGroup .. autoclass:: OrGroup .. autoclass:: AndNotGroup .. autoclass:: AndMaybeGroup .. autoclass:: DisMaxGroup .. autoclass:: RequireGroup .. autoclass:: NotGroup Operators --------- .. autoclass:: Operator .. autoclass:: PrefixOperator .. autoclass:: PostfixOperator .. autoclass:: InfixOperator Whoosh-2.5.7/docs/source/api/query.rst0000644000076500000240000000264612254366350020002 0ustar mattstaff00000000000000================ ``query`` module ================ .. automodule:: whoosh.query See also :mod:`whoosh.qparser` which contains code for parsing user queries into query objects. Base classes ============ The following abstract base classes are subclassed to create the "real" query operations. .. autoclass:: Query :members: .. autoclass:: CompoundQuery .. autoclass:: MultiTerm .. autoclass:: ExpandingTerm .. autoclass:: WrappingQuery Query classes ============= .. autoclass:: Term .. autoclass:: Variations .. autoclass:: FuzzyTerm .. autoclass:: Phrase .. autoclass:: And .. autoclass:: Or .. autoclass:: DisjunctionMax .. autoclass:: Not .. autoclass:: Prefix .. autoclass:: Wildcard .. autoclass:: Regex .. autoclass:: TermRange .. autoclass:: NumericRange .. autoclass:: DateRange .. autoclass:: Every .. autoclass:: NullQuery Binary queries ============== .. autoclass:: Require .. autoclass:: AndMaybe .. autoclass:: AndNot .. autoclass:: Otherwise Span queries ============ .. autoclass:: Span :members: .. autoclass:: SpanQuery .. autoclass:: SpanFirst .. autoclass:: SpanNear .. autoclass:: SpanNear2 .. autoclass:: SpanNot .. autoclass:: SpanOr .. autoclass:: SpanContains .. autoclass:: SpanBefore .. autoclass:: SpanCondition Special queries =============== .. autoclass:: NestedParent .. autoclass:: NestedChildren .. autoclass:: ConstantScoreQuery Exceptions ========== .. autoexception:: QueryError Whoosh-2.5.7/docs/source/api/reading.rst0000644000076500000240000000041712254366350020240 0ustar mattstaff00000000000000================== ``reading`` module ================== .. automodule:: whoosh.reading Classes ======= .. autoclass:: IndexReader :members: .. autoclass:: MultiReader .. autoclass:: TermInfo :members: Exceptions ========== .. autoexception:: TermNotFound Whoosh-2.5.7/docs/source/api/scoring.rst0000644000076500000240000000103712254366350020272 0ustar mattstaff00000000000000================== ``scoring`` module ================== .. automodule:: whoosh.scoring Base classes ============ .. autoclass:: WeightingModel :members: .. autoclass:: BaseScorer :members: .. autoclass:: WeightScorer .. autoclass:: WeightLengthScorer Scoring algorithm classes ========================= .. autoclass:: BM25F .. autoclass:: TF_IDF .. autoclass:: Frequency Scoring utility classes ======================= .. autoclass:: FunctionWeighting .. autoclass:: MultiWeighting .. autoclass:: ReverseWeighting Whoosh-2.5.7/docs/source/api/searching.rst0000644000076500000240000000063512254366350020574 0ustar mattstaff00000000000000==================== ``searching`` module ==================== .. automodule:: whoosh.searching Searching classes ================= .. autoclass:: Searcher :members: Results classes =============== .. autoclass:: Results :members: .. autoclass:: Hit :members: .. autoclass:: ResultsPage :members: Exceptions ========== .. autoexception:: NoTermsException .. autoexception:: TimeLimit Whoosh-2.5.7/docs/source/api/sorting.rst0000644000076500000240000000125612254366350020316 0ustar mattstaff00000000000000================== ``sorting`` module ================== .. automodule:: whoosh.sorting Base types ========== .. autoclass:: FacetType :members: .. autoclass:: Categorizer :members: Facet types =========== .. autoclass:: FieldFacet .. autoclass:: QueryFacet .. autoclass:: RangeFacet .. autoclass:: DateRangeFacet .. autoclass:: ScoreFacet .. autoclass:: FunctionFacet .. autoclass:: MultiFacet .. autoclass:: StoredFieldFacet Facets object ============= .. autoclass:: Facets :members: FacetType objects ================= .. autoclass:: FacetMap :members: .. autoclass:: OrderedList .. autoclass:: UnorderedList .. autoclass:: Count .. autoclass:: Best Whoosh-2.5.7/docs/source/api/spelling.rst0000644000076500000240000000076312254366350020450 0ustar mattstaff00000000000000=================== ``spelling`` module =================== See :doc:`correcting errors in user queries <../spelling>`. .. automodule:: whoosh.spelling Corrector objects ================= .. autoclass:: Corrector :members: .. autoclass:: ReaderCorrector .. autoclass:: GraphCorrector :members: .. autoclass:: MultiCorrector QueryCorrector objects ====================== .. autoclass:: QueryCorrector :members: .. autoclass:: SimpleQueryCorrector .. autoclass:: Correction Whoosh-2.5.7/docs/source/api/support/0000755000076500000240000000000012277504634017613 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/source/api/support/charset.rst0000644000076500000240000000045312254366350021774 0ustar mattstaff00000000000000========================== ``support.charset`` module ========================== .. automodule:: whoosh.support.charset .. data:: default_charset An extensive case- and accent folding charset table. Taken from http://speeple.com/unicode-maps.txt .. autofunction:: charset_table_to_dict Whoosh-2.5.7/docs/source/api/support/levenshtein.rst0000644000076500000240000000030212254366350022660 0ustar mattstaff00000000000000============================== ``support.levenshtein`` module ============================== .. automodule:: whoosh.support.levenshtein .. autofunction:: relative .. autofunction:: distance Whoosh-2.5.7/docs/source/api/util.rst0000644000076500000240000000013412254366350017600 0ustar mattstaff00000000000000=============== ``util`` module =============== .. automodule:: whoosh.util :members: Whoosh-2.5.7/docs/source/api/writing.rst0000644000076500000240000000051012254366350020304 0ustar mattstaff00000000000000================== ``writing`` module ================== .. automodule:: whoosh.writing Writer ====== .. autoclass:: IndexWriter :members: Utility writers =============== .. autoclass:: BufferedWriter :members: .. autoclass:: AsyncWriter :members: Exceptions ========== .. autoexception:: IndexingError Whoosh-2.5.7/docs/source/batch.rst0000644000076500000240000000750712254366350017146 0ustar mattstaff00000000000000=================================== Tips for speeding up batch indexing =================================== Overview ======== Indexing documents tends to fall into two general patterns: adding documents one at a time as they are created (as in a web application), and adding a bunch of documents at once (batch indexing). The following settings and alternate workflows can make batch indexing faster. StemmingAnalyzer cache ====================== The stemming analyzer by default uses a least-recently-used (LRU) cache to limit the amount of memory it uses, to prevent the cache from growing very large if the analyzer is reused for a long period of time. However, the LRU cache can slow down indexing by almost 200% compared to a stemming analyzer with an "unbounded" cache. When you're indexing in large batches with a one-shot instance of the analyzer, consider using an unbounded cache:: w = myindex.writer() # Get the analyzer object from a text field stem_ana = w.schema["content"].format.analyzer # Set the cachesize to -1 to indicate unbounded caching stem_ana.cachesize = -1 # Reset the analyzer to pick up the changed attribute stem_ana.clear() # Use the writer to index documents... The ``limitmb`` parameter ========================= The ``limitmb`` parameter to :meth:`whoosh.index.Index.writer` controls the *maximum* memory (in megabytes) the writer will use for the indexing pool. The higher the number, the faster indexing will be. The default value of ``128`` is actually somewhat low, considering many people have multiple gigabytes of RAM these days. Setting it higher can speed up indexing considerably:: from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(limitmb=256) .. note:: The actual memory used will be higher than this value because of interpreter overhead (up to twice as much!). It is very useful as a tuning parameter, but not for trying to exactly control the memory usage of Whoosh. The ``procs`` parameter ======================= The ``procs`` parameter to :meth:`whoosh.index.Index.writer` controls the number of processors the writer will use for indexing (via the ``multiprocessing`` module):: from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(procs=4) Note that when you use multiprocessing, the ``limitmb`` parameter controls the amount of memory used by *each process*, so the actual memory used will be ``limitmb * procs``:: # Each process will use a limit of 128, for a total of 512 writer = ix.writer(procs=4, limitmb=128) The ``multisegment`` parameter ============================== The ``procs`` parameter causes the default writer to use multiple processors to do much of the indexing, but then still uses a single process to merge the pool of each sub-writer into a single segment. You can get much better indexing speed by also using the ``multisegment=True`` keyword argument, which instead of merging the results of each sub-writer, simply has them each just write out a new segment:: from whoosh import index ix = index.open_dir("indexdir") writer = ix.writer(procs=4, multisegment=True) The drawback is that instead of creating a single new segment, this option creates a number of new segments **at least** equal to the number of processes you use. For example, if you use ``procs=4``, the writer will create four new segments. (If you merge old segments or call ``add_reader`` on the parent writer, the parent writer will also write a segment, meaning you'll get five new segments.) So, while ``multisegment=True`` is much faster than a normal writer, you should only use it for large batch indexing jobs (or perhaps only for indexing from scratch). It should not be the only method you use for indexing, because otherwise the number of segments will tend to increase forever! Whoosh-2.5.7/docs/source/conf.py0000644000076500000240000001437412254366350016632 0ustar mattstaff00000000000000 import sys, os, os.path sys.path.append(os.path.abspath("../../src")) import whoosh # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.append(os.path.abspath('.')) # -- General configuration ----------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.ifconfig'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8' # The master toctree document. master_doc = 'index' # General information about the project. project = u'Whoosh' copyright = u'2007-2012 Matt Chaput' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = whoosh.versionstring(build=False) # The full version, including alpha/beta/rc tags. release = whoosh.versionstring() # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. #language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. #unused_docs = [] # List of directories, relative to source directory, that shouldn't be searched # for source files. exclude_trees = [] # The reST default role (used for this markup: `text`) to use for all documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. html_theme = 'default' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { "codebgcolor": "#CCC", } # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". #html_static_path = ['_static'] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_use_modindex = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = 'Whooshdoc' # -- Options for LaTeX output -------------------------------------------------- # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'Whoosh.tex', u'Whoosh Documentation', u'Matt Chaput', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # Additional stuff for the LaTeX preamble. #latex_preamble = '' # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_use_modindex = True # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'http://docs.python.org/': None} # Autodoc config autoclass_content = "both" Whoosh-2.5.7/docs/source/dates.rst0000644000076500000240000001461212254366350017160 0ustar mattstaff00000000000000================================ Indexing and parsing dates/times ================================ Indexing dates ============== Whoosh lets you index and search dates/times using the :class:`whoosh.fields.DATETIME` field type. Instead of passing text for the field in ``add_document()``, you use a Python ``datetime.datetime`` object:: from datetime import datetime, timedelta from whoosh import fields, index schema = fields.Schema(title=fields.TEXT, content=fields.TEXT, date=fields.DATETIME) ix = index.create_in("indexdir", schema) w = ix.writer() w.add_document(title="Document 1", content="Rendering images from the command line", date=datetime.utcnow()) w.add_document(title="Document 2", content="Creating shaders using a node network", date=datetime.utcnow() + timedelta(days=1)) w.commit() Parsing date queries ==================== Once you've have an indexed ``DATETIME`` field, you can search it using a rich date parser contained in the :class:`whoosh.qparser.dateparse.DateParserPlugin`:: from whoosh import index from whoosh.qparser import QueryParser from whoosh.qparser.dateparse import DateParserPlugin ix = index.open_dir("indexdir") # Instatiate a query parser qp = QueryParser("content", ix.schema) # Add the DateParserPlugin to the parser qp.add_plugin(DateParserPlugin()) With the ``DateParserPlugin``, users can use date queries such as:: 20050912 2005 sept 12th june 23 1978 23 mar 2005 july 1985 sep 12 today yesterday tomorrow now next friday last tuesday 5am 10:25:54 23:12 8 PM 4:46 am oct 31 2010 last tuesday to today today to next friday jan 2005 to feb 2008 -1 week to now now to +2h -1y6mo to +2 yrs 23d Normally, as with other types of queries containing spaces, the users need to quote date queries containing spaces using single quotes:: render date:'last tuesday' command date:['last tuesday' to 'next friday'] If you use the ``free`` argument to the ``DateParserPlugin``, the plugin will try to parse dates from unquoted text following a date field prefix:: qp.add_plugin(DateParserPlugin(free=True)) This allows the user to type a date query with spaces and special characters following the name of date field and a colon. The date query can be mixed with other types of queries without quotes:: date:last tuesday render date:oct 15th 2001 5:20am command If you don't use the ``DateParserPlugin``, users can still search DATETIME fields using a simple numeric form ``YYYY[MM[DD[hh[mm[ss]]]]]`` that is built into the ``DATETIME`` field:: from whoosh import index from whoosh.qparser import QueryParser ix = index.open_dir("indexdir") qp = QueryParser("content", schema=ix.schema) # Find all datetimes in 2005 q = qp.parse(u"date:2005") # Find all datetimes on June 24, 2005 q = qp.parse(u"date:20050624") # Find all datetimes from 1am-2am on June 24, 2005 q = qp.parse(u"date:2005062401") # Find all datetimes from Jan 1, 2005 to June 2, 2010 q = qp.parse(u"date:[20050101 to 20100602]") About time zones and basetime ============================= The best way to deal with time zones is to always index ``datetime``\ s in native UTC form. Any ``tzinfo`` attribute on the ``datetime`` object is *ignored* by the indexer. If you are working with local datetimes, you should convert them to native UTC datetimes before indexing. Date parser notes ================= Please note that the date parser is still somewhat experimental. Setting the base datetime ------------------------- When you create the ``DateParserPlugin`` you can pass a ``datetime`` object to the ``basedate`` argument to set the datetime against which relative queries (such as ``last tuesday`` and ``-2 hours``) are measured. By default, the basedate is ``datetime.utcnow()`` at the moment the plugin is instantiated:: qp.add_plugin(DateParserPlugin(basedate=my_datetime)) Registering an error callback ----------------------------- To avoid user queries causing exceptions in your application, the date parser attempts to fail silently when it can't parse a date query. However, you can register a callback function to be notified of parsing failures so you can display feedback to the user. The argument to the callback function is the date text that could not be parsed (this is an experimental feature and may change in future versions):: errors = [] def add_error(msg): errors.append(msg) qp.add_plugin(DateParserPlug(callback=add_error)) q = qp.parse(u"date:blarg") # errors == [u"blarg"] Using free parsing ------------------ While the ``free`` option is easier for users, it may result in ambiguities. As one example, if you want to find documents containing reference to a march and the number 2 in documents from the year 2005, you might type:: date:2005 march 2 This query would be interpreted correctly as a date query and two term queries when ``free=False``, but as a single date query when ``free=True``. In this case the user could limit the scope of the date parser with single quotes:: date:'2005' march 2 Parsable formats ---------------- The date parser supports a wide array of date and time formats, however it is not my intention to try to support *all* types of human-readable dates (for example ``ten to five the friday after next``). The best idea might be to pick a date format that works and try to train users on it, and if they use one of the other formats that also works consider it a happy accident. Limitations =========== * Since it's based on Python's ``datetime.datetime`` object, the ``DATETIME`` field shares all the limitations of that class, such as no support for dates before year 1 on the proleptic Gregorian calendar. The ``DATETIME`` field supports practically unlimited dates, so if the ``datetime`` object is every improved it could support it. An alternative possibility might be to add support for ``mxDateTime`` objects someday. * The ``DateParserPlugin`` currently only has support for English dates. The architecture supports creation of parsers for other languages, and I hope to add examples for other languages soon. * ``DATETIME`` fields do not currently support open-ended ranges. You can simulate an open ended range by using an endpoint far in the past or future. Whoosh-2.5.7/docs/source/facets.rst0000644000076500000240000006722712254366350017337 0ustar mattstaff00000000000000==================== Sorting and faceting ==================== .. note:: The API for sorting and faceting changed in Whoosh 3.0. Overview ======== Sorting and faceting search results in Whoosh is based on **facets**. Each facet associates a value with each document in the search results, allowing you to sort by the keys or use them to group the documents. Whoosh includes a variety of **facet types** you can use for sorting and grouping (see below). Sorting ======= By default, the results of a search are sorted with the highest-scoring documents first. You can use the ``sortedby`` keyword argument to order the results by some other criteria instead, such as the value of a field. Making fields sortable ---------------------- In order to sort on a field, you should create the field using the ``sortable=True`` keyword argument:: schema = fields.Schema(title=fields.TEXT(sortable=True), content=fields.TEXT, modified=fields.DATETIME(sortable=True) ) It's possible to sort on a field that doesn't have ``sortable=True``, but this requires Whoosh to load the unique terms in the field into memory. Using ``sortable`` is much more efficient. About column types ------------------ When you create a field using ``sortable=True``, you are telling Whoosh to store per-document values for that field in a *column*. A column object specifies the format to use to store the per-document values on disk. The :mod:`whoosh.columns` module contains several different column object implementations. Each field type specifies a reasonable default column type (for example, the default for text fields is :class:`whoosh.columns.VarBytesColumn`, the default for numeric fields is :class:`whoosh.columns.NumericColumn`). However, if you want maximum efficiency you may want to use a different column type for a field. For example, if all document values in a field are a fixed length, you can use a :class:`whoosh.columns.FixedBytesColumn`. If you have a field where many documents share a relatively small number of possible values (an example might be a "category" field, or "month" or other enumeration type fields), you might want to use :class:`whoosh.columns.RefBytesColumn` (which can handle both variable and fixed-length values). There are column types for storing per-document bit values, structs, pickled objects, and compressed byte values. To specify a custom column object for a field, pass it as the ``sortable`` keyword argument instead of ``True``:: from whoosh import columns, fields category_col = columns.RefBytesColumn() schema = fields.Schema(title=fields.TEXT(sortable=True), category=fields.KEYWORD(sortable=category_col) Using a COLUMN field for custom sort keys ----------------------------------------- When you add a document with a sortable field, Whoosh uses the value you pass for the field as the sortable value. For example, if "title" is a sortable field, and you add this document:: writer.add_document(title="Mr. Palomar") ...then ``Mr. Palomar`` is stored in the field column as the sorting key for the document. This is usually good, but sometimes you need to "massage" the sortable key so it's different from the value the user searches and/or sees in the interface. For example, if you allow the user to sort by title, you might want to use different values for the visible title and the value used for sorting:: # Visible title title = "The Unbearable Lightness of Being" # Sortable title: converted to lowercase (to prevent different ordering # depending on uppercase/lowercase), with initial article moved to the end sort_title = "unbearable lightness of being, the" The best way to do this is to use an additional field just for sorting. You can use the :class:`whoosh.fields.COLUMN` field type to create a field that is not indexed or stored, it only holds per-document column values:: schema = fields.Schema(title=fields.TEXT(stored=True), sort_title=fields.COLUMN(columns.VarBytesColumn()) ) The single argument to the :class:`whoosh.fields.COLUMN` initializer is a :class:`whoosh.columns.ColumnType` object. You can use any of the various column types in the :mod:`whoosh.columns` module. As another example, say you are indexing documents that have a custom sorting order associated with each document, such as a "priority" number:: name=Big Wheel price=100 priority=1 name=Toss Across price=40 priority=3 name=Slinky price=25 priority=2 ... You can use a column field with a numeric column object to hold the "priority" and use it for sorting:: schema = fields.Schema(name=fields.TEXT(stored=True), price=fields.NUMERIC(stored=True), priority=fields.COLUMN(columns.NumericColumn("i"), ) (Note that :class:`columns.NumericColumn` takes a type code character like the codes used by Python's ``struct`` and ``array`` modules.) Making existing fields sortable ------------------------------- If you have an existing index from before the ``sortable`` argument was added in Whoosh 3.0, or you didn't think you needed a field to be sortable but now you find that you need to sort it, you can add "sortability" to an existing index using the :func:`whoosh.sorting.add_sortable` utility function:: from whoosh import columns, fields, index, sorting # Say we have an existing index with this schema schema = fields.Schema(title=fields.TEXT, price=fields.NUMERIC) # To use add_sortable, first open a writer for the index ix = index.open_dir("indexdir") with ix.writer() as w: # Add sortable=True to the "price" field using field terms as the # sortable values sorting.add_sortable(w, "price", sorting.FieldFacet("price")) # Add sortable=True to the "title" field using the # stored field values as the sortable value sorting.add_sortable(w, "title", sorting.StoredFieldFacet("title")) You can specify a custom column type when you call ``add_sortable`` using the ``column`` keyword argument:: add_sortable(w, "chapter", sorting.FieldFacet("chapter"), column=columns.RefBytesColumn()) See the documentation for :func:`~whoosh.sorting.add_sortable` for more information. Sorting search results ---------------------- When you tell Whoosh to sort by a field (or fields), it uses the per-document values in the field's column as sorting keys for the documents. Normally search results are sorted by descending relevance score. You can tell Whoosh to use a different ordering by passing the ``sortedby`` keyword argument to the :meth:`~whoosh.searching.Searcher.search` method:: from whoosh import fields, index, qparser schema = fields.Schema(title=fields.TEXT(stored=True), price=fields.NUMERIC(sortable=True)) ix = index.create_in("indexdir", schema) with ix.writer() as w: w.add_document(title="Big Deal", price=20) w.add_document(title="Mr. Big", price=10) w.add_document(title="Big Top", price=15) with ix.searcher() as s: qp = qparser.QueryParser("big", ix.schema) q = qp.parse(user_query_string) # Sort search results from lowest to highest price results = s.search(q, sortedby="price") for hit in results: print(hit["title"]) You can use any of the following objects as ``sortedby`` values: A ``FacetType`` object Uses this object to sort the documents. See below for the available facet types. A field name string Converts the field name into a ``FieldFacet`` (see below) and uses it to sort the documents. A list of ``FacetType`` objects and/or field name strings Bundles the facets together into a ``MultiFacet`` so you can sort by multiple keys. Note that this shortcut does not allow you to reverse the sort direction of individual facets. To do that, you need to construct the ``MultiFacet`` object yourself. .. note:: You can use the ``reverse=True`` keyword argument to the ``Searcher.search()`` method to reverse the overall sort direction. This is more efficient than reversing each individual facet. Examples -------- Sort by the value of the size field:: results = searcher.search(myquery, sortedby="size") Sort by the reverse (highest-to-lowest) order of the "price" field:: facet = sorting.FieldFacet("price", reverse=True) results = searcher.search(myquery, sortedby=facet) Sort by ascending size and then descending price:: mf = sorting.MultiFacet() mf.add_field("size") mf.add_field("price", reverse=True) results = searcher.search(myquery, sortedby=mf) # or... sizes = sorting.FieldFacet("size") prices = sorting.FieldFacet("price", reverse=True) results = searcher.search(myquery, sortedby=[sizes, prices]) Sort by the "category" field, then by the document's score:: cats = sorting.FieldFacet("category") scores = sorting.ScoreFacet() results = searcher.search(myquery, sortedby=[cats, scores]) Accessing column values ----------------------- Per-document column values are available in :class:`~whoosh.searching.Hit` objects just like stored field values:: schema = fields.Schema(title=fields.TEXT(stored=True), price=fields.NUMERIC(sortable=True)) ... results = searcher.search(myquery) for hit in results: print(hit["title"], hit["price"]) ADVANCED: if you want to access abitrary per-document values quickly you can get a column reader object:: with ix.searcher() as s: reader = s.reader() colreader = s.reader().column_reader("price") for docnum in reader.all_doc_ids(): print(colreader[docnum]) Grouping ======== It is often very useful to present "faceted" search results to the user. Faceting is dynamic grouping of search results into categories. The categories let users view a slice of the total results based on the categories they're interested in. For example, if you are programming a shopping website, you might want to display categories with the search results such as the manufacturers and price ranges. ==================== ================= Manufacturer Price -------------------- ----------------- Apple (5) $0 - $100 (2) Sanyo (1) $101 - $500 (10) Sony (2) $501 - $1000 (1) Toshiba (5) ==================== ================= You can let your users click the different facet values to only show results in the given categories. Another useful UI pattern is to show, say, the top 5 results for different types of found documents, and let the user click to see more results from a category they're interested in, similarly to how the Spotlight quick results work on Mac OS X. The ``groupedby`` keyword argument ---------------------------------- You can use the following objects as ``groupedby`` values: A ``FacetType`` object Uses this object to group the documents. See below for the available facet types. A field name string Converts the field name into a ``FieldFacet`` (see below) and uses it to sort the documents. The name of the field is used as the facet name. A list or tuple of field name strings Sets up multiple field grouping criteria. A dictionary mapping facet names to ``FacetType`` objects Sets up multiple grouping criteria. A ``Facets`` object This object is a lot like using a dictionary, but has some convenience methods to make setting up multiple groupings a little easier. Examples -------- Group by the value of the "category" field:: results = searcher.search(myquery, groupedby="category") Group by the value of the "category" field and also by the value of the "tags" field and a date range:: cats = sorting.FieldFacet("category") tags = sorting.FieldFacet("tags", allow_overlap=True) results = searcher.search(myquery, groupedby={"category": cats, "tags": tags}) # ...or, using a Facets object has a little less duplication facets = sorting.Facets() facets.add_field("category") facets.add_field("tags", allow_overlap=True) results = searcher.search(myquery, groupedby=facets) To group results by the *intersected values of multiple fields*, use a ``MultiFacet`` object (see below). For example, if you have two fields named ``tag`` and ``size``, you could group the results by all combinations of the ``tag`` and ``size`` field, such as ``('tag1', 'small')``, ``('tag2', 'small')``, ``('tag1', 'medium')``, and so on:: # Generate a grouping from the combination of the "tag" and "size" fields mf = MultiFacet("tag", "size") results = searcher.search(myquery, groupedby={"tag/size": mf}) Getting the faceted groups -------------------------- The ``Results.groups("facetname")`` method returns a dictionary mapping category names to lists of **document IDs**:: myfacets = sorting.Facets().add_field("size").add_field("tag") results = mysearcher.search(myquery, groupedby=myfacets) results.groups("size") # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]} If there is only one facet, you can just use ``Results.groups()`` with no argument to access its groups:: results = mysearcher.search(myquery, groupedby=myfunctionfacet) results.groups() By default, the values in the dictionary returned by ``groups()`` are lists of document numbers in the same relative order as in the results. You can use the ``Searcher`` object's ``stored_fields()`` method to take a document number and return the document's stored fields as a dictionary:: for category_name in categories: print "Top 5 documents in the %s category" % category_name doclist = categories[category_name] for docnum, score in doclist[:5]: print " ", searcher.stored_fields(docnum) if len(doclist) > 5: print " (%s more)" % (len(doclist) - 5) If you want different information about the groups, for example just the count of documents in each group, or you don't need the groups to be ordered, you can specify a :class:`whoosh.sorting.FacetMap` type or instance with the ``maptype`` keyword argument when creating the ``FacetType``:: # This is the same as the default myfacet = FieldFacet("size", maptype=sorting.OrderedList) results = mysearcher.search(myquery, groupedby=myfacet) results.groups() # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]} # Don't sort the groups to match the order of documents in the results # (faster) myfacet = FieldFacet("size", maptype=sorting.UnorderedList) results = mysearcher.search(myquery, groupedby=myfacet) results.groups() # {"small": [1, 2, 4, 5, 8], "medium": [0, 3, 6], "large": [7, 9]} # Only count the documents in each group myfacet = FieldFacet("size", maptype=sorting.Count) results = mysearcher.search(myquery, groupedby=myfacet) results.groups() # {"small": 5, "medium": 3, "large": 2} # Only remember the "best" document in each group myfacet = FieldFacet("size", maptype=sorting.Best) results = mysearcher.search(myquery, groupedby=myfacet) results.groups() # {"small": 8, "medium": 3, "large": 7} Alternatively you can specify a ``maptype`` argument in the ``Searcher.search()`` method call which applies to all facets:: results = mysearcher.search(myquery, groupedby=["size", "tag"], maptype=sorting.Count) (You can override this overall ``maptype`` argument on individual facets by specifying the ``maptype`` argument for them as well.) Facet types =========== FieldFacet ---------- This is the most common facet type. It sorts or groups based on the value in a certain field in each document. This generally works best (or at all) if each document has only one term in the field (e.g. an ID field):: # Sort search results by the value of the "path" field facet = sorting.FieldFacet("path") results = searcher.search(myquery, sortedby=facet) # Group search results by the value of the "parent" field facet = sorting.FieldFacet("parent") results = searcher.search(myquery, groupedby=facet) parent_groups = results.groups("parent") By default, ``FieldFacet`` only supports **non-overlapping** grouping, where a document cannot belong to multiple facets at the same time (each document will be sorted into one category arbitrarily.) To get overlapping groups with multi-valued fields, use the ``allow_overlap=True`` keyword argument:: facet = sorting.FieldFacet(fieldname, allow_overlap=True) This supports overlapping group membership where documents have more than one term in a field (e.g. KEYWORD fields). If you don't need overlapping, don't use ``allow_overlap`` because it's *much* slower and uses more memory (see the secion on ``allow_overlap`` below). QueryFacet ---------- You can set up categories defined by arbitrary queries. For example, you can group names using prefix queries:: # Use queries to define each category # (Here I'll assume "price" is a NUMERIC field, so I'll use # NumericRange) qdict = {} qdict["A-D"] = query.TermRange("name", "a", "d") qdict["E-H"] = query.TermRange("name", "e", "h") qdict["I-L"] = query.TermRange("name", "i", "l") # ... qfacet = sorting.QueryFacet(qdict) r = searcher.search(myquery, groupedby={"firstltr": qfacet}) By default, ``QueryFacet`` only supports **non-overlapping** grouping, where a document cannot belong to multiple facets at the same time (each document will be sorted into one category arbitrarily). To get overlapping groups with multi-valued fields, use the ``allow_overlap=True`` keyword argument:: facet = sorting.QueryFacet(querydict, allow_overlap=True) RangeFacet ---------- The ``RangeFacet`` is for NUMERIC field types. It divides a range of possible values into groups. For example, to group documents based on price into buckets $100 "wide":: pricefacet = sorting.RangeFacet("price", 0, 1000, 100) The first argument is the name of the field. The next two arguments are the full range to be divided. Value outside this range (in this example, values below 0 and above 1000) will be sorted into the "missing" (None) group. The fourth argument is the "gap size", the size of the divisions in the range. The "gap" can be a list instead of a single value. In that case, the values in the list will be used to set the size of the initial divisions, with the last value in the list being the size for all subsequent divisions. For example:: pricefacet = sorting.RangeFacet("price", 0, 1000, [5, 10, 35, 50]) ...will set up divisions of 0-5, 5-15, 15-50, 50-100, and then use 50 as the size for all subsequent divisions (i.e. 100-150, 150-200, and so on). The ``hardend`` keyword argument controls whether the last division is clamped to the end of the range or allowed to go past the end of the range. For example, this:: facet = sorting.RangeFacet("num", 0, 10, 4, hardend=False) ...gives divisions 0-4, 4-8, and 8-12, while this:: facet = sorting.RangeFacet("num", 0, 10, 4, hardend=True) ...gives divisions 0-4, 4-8, and 8-10. (The default is ``hardend=False``.) .. note:: The ranges/buckets are always **inclusive** at the start and **exclusive** at the end. DateRangeFacet -------------- This is like ``RangeFacet`` but for DATETIME fields. The start and end values must be ``datetime.datetime`` objects, and the gap(s) is/are ``datetime.timedelta`` objects. For example:: from datetime import datetime, timedelta start = datetime(2000, 1, 1) end = datetime.now() gap = timedelta(days=365) bdayfacet = sorting.DateRangeFacet("birthday", start, end, gap) As with ``RangeFacet``, you can use a list of gaps and the ``hardend`` keyword argument. ScoreFacet ---------- This facet is sometimes useful for sorting. For example, to sort by the "category" field, then for documents with the same category, sort by the document's score:: cats = sorting.FieldFacet("category") scores = sorting.ScoreFacet() results = searcher.search(myquery, sortedby=[cats, scores]) The ``ScoreFacet`` always sorts higher scores before lower scores. .. note:: While using ``sortedby=ScoreFacet()`` should give the same results as using the default scored ordering (``sortedby=None``), using the facet will be slower because Whoosh automatically turns off many optimizations when sorting. FunctionFacet ------------- This facet lets you pass a custom function to compute the sorting/grouping key for documents. (Using this facet type may be easier than subclassing FacetType and Categorizer to set up some custom behavior.) The function will be called with the index searcher and index document ID as arguments. For example, if you have an index with term vectors:: schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True, vector=True)) ix = RamStorage().create_index(schema) ...you could use a function to sort documents higher the closer they are to having equal occurances of two terms:: def fn(searcher, docnum): v = dict(searcher.vector_as("frequency", docnum, "text")) # Sort documents that have equal number of "alfa" and "bravo" first return 0 - (1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0)) facet = sorting.FunctionFacet(fn) results = searcher.search(myquery, sortedby=facet) StoredFieldFacet ---------------- This facet lets you use stored field values as the sorting/grouping key for documents. This is usually slower than using an indexed field, but when using ``allow_overlap`` it can actually be faster for large indexes just because it avoids the overhead of reading posting lists. :class:`~whoosh.sorting.StoredFieldFacet` supports ``allow_overlap`` by splitting the stored value into separate keys. By default it calls the value's ``split()`` method (since most stored values are strings), but you can supply a custom split function. See the section on ``allow_overlap`` below. MultiFacet ========== This facet type returns a composite of the keys returned by two or more sub-facets, allowing you to sort/group by the intersected values of multiple facets. ``MultiFacet`` has methods for adding facets:: myfacet = sorting.RangeFacet(0, 1000, 10) mf = sorting.MultiFacet() mf.add_field("category") mf.add_field("price", reverse=True) mf.add_facet(myfacet) mf.add_score() You can also pass a list of field names and/or ``FacetType`` objects to the initializer:: prices = sorting.FieldFacet("price", reverse=True) scores = sorting.ScoreFacet() mf = sorting.MultiFacet("category", prices, myfacet, scores) Missing values ============== * When sorting, documents without any terms in a given field, or whatever else constitutes "missing" for different facet types, will always sort to the end. * When grouping, "missing" documents will appear in a group with the key ``None``. Using overlapping groups ======================== The common supported workflow for grouping and sorting is where the given field has *one value for document*, for example a ``path`` field containing the file path of the original document. By default, facets are set up to support this single-value approach. Of course, there are situations where you want documents to be sorted into multiple groups based on a field with multiple terms per document. The most common example would be a ``tags`` field. The ``allow_overlap`` keyword argument to the :class:`~whoosh.sorting.FieldFacet`, :class:`~whoosh.sorting.QueryFacet`, and :class:`~whoosh.sorting.StoredFieldFacet` allows this multi-value approach. However, there is an important caveat: using ``allow_overlap=True`` is slower than the default, potentially *much* slower for very large result sets. This is because Whoosh must read every posting of every term in the field to create a temporary "forward index" mapping documents to terms. If a field is indexed with *term vectors*, ``FieldFacet`` will use them to speed up ``allow_overlap`` faceting for small result sets, but for large result sets, where Whoosh has to open the vector list for every matched document, this can still be very slow. For very large indexes and result sets, if a field is stored, you can get faster overlapped faceting using :class:`~whoosh.sorting.StoredFieldFacet` instead of ``FieldFacet``. While reading stored values is usually slower than using the index, in this case avoiding the overhead of opening large numbers of posting readers can make it worthwhile. ``StoredFieldFacet`` supports ``allow_overlap`` by loading the stored value for the given field and splitting it into multiple values. The default is to call the value's ``split()`` method. For example, if you've stored the ``tags`` field as a string like ``"tag1 tag2 tag3"``:: schema = fields.Schema(name=fields.TEXT(stored=True), tags=fields.KEYWORD(stored=True)) ix = index.create_in("indexdir") with ix.writer() as w: w.add_document(name="A Midsummer Night's Dream", tags="comedy fairies") w.add_document(name="Hamlet", tags="tragedy denmark") # etc. ...Then you can use a ``StoredFieldFacet`` like this:: ix = index.open_dir("indexdir") with ix.searcher() as s: sff = sorting.StoredFieldFacet("tags", allow_overlap=True) results = s.search(myquery, groupedby={"tags": sff}) For stored Python objects other than strings, you can supply a split function (using the ``split_fn`` keyword argument to ``StoredFieldFacet``). The function should accept a single argument (the stored value) and return a list or tuple of grouping keys. Using a custom sort order ========================= It is sometimes useful to have a custom sort order per-search. For example, different languages use different sort orders. If you have a function to return the sorting order you want for a given field value, such as an implementation of the Unicode Collation Algorithm (UCA), you can customize the sort order for the user's language. The :class:`whoosh.sorting.TranslateFacet` lets you apply a function to the value of another facet. This lets you "translate" a field value into an arbitrary sort key, such as with UCA:: from pyuca import Collator # The Collator object has a sort_key() method which takes a unicode # string and returns a sort key c = Collator("allkeys.txt") # Make a facet object for the field you want to sort on nf = sorting.FieldFacet("name") # Wrap the facet in a TranslateFacet with the translation function # (the Collator object's sort_key method) tf = sorting.TranslateFacet(facet, c.sort_key) # Use the facet to sort the search results results = searcher.search(myquery, sortedby=tf) (You can pass multiple "wrapped" facets to the ``TranslateFacet``, and it will call the function with the values of the facets as multiple arguments.) The TranslateFacet can also be very useful with numeric fields to sort on the output of some formula:: # Sort based on the average of two numeric fields def average(a, b): return (a + b) / 2.0 # Create two facets for the fields and pass them with the function to # TranslateFacet af = sorting.FieldFacet("age") wf = sorting.FieldFacet("weight") facet = sorting.TranslateFacet(average, af, wf) results = searcher.search(myquery. sortedby=facet) Remember that you can still sort by multiple facets. For example, you could sort by a numeric value transformed by a quantizing function first, and then if that is equal sort by the value of another field:: # Sort by a quantized size first, then by name tf = sorting.TranslateFacet(quantize, sorting.FieldFacet("size")) results = searcher.search(myquery, sortedby=[tf, "name"]) Expert: writing your own facet ============================== TBD. Whoosh-2.5.7/docs/source/fieldcaches.rst0000644000076500000240000000326212254366350020311 0ustar mattstaff00000000000000============ Field caches ============ The default (``filedb``) backend uses *field caches* in certain circumstances. The field cache basically pre-computes the order of documents in the index to speed up sorting and faceting. Generating field caches can take time the first time you sort/facet on a large index. The field cache is kept in memory (and by default written to disk when it is generated) so subsequent sorted/faceted searches should be faster. The default caching policy never expires field caches, so reused searchers and/or sorting a lot of different fields could use up quite a bit of memory with large indexes. Customizing cache behaviour =========================== (The following API examples refer to the default ``filedb`` backend.) *By default*, Whoosh saves field caches to disk. To prevent a reader or searcher from writing out field caches, do this before you start using it:: searcher.set_caching_policy(save=False) By default, if caches are written to disk they are saved in the index directory. To tell a reader or searcher to save cache files to a different location, create a storage object and pass it to the ``storage`` keyword argument:: from whoosh.filedb.filestore import FileStorage mystorage = FileStorage("path/to/cachedir") reader.set_caching_policy(storage=mystorage) Creating a custom caching policy ================================ Expert users who want to implement a custom caching policy (for example, to add cache expiration) should subclass :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`. Then you can pass an instance of your policy object to the ``set_caching_policy`` method:: searcher.set_caching_policy(MyPolicy()) Whoosh-2.5.7/docs/source/glossary.rst0000644000076500000240000000563512254366350017730 0ustar mattstaff00000000000000.. _glossary: ======== Glossary ======== .. glossary:: Analysis The process of breaking the text of a field into individual *terms* to be indexed. This consists of tokenizing the text into terms, and then optionally filtering the tokenized terms (for example, lowercasing and removing *stop words*). Whoosh includes several different analyzers. Corpus The set of documents you are indexing. Documents The individual pieces of content you want to make searchable. The word "documents" might imply files, but the data source could really be anything -- articles in a content management system, blog posts in a blogging system, chunks of a very large file, rows returned from an SQL query, individual email messages from a mailbox file, or whatever. When you get search results from Whoosh, the results are a list of documents, whatever "documents" means in your search engine. Fields Each document contains a set of fields. Typical fields might be "title", "content", "url", "keywords", "status", "date", etc. Fields can be indexed (so they're searchable) and/or stored with the document. Storing the field makes it available in search results. For example, you typically want to store the "title" field so your search results can display it. Forward index A table listing every document and the words that appear in the document. Whoosh lets you store *term vectors* that are a kind of forward index. Indexing The process of examining documents in the corpus and adding them to the *reverse index*. Postings The *reverse index* lists every word in the corpus, and for each word, a list of documents in which that word appears, along with some optional information (such as the number of times the word appears in that document). These items in the list, containing a document number and any extra information, are called *postings*. In Whoosh the information stored in postings is customizable for each *field*. Reverse index Basically a table listing every word in the corpus, and for each word, the list of documents in which it appears. It can be more complicated (the index can also list how many times the word appears in each document, the positions at which it appears, etc.) but that's how it basically works. Schema Whoosh requires that you specify the *fields* of the index before you begin indexing. The Schema associates field names with metadata about the field, such as the format of the *postings* and whether the contents of the field are stored in the index. Term vector A *forward index* for a certain field in a certain document. You can specify in the Schema that a given field should store term vectors. Whoosh-2.5.7/docs/source/highlight.rst0000644000076500000240000003246212254366350020032 0ustar mattstaff00000000000000================================================ How to create highlighted search result excerpts ================================================ Overview ======== The highlighting system works as a pipeline, with four component types. * **Fragmenters** chop up the original text into __fragments__, based on the locations of matched terms in the text. * **Scorers** assign a score to each fragment, allowing the system to rank the best fragments by whatever criterion. * **Order functions** control in what order the top-scoring fragments are presented to the user. For example, you can show the fragments in the order they appear in the document (FIRST) or show higher-scoring fragments first (SCORE) * **Formatters** turn the fragment objects into human-readable output, such as an HTML string. Requirements ============ Highlighting requires that you have the text of the indexed document available. You can keep the text in a stored field, or if the original text is available in a file, database column, etc, just reload it on the fly. Note that you might need to process the text to remove e.g. HTML tags, wiki markup, etc. How to ====== Get search results:: results = mysearcher.search(myquery) for hit in results: print(hit["title"]) You can use the :meth:`~whoosh.searching.Hit.highlights` method on the :class:`whoosh.searching.Hit` object to get highlighted snippets from the document containing the search terms. The first argument is the name of the field to highlight. If the field is stored, this is the only argument you need to supply:: results = mysearcher.search(myquery) for hit in results: print(hit["title"]) # Assume "content" field is stored print(hit.highlights("content")) If the field is not stored, you need to retrieve the text of the field some other way. For example, reading it from the original file or a database. Then you can supply the text to highlight with the ``text`` argument:: results = mysearcher.search(myquery) for hit in results: print(hit["title"]) # Assume the "path" stored field contains a path to the original file with open(hit["path"]) as fileobj: filecontents = fileobj.read() print(hit.highlights("content", text=filecontents)) The character limit =================== By default, Whoosh only pulls fragments from the first 32K characters of the text. This prevents very long texts from bogging down the highlighting process too much, and is usually justified since important/summary information is usually at the start of a document. However, if you find the highlights are missing information (for example, very long encyclopedia articles where the terms appear in a later section), you can increase the fragmenter's character limit. You can change the character limit on the results object like this:: results = mysearcher.search(myquery) results.fragmenter.charlimit = 100000 To turn off the character limit:: results.fragmenter.charlimit = None If you instantiate a custom fragmenter, you can set the character limit on it directly:: sf = highlight.SentenceFragmenter(charlimit=100000) results.fragmenter = sf See below for information on customizing the highlights. If you increase or disable the character limit to highlight long documents, you may need to use the tips in the "speeding up highlighting" section below to make highlighting faster. Customizing the highlights ========================== Number of fragments ------------------- You can use the ``top`` keyword argument to control the number of fragments returned in each snippet:: # Show a maximum of 5 fragments from the document print hit.highlights("content", top=5) Fragment size ------------- The default fragmenter has a ``maxchars`` attribute (default 200) controlling the maximum length of a fragment, and a ``surround`` attribute (default 20) controlling the maximum number of characters of context to add at the beginning and end of a fragment:: # Allow larger fragments results.fragmenter.maxchars = 300 # Show more context before and after results.fragmenter.surround = 50 Fragmenter ---------- A fragmenter controls how to extract excerpts from the original text. The ``highlight`` module has the following pre-made fragmenters: :class:`whoosh.highlight.ContextFragmenter` (the default) This is a "smart" fragmenter that finds matched terms and then pulls in surround text to form fragments. This fragmenter only yields fragments that contain matched terms. :class:`whoosh.highlight.SentenceFragmenter` Tries to break the text into fragments based on sentence punctuation (".", "!", and "?"). This object works by looking in the original text for a sentence end as the next character after each token's 'endchar'. Can be fooled by e.g. source code, decimals, etc. :class:`whoosh.highlight.WholeFragmenter` Returns the entire text as one "fragment". This can be useful if you are highlighting a short bit of text and don't need to fragment it. The different fragmenters have different options. For example, the default :class:`~whoosh.highlight.ContextFragmenter` lets you set the maximum fragment size and the size of the context to add on either side:: my_cf = highlight.ContextFragmenter(maxchars=100, surround=30) See the :mod:`whoosh.highlight` docs for more information. To use a different fragmenter:: results.fragmenter = my_cf Scorer ------ A scorer is a callable that takes a :class:`whoosh.highlight.Fragment` object and returns a sortable value (where higher values represent better fragments). The default scorer adds up the number of matched terms in the fragment, and adds a "bonus" for the number of __different__ matched terms. The highlighting system uses this score to select the best fragments to show to the user. As an example of a custom scorer, to rank fragments by lowest standard deviation of the positions of matched terms in the fragment:: def StandardDeviationScorer(fragment): """Gives higher scores to fragments where the matched terms are close together. """ # Since lower values are better in this case, we need to negate the # value return 0 - stddev([t.pos for t in fragment.matched]) To use a different scorer:: results.scorer = StandardDeviationScorer Order ----- The order is a function that takes a fragment and returns a sortable value used to sort the highest-scoring fragments before presenting them to the user (where fragments with lower values appear before fragments with higher values). The ``highlight`` module has the following order functions. ``FIRST`` (the default) Show fragments in the order they appear in the document. ``SCORE`` Show highest scoring fragments first. The ``highlight`` module also includes ``LONGER`` (longer fragments first) and ``SHORTER`` (shorter fragments first), but they probably aren't as generally useful. To use a different order:: results.order = highlight.SCORE Formatter --------- A formatter contols how the highest scoring fragments are turned into a formatted bit of text for display to the user. It can return anything (e.g. plain text, HTML, a Genshi event stream, a SAX event generator, or anything else useful to the calling system). The ``highlight`` module contains the following pre-made formatters. :class:`whoosh.highlight.HtmlFormatter` Outputs a string containing HTML tags (with a class attribute) around the matched terms. :class:`whoosh.highlight.UppercaseFormatter` Converts the matched terms to UPPERCASE. :class:`whoosh.highlight.GenshiFormatter` Outputs a Genshi event stream, with the matched terms wrapped in a configurable element. The easiest way to create a custom formatter is to subclass ``highlight.Formatter`` and override the ``format_token`` method:: class BracketFormatter(highlight.Formatter): """Puts square brackets around the matched terms. """ def format_token(self, text, token, replace=False): # Use the get_text function to get the text corresponding to the # token tokentext = highlight.get_text(text, token) # Return the text as you want it to appear in the highlighted # string return "[%s]" % tokentext To use a different formatter:: brf = BracketFormatter() results.formatter = brf If you need more control over the formatting (or want to output something other than strings), you will need to override other methods. See the documentation for the :class:`whoosh.highlight.Formatter` class. Highlighter object ================== Rather than setting attributes on the results object, you can create a reusable :class:`whoosh.highlight.Highlighter` object. Keyword arguments let you change the ``fragmenter``, ``scorer``, ``order``, and/or ``formatter``:: hi = highlight.Highlighter(fragmenter=my_cf, scorer=sds) You can then use the :meth:`whoosh.highlight.Highlighter.highlight_hit` method to get highlights for a ``Hit`` object:: for hit in results: print(hit["title"]) print(hi.highlight_hit(hit)) (When you assign to a ``Results`` object's ``fragmenter``, ``scorer``, ``order``, or ``formatter`` attributes, you're actually changing the values on the results object's default ``Highlighter`` object.) Speeding up highlighting ======================== Recording which terms matched in which documents during the search may make highlighting faster, since it will skip documents it knows don't contain any matching terms in the given field:: # Record per-document term matches results = searcher.search(myquery, terms=True) PinpointFragmenter ------------------ Usually the highlighting system uses the field's analyzer to re-tokenize the document's text to find the matching terms in context. If you have long documents and have increased/disabled the character limit, and/or if the field has a very complex analyzer, re-tokenizing may be slow. Instead of retokenizing, Whoosh can look up the character positions of the matched terms in the index. Looking up the character positions is not instantaneous, but is usually faster than analyzing large amounts of text. To use :class:`whoosh.highlight.PinpointFragmenter` and avoid re-tokenizing the document text, you must do all of the following: Index the field with character information (this will require re-indexing an existing index):: # Index the start and end chars of each term schema = fields.Schema(content=fields.TEXT(stored=True, chars=True)) Record per-document term matches in the results:: # Record per-document term matches results = searcher.search(myquery, terms=True) Set a :class:`whoosh.highlight.PinpointFragmenter` as the fragmenter:: results.fragmenter = highlight.PinpointFragmenter() PinpointFragmenter limitations ------------------------------ When the highlighting system does not re-tokenize the text, it doesn't know where any other words are in the text except the matched terms it looked up in the index. Therefore when the fragmenter adds surrounding context, it just adds or a certain number of characters blindly, and so doesn't distinguish between content and whitespace, or break on word boundaries, for example:: >>> hit.highlights("content") 're when the fragmenter\n ad' (This can be embarassing when the word fragments form dirty words!) One way to avoid this is to not show any surrounding context, but then fragments containing one matched term will contain ONLY that matched term:: >>> hit.highlights("content") 'fragmenter' Alternatively, you can normalize whitespace in the text before passing it to the highlighting system:: >>> text = searcher.stored_ >>> re.sub("[\t\r\n ]+", " ", text) >>> hit.highlights("content", text=text) ...and use the ``autotrim`` option of ``PinpointFragmenter`` to automatically strip text before the first space and after the last space in the fragments:: >>> results.fragmenter = highlight.PinpointFragmenter(autotrim=True) >>> hit.highlights("content") 'when the fragmenter' Using the low-level API ======================= Usage ----- The following function lets you retokenize and highlight a piece of text using an analyzer:: from whoosh.highlight import highlight excerpts = highlight(text, terms, analyzer, fragmenter, formatter, top=3, scorer=BasicFragmentScorer, minscore=1, order=FIRST) ``text`` The original text of the document. ``terms`` A sequence or set containing the query words to match, e.g. ("render", "shader"). ``analyzer`` The analyzer to use to break the document text into tokens for matching against the query terms. This is usually the analyzer for the field the query terms are in. ``fragmenter`` A :class:`whoosh.highlight.Fragmenter` object, see below. ``formatter`` A :class:`whoosh.highlight.Formatter` object, see below. ``top`` The number of fragments to include in the output. ``scorer`` A :class:`whoosh.highlight.FragmentScorer` object. The only scorer currently included with Whoosh is :class:`~whoosh.highlight.BasicFragmentScorer`, the default. ``minscore`` The minimum score a fragment must have to be considered for inclusion. ``order`` An ordering function that determines the order of the "top" fragments in the output text. Whoosh-2.5.7/docs/source/index.rst0000644000076500000240000000144512254366350017167 0ustar mattstaff00000000000000============================== Whoosh |release| documentation ============================== Whoosh was created by `Matt Chaput `_. You can view outstanding issues on the `Whoosh Bitbucket page `_ and get help on the `Whoosh mailing list `_. Contents ======== .. toctree:: :maxdepth: 2 releases/index quickstart intro glossary schema indexing searching parsing querylang dates query analysis stemming ngrams facets highlight keywords spelling fieldcaches batch threads nested recipes api/api tech/index Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` Whoosh-2.5.7/docs/source/indexing.rst0000644000076500000240000003701012254366350017662 0ustar mattstaff00000000000000====================== How to index documents ====================== Creating an Index object ======================== To create an index in a directory, use ``index.create_in``:: import os, os.path from whoosh import index if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) To open an existing index in a directory, use ``index.open_dir``:: import whoosh.index as index ix = index.open_dir("indexdir") These are convenience methods for:: from whoosh.filedb.filestore import FileStorage storage = FileStorage("indexdir") # Create an index ix = storage.create_index(schema) # Open an existing index storage.open_index() The schema you created the index with is pickled and stored with the index. You can keep multiple indexes in the same directory using the indexname keyword argument:: # Using the convenience functions ix = index.create_in("indexdir", schema=schema, indexname="usages") ix = index.open_dir("indexdir", indexname="usages") # Using the Storage object ix = storage.create_index(schema, indexname="usages") ix = storage.open_index(indexname="usages") Clearing the index ================== Calling ``index.create_in`` on a directory with an existing index will clear the current contents of the index. To test whether a directory currently contains a valid index, use ``index.exists_in``:: exists = index.exists_in("indexdir") usages_exists = index.exists_in("indexdir", indexname="usages") (Alternatively you can simply delete the index's files from the directory, e.g. if you only have one index in the directory, use ``shutil.rmtree`` to remove the directory and then recreate it.) Indexing documents ================== Once you've created an ``Index`` object, you can add documents to the index with an ``IndexWriter`` object. The easiest way to get the ``IndexWriter`` is to call ``Index.writer()``:: ix = index.open_dir("index") writer = ix.writer() Creating a writer locks the index for writing, so only one thread/process at a time can have a writer open. .. note:: Because opening a writer locks the index for writing, in a multi-threaded or multi-process environment your code needs to be aware that opening a writer may raise an exception (``whoosh.store.LockError``) if a writer is already open. Whoosh includes a couple of example implementations (:class:`whoosh.writing.AsyncWriter` and :class:`whoosh.writing.BufferedWriter`) of ways to work around the write lock. .. note:: While the writer is open and during the commit, the index is still available for reading. Existing readers are unaffected and new readers can open the current index normally. Once the commit is finished, existing readers continue to see the previous version of the index (that is, they do not automatically see the newly committed changes). New readers will see the updated index. The IndexWriter's ``add_document(**kwargs)`` method accepts keyword arguments where the field name is mapped to a value:: writer = ix.writer() writer.add_document(title=u"My document", content=u"This is my document!", path=u"/a", tags=u"first short", icon=u"/icons/star.png") writer.add_document(title=u"Second try", content=u"This is the second example.", path=u"/b", tags=u"second short", icon=u"/icons/sheep.png") writer.add_document(title=u"Third time's the charm", content=u"Examples are many.", path=u"/c", tags=u"short", icon=u"/icons/book.png") writer.commit() You don't have to fill in a value for every field. Whoosh doesn't care if you leave out a field from a document. Indexed fields must be passed a unicode value. Fields that are stored but not indexed (i.e. the ``STORED`` field type) can be passed any pickle-able object. Whoosh will happily allow you to add documents with identical values, which can be useful or annoying depending on what you're using the library for:: writer.add_document(path=u"/a", title=u"A", content=u"Hello there") writer.add_document(path=u"/a", title=u"A", content=u"Deja vu!") This adds two documents to the index with identical path and title fields. See "updating documents" below for information on the ``update_document`` method, which uses "unique" fields to replace old documents instead of appending. Indexing and storing different values for the same field -------------------------------------------------------- If you have a field that is both indexed and stored, you can index a unicode value but store a different object if necessary (it's usually not, but sometimes this is really useful) using a "special" keyword argument ``_stored_``. The normal value will be analyzed and indexed, but the "stored" value will show up in the results:: writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title") Finishing adding documents -------------------------- An ``IndexWriter`` object is kind of like a database transaction. You specify a bunch of changes to the index, and then "commit" them all at once. Calling ``commit()`` on the ``IndexWriter`` saves the added documents to the index:: writer.commit() Once your documents are in the index, you can search for them. If you want to close the writer without committing the changes, call ``cancel()`` instead of ``commit()``:: writer.cancel() Keep in mind that while you have a writer open (including a writer you opened and is still in scope), no other thread or process can get a writer or modify the index. A writer also keeps several open files. So you should always remember to call either ``commit()`` or ``cancel()`` when you're done with a writer object. Merging segments ================ A Whoosh ``filedb`` index is really a container for one or more "sub-indexes" called segments. When you add documents to an index, instead of integrating the new documents with the existing documents (which could potentially be very expensive, since it involves resorting all the indexed terms on disk), Whoosh creates a new segment next to the existing segment. Then when you search the index, Whoosh searches both segments individually and merges the results so the segments appear to be one unified index. (This smart design is copied from Lucene.) So, having a few segments is more efficient than rewriting the entire index every time you add some documents. But searching multiple segments does slow down searching somewhat, and the more segments you have, the slower it gets. So Whoosh has an algorithm that runs when you call ``commit()`` that looks for small segments it can merge together to make fewer, bigger segments. To prevent Whoosh from merging segments during a commit, use the ``merge`` keyword argument:: writer.commit(merge=False) To merge all segments together, optimizing the index into a single segment, use the ``optimize`` keyword argument:: writer.commit(optimize=True) Since optimizing rewrites all the information in the index, it can be slow on a large index. It's generally better to rely on Whoosh's merging algorithm than to optimize all the time. (The ``Index`` object also has an ``optimize()`` method that lets you optimize the index (merge all the segments together). It simply creates a writer and calls ``commit(optimize=True)`` on it.) For more control over segment merging, you can write your own merge policy function and use it as an argument to the ``commit()`` method. See the implementation of the ``NO_MERGE``, ``MERGE_SMALL``, and ``OPTIMIZE`` functions in the ``whoosh.writing`` module. Deleting documents ================== You can delete documents using the following methods on an ``IndexWriter`` object. You then need to call ``commit()`` on the writer to save the deletions to disk. ``delete_document(docnum)`` Low-level method to delete a document by its internal document number. ``is_deleted(docnum)`` Low-level method, returns ``True`` if the document with the given internal number is deleted. ``delete_by_term(fieldname, termtext)`` Deletes any documents where the given (indexed) field contains the given term. This is mostly useful for ``ID`` or ``KEYWORD`` fields. ``delete_by_query(query)`` Deletes any documents that match the given query. :: # Delete document by its path -- this field must be indexed ix.delete_by_term('path', u'/a/b/c') # Save the deletion to disk ix.commit() In the ``filedb`` backend, "deleting" a document simply adds the document number to a list of deleted documents stored with the index. When you search the index, it knows not to return deleted documents in the results. However, the document's contents are still stored in the index, and certain statistics (such as term document frequencies) are not updated, until you merge the segments containing deleted documents (see merging above). (This is because removing the information immediately from the index would essentially involving rewriting the entire index on disk, which would be very inefficient.) Updating documents ================== If you want to "replace" (re-index) a document, you can delete the old document using one of the ``delete_*`` methods on ``Index`` or ``IndexWriter``, then use ``IndexWriter.add_document`` to add the new version. Or, you can use ``IndexWriter.update_document`` to do this in one step. For ``update_document`` to work, you must have marked at least one of the fields in the schema as "unique". Whoosh will then use the contents of the "unique" field(s) to search for documents to delete:: from whoosh.fields import Schema, ID, TEXT schema = Schema(path = ID(unique=True), content=TEXT) ix = index.create_in("index") writer = ix.writer() writer.add_document(path=u"/a", content=u"The first document") writer.add_document(path=u"/b", content=u"The second document") writer.commit() writer = ix.writer() # Because "path" is marked as unique, calling update_document with path="/a" # will delete any existing documents where the "path" field contains "/a". writer.update_document(path=u"/a", content="Replacement for the first document") writer.commit() The "unique" field(s) must be indexed. If no existing document matches the unique fields of the document you're updating, ``update_document`` acts just like ``add_document``. "Unique" fields and ``update_document`` are simply convenient shortcuts for deleting and adding. Whoosh has no inherent concept of a unique identifier, and in no way enforces uniqueness when you use ``add_document``. Incremental indexing ==================== When you're indexing a collection of documents, you'll often want two code paths: one to index all the documents from scratch, and one to only update the documents that have changed (leaving aside web applications where you need to add/update documents according to user actions). Indexing everything from scratch is pretty easy. Here's a simple example:: import os.path from whoosh import index from whoosh.fields import Schema, ID, TEXT def clean_index(dirname): # Always create the index from scratch ix = index.create_in(dirname, schema=get_schema()) writer = ix.writer() # Assume we have a function that gathers the filenames of the # documents to be indexed for path in my_docs(): add_doc(writer, path) writer.commit() def get_schema() return Schema(path=ID(unique=True, stored=True), content=TEXT) def add_doc(writer, path): fileobj = open(path, "rb") content = fileobj.read() fileobj.close() writer.add_document(path=path, content=content) Now, for a small collection of documents, indexing from scratch every time might actually be fast enough. But for large collections, you'll want to have the script only re-index the documents that have changed. To start we'll need to store each document's last-modified time, so we can check if the file has changed. In this example, we'll just use the mtime for simplicity:: def get_schema() return Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT) def add_doc(writer, path): fileobj = open(path, "rb") content = fileobj.read() fileobj.close() modtime = os.path.getmtime(path) writer.add_document(path=path, content=content, time=modtime) Now we can modify the script to allow either "clean" (from scratch) or incremental indexing:: def index_my_docs(dirname, clean=False): if clean: clean_index(dirname) else: incremental_index(dirname) def incremental_index(dirname) ix = index.open_dir(dirname) # The set of all paths in the index indexed_paths = set() # The set of all paths we need to re-index to_index = set() with ix.searcher() as searcher: writer = ix.writer() # Loop over the stored fields in the index for fields in searcher.all_stored_fields(): indexed_path = fields['path'] indexed_paths.add(indexed_path) if not os.path.exists(indexed_path): # This file was deleted since it was indexed writer.delete_by_term('path', indexed_path) else: # Check if this file was changed since it # was indexed indexed_time = fields['time'] mtime = os.path.getmtime(indexed_path) if mtime > indexed_time: # The file has changed, delete it and add it to the list of # files to reindex writer.delete_by_term('path', indexed_path) to_index.add(indexed_path) # Loop over the files in the filesystem # Assume we have a function that gathers the filenames of the # documents to be indexed for path in my_docs(): if path in to_index or path not in indexed_paths: # This is either a file that's changed, or a new file # that wasn't indexed before. So index it! add_doc(writer, path) writer.commit() The ``incremental_index`` function: * Loops through all the paths that are currently indexed. * If any of the files no longer exist, delete the corresponding document from the index. * If the file still exists, but has been modified, add it to the list of paths to be re-indexed. * If the file exists, whether it's been modified or not, add it to the list of all indexed paths. * Loops through all the paths of the files on disk. * If a path is not in the set of all indexed paths, the file is new and we need to index it. * If a path is in the set of paths to re-index, we need to index it. * Otherwise, we can skip indexing the file. Clearing the index ================== In some cases you may want to re-index from scratch. To clear the index without disrupting any existing readers:: from whoosh import writing with myindex.writer() as mywriter: # You can optionally add documents to the writer here # e.g. mywriter.add_document(...) # Using mergetype=CLEAR clears all existing segments so the index will # only have any documents you've added to this writer mywriter.mergetype = writing.CLEAR Or, if you don't use the writer as a context manager and call ``commit()`` directly, do it like this:: mywriter = myindex.writer() # ... mywriter.commit(mergetype=writing.CLEAR) .. note:: If you don't need to worry about existing readers, a more efficient method is to simply delete the contents of the index directory and start over. Whoosh-2.5.7/docs/source/intro.rst0000644000076500000240000000421712254366350017213 0ustar mattstaff00000000000000====================== Introduction to Whoosh ====================== About Whoosh ------------ Whoosh was created by `Matt Chaput `_. It started as a quick and dirty search server for the online documentation of the `Houdini `_ 3D animation software package. Side Effects Software generously allowed Matt to open source the code in case it might be useful to anyone else who needs a very flexible or pure-Python search engine (or both!). * Whoosh is fast, but uses only pure Python, so it will run anywhere Python runs, without requiring a compiler. * By default, Whoosh uses the `Okapi BM25F `_ ranking function, but like most things the ranking function can be easily customized. * Whoosh creates fairly small indexes compared to many other search libraries. * All indexed text in Whoosh must be *unicode*. * Whoosh lets you store arbitrary Python objects with indexed documents. What is Whoosh? --------------- Whoosh is a fast, pure Python search engine library. The primary design impetus of Whoosh is that it is pure Python. You should be able to use Whoosh anywhere you can use Python, no compiler or Java required. Like one of its ancestors, Lucene, Whoosh is not really a search engine, it's a programmer library for creating a search engine [1]_. Practically no important behavior of Whoosh is hard-coded. Indexing of text, the level of information stored for each term in each field, parsing of search queries, the types of queries allowed, scoring algorithms, etc. are all customizable, replaceable, and extensible. .. [1] It would of course be possible to build a turnkey search engine on top of Whoosh, like Nutch and Solr use Lucene. What can Whoosh do for you? --------------------------- Whoosh lets you index free-form or structured text and then quickly find matching documents based on simple or complex search criteria. Getting help with Whoosh ------------------------ You can view outstanding issues on the `Whoosh Bitbucket page `_ and get help on the `Whoosh mailing list `_. Whoosh-2.5.7/docs/source/keywords.rst0000644000076500000240000000732412254366350017731 0ustar mattstaff00000000000000======================================= Query expansion and Key word extraction ======================================= Overview ======== Whoosh provides methods for computing the "key terms" of a set of documents. For these methods, "key terms" basically means terms that are frequent in the given documents, but relatively infrequent in the indexed collection as a whole. Because this is a purely statistical operation, not a natural language processing or AI function, the quality of the results will vary based on the content, the size of the document collection, and the number of documents for which you extract keywords. These methods can be useful for providing the following features to users: * Search term expansion. You can extract key terms for the top N results from a query and suggest them to the user as additional/alternate query terms to try. * Tag suggestion. Extracting the key terms for a single document may yield useful suggestions for tagging the document. * "More like this". You can extract key terms for the top ten or so results from a query (and removing the original query terms), and use those key words as the basis for another query that may find more documents using terms the user didn't think of. Usage ===== * Get more documents like a certain search hit. *This requires that the field you want to match on is vectored or stored, or that you have access to the original text (such as from a database)*. Use :meth:`~whoosh.searching.Hit.more_like_this`:: results = mysearcher.search(myquery) first_hit = results[0] more_results = first_hit.more_like_this("content") * Extract keywords for the top N documents in a :class:`whoosh.searching.Results` object. *This requires that the field is either vectored or stored*. Use the :meth:`~whoosh.searching.Results.key_terms` method of the :class:`whoosh.searching.Results` object to extract keywords from the top N documents of the result set. For example, to extract *five* key terms from the ``content`` field of the top *ten* documents of a results object:: keywords = [keyword for keyword, score in results.key_terms("content", docs=10, numterms=5) * Extract keywords for an arbitrary set of documents. *This requires that the field is either vectored or stored*. Use the :meth:`~whoosh.searching.Searcher.document_number` or :meth:`~whoosh.searching.Searcher.document_numbers` methods of the :class:`whoosh.searching.Searcher` object to get the document numbers for the document(s) you want to extract keywords from. Use the :meth:`~whoosh.searching.Searcher.key_terms` method of a :class:`whoosh.searching.Searcher` to extract the keywords, given the list of document numbers. For example, let's say you have an index of emails. To extract key terms from the ``content`` field of emails whose ``emailto`` field contains ``matt@whoosh.ca``:: with email_index.searcher() as s: docnums = s.document_numbers(emailto=u"matt@whoosh.ca") keywords = [keyword for keyword, score in s.key_terms(docnums, "body")] * Extract keywords from arbitrary text not in the index. Use the :meth:`~whoosh.searching.Searcher.key_terms_from_text` method of a :class:`whoosh.searching.Searcher` to extract the keywords, given the text:: with email_index.searcher() as s: keywords = [keyword for keyword, score in s.key_terms_from_text("body", mytext)] Expansion models ================ The ``ExpansionModel`` subclasses in the :mod:`whoosh.classify` module implement different weighting functions for key words. These models are translated into Python from original Java implementations in Terrier. Whoosh-2.5.7/docs/source/nested.rst0000644000076500000240000002345712254366350017351 0ustar mattstaff00000000000000=========================================== Indexing and searching document hierarchies =========================================== Overview ======== Whoosh's full-text index is essentially a flat database of documents. However, Whoosh supports two techniques for simulating the indexing and querying of hierarchical documents, that is, sets of documents that form a parent-child hierarchy, such as "Chapter - Section - Paragraph" or "Module - Class - Method". You can specify parent-child relationships *at indexing time*, by grouping documents in the same hierarchy, and then use the :class:`whoosh.query.NestedParent` and/or :class:`whoosh.query.NestedChildren` to find parents based on their children or vice-versa. Alternatively, you can use *query time joins*, essentially like external key joins in a database, where you perform one search to find a relevant document, then use a stored value on that document (for example, a ``parent`` field) to look up another document. Both methods have pros and cons. Using nested document indexing ============================== Indexing -------- This method works by indexing a "parent" document and all its "child" documents *as a "group"* so they are guaranteed to end up in the same segment. You can use the context manager returned by ``IndexWriter.group()`` to group documents:: with ix.writer() as w: with w.group(): w.add_document(kind="class", name="Index") w.add_document(kind="method", name="add document") w.add_document(kind="method", name="add reader") w.add_document(kind="method", name="close") with w.group(): w.add_document(kind="class", name="Accumulator") w.add_document(kind="method", name="add") w.add_document(kind="method", name="get result") with w.group(): w.add_document(kind="class", name="Calculator") w.add_document(kind="method", name="add") w.add_document(kind="method", name="add all") w.add_document(kind="method", name="add some") w.add_document(kind="method", name="multiply") w.add_document(kind="method", name="close") with w.group(): w.add_document(kind="class", name="Deleter") w.add_document(kind="method", name="add") w.add_document(kind="method", name="delete") Alternatively you can use the ``start_group()`` and ``end_group()`` methods:: with ix.writer() as w: w.start_group() w.add_document(kind="class", name="Index") w.add_document(kind="method", name="add document") w.add_document(kind="method", name="add reader") w.add_document(kind="method", name="close") w.end_group() Each level of the hierarchy should have a query that distinguishes it from other levels (for example, in the above index, you can use ``kind:class`` or ``kind:method`` to match different levels of the hierarchy). Once you've indexed the hierarchy of documents, you can use two query types to find parents based on children or vice-versa. (There is currently no support in the default query parser for nested queries.) NestedParent query ------------------ The :class:`whoosh.query.NestedParent` query type lets you specify a query for child documents, but have the query return an "ancestor" document from higher in the hierarchy:: # First, we need a query that matches all the documents in the "parent" # level we want of the hierarchy all_parents = query.Term("kind", "class") # Then, we need a query that matches the children we want to find wanted_kids = query.Term("name", "close") # Now we can make a query that will match documents where "name" is # "close", but the query will return the "parent" documents of the matching # children q = query.NestedParent(all_parents, wanted_kids) # results = Index, Calculator Note that in a hierarchy with more than two levels, you can specify a "parents" query that matches any level of the hierarchy, so you can return the top-level ancestors of the matching children, or the second level, third level, etc. The query works by first building a bit vector representing which documents are "parents":: Index | Calculator | | 1000100100000100 | | | Deleter Accumulator Then for each match of the "child" query, it calculates the previous parent from the bit vector and returns it as a match (it only returns each parent once no matter how many children match). This parent lookup is very efficient:: 1000100100000100 | |<-+ close NestedChildren query -------------------- The opposite of ``NestedParent`` is :class:`whoosh.query.NestedChildren`. This query lets you match parents but return their children. This is useful, for example, to search for an album title and return the songs in the album:: # Query that matches all documents in the "parent" level we want to match # at all_parents = query.Term("kind", "album") # Parent documents we want to match wanted_parents = query.Term("album_title", "heaven") # Now we can make a query that will match parent documents where "album_title" # contains "heaven", but the query will return the "child" documents of the # matching parents q1 = query.NestedChildren(all_parents, wanted_parents) You can then combine that query with an ``AND`` clause, for example to find songs with "hell" in the song title that occur on albums with "heaven" in the album title:: q2 = query.And([q1, query.Term("song_title", "hell")]) Deleting and updating hierarchical documents -------------------------------------------- The drawback of the index-time method is *updating and deleting*. Because the implementation of the queries depends on the parent and child documents being contiguous in the segment, you can't update/delete just one child document. You can only update/delete an entire top-level document at once (for example, if your hierarchy is "Chapter - Section - Paragraph", you can only update or delete entire chapters, not a section or paragraph). If the top-level of the hierarchy represents very large blocks of text, this can involve a lot of deleting and reindexing. Currently ``Writer.update_document()`` does not automatically work with nested documents. You must manually delete and re-add document groups to update them. To delete nested document groups, use the ``Writer.delete_by_query()`` method with a ``NestedParent`` query:: # Delete the "Accumulator" class all_parents = query.Term("kind", "class") to_delete = query.Term("name", "Accumulator") q = query.NestedParent(all_parents, to_delete) with myindex.writer() as w: w.delete_by_query(q) Using query-time joins ====================== A second technique for simulating hierarchical documents in Whoosh involves using a stored field on each document to point to its parent, and then using the value of that field at query time to find parents and children. For example, if we index a hierarchy of classes and methods using pointers to parents instead of nesting:: # Store a pointer to the parent on each "method" document with ix.writer() as w: w.add_document(kind="class", c_name="Index", docstring="...") w.add_document(kind="method", m_name="add document", parent="Index") w.add_document(kind="method", m_name="add reader", parent="Index") w.add_document(kind="method", m_name="close", parent="Index") w.add_document(kind="class", c_name="Accumulator", docstring="...") w.add_document(kind="method", m_name="add", parent="Accumulator") w.add_document(kind="method", m_name="get result", parent="Accumulator") w.add_document(kind="class", c_name="Calculator", docstring="...") w.add_document(kind="method", m_name="add", parent="Calculator") w.add_document(kind="method", m_name="add all", parent="Calculator") w.add_document(kind="method", m_name="add some", parent="Calculator") w.add_document(kind="method", m_name="multiply", parent="Calculator") w.add_document(kind="method", m_name="close", parent="Calculator") w.add_document(kind="class", c_name="Deleter", docstring="...") w.add_document(kind="method", m_name="add", parent="Deleter") w.add_document(kind="method", m_name="delete", parent="Deleter") # Now do manual joins at query time with ix.searcher() as s: # Tip: Searcher.document() and Searcher.documents() let you look up # documents by field values more easily than using Searcher.search() # Children to parents: # Print the docstrings of classes on which "close" methods occur for child_doc in s.documents(m_name="close"): # Use the stored value of the "parent" field to look up the parent # document parent_doc = s.document(c_name=child_doc["parent"]) # Print the parent document's stored docstring field print(parent_doc["docstring"]) # Parents to children: # Find classes with "big" in the docstring and print their methods q = query.Term("kind", "class") & query.Term("docstring", "big") for hit in s.search(q, limit=None): print("Class name=", hit["c_name"], "methods:") for child_doc in s.documents(parent=hit["c_name"]): print(" Method name=", child_doc["m_name"]) This technique is more flexible than index-time nesting in that you can delete/update individual documents in the hierarchy piece by piece, although it doesn't support finding different parent levels as easily. It is also slower than index-time nesting (potentially much slower), since you must perform additional searches for each found document. Future versions of Whoosh may include "join" queries to make this process more efficient (or at least more automatic). Whoosh-2.5.7/docs/source/ngrams.rst0000644000076500000240000000375612254366350017356 0ustar mattstaff00000000000000============================== Indexing and searching N-grams ============================== Overview ======== N-gram indexing is a powerful method for getting fast, "search as you type" functionality like iTunes. It is also useful for quick and effective indexing of languages such as Chinese and Japanese without word breaks. N-grams refers to groups of N characters... bigrams are groups of two characters, trigrams are groups of three characters, and so on. Whoosh includes two methods for analyzing N-gram fields: an N-gram tokenizer, and a filter that breaks tokens into N-grams. :class:`whoosh.analysis.NgramTokenizer` tokenizes the entire field into N-grams. This is more useful for Chinese/Japanese/Korean languages, where it's useful to index bigrams of characters rather than individual characters. Using this tokenizer with roman languages leads to spaces in the tokens. :: >>> ngt = NgramTokenizer(minsize=2, maxsize=4) >>> [token.text for token in ngt(u"hi there")] [u'hi', u'hi ', u'hi t',u'i ', u'i t', u'i th', u' t', u' th', u' the', u'th', u'the', u'ther', u'he', u'her', u'here', u'er', u'ere', u're'] :class:`whoosh.analysis.NgramFilter` breaks individual tokens into N-grams as part of an analysis pipeline. This is more useful for languages with word separation. :: >>> my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4) >>> [token.text for token in my_analyzer(u"rendering shaders")] [u'ren', u'rend', u'end', u'ende', u'nde', u'nder', u'der', u'deri', u'eri', u'erin', u'rin', u'ring', u'ing', u'sha', u'shad', u'had', u'hade', u'ade', u'ader', u'der', u'ders', u'ers'] Whoosh includes two pre-configured field types for N-grams: :class:`whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS`. The only difference is that ``NGRAM`` runs all text through the N-gram filter, including whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text using a tokenizer, then runs each word through the N-gram filter. TBD. Whoosh-2.5.7/docs/source/parsing.rst0000644000076500000240000003525312254366350017527 0ustar mattstaff00000000000000==================== Parsing user queries ==================== Overview ======== The job of a query parser is to convert a *query string* submitted by a user into *query objects* (objects from the :mod:`whoosh.query` module). For example, the user query: .. code-block:: none rendering shading might be parsed into query objects like this:: And([Term("content", u"rendering"), Term("content", u"shading")]) Whoosh includes a powerful, modular parser for user queries in the :mod:`whoosh.qparser` module. The default parser implements a query language similar to the one that ships with Lucene. However, by changing plugins or using functions such as :func:`whoosh.qparser.MultifieldParser`, :func:`whoosh.qparser.SimpleParser` or :func:`whoosh.qparser.DisMaxParser`, you can change how the parser works, get a simpler parser or change the query language syntax. (In previous versions of Whoosh, the query parser was based on ``pyparsing``. The new hand-written parser is less brittle and more flexible.) .. note:: Remember that you can directly create query objects programmatically using the objects in the :mod:`whoosh.query` module. If you are not processing actual user queries, this is preferable to building a query string just to parse it. Using the default parser ======================== To create a :class:`whoosh.qparser.QueryParser` object, pass it the name of the *default field* to search and the schema of the index you'll be searching. :: from whoosh.qparser import QueryParser parser = QueryParser("content", schema=myindex.schema) .. tip:: You can instantiate a ``QueryParser`` object without specifying a schema, however the parser will not process the text of the user query. This is useful for debugging, when you want to see how QueryParser will build a query, but don't want to make up a schema just for testing. Once you have a ``QueryParser`` object, you can call ``parse()`` on it to parse a query string into a query object:: >>> parser.parse(u"alpha OR beta gamma") Or([Term("content", u"alpha"), Term("content", "beta")]) See the :doc:`query language reference ` for the features and syntax of the default parser's query language. Common customizations ===================== Searching for any terms instead of all terms by default ------------------------------------------------------- If the user doesn't explicitly specify ``AND`` or ``OR`` clauses:: physically based rendering ...by default, the parser treats the words as if they were connected by ``AND``, meaning all the terms must be present for a document to match:: physically AND based AND rendering To change the parser to use ``OR`` instead, so that any of the terms may be present for a document to match, i.e.:: physically OR based OR rendering ...configure the QueryParser using the ``group`` keyword argument like this:: from whoosh import qparser parser = qparser.QueryParser(fieldname, schema=myindex.schema, group=qparser.OrGroup) The Or query lets you specify that documents that contain more of the query terms score higher. For example, if the user searches for ``foo bar``, a document with four occurances of ``foo`` would normally outscore a document that contained one occurance each of ``foo`` and ``bar``. However, users usually expect documents that contain more of the words they searched for to score higher. To configure the parser to produce Or groups with this behavior, use the ``factory()`` class method of ``OrGroup``:: og = qparser.OrGroup.factory(0.9) parser = qparser.QueryParser(fieldname, schema, group=og) where the argument to ``factory()`` is a scaling factor on the bonus (between 0 and 1). Letting the user search multiple fields by default -------------------------------------------------- The default QueryParser configuration takes terms without explicit fields and assigns them to the default field you specified when you created the object, so for example if you created the object with:: parser = QueryParser("content", schema=myschema) And the user entered the query: .. code-block:: none three blind mice The parser would treat it as: .. code-block:: none content:three content:blind content:mice However, you might want to let the user search *multiple* fields by default. For example, you might want "unfielded" terms to search both the ``title`` and ``content`` fields. In that case, you can use a :class:`whoosh.qparser.MultifieldParser`. This is just like the normal QueryParser, but instead of a default field name string, it takes a *sequence* of field names:: from whoosh.qparser import MultifieldParser mparser = MultifieldParser(["title", "content"], schema=myschema) When this MultifieldParser instance parses ``three blind mice``, it treats it as: .. code-block:: none (title:three OR content:three) (title:blind OR content:blind) (title:mice OR content:mice) Simplifying the query language ------------------------------ Once you have a parser:: parser = qparser.QueryParser("content", schema=myschema) you can remove features from it using the :meth:`~whoosh.qparser.QueryParser.remove_plugin_class` method. For example, to remove the ability of the user to specify fields to search:: parser.remove_plugin_class(qparser.FieldsPlugin) To remove the ability to search for wildcards, which can be harmful to query performance:: parser.remove_plugin_class(qparser.WildcardPlugin) See :doc:`/api/qparser` for information about the plugins included with Whoosh's query parser. Changing the AND, OR, ANDNOT, ANDMAYBE, and NOT syntax ------------------------------------------------------ The default parser uses English keywords for the AND, OR, ANDNOT, ANDMAYBE, and NOT functions:: parser = qparser.QueryParser("content", schema=myschema) You can replace the default ``CompoundsPlugin`` and ``NotPlugin`` objects to replace the default English tokens with your own regular expressions. The :class:`whoosh.qparser.CompoundsPlugin` implements the ability to use AND, OR, ANDNOT, and ANDMAYBE clauses in queries. You can instantiate a new ``CompoundsPlugin`` and use the ``And``, ``Or``, ``AndNot``, and ``AndMaybe`` keyword arguments to change the token patterns:: # Use Spanish equivalents instead of AND and OR cp = qparser.CompoundsPlugin(And=" Y ", Or=" O ") parser.replace_plugin(cp) The :class:`whoosh.qparser.NotPlugin` implements the ability to logically NOT subqueries. You can instantiate a new ``NotPlugin`` object with a different token:: np = qparser.NotPlugin("NO ") parser.replace_plugin(np) The arguments can be pattern strings or precompiled regular expression objects. For example, to change the default parser to use typographic symbols instead of words for the AND, OR, ANDNOT, ANDMAYBE, and NOT functions:: parser = qparser.QueryParser("content", schema=myschema) # These are regular expressions, so we have to escape the vertical bar cp = qparser.CompoundsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~") parser.replace_plugin(cp) parser.replace_plugin(qparser.NotPlugin("!")) Adding less-than, greater-than, etc. ------------------------------------ Normally, the way you match all terms in a field greater than "apple" is with an open ended range:: field:{apple to] The :class:`whoosh.qparser.GtLtPlugin` lets you specify the same search like this:: field:>apple The plugin lets you use ``>``, ``<``, ``>=``, ``<=``, ``=>``, or ``=<`` after a field specifier, and translates the expression into the equivalent range:: date:>='31 march 2001' date:[31 march 2001 to] Adding fuzzy term queries ------------------------- Fuzzy queries are good for catching misspellings and similar words. The :class:`whoosh.qparser.FuzzyTermPlugin` lets you search for "fuzzy" terms, that is, terms that don't have to match exactly. The fuzzy term will match any similar term within a certain number of "edits" (character insertions, deletions, and/or transpositions -- this is called the "Damerau-Levenshtein edit distance"). To add the fuzzy plugin:: parser = qparser.QueryParser("fieldname", my_index.schema) parser.add_plugin(qparser.FuzzyTermPlugin()) Once you add the fuzzy plugin to the parser, you can specify a fuzzy term by adding a ``~`` followed by an optional maximum edit distance. If you don't specify an edit distance, the default is ``1``. For example, the following "fuzzy" term query:: cat~ would match ``cat`` and all terms in the index within one "edit" of cat, for example ``cast`` (insert ``s``), ``at`` (delete ``c``), and ``act`` (transpose ``c`` and ``a``). If you wanted ``cat`` to match ``bat``, it requires two edits (delete ``c`` and insert ``b``) so you would need to set the maximum edit distance to ``2``:: cat~2 Because each additional edit you allow increases the number of possibilities that must be checked, edit distances greater than ``2`` can be very slow. It is often useful to require that the first few characters of a fuzzy term match exactly. This is called a prefix. You can set the length of the prefix by adding a slash and a number after the edit distance. For example, to use a maximum edit distance of ``2`` and a prefix length of ``3``:: johannson~2/3 You can specify a prefix without specifying an edit distance:: johannson~/3 The default prefix distance is ``0``. Allowing complex phrase queries ------------------------------- The default parser setup allows phrase (proximity) queries such as:: "whoosh search library" The default phrase query tokenizes the text between the quotes and creates a search for those terms in proximity. If you want to do more complex proximity searches, you can replace the phrase plugin with the :class:`whoosh.qparser.SequencePlugin`, which allows any query between the quotes. For example:: "(john OR jon OR jonathan~) peters*" The sequence syntax lets you add a "slop" factor just like the regular phrase:: "(john OR jon OR jonathan~) peters*"~2 To replace the default phrase plugin with the sequence plugin:: parser = qparser.QueryParser("fieldname", my_index.schema) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) Alternatively, you could keep the default phrase plugin and give the sequence plugin different syntax by specifying a regular expression for the start/end marker when you create the sequence plugin. The regular expression should have a named group ``slop`` for the slop factor. For example:: parser = qparser.QueryParser("fieldname", my_index.schema) parser.add_plugin(qparser.SequencePlugin("!(~(?P[1-9][0-9]*))?")) This would allow you to use regular phrase queries and sequence queries at the same time:: "regular phrase" AND !sequence query~2! Advanced customization ====================== QueryParser arguments --------------------- QueryParser supports two extra keyword arguments: ``group`` The query class to use to join sub-queries when the user doesn't explicitly specify a boolean operator, such as ``AND`` or ``OR``. This lets you change the default operator from ``AND`` to ``OR``. This will be the :class:`whoosh.qparser.AndGroup` or :class:`whoosh.qparser.OrGroup` class (*not* an instantiated object) unless you've written your own custom grouping syntax you want to use. ``termclass`` The query class to use to wrap single terms. This must be a :class:`whoosh.query.Query` subclass (*not* an instantiated object) that accepts a fieldname string and term text unicode string in its ``__init__`` method. The default is :class:`whoosh.query.Term`. This is useful if you want to change the default term class to :class:`whoosh.query.Variations`, or if you've written a custom term class you want the parser to use instead of the ones shipped with Whoosh. :: >>> from whoosh.qparser import QueryParser, OrGroup >>> orparser = QueryParser("content", schema=myschema, group=OrGroup) Configuring plugins ------------------- The query parser's functionality is provided by a set of plugins. You can remove plugins to remove functionality, add plugins to add functionality, or replace default plugins with re-configured or rewritten versions. The :meth:`whoosh.qparser.QueryParser.add_plugin`, :meth:`whoosh.qparser.QueryParser.remove_plugin_class`, and :meth:`whoosh.qparser.QueryParser.replace_plugin` methods let you manipulate the plugins in a ``QueryParser`` object. See :doc:`/api/qparser` for information about the available plugins. .. _custom-op: Creating custom operators ------------------------- * Decide whether you want a ``PrefixOperator``, ``PostfixOperator``, or ``InfixOperator``. * Create a new :class:`whoosh.qparser.syntax.GroupNode` subclass to hold nodes affected by your operator. This object is responsible for generating a :class:`whoosh.query.Query` object corresponding to the syntax. * Create a regular expression pattern for the operator's query syntax. * Create an ``OperatorsPlugin.OpTagger`` object from the above information. * Create a new ``OperatorsPlugin`` instance configured with your custom operator(s). * Replace the default ``OperatorsPlugin`` in your parser with your new instance. For example, if you were creating a ``BEFORE`` operator:: from whoosh import qparser, query optype = qparser.InfixOperator pattern = " BEFORE " class BeforeGroup(qparser.GroupNode): merging = True qclass = query.Ordered Create an OpTagger for your operator:: btagger = qparser.OperatorPlugin.OpTagger(pattern, BeforeGroup, qparser.InfixOperator) By default, infix operators are left-associative. To make a right-associative infix operator, do this:: btagger = qparser.OperatorPlugin.OpTagger(pattern, BeforeGroup, qparser.InfixOperator, leftassoc=False) Create an :class:`~whoosh.qparser.plugins.OperatorsPlugin` instance with your new operator, and replace the default operators plugin in your query parser:: qp = qparser.QueryParser("text", myschema) my_op_plugin = qparser.OperatorsPlugin([(btagger, 0)]) qp.replace_plugin(my_op_plugin) Note that the list of operators you specify with the first argument is IN ADDITION TO the default operators (AND, OR, etc.). To turn off one of the default operators, you can pass None to the corresponding keyword argument:: cp = qparser.OperatorsPlugin([(optagger, 0)], And=None) If you want ONLY your list of operators and none of the default operators, use the ``clean`` keyword argument:: cp = qparser.OperatorsPlugin([(optagger, 0)], clean=True) Operators earlier in the list bind more closely than operators later in the list. Whoosh-2.5.7/docs/source/query.rst0000644000076500000240000000033212254366350017217 0ustar mattstaff00000000000000============= Query objects ============= The classes in the :mod:`whoosh.query` module implement *queries* you can run against the index. TBD. See :doc:`searching` for how to search the index using query objects. Whoosh-2.5.7/docs/source/querylang.rst0000644000076500000240000001145712254366350020073 0ustar mattstaff00000000000000========================== The default query language ========================== .. highlight:: none Overview ======== A query consists of *terms* and *operators*. There are two types of terms: single terms and *phrases*. Multiple terms can be combined with operators such as *AND* and *OR*. Whoosh supports indexing text in different *fields*. You must specify the *default field* when you create the :class:`whoosh.qparser.QueryParser` object. This is the field in which any terms the user does not explicitly specify a field for will be searched. Whoosh's query parser is capable of parsing different and/or additional syntax through the use of plug-ins. See :doc:`parsing`. Individual terms and phrases ============================ Find documents containing the term ``render``:: render Find documents containing the phrase ``all was well``:: "all was well" Note that a field must store Position information for phrase searching to work in that field. Normally when you specify a phrase, the maximum difference in position between each word in the phrase is 1 (that is, the words must be right next to each other in the document). For example, the following matches if a document has ``library`` within 5 words after ``whoosh``:: "whoosh library"~5 Boolean operators ================= Find documents containing ``render`` *and* ``shading``:: render AND shading Note that AND is the default relation between terms, so this is the same as:: render shading Find documents containing ``render``, *and* also either ``shading`` *or* ``modeling``:: render AND shading OR modeling Find documents containing ``render`` but *not* modeling:: render NOT modeling Find documents containing ``alpha`` but not either ``beta`` or ``gamma``:: alpha NOT (beta OR gamma) Note that when no boolean operator is specified between terms, the parser will insert one, by default AND. So this query:: render shading modeling is equivalent (by default) to:: render AND shading AND modeling See :doc:`customizing the default parser ` for information on how to change the default operator to OR. Group operators together with parentheses. For example to find documents that contain both ``render`` and ``shading``, or contain ``modeling``:: (render AND shading) OR modeling Fields ====== Find the term ``ivan`` in the ``name`` field:: name:ivan The ``field:`` prefix only sets the field for the term it directly precedes, so the query:: title:open sesame Will search for ``open`` in the ``title`` field and ``sesame`` in the *default* field. To apply a field prefix to multiple terms, group them with parentheses:: title:(open sesame) This is the same as:: title:open title:sesame Of course you can specify a field for phrases too:: title:"open sesame" Inexact terms ============= Use "globs" (wildcard expressions using ``?`` to represent a single character and ``*`` to represent any number of characters) to match terms:: te?t test* *b?g* Note that a wildcard starting with ``?`` or ``*`` is very slow. Note also that these wildcards only match *individual terms*. For example, the query:: my*life will **not** match an indexed phrase like:: my so called life because those are four separate terms. Ranges ====== You can match a range of terms. For example, the following query will match documents containing terms in the lexical range from ``apple`` to ``bear`` *inclusive*. For example, it will match documents containing ``azores`` and ``be`` but not ``blur``:: [apple TO bear] This is very useful when you've stored, for example, dates in a lexically sorted format (i.e. YYYYMMDD):: date:[20050101 TO 20090715] The range is normally *inclusive* (that is, the range will match all terms between the start and end term, *as well as* the start and end terms themselves). You can specify that one or both ends of the range are *exclusive* by using the ``{`` and/or ``}`` characters:: [0000 TO 0025} {prefix TO suffix} You can also specify *open-ended* ranges by leaving out the start or end term:: [0025 TO] {TO suffix} Boosting query elements ======================= You can specify that certain parts of a query are more important for calculating the score of a matched document than others. For example, to specify that ``ninja`` is twice as important as other words, and ``bear`` is half as important:: ninja^2 cowboy bear^0.5 You can apply a boost to several terms using grouping parentheses:: (open sesame)^2.5 roc Making a term from literal text =============================== If you need to include characters in a term that are normally treated specially by the parser, such as spaces, colons, or brackets, you can enclose the term in single quotes:: path:'MacHD:My Documents' 'term with spaces' title:'function()' Whoosh-2.5.7/docs/source/quickstart.rst0000644000076500000240000002242212254366350020250 0ustar mattstaff00000000000000=========== Quick start =========== Whoosh is a library of classes and functions for indexing text and then searching the index. It allows you to develop custom search engines for your content. For example, if you were creating blogging software, you could use Whoosh to add a search function to allow users to search blog entries. A quick introduction ==================== :: >>> from whoosh.index import create_in >>> from whoosh.fields import * >>> schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) >>> ix = create_in("indexdir", schema) >>> writer = ix.writer() >>> writer.add_document(title=u"First document", path=u"/a", ... content=u"This is the first document we've added!") >>> writer.add_document(title=u"Second document", path=u"/b", ... content=u"The second one is even more interesting!") >>> writer.commit() >>> from whoosh.qparser import QueryParser >>> with ix.searcher() as searcher: ... query = QueryParser("content", ix.schema).parse("first") ... results = searcher.search(query) ... results[0] ... {"title": u"First document", "path": u"/a"} The ``Index`` and ``Schema`` objects ==================================== To begin using Whoosh, you need an *index object*. The first time you create an index, you must define the index's *schema*. The schema lists the *fields* in the index. A field is a piece of information for each document in the index, such as its title or text content. A field can be *indexed* (meaning it can be searched) and/or *stored* (meaning the value that gets indexed is returned with the results; this is useful for fields such as the title). This schema has two fields, "title" and "content":: from whoosh.fields import Schema, TEXT schema = Schema(title=TEXT, content=TEXT) You only need to do create the schema once, when you create the index. The schema is pickled and stored with the index. When you create the ``Schema`` object, you use keyword arguments to map field names to field types. The list of fields and their types defines what you are indexing and what's searchable. Whoosh comes with some very useful predefined field types, and you can easily create your own. :class:`whoosh.fields.ID` This type simply indexes (and optionally stores) the entire value of the field as a single unit (that is, it doesn't break it up into individual words). This is useful for fields such as a file path, URL, date, category, etc. :class:`whoosh.fields.STORED` This field is stored with the document, but not indexed. This field type is not indexed and not searchable. This is useful for document information you want to display to the user in the search results. :class:`whoosh.fields.KEYWORD` This type is designed for space- or comma-separated keywords. This type is indexed and searchable (and optionally stored). To save space, it does not support phrase searching. :class:`whoosh.fields.TEXT` This type is for body text. It indexes (and optionally stores) the text and stores term positions to allow phrase searching. :class:`whoosh.fields.NUMERIC` This type is for numbers. You can store integers or floating point numbers. :class:`whoosh.fields.BOOLEAN` This type is for boolean (true/false) values. :class:`whoosh.fields.DATETIME` This type is for ``datetime`` objects. See :doc:`dates` for more information. :class:`whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS` These types break the field text or individual terms into N-grams. See :doc:`ngrams` for more information. (As a shortcut, if you don't need to pass any arguments to the field type, you can just give the class name and Whoosh will instantiate the object for you.) :: from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT schema = Schema(title=TEXT(stored=True), content=TEXT, path=ID(stored=True), tags=KEYWORD, icon=STORED) See :doc:`schema` for more information. Once you have the schema, you can create an index using the ``create_in`` function:: import os.path from whoosh.index import create_in if not os.path.exists("index"): os.mkdir("index") ix = create_in("index", schema) (At a low level, this creates a *Storage* object to contain the index. A ``Storage`` object represents that medium in which the index will be stored. Usually this will be ``FileStorage``, which stores the index as a set of files in a directory.) After you've created an index, you can open it using the ``open_dir`` convenience function:: from whoosh.index import open_dir ix = open_dir("index") The ``IndexWriter`` object ========================== OK, so we've got an ``Index`` object, now we can start adding documents. The ``writer()`` method of the ``Index`` object returns an ``IndexWriter`` object that lets you add documents to the index. The IndexWriter's ``add_document(**kwargs)`` method accepts keyword arguments where the field name is mapped to a value:: writer = ix.writer() writer.add_document(title=u"My document", content=u"This is my document!", path=u"/a", tags=u"first short", icon=u"/icons/star.png") writer.add_document(title=u"Second try", content=u"This is the second example.", path=u"/b", tags=u"second short", icon=u"/icons/sheep.png") writer.add_document(title=u"Third time's the charm", content=u"Examples are many.", path=u"/c", tags=u"short", icon=u"/icons/book.png") writer.commit() Two important notes: * You don't have to fill in a value for every field. Whoosh doesn't care if you leave out a field from a document. * Indexed text fields must be passed a unicode value. Fields that are stored but not indexed (``STORED`` field type) can be passed any pickle-able object. If you have a text field that is both indexed and stored, you can index a unicode value but store a different object if necessary (it's usually not, but sometimes this is really useful) using this trick:: writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title") Calling commit() on the ``IndexWriter`` saves the added documents to the index:: writer.commit() See :doc:`indexing` for more information. Once your documents are committed to the index, you can search for them. The ``Searcher`` object ======================= To begin searching the index, we'll need a ``Searcher`` object:: searcher = ix.searcher() You'll usually want to open the searcher using a ``with`` statement so the searcher is automatically closed when you're done with it (searcher objects represent a number of open files, so if you don't explicitly close them and the system is slow to collect them, you can run out of file handles):: with ix.searcher() as searcher: ... This is of course equivalent to:: try: searcher = ix.searcher() ... finally: searcher.close() The Searcher's ``search()`` method takes a *Query object*. You can construct query objects directly or use a query parser to parse a query string. For example, this query would match documents that contain both "apple" and "bear" in the "content" field:: # Construct query objects directly from whoosh.query import * myquery = And([Term("content", u"apple"), Term("content", "bear")]) To parse a query string, you can use the default query parser in the ``qparser`` module. The first argument to the ``QueryParser`` constructor is the default field to search. This is usually the "body text" field. The second optional argument is a schema to use to understand how to parse the fields:: # Parse a query string from whoosh.qparser import QueryParser parser = QueryParser("content", ix.schema) myquery = parser.parse(querystring) Once you have a ``Searcher`` and a query object, you can use the ``Searcher``'s ``search()`` method to run the query and get a ``Results`` object:: >>> results = searcher.search(myquery) >>> print(len(results)) 1 >>> print(results[0]) {"title": "Second try", "path": "/b", "icon": "/icons/sheep.png"} The default ``QueryParser`` implements a query language very similar to Lucene's. It lets you connect terms with ``AND`` or ``OR``, eleminate terms with ``NOT``, group terms together into clauses with parentheses, do range, prefix, and wilcard queries, and specify different fields to search. By default it joins clauses together with ``AND`` (so by default, all terms you specify must be in the document for the document to match):: >>> print(parser.parse(u"render shade animate")) And([Term("content", "render"), Term("content", "shade"), Term("content", "animate")]) >>> print(parser.parse(u"render OR (title:shade keyword:animate)")) Or([Term("content", "render"), And([Term("title", "shade"), Term("keyword", "animate")])]) >>> print(parser.parse(u"rend*")) Prefix("content", "rend") Whoosh includes extra features for dealing with search results, such as * Sorting results by the value of an indexed field, instead of by relelvance. * Highlighting the search terms in excerpts from the original documents. * Expanding the query terms based on the top few documents found. * Paginating the results (e.g. "Showing results 1-20, page 1 of 4"). See :doc:`searching` for more information. Whoosh-2.5.7/docs/source/recipes.rst0000644000076500000240000001437412254366350017517 0ustar mattstaff00000000000000============== Whoosh recipes ============== General ======= Get the stored fields for a document from the document number ------------------------------------------------------------- :: stored_fields = searcher.stored_fields(docnum) Analysis ======== Eliminate words shorter/longer than N ------------------------------------- Use a :class:`~whoosh.analysis.StopFilter` and the ``minsize`` and ``maxsize`` keyword arguments. If you just want to filter based on size and not common words, set the ``stoplist`` to ``None``:: sf = analysis.StopFilter(stoplist=None, minsize=2, maxsize=40) Allow optional case-sensitive searches -------------------------------------- A quick and easy way to do this is to index both the original and lowercased versions of each word. If the user searches for an all-lowercase word, it acts as a case-insensitive search, but if they search for a word with any uppercase characters, it acts as a case-sensitive search:: class CaseSensitivizer(analysis.Filter): def __call__(self, tokens): for t in tokens: yield t if t.mode == "index": low = t.text.lower() if low != t.text: t.text = low yield t ana = analysis.RegexTokenizer() | CaseSensitivizer() [t.text for t in ana("The new SuperTurbo 5000", mode="index")] # ["The", "the", "new", "SuperTurbo", "superturbo", "5000"] Searching ========= Find every document ------------------- :: myquery = query.Every() iTunes-style search-as-you-type ------------------------------- Use the :class:`whoosh.analysis.NgramWordAnalyzer` as the analyzer for the field you want to search as the user types. You can save space in the index by turning off positions in the field using ``phrase=False``, since phrase searching on N-gram fields usually doesn't make much sense:: # For example, to search the "title" field as the user types analyzer = analysis.NgramWordAnalyzer() title_field = fields.TEXT(analyzer=analyzer, phrase=False) schema = fields.Schema(title=title_field) See the documentation for the :class:`~whoosh.analysis.NgramWordAnalyzer` class for information on the available options. Shortcuts ========= Look up documents by a field value ---------------------------------- :: # Single document (unique field value) stored_fields = searcher.document(id="bacon") # Multiple documents for stored_fields in searcher.documents(tag="cake"): ... Sorting and scoring =================== See :doc:`facets`. Score results based on the position of the matched term ------------------------------------------------------- The following scoring function uses the position of the first occurance of a term in each document to calculate the score, so documents with the given term earlier in the document will score higher:: from whoosh import scoring def pos_score_fn(searcher, fieldname, text, matcher): poses = matcher.value_as("positions") return 1.0 / (poses[0] + 1) pos_weighting = scoring.FunctionWeighting(pos_score_fn) with myindex.searcher(weighting=pos_weighting) as s: ... Results ======= How many hits were there? ------------------------- The number of *scored* hits:: found = results.scored_length() Depending on the arguments to the search, the exact total number of hits may be known:: if results.has_exact_length(): print("Scored", found, "of exactly", len(results), "documents") Usually, however, the exact number of documents that match the query is not known, because the searcher can skip over blocks of documents it knows won't show up in the "top N" list. If you call ``len(results)`` on a query where the exact length is unknown, Whoosh will run an unscored version of the original query to get the exact number. This is faster than the scored search, but may still be noticeably slow on very large indexes or complex queries. As an alternative, you might display the *estimated* total hits:: found = results.scored_length() if results.has_exact_length(): print("Scored", found, "of exactly", len(results), "documents") else: low = results.estimated_min_length() high = results.estimated_length() print("Scored", found, "of between", low, "and", high, "documents") Which terms matched in each hit? -------------------------------- :: # Use terms=True to record term matches for each hit results = searcher.search(myquery, terms=True) for hit in results: # Which terms matched in this hit? print("Matched:", hit.matched_terms()) # Which terms from the query didn't match in this hit? print("Didn't match:", myquery.all_terms() - hit.matched_terms()) Global information ================== How many documents are in the index? ------------------------------------ :: # Including documents that are deleted but not yet optimized away numdocs = searcher.doc_count_all() # Not including deleted documents numdocs = searcher.doc_count() What fields are in the index? ----------------------------- :: return myindex.schema.names() Is term X in the index? ----------------------- :: return ("content", "wobble") in searcher How many times does term X occur in the index? ---------------------------------------------- :: # Number of times content:wobble appears in all documents freq = searcher.frequency("content", "wobble") # Number of documents containing content:wobble docfreq = searcher.doc_frequency("content", "wobble") Is term X in document Y? ------------------------ :: # Check if the "content" field of document 500 contains the term "wobble" # Without term vectors, skipping through list... postings = searcher.postings("content", "wobble") postings.skip_to(500) return postings.id() == 500 # ...or the slower but easier way docset = set(searcher.postings("content", "wobble").all_ids()) return 500 in docset # If field has term vectors, skipping through list... vector = searcher.vector(500, "content") vector.skip_to("wobble") return vector.id() == "wobble" # ...or the slower but easier way wordset = set(searcher.vector(500, "content").all_ids()) return "wobble" in wordset Whoosh-2.5.7/docs/source/releases/0000755000076500000240000000000012277504634017131 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/source/releases/0_3.rst0000644000076500000240000000467512254366350020254 0ustar mattstaff00000000000000======================== Whoosh 0.3 release notes ======================== * Major improvements to reading/writing of postings and query performance. * Changed default post limit (run size) from 4 MB to 32 MB. * Finished migrating backend-specific code into ``whoosh.filedb`` package. * Moved formats from whoosh.fields module into new whoosh.formats module. * DocReader and TermReader classes combined into new IndexReader interface. You can get an IndexReader implementation by calling Index.reader(). Searcher is now a wrapper around an IndexReader. * Range query object changed, with new signature and new syntax in the default query parser. Now you can use ``[start TO end]`` in the query parser for an inclusive range, and ``{start TO end}`` for an exclusive range. You can also mix the delimiters, for example ``[start TO end}`` for a range with an inclusive start but exclusive end term. * Added experimental DATETIME field type lets you pass a ``datetime.datetime`` object as a field value to ``add_document``:: from whoosh.fields import Schema, ID, DATETIME from whoosh.filedb.filestore import RamStorage from datetime import datetime schema = Schema(id=ID, date=DATETIME) storage = RamStorage() ix = storage.create_index(schema) w = ix.writer() w.add_document(id=u"A", date=datetime.now()) w.close() Internally, the DATETIME field indexes the datetime object as text using the format (4 digit year + 2 digit month + 2 digit day + 'T' + 2 digit hour + 2 digit minute + 2 digit second + 6 digit microsecond), for example ``20090817T160203109000``. * The default query parser now lets you use quoted strings in prefix and range queries, e.g. ``["2009-05" TO "2009-12"]``, ``"alfa/bravo"*``, making it easier to work with terms containing special characters. * ``DocReader.vector_as(docnum, fieldid, astype)`` is now ``IndexReader.vector_as(astype, docnum, fieldid)`` (i.e. the astype argument has moved from the last to the first argument), e.g. ``v = ixreader.vector_as("frequency", 102, "content")``. * Added whoosh.support.charset for translating Sphinx charset table files. * Added whoosh.analysis.CharsetTokenizer and CharsetFilter to enable case and accent folding. * Added experimental ``whoosh.ramdb`` in-memory backend. * Added experimental ``whoosh.query.FuzzyTerm`` query type. * Added ``whoosh.lang.wordnet`` module containing ``Thesaurus`` object for using WordNet synonym database. Whoosh-2.5.7/docs/source/releases/1_0.rst0000644000076500000240000004344712254366350020252 0ustar mattstaff00000000000000======================== Whoosh 1.x release notes ======================== Whoosh 1.8.3 ============ Whoosh 1.8.3 contains important bugfixes and new functionality. Thanks to all the mailing list and BitBucket users who helped with the fixes! Fixed a bad ``Collector`` bug where the docset of a Results object did not match the actual results. You can now pass a sequence of objects to a keyword argument in ``add_document`` and ``update_document`` (currently this will not work for unique fields in ``update_document``). This is useful for non-text fields such as ``DATETIME`` and ``NUMERIC``, allowing you to index multiple dates/numbers for a document:: writer.add_document(shoe=u"Saucony Kinvara", sizes=[10.0, 9.5, 12]) This version reverts to using the CDB hash function for hash files instead of Python's ``hash()`` because the latter is not meant to be stored externally. This change maintains backwards compatibility with old files. The ``Searcher.search`` method now takes a ``mask`` keyword argument. This is the opposite of the ``filter`` argument. Where the ``filter`` specifies the set of documents that can appear in the results, the ``mask`` specifies a set of documents that must not appear in the results. Fixed performance problems in ``Searcher.more_like``. This method now also takes a ``filter`` keyword argument like ``Searcher.search``. Improved documentation. Whoosh 1.8.2 ============ Whoosh 1.8.2 fixes some bugs, including a mistyped signature in Searcher.more_like and a bad bug in Collector that could screw up the ordering of results given certain parameters. Whoosh 1.8.1 ============ Whoosh 1.8.1 includes a few recent bugfixes/improvements: - ListMatcher.skip_to_quality() wasn't returning an integer, resulting in a "None + int" error. - Fixed locking and memcache sync bugs in the Google App Engine storage object. - MultifieldPlugin wasn't working correctly with groups. - The binary matcher trees of Or and And are now generated using a Huffman-like algorithm instead perfectly balanced. This gives a noticeable speed improvement because less information has to be passed up/down the tree. Whoosh 1.8 ========== This release relicensed the Whoosh source code under the Simplified BSD (A.K.A. "two-clause" or "FreeBSD") license. See LICENSE.txt for more information. Whoosh 1.7.7 ============ Setting a TEXT field to store term vectors is now much easier. Instead of having to pass an instantiated whoosh.formats.Format object to the vector= keyword argument, you can pass True to automatically use the same format and analyzer as the inverted index. Alternatively, you can pass a Format subclass and Whoosh will instantiate it for you. For example, to store term vectors using the same settings as the inverted index (Positions format and StandardAnalyzer):: from whoosh.fields import Schema, TEXT schema = Schema(content=TEXT(vector=True)) To store term vectors that use the same analyzer as the inverted index (StandardAnalyzer by default) but only store term frequency:: from whoosh.formats import Frequency schema = Schema(content=TEXT(vector=Frequency)) Note that currently the only place term vectors are used in Whoosh is keyword extraction/more like this, but they can be useful for expert users with custom code. Added :meth:`whoosh.searching.Searcher.more_like` and :meth:`whoosh.searching.Hit.more_like_this` methods, as shortcuts for doing keyword extraction yourself. Return a Results object. "python setup.py test" works again, as long as you have nose installed. The :meth:`whoosh.searching.Searcher.sort_query_using` method lets you sort documents matching a given query using an arbitrary function. Note that like "complex" searching with the Sorter object, this can be slow on large multi-segment indexes. Whoosh 1.7 ========== You can once again perform complex sorting of search results (that is, a sort with some fields ascending and some fields descending). You can still use the ``sortedby`` keyword argument to :meth:`whoosh.searching.Searcher.search` to do a simple sort (where all fields are sorted in the same direction), or you can use the new :class:`~whoosh.sorting.Sorter` class to do a simple or complex sort:: searcher = myindex.searcher() sorter = searcher.sorter() # Sort first by the group field, ascending sorter.add_field("group") # Then by the price field, descending sorter.add_field("price", reverse=True) # Get the Results results = sorter.sort_query(myquery) See the documentation for the :class:`~whoosh.sorting.Sorter` class for more information. Bear in mind that complex sorts will be much slower on large indexes because they can't use the per-segment field caches. You can now get highlighted snippets for a hit automatically using :meth:`whoosh.searching.Hit.highlights`:: results = searcher.search(myquery, limit=20) for hit in results: print hit["title"] print hit.highlights("content") See :meth:`whoosh.searching.Hit.highlights` for more information. Added the ability to filter search results so that only hits in a Results set, a set of docnums, or matching a query are returned. The filter is cached on the searcher. # Search within previous results newresults = searcher.search(newquery, filter=oldresults) # Search within the "basics" chapter results = searcher.search(userquery, filter=query.Term("chapter", "basics")) You can now specify a time limit for a search. If the search does not finish in the given time, a :class:`whoosh.searching.TimeLimit` exception is raised, but you can still retrieve the partial results from the collector. See the ``timelimit`` and ``greedy`` arguments in the :class:`whoosh.searching.Collector` documentation. Added back the ability to set :class:`whoosh.analysis.StemFilter` to use an unlimited cache. This is useful for one-shot batch indexing (see :doc:`../batch`). The ``normalize()`` method of the ``And`` and ``Or`` queries now merges overlapping range queries for more efficient queries. Query objects now have ``__hash__`` methods allowing them to be used as dictionary keys. The API of the highlight module has changed slightly. Most of the functions in the module have been converted to classes. However, most old code should still work. The ``NullFragmeter`` is now called ``WholeFragmenter``, but the old name is still available as an alias. Fixed MultiPool so it won't fill up the temp directory with job files. Fixed a bug where Phrase query objects did not use their boost factor. Fixed a bug where a fieldname after an open parenthesis wasn't parsed correctly. The change alters the semantics of certain parsing "corner cases" (such as ``a:b:c:d``). Whoosh 1.6 ========== The ``whoosh.writing.BatchWriter`` class is now called :class:`whoosh.writing.BufferedWriter`. It is similar to the old ``BatchWriter`` class but allows you to search and update the buffered documents as well as the documents that have been flushed to disk:: writer = writing.BufferedWriter(myindex) # You can update (replace) documents in RAM without having to commit them # to disk writer.add_document(path="/a", text="Hi there") writer.update_document(path="/a", text="Hello there") # Search committed and uncommited documents by getting a searcher from the # writer instead of the index searcher = writer.searcher() (BatchWriter is still available as an alias for backwards compatibility.) The :class:`whoosh.qparser.QueryParser` initialization method now requires a schema as the second argument. Previously the default was to create a ``QueryParser`` without a schema, which was confusing:: qp = qparser.QueryParser("content", myindex.schema) The :meth:`whoosh.searching.Searcher.search` method now takes a ``scored`` keyword. If you search with ``scored=False``, the results will be in "natural" order (the order the documents were added to the index). This is useful when you don't need scored results but want the convenience of the Results object. Added the :class:`whoosh.qparser.GtLtPlugin` parser plugin to allow greater than/less as an alternative syntax for ranges:: count:>100 tag:<=zebra date:>='29 march 2001' Added the ability to define schemas declaratively, similar to Django models:: from whoosh import index from whoosh.fields import SchemaClass, ID, KEYWORD, STORED, TEXT class MySchema(SchemaClass): uuid = ID(stored=True, unique=True) path = STORED tags = KEYWORD(stored=True) content = TEXT index.create_in("indexdir", MySchema) Whoosh 1.6.2: Added :class:`whoosh.searching.TermTrackingCollector` which tracks which part of the query matched which documents in the final results. Replaced the unbounded cache in :class:`whoosh.analysis.StemFilter` with a bounded LRU (least recently used) cache. This will make stemming analysis slightly slower but prevent it from eating up too much memory over time. Added a simple :class:`whoosh.analysis.PyStemmerFilter` that works when the py-stemmer library is installed:: ana = RegexTokenizer() | PyStemmerFilter("spanish") The estimation of memory usage for the ``limitmb`` keyword argument to ``FileIndex.writer()`` is more accurate, which should help keep memory usage memory usage by the sorting pool closer to the limit. The ``whoosh.ramdb`` package was removed and replaced with a single ``whoosh.ramindex`` module. Miscellaneous bug fixes. Whoosh 1.5 ========== .. note:: Whoosh 1.5 is incompatible with previous indexes. You must recreate existing indexes with Whoosh 1.5. Fixed a bug where postings were not portable across different endian platforms. New generalized field cache system, using per-reader caches, for much faster sorting and faceting of search results, as well as much faster multi-term (e.g. prefix and wildcard) and range queries, especially for large indexes and/or indexes with multiple segments. Changed the faceting API. See :doc:`../facets`. Faster storage and retrieval of posting values. Added per-field ``multitoken_query`` attribute to control how the query parser deals with a "term" that when analyzed generates multiple tokens. The default value is `"first"` which throws away all but the first token (the previous behavior). Other possible values are `"and"`, `"or"`, or `"phrase"`. Added :class:`whoosh.analysis.DoubleMetaphoneFilter`, :class:`whoosh.analysis.SubstitutionFilter`, and :class:`whoosh.analysis.ShingleFilter`. Added :class:`whoosh.qparser.CopyFieldPlugin`. Added :class:`whoosh.query.Otherwise`. Generalized parsing of operators (such as OR, AND, NOT, etc.) in the query parser to make it easier to add new operators. In intend to add a better API for this in a future release. Switched NUMERIC and DATETIME fields to use more compact on-disk representations of numbers. Fixed a bug in the porter2 stemmer when stemming the string `"y"`. Added methods to :class:`whoosh.searching.Hit` to make it more like a `dict`. Short posting lists (by default, single postings) are inline in the term file instead of written to the posting file for faster retrieval and a small saving in disk space. Whoosh 1.3 ========== Whoosh 1.3 adds a more efficient DATETIME field based on the new tiered NUMERIC field, and the DateParserPlugin. See :doc:`../dates`. Whoosh 1.2 ========== Whoosh 1.2 adds tiered indexing for NUMERIC fields, resulting in much faster range queries on numeric fields. Whoosh 1.0 ========== Whoosh 1.0 is a major milestone release with vastly improved performance and several useful new features. *The index format of this version is not compatibile with indexes created by previous versions of Whoosh*. You will need to reindex your data to use this version. Orders of magnitude faster searches for common terms. Whoosh now uses optimizations similar to those in Xapian to skip reading low-scoring postings. Faster indexing and ability to use multiple processors (via ``multiprocessing`` module) to speed up indexing. Flexible Schema: you can now add and remove fields in an index with the :meth:`whoosh.writing.IndexWriter.add_field` and :meth:`whoosh.writing.IndexWriter.remove_field` methods. New hand-written query parser based on plug-ins. Less brittle, more robust, more flexible, and easier to fix/improve than the old pyparsing-based parser. On-disk formats now use 64-bit disk pointers allowing files larger than 4 GB. New :class:`whoosh.searching.Facets` class efficiently sorts results into facets based on any criteria that can be expressed as queries, for example tags or price ranges. New :class:`whoosh.writing.BatchWriter` class automatically batches up individual ``add_document`` and/or ``delete_document`` calls until a certain number of calls or a certain amount of time passes, then commits them all at once. New :class:`whoosh.analysis.BiWordFilter` lets you create bi-word indexed fields a possible alternative to phrase searching. Fixed bug where files could be deleted before a reader could open them in threaded situations. New :class:`whoosh.analysis.NgramFilter` filter, :class:`whoosh.analysis.NgramWordAnalyzer` analyzer, and :class:`whoosh.fields.NGRAMWORDS` field type allow producing n-grams from tokenized text. Errors in query parsing now raise a specific ``whoosh.qparse.QueryParserError`` exception instead of a generic exception. Previously, the query string ``*`` was optimized to a :class:`whoosh.query.Every` query which matched every document. Now the ``Every`` query only matches documents that actually have an indexed term from the given field, to better match the intuitive sense of what a query string like ``tag:*`` should do. New :meth:`whoosh.searching.Searcher.key_terms_from_text` method lets you extract key words from arbitrary text instead of documents in the index. Previously the :meth:`whoosh.searching.Searcher.key_terms` and :meth:`whoosh.searching.Results.key_terms` methods required that the given field store term vectors. They now also work if the given field is stored instead. They will analyze the stored string into a term vector on-the-fly. The field must still be indexed. User API changes ================ The default for the ``limit`` keyword argument to :meth:`whoosh.searching.Searcher.search` is now ``10``. To return all results in a single ``Results`` object, use ``limit=None``. The ``Index`` object no longer represents a snapshot of the index at the time the object was instantiated. Instead it always represents the index in the abstract. ``Searcher`` and ``IndexReader`` objects obtained from the ``Index`` object still represent the index as it was at the time they were created. Because the ``Index`` object no longer represents the index at a specific version, several methods such as ``up_to_date`` and ``refresh`` were removed from its interface. The Searcher object now has :meth:`~whoosh.searching.Searcher.last_modified`, :meth:`~whoosh.searching.Searcher.up_to_date`, and :meth:`~whoosh.searching.Searcher.refresh` methods similar to those that used to be on ``Index``. The document deletion and field add/remove methods on the ``Index`` object now create a writer behind the scenes to accomplish each call. This means they write to the index immediately, so you don't need to call ``commit`` on the ``Index``. Also, it will be much faster if you need to call them multiple times to create your own writer instead:: # Don't do this for id in my_list_of_ids_to_delete: myindex.delete_by_term("id", id) myindex.commit() # Instead do this writer = myindex.writer() for id in my_list_of_ids_to_delete: writer.delete_by_term("id", id) writer.commit() The ``postlimit`` argument to ``Index.writer()`` has been changed to ``postlimitmb`` and is now expressed in megabytes instead of bytes:: writer = myindex.writer(postlimitmb=128) Instead of having to import ``whoosh.filedb.filewriting.NO_MERGE`` or ``whoosh.filedb.filewriting.OPTIMIZE`` to use as arguments to ``commit()``, you can now simply do the following:: # Do not merge segments writer.commit(merge=False) # or # Merge all segments writer.commit(optimize=True) The ``whoosh.postings`` module is gone. The ``whoosh.matching`` module contains classes for posting list readers. Whoosh no longer maps field names to numbers for internal use or writing to disk. Any low-level method that accepted field numbers now accept field names instead. Custom Weighting implementations that use the ``final()`` method must now set the ``use_final`` attribute to ``True``:: from whoosh.scoring import BM25F class MyWeighting(BM25F): use_final = True def final(searcher, docnum, score): return score + docnum * 10 This disables the new optimizations, forcing Whoosh to score every matching document. :class:`whoosh.writing.AsyncWriter` now takes an :class:`whoosh.index.Index` object as its first argument, not a callable. Also, the keyword arguments to pass to the index's ``writer()`` method should now be passed as a dictionary using the ``writerargs`` keyword argument. Whoosh now stores per-document field length using an approximation rather than exactly. For low numbers the approximation is perfectly accurate, while high numbers will be approximated less accurately. The ``doc_field_length`` method on searchers and readers now takes a second argument representing the default to return if the given document and field do not have a length (i.e. the field is not scored or the field was not provided for the given document). The :class:`whoosh.analysis.StopFilter` now has a ``maxsize`` argument as well as a ``minsize`` argument to its initializer. Analyzers that use the ``StopFilter`` have the ``maxsize`` argument in their initializers now also. The interface of :class:`whoosh.writing.AsyncWriter` has changed. Misc ==== * Because the file backend now writes 64-bit disk pointers and field names instead of numbers, the size of an index on disk will grow compared to previous versions. * Unit tests should no longer leave directories and files behind. Whoosh-2.5.7/docs/source/releases/2_0.rst0000644000076500000240000003066212254366350020246 0ustar mattstaff00000000000000======================== Whoosh 2.x release notes ======================== Whoosh 2.5 ========== * Whoosh 2.5 will read existing indexes, but segments created by 2.5 will not be readable by older versions of Whoosh. * As a replacement for field caches to speed up sorting, Whoosh now supports adding a ``sortable=True`` keyword argument to fields. This makes Whoosh store a sortable representation of the field's values in a "column" format (which associates a "key" value with each document). This is more robust, efficient, and customizable than the old behavior. You should now specify ``sortable=True`` on fields that you plan on using to sort or group search results. (You can still sort/group on fields that don't have ``sortable=True``, however it will use more RAM and be slower as Whoosh caches the field values in memory.) Fields that use ``sortable=True`` can avoid specifying ``stored=True``. The field's value will still be available on ``Hit`` objects (the value will be retrieved from the column instead of from the stored fields). This may actually be faster for certain types of values. * Whoosh will now detect common types of OR queries and use optimized read-ahead matchers to speed them up by several times. * Whoosh now includes pure-Python implementations of the Snowball stemmers and stop word lists for various languages adapted from NLTK. These are available through the :class:`whoosh.analysis.LanguageAnalyzer` analyzer or through the ``lang=`` keyword argument to the :class:`~whoosh.fields.TEXT` field. * You can now use the :meth:`whoosh.filedb.filestore.Storage.create()` and :meth:`whoosh.filedb.filestore.Storage.destory()` methods as a consistent API to set up and tear down different types of storage. * Many bug fixes and speed improvements. * Switched unit tests to use ``py.test`` instead of ``nose``. * Removed obsolete ``SpellChecker`` class. Whoosh 2.4 ========== * By default, Whoosh now assembles the individual files of a segment into a single file when committing. This has a small performance penalty but solves a problem where Whoosh can keep too many files open. Whoosh is also now smarter about using mmap. * Added functionality to index and search hierarchical documents. See :doc:`/nested`. * Rewrote the Directed Acyclic Word Graph implementation (used in spell checking) to be faster and more space-efficient. Word graph files created by previous versions will be ignored, meaning that spell checking may become slower unless/until you replace the old segments (for example, by optimizing). * Rewrote multiprocessing indexing to be faster and simpler. You can now do ``myindex.writer(procs=n)`` to get a multiprocessing writer, or ``myindex.writer(procs=n, multisegment=True)`` to get a multiprocessing writer that leaves behind multiple segments, like the old MultiSegmentWriter. (``MultiSegmentWriter`` is still available as a function that returns the new class.) * When creating ``Term`` query objects for special fields (e.g. NUMERIC or BOOLEAN), you can now use the field's literal type instead of a string as the second argument, for example ``Term("num", 20)`` or ``Term("bool", True)``. (This change may cause problems interacting with functions that expect query objects to be pure textual, such as spell checking.) * All writing to and reading from on-disk indexes is now done through "codec" objects. This architecture should make it easier to add optional or experimental features, and maintain backwards compatibility. * Fixes issues #75, #137, #206, #213, #215, #219, #223, #226, #230, #233, #238, #239, #240, #241, #243, #244, #245, #252, #253, and other bugs. Thanks to Thomas Waldmann and Alexei Gousev for the help! Whoosh 2.3.2 ============ * Fixes bug in BM25F scoring function, leading to increased precision in search results. * Fixes issues #203, #205, #206, #208, #209, #212. Whoosh 2.3.1 ============ * Fixes issue #200. Whoosh 2.3 ========== * Added a :class:`whoosh.query.Regex` term query type, similar to :class:`whoosh.query.Wildcard`. The parser does not allow regex term queries by default. You need to add the :class:`whoosh.qparser.RegexPlugin` plugin. After you add the plugin, you can use ``r"expression"`` query syntax for regular expression term queries. For example, ``r"foo.*bar"``. * Added the :class:`whoosh.qparser.PseudoFieldPlugin` parser plugin. This plugin lets you create "pseudo-fields" that run a transform function on whatever query syntax the user applies the field to. This is fairly advanced functionality right now; I'm trying to think of ways to make its power easier to access. * The documents in the lists in the dictionary returned by ``Results.groups()`` by default are now in the same relative order as in the results. This makes it much easier to display the "top N" results in each category, for example. * The ``groupids`` keyword argument to ``Searcher.search`` has been removed. Instead you can now pass a :class:`whoosh.sorting.FacetMap` object to the ``Searcher.search`` method's ``maptype`` argument to control how faceted documents are grouped, and/or set the ``maptype`` argument on individual :class:`whoosh.sorting.FacetType`` objects to set custom grouping per facet. See :doc:`../facets` for more information. * Calling ``Searcher.documents()`` or ``Searcher.document_numbers()`` with no arguments now yields all documents/numbers. * Calling ``Writer.update_document()`` with no unique fields is now equivalent to calling ``Writer.add_document()`` with the same arguments. * Fixed a problem with keyword expansion where the code was building a cache that was fast on small indexes, but unacceptably slow on large indexes. * Added the hyphen (``-``) to the list of characters that match a "wildcard" token, to make parsing slightly more predictable. A true fix will have to wait for another parser rewrite. * Fixed an unused ``__future__`` import and use of ``float("nan")`` which were breaking under Python 2.5. * Fixed a bug where vectored fields with only one term stored an empty term vector. * Various other bug fixes. Whoosh 2.2 ========== * Fixes several bugs, including a bad bug in BM25F scoring. * Added ``allow_overlap`` option to :class:`whoosh.sorting.StoredFieldFacet`. * In :meth:`~whoosh.writing.IndexWriter.add_document`, You can now pass query-like strings for BOOLEAN and DATETIME fields (e.g ``boolfield="true"`` and ``dtfield="20101131-16:01"``) as an alternative to actual ``bool`` or ``datetime`` objects. The implementation of this is incomplete: it only works in the default ``filedb`` backend, and if the field is stored, the stored value will be the string, not the parsed object. * Added :class:`whoosh.analysis.CompoundWordFilter` and :class:`whoosh.analysis.TeeFilter`. Whoosh 2.1 ========== This release fixes several bugs, and contains speed improvments to highlighting. See :doc:`/highlight` for more information. Whoosh 2.0 ========== Improvements ------------ * Whoosh is now compatible with Python 3 (tested with Python 3.2). Special thanks to Vinay Sajip who did the work, and also Jordan Sherer who helped fix later issues. * Sorting and grouping (faceting) now use a new system of "facet" objects which are much more flexible than the previous field-based system. For example, to sort by first name and then score:: from whoosh import sorting mf = sorting.MultiFacet([sorting.FieldFacet("firstname"), sorting.ScoreFacet()]) results = searcher.search(myquery, sortedby=mf) In addition to the previously supported sorting/grouping by field contents and/or query results, you can now use numeric ranges, date ranges, score, and more. The new faceting system also supports overlapping groups. (The old "Sorter" API still works but is deprecated and may be removed in a future version.) See :doc:`/facets` for more information. * Completely revamped spell-checking to make it much faster, easier, and more flexible. You can enable generation of the graph files use by spell checking using the ``spelling=True`` argument to a field type:: schema = fields.Schema(text=fields.TEXT(spelling=True)) (Spelling suggestion methods will work on fields without ``spelling=True`` but will slower.) The spelling graph will be updated automatically as new documents are added -- it is no longer necessary to maintain a separate "spelling index". You can get suggestions for individual words using :meth:`whoosh.searching.Searcher.suggest`:: suglist = searcher.suggest("content", "werd", limit=3) Whoosh now includes convenience methods to spell-check and correct user queries, with optional highlighting of corrections using the ``whoosh.highlight`` module:: from whoosh import highlight, qparser # User query string qstring = request.get("q") # Parse into query object parser = qparser.QueryParser("content", myindex.schema) qobject = parser.parse(qstring) results = searcher.search(qobject) if not results: correction = searcher.correct_query(gobject, gstring) # correction.query = corrected query object # correction.string = corrected query string # Format the corrected query string with HTML highlighting cstring = correction.format_string(highlight.HtmlFormatter()) Spelling suggestions can come from field contents and/or lists of words. For stemmed fields the spelling suggestions automatically use the unstemmed forms of the words. There are APIs for spelling suggestions and query correction, so highly motivated users could conceivably replace the defaults with more sophisticated behaviors (for example, to take context into account). See :doc:`/spelling` for more information. * :class:`whoosh.query.FuzzyTerm` now uses the new word graph feature as well and so is much faster. * You can now set a boost factor for individual documents as you index them, to increase the score of terms in those documents in searches. See the documentation for the :meth:`~whoosh.writing.IndexWriter.add_document` for more information. * Added built-in recording of which terms matched in which documents. Use the ``terms=True`` argument to :meth:`whoosh.searching.Searcher.search` and use :meth:`whoosh.searching.Hit.matched_terms` and :meth:`whoosh.searching.Hit.contains_term` to check matched terms. * Whoosh now supports whole-term quality optimizations, so for example if the system knows that a UnionMatcher cannot possibly contribute to the "top N" results unless both sub-matchers match, it will replace the UnionMatcher with an IntersectionMatcher which is faster to compute. The performance improvement is not as dramatic as from block quality optimizations, but it can be noticeable. * Fixed a bug that prevented block quality optimizations in queries with words not in the index, which could severely degrade performance. * Block quality optimizations now use the actual scoring algorithm to calculate block quality instead of an approximation, which fixes issues where ordering of results could be different for searches with and without the optimizations. * the BOOLEAN field type now supports field boosts. * Re-architected the query parser to make the code easier to understand. Custom parser plugins from previous versions will probably break in Whoosh 2.0. * Various bug-fixes and performance improvements. * Removed the "read lock", which caused more problems than it solved. Now when opening a reader, if segments are deleted out from under the reader as it is opened, the code simply retries. Compatibility ------------- * The term quality optimizations required changes to the on-disk formats. Whoosh 2.0 if backwards-compatible with the old format. As you rewrite an index using Whoosh 2.0, by default it will use the new formats for new segments, making the index incompatible with older versions. To upgrade an existing index to use the new formats immediately, use ``Index.optimize()``. * Removed the experimental ``TermTrackingCollector`` since it is replaced by the new built-in term recording functionality. * Removed the experimental ``Searcher.define_facets`` feature until a future release when it will be replaced by a more robust and useful feature. * Reader iteration methods (``__iter__``, ``iter_from``, ``iter_field``, etc.) now yield :class:`whoosh.reading.TermInfo` objects. * The arguments to :class:`whoosh.query.FuzzyTerm` changed. Whoosh-2.5.7/docs/source/releases/index.rst0000644000076500000240000000014312254366350020764 0ustar mattstaff00000000000000============= Release notes ============= .. toctree:: :maxdepth: 2 2_0 1_0 0_3 Whoosh-2.5.7/docs/source/schema.rst0000644000076500000240000003551512254366350017325 0ustar mattstaff00000000000000================== Designing a schema ================== About schemas and fields ======================== The schema specifies the fields of documents in an index. Each document can have multiple fields, such as title, content, url, date, etc. Some fields can be indexed, and some fields can be stored with the document so the field value is available in search results. Some fields will be both indexed and stored. The schema is the set of all possible fields in a document. Each individual document might only use a subset of the available fields in the schema. For example, a simple schema for indexing emails might have fields like ``from_addr``, ``to_addr``, ``subject``, ``body``, and ``attachments``, where the ``attachments`` field lists the names of attachments to the email. For emails without attachments, you would omit the attachments field. Built-in field types ==================== Whoosh provides some useful predefined field types: :class:`whoosh.fields.TEXT` This type is for body text. It indexes (and optionally stores) the text and stores term positions to allow phrase searching. ``TEXT`` fields use :class:`~whoosh.analysis.StandardAnalyzer` by default. To specify a different analyzer, use the ``analyzer`` keyword argument to the constructor, e.g. ``TEXT(analyzer=analysis.StemmingAnalyzer())``. See :doc:`analysis`. By default, ``TEXT`` fields store position information for each indexed term, to allow you to search for phrases. If you don't need to be able to search for phrases in a text field, you can turn off storing term positions to save space. Use ``TEXT(phrase=False)``. By default, ``TEXT`` fields are not stored. Usually you will not want to store the body text in the search index. Usually you have the indexed documents themselves available to read or link to based on the search results, so you don't need to store their text in the search index. However, in some circumstances it can be useful (see :doc:`highlight`). Use ``TEXT(stored=True)`` to specify that the text should be stored in the index. :class:`whoosh.fields.KEYWORD` This field type is designed for space- or comma-separated keywords. This type is indexed and searchable (and optionally stored). To save space, it does not support phrase searching. To store the value of the field in the index, use ``stored=True`` in the constructor. To automatically lowercase the keywords before indexing them, use ``lowercase=True``. By default, the keywords are space separated. To separate the keywords by commas instead (to allow keywords containing spaces), use ``commas=True``. If your users will use the keyword field for searching, use ``scorable=True``. :class:`whoosh.fields.ID` The ``ID`` field type simply indexes (and optionally stores) the entire value of the field as a single unit (that is, it doesn't break it up into individual terms). This type of field does not store frequency information, so it's quite compact, but not very useful for scoring. Use ``ID`` for fields like url or path (the URL or file path of a document), date, category -- fields where the value must be treated as a whole, and each document only has one value for the field. By default, ``ID`` fields are not stored. Use ``ID(stored=True)`` to specify that the value of the field should be stored with the document for use in the search results. For example, you would want to store the value of a url field so you could provide links to the original in your search results. :class:`whoosh.fields.STORED` This field is stored with the document, but not indexed and not searchable. This is useful for document information you want to display to the user in the search results, but don't need to be able to search for. :class:`whoosh.fields.NUMERIC` This field stores int, long, or floating point numbers in a compact, sortable format. :class:`whoosh.fields.DATETIME` This field stores datetime objects in a compact, sortable format. :class:`whoosh.fields.BOOLEAN` This simple filed indexes boolean values and allows users to search for ``yes``, ``no``, ``true``, ``false``, ``1``, ``0``, ``t`` or ``f``. :class:`whoosh.fields.NGRAM` TBD. Expert users can create their own field types. Creating a Schema ================= To create a schema:: from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import StemmingAnalyzer schema = Schema(from_addr=ID(stored=True), to_addr=ID(stored=True), subject=TEXT(stored=True), body=TEXT(analyzer=StemmingAnalyzer()), tags=KEYWORD) If you aren't specifying any constructor keyword arguments to one of the predefined fields, you can leave off the brackets (e.g. ``fieldname=TEXT`` instead of ``fieldname=TEXT()``). Whoosh will instantiate the class for you. Alternatively you can create a schema declaratively using the ``SchemaClass`` base class:: from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, STORED class MySchema(SchemaClass): path = ID(stored=True) title = TEXT(stored=True) content = TEXT tags = KEYWORD You can pass a declarative class to :func:`~whoosh.index.create_in` or :meth:`~whoosh.store.Storage.create_index()` instead of a :class:`~whoosh.fields.Schema` instance. Modifying the schema after indexing =================================== After you have created an index, you can add or remove fields to the schema using the ``add_field()`` and ``remove_field()`` methods. These methods are on the ``Writer`` object:: writer = ix.writer() writer.add_field("fieldname", fields.TEXT(stored=True)) writer.remove_field("content") writer.commit() (If you're going to modify the schema *and* add documents using the same writer, you must call ``add_field()`` and/or ``remove_field`` *before* you add any documents.) These methods are also on the ``Index`` object as a convenience, but when you call them on an ``Index``, the Index object simply creates the writer, calls the corresponding method on it, and commits, so if you want to add or remove more than one field, it's much more efficient to create the writer yourself:: ix.add_field("fieldname", fields.KEYWORD) In the ``filedb`` backend, removing a field simply removes that field from the *schema* -- the index will not get smaller, data about that field will remain in the index until you optimize. Optimizing will compact the index, removing references to the deleted field as it goes:: writer = ix.writer() writer.add_field("uuid", fields.ID(stored=True)) writer.remove_field("path") writer.commit(optimize=True) Because data is stored on disk with the field name, *do not* add a new field with the same name as a deleted field without optimizing the index in between:: writer = ix.writer() writer.delete_field("path") # Don't do this!!! writer.add_field("path", fields.KEYWORD) (A future version of Whoosh may automatically prevent this error.) Dynamic fields ============== Dynamic fields let you associate a field type with any field name that matches a given "glob" (a name pattern containing ``*``, ``?``, and/or ``[abc]`` wildcards). You can add dynamic fields to a new schema using the ``add()`` method with the ``glob`` keyword set to True:: schema = fields.Schema(...) # Any name ending in "_d" will be treated as a stored # DATETIME field schema.add("*_d", fields.DATETIME(stored=True), glob=True) To set up a dynamic field on an existing index, use the same ``IndexWriter.add_field`` method as if you were adding a regular field, but with the ``glob`` keyword argument set to ``True``:: writer = ix.writer() writer.add_field("*_d", fields.DATETIME(stored=True), glob=True) writer.commit() To remove a dynamic field, use the ``IndexWriter.remove_field()`` method with the glob as the name:: writer = ix.writer() writer.remove_field("*_d") writer.commit() For example, to allow documents to contain any field name that ends in ``_id`` and associate it with the ``ID`` field type:: schema = fields.Schema(path=fields.ID) schema.add("*_id", fields.ID, glob=True) ix = index.create_in("myindex", schema) w = ix.writer() w.add_document(path=u"/a", test_id=u"alfa") w.add_document(path=u"/b", class_id=u"MyClass") # ... w.commit() qp = qparser.QueryParser("path", schema=schema) q = qp.parse(u"test_id:alfa") with ix.searcher() as s: results = s.search(q) Advanced schema setup ===================== Field boosts ------------ You can specify a field boost for a field. This is a multiplier applied to the score of any term found in the field. For example, to make terms found in the title field score twice as high as terms in the body field:: schema = Schema(title=TEXT(field_boost=2.0), body=TEXT) Field types ----------- The predefined field types listed above are subclasses of ``fields.FieldType``. ``FieldType`` is a pretty simple class. Its attributes contain information that define the behavior of a field. ============ =============== ====================================================== Attribute Type Description ============ =============== ====================================================== format fields.Format Defines what kind of information a field records about each term, and how the information is stored on disk. vector fields.Format Optional: if defined, the format in which to store per-document forward-index information for this field. scorable bool If True, the length of (number of terms in) the field in each document is stored in the index. Slightly misnamed, since field lengths are not required for all scoring. However, field lengths are required to get proper results from BM25F. stored bool If True, the value of this field is stored in the index. unique bool If True, the value of this field may be used to replace documents with the same value when the user calls :meth:`~whoosh.writing.IndexWriter.document_update` on an ``IndexWriter``. ============ =============== ====================================================== The constructors for most of the predefined field types have parameters that let you customize these parts. For example: * Most of the predefined field types take a stored keyword argument that sets FieldType.stored. * The ``TEXT()`` constructor takes an ``analyzer`` keyword argument that is passed on to the format object. Formats ------- A ``Format`` object defines what kind of information a field records about each term, and how the information is stored on disk. For example, the ``Existence`` format would store postings like this: ==== ==== Doc ==== ==== 10 20 30 ==== ==== Whereas the ``Positions`` format would store postings like this: ===== ============= Doc Positions ===== ============= 10 ``[1,5,23]`` 20 ``[45]`` 30 ``[7,12]`` ===== ============= The indexing code passes the unicode string for a field to the field's ``Format`` object. The ``Format`` object calls its analyzer (see text analysis) to break the string into tokens, then encodes information about each token. Whoosh ships with the following pre-defined formats. =============== ================================================================ Class name Description =============== ================================================================ Stored A "null" format for fields that are stored but not indexed. Existence Records only whether a term is in a document or not, i.e. it does not store term frequency. Useful for identifier fields (e.g. path or id) and "tag"-type fields, where the frequency is expected to always be 0 or 1. Frequency Stores the number of times each term appears in each document. Positions Stores the number of times each term appears in each document, and at what positions. =============== ================================================================ The ``STORED`` field type uses the ``Stored`` format (which does nothing, so ``STORED`` fields are not indexed). The ``ID`` type uses the ``Existence`` format. The ``KEYWORD`` type uses the ``Frequency`` format. The ``TEXT`` type uses the ``Positions`` format if it is instantiated with ``phrase=True`` (the default), or ``Frequency`` if ``phrase=False``. In addition, the following formats are implemented for the possible convenience of expert users, but are not currently used in Whoosh: ================= ================================================================ Class name Description ================= ================================================================ DocBoosts Like Existence, but also stores per-document boosts Characters Like Positions, but also stores the start and end character indices of each term PositionBoosts Like Positions, but also stores per-position boosts CharacterBoosts Like Positions, but also stores the start and end character indices of each term and per-position boosts ================= ================================================================ Vectors ------- The main index is an inverted index. It maps terms to the documents they appear in. It is also sometimes useful to store a forward index, also known as a term vector, that maps documents to the terms that appear in them. For example, imagine an inverted index like this for a field: ========== ========================================================= Term Postings ========== ========================================================= apple ``[(doc=1, freq=2), (doc=2, freq=5), (doc=3, freq=1)]`` bear ``[(doc=2, freq=7)]`` ========== ========================================================= The corresponding forward index, or term vector, would be: ========== ====================================================== Doc Postings ========== ====================================================== 1 ``[(text=apple, freq=2)]`` 2 ``[(text=apple, freq=5), (text='bear', freq=7)]`` 3 ``[(text=apple, freq=1)]`` ========== ====================================================== If you set ``FieldType.vector`` to a ``Format`` object, the indexing code will use the ``Format`` object to store information about the terms in each document. Currently by default Whoosh does not make use of term vectors at all, but they are available to expert users who want to implement their own field types. Whoosh-2.5.7/docs/source/searching.rst0000644000076500000240000003270412254366350020025 0ustar mattstaff00000000000000============= How to search ============= Once you've created an index and added documents to it, you can search for those documents. The ``Searcher`` object ======================= To get a :class:`whoosh.searching.Searcher` object, call ``searcher()`` on your ``Index`` object:: searcher = myindex.searcher() You'll usually want to open the searcher using a ``with`` statement so the searcher is automatically closed when you're done with it (searcher objects represent a number of open files, so if you don't explicitly close them and the system is slow to collect them, you can run out of file handles):: with ix.searcher() as searcher: ... This is of course equivalent to:: try: searcher = ix.searcher() ... finally: searcher.close() The ``Searcher`` object is the main high-level interface for reading the index. It has lots of useful methods for getting information about the index, such as ``lexicon(fieldname)``. :: >>> list(searcher.lexicon("content")) [u"document", u"index", u"whoosh"] However, the most important method on the ``Searcher`` object is :meth:`~whoosh.searching.Searcher.search`, which takes a :class:`whoosh.query.Query` object and returns a :class:`~whoosh.searching.Results` object:: from whoosh.qparser import QueryParser qp = QueryParser("content", schema=myindex.schema) q = qp.parse(u"hello world") with myindex.searcher() as s: results = s.search(q) By default the results contains at most the first 10 matching documents. To get more results, use the ``limit`` keyword:: results = s.search(q, limit=20) If you want all results, use ``limit=None``. However, setting the limit whenever possible makes searches faster because Whoosh doesn't need to examine and score every document. Since displaying a page of results at a time is a common pattern, the ``search_page`` method lets you conveniently retrieve only the results on a given page:: results = s.search_page(q, 1) The default page length is 10 hits. You can use the ``pagelen`` keyword argument to set a different page length:: results = s.search_page(q, 5, pagelen=20) Results object ============== The :class:`~whoosh.searching.Results` object acts like a list of the matched documents. You can use it to access the stored fields of each hit document, to display to the user. :: >>> # Show the best hit's stored fields >>> results[0] {"title": u"Hello World in Python", "path": u"/a/b/c"} >>> results[0:2] [{"title": u"Hello World in Python", "path": u"/a/b/c"}, {"title": u"Foo", "path": u"/bar"}] By default, ``Searcher.search(myquery)`` limits the number of hits to 20, So the number of scored hits in the ``Results`` object may be less than the number of matching documents in the index. :: >>> # How many documents in the entire index would have matched? >>> len(results) 27 >>> # How many scored and sorted documents in this Results object? >>> # This will often be less than len() if the number of hits was limited >>> # (the default). >>> results.scored_length() 10 Calling ``len(Results)`` runs a fast (unscored) version of the query again to figure out the total number of matching documents. This is usually very fast but for large indexes it can cause a noticeable delay. If you want to avoid this delay on very large indexes, you can use the :meth:`~whoosh.searching.Results.has_exact_length`, :meth:`~whoosh.searching.Results.estimated_length`, and :meth:`~whoosh.searching.Results.estimated_min_length` methods to estimate the number of matching documents without calling ``len()``:: found = results.scored_length() if results.has_exact_length(): print("Scored", found, "of exactly", len(results), "documents") else: low = results.estimated_min_length() high = results.estimated_length() print("Scored", found, "of between", low, "and", high, "documents") Scoring and sorting =================== Scoring ------- Normally the list of result documents is sorted by *score*. The :mod:`whoosh.scoring` module contains implementations of various scoring algorithms. The default is :class:`~whoosh.scoring.BM25F`. You can set the scoring object to use when you create the searcher using the ``weighting`` keyword argument:: from whoosh import scoring with myindex.searcher(weighting=scoring.TF_IDF()) as s: ... A weighting model is a :class:`~whoosh.scoring.WeightingModel` subclass with a ``scorer()`` method that produces a "scorer" instance. This instance has a method that takes the current matcher and returns a floating point score. Sorting ------- See :doc:`facets`. Highlighting snippets and More Like This ======================================== See :doc:`highlight` and :doc:`keywords` for information on these topics. Filtering results ================= You can use the ``filter`` keyword argument to ``search()`` to specify a set of documents to permit in the results. The argument can be a :class:`whoosh.query.Query` object, a :class:`whoosh.searching.Results` object, or a set-like object containing document numbers. The searcher caches filters so if for example you use the same query filter with a searcher multiple times, the additional searches will be faster because the searcher will cache the results of running the filter query You can also specify a ``mask`` keyword argument to specify a set of documents that are not permitted in the results. :: with myindex.searcher() as s: qp = qparser.QueryParser("content", myindex.schema) user_q = qp.parse(query_string) # Only show documents in the "rendering" chapter allow_q = query.Term("chapter", "rendering") # Don't show any documents where the "tag" field contains "todo" restrict_q = query.Term("tag", "todo") results = s.search(user_q, filter=allow_q, mask=restrict_q) (If you specify both a ``filter`` and a ``mask``, and a matching document appears in both, the ``mask`` "wins" and the document is not permitted.) To find out how many results were filtered out of the results, use ``results.filtered_count`` (or ``resultspage.results.filtered_count``):: with myindex.searcher() as s: qp = qparser.QueryParser("content", myindex.schema) user_q = qp.parse(query_string) # Filter documents older than 7 days old_q = query.DateRange("created", None, datetime.now() - timedelta(days=7)) results = s.search(user_q, mask=old_q) print("Filtered out %d older documents" % results.filtered_count) Which terms from my query matched? ================================== You can use the ``terms=True`` keyword argument to ``search()`` to have the search record which terms in the query matched which documents:: with myindex.searcher() as s: results = s.seach(myquery, terms=True) You can then get information about which terms matched from the :class:`whoosh.searching.Results` and :class:`whoosh.searching.Hit` objects:: # Was this results object created with terms=True? if results.has_matched_terms(): # What terms matched in the results? print(results.matched_terms()) # What terms matched in each hit? for hit in results: print(hit.matched_terms()) .. _collapsing: Collapsing results ================== Whoosh lets you eliminate all but the top N documents with the same facet key from the results. This can be useful in a few situations: * Eliminating duplicates at search time. * Restricting the number of matches per source. For example, in a web search application, you might want to show at most three matches from any website. Whether a document should be collapsed is determined by the value of a "collapse facet". If a document has an empty collapse key, it will never be collapsed, but otherwise only the top N documents with the same collapse key will appear in the results. See :doc:`/facets` for information on facets. :: with myindex.searcher() as s: # Set the facet to collapse on and the maximum number of documents per # facet value (default is 1) results = s.collector(collapse="hostname", collapse_limit=3) # Dictionary mapping collapse keys to the number of documents that # were filtered out by collapsing on that key print(results.collapsed_counts) Collapsing works with both scored and sorted results. You can use any of the facet types available in the :mod:`whoosh.sorting` module. By default, Whoosh uses the results order (score or sort key) to determine the documents to collapse. For example, in scored results, the best scoring documents would be kept. You can optionally specify a ``collapse_order`` facet to control which documents to keep when collapsing. For example, in a product search you could display results sorted by decreasing price, and eliminate all but the highest rated item of each product type:: from whoosh import sorting with myindex.searcher() as s: price_facet = sorting.FieldFacet("price", reverse=True) type_facet = sorting.FieldFacet("type") rating_facet = sorting.FieldFacet("rating", reverse=True) results = s.collector(sortedby=price_facet, # Sort by reverse price collapse=type_facet, # Collapse on product type collapse_order=rating_facet # Collapse to highest rated ) The collapsing happens during the search, so it is usually more efficient than finding everything and post-processing the results. However, if the collapsing eliminates a large number of documents, collapsed search can take longer because the search has to consider more documents and remove many already-collected documents. Since this collector must sometimes go back and remove already-collected documents, if you use it in combination with :class:`~whoosh.collectors.TermsCollector` and/or :class:`~whoosh.collectors.FacetCollector`, those collectors may contain information about documents that were filtered out of the final results by collapsing. Time limited searches ===================== To limit the amount of time a search can take:: from whoosh.collectors import TimeLimitCollector, TimeLimit with myindex.searcher() as s: # Get a collector object c = s.collector(limit=None, sortedby="title_exact") # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds tlc = TimeLimitedCollector(c, timelimit=10.0) # Try searching try: s.search_with_collector(myquery, tlc) except TimeLimit: print("Search took too long, aborting!") # You can still get partial results from the collector results = tlc.results() Convenience methods =================== The :meth:`~whoosh.searching.Searcher.document` and :meth:`~whoosh.searching.Searcher.documents` methods on the ``Searcher`` object let you retrieve the stored fields of documents matching terms you pass in keyword arguments. This is especially useful for fields such as dates/times, identifiers, paths, and so on. :: >>> list(searcher.documents(indexeddate=u"20051225")) [{"title": u"Christmas presents"}, {"title": u"Turkey dinner report"}] >>> print searcher.document(path=u"/a/b/c") {"title": "Document C"} These methods have some limitations: * The results are not scored. * Multiple keywords are always AND-ed together. * The entire value of each keyword argument is considered a single term; you can't search for multiple terms in the same field. Combining Results objects ========================= It is sometimes useful to use the results of another query to influence the order of a :class:`whoosh.searching.Results` object. For example, you might have a "best bet" field. This field contains hand-picked keywords for documents. When the user searches for those keywords, you want those documents to be placed at the top of the results list. You could try to do this by boosting the "bestbet" field tremendously, but that can have unpredictable effects on scoring. It's much easier to simply run the query twice and combine the results:: # Parse the user query userquery = queryparser.parse(querystring) # Get the terms searched for termset = set() userquery.existing_terms(termset) # Formulate a "best bet" query for the terms the user # searched for in the "content" field bbq = Or([Term("bestbet", text) for fieldname, text in termset if fieldname == "content"]) # Find documents matching the searched for terms results = s.search(bbq, limit=5) # Find documents that match the original query allresults = s.search(userquery, limit=10) # Add the user query results on to the end of the "best bet" # results. If documents appear in both result sets, push them # to the top of the combined results. results.upgrade_and_extend(allresults) The ``Results`` object supports the following methods: ``Results.extend(results)`` Adds the documents in 'results' on to the end of the list of result documents. ``Results.filter(results)`` Removes the documents in 'results' from the list of result documents. ``Results.upgrade(results)`` Any result documents that also appear in 'results' are moved to the top of the list of result documents. ``Results.upgrade_and_extend(results)`` Any result documents that also appear in 'results' are moved to the top of the list of result documents. Then any other documents in 'results' are added on to the list of result documents. Whoosh-2.5.7/docs/source/spelling.rst0000644000076500000240000001161112254366350017671 0ustar mattstaff00000000000000===================================================== "Did you mean... ?" Correcting errors in user queries ===================================================== .. note:: In Whoosh 1.9 the old spelling system based on a separate N-gram index was replaced with this significantly more convenient and powerful implementation. Overview ======== Whoosh can quickly suggest replacements for mis-typed words by returning a list of words from the index (or a dictionary) that are close to the mis-typed word:: with ix.searcher() as s: corrector = s.corrector("text") for mistyped_word in mistyped_words: print corrector.suggest(mistyped_word, limit=3) See the :meth:`whoosh.spelling.Corrector.suggest` method documentation for information on the arguments. Currently the suggestion engine is more like a "typo corrector" than a real "spell checker" since it doesn't do the kind of sophisticated phonetic matching or semantic/contextual analysis a good spell checker might. However, it is still very useful. There are two main strategies for correcting words: * Use the terms from an index field. * Use words from a word list file. Pulling suggestions from an indexed field ========================================= To enable spell checking on the contents of a field, use the ``spelling=True`` keyword argument on the field in the schema definition:: schema = Schema(text=TEXT(spelling=True)) (If you have an existing index you want to enable spelling for, you can alter the schema in-place using the :func:`whoosh.writing.add_spelling` function to create the missing word graph files.) .. tip:: You can get suggestions for fields without the ``spelling`` attribute, but calculating the suggestions will be slower. You can then use the :meth:`whoosh.searching.Searcher.corrector` method to get a corrector for a field:: corrector = searcher.corrector("content") The advantage of using the contents of an index field is that when you are spell checking queries on that index, the suggestions are tailored to the contents of the index. The disadvantage is that if the indexed documents contain spelling errors, then the spelling suggestions will also be erroneous. Pulling suggestions from a word list ==================================== There are plenty of word lists available on the internet you can use to populate the spelling dictionary. (In the following examples, ``word_list`` can be a list of unicode strings, or a file object with one word on each line.) To create a :class:`whoosh.spelling.Corrector` object from a word list:: from whoosh.spelling import GraphCorrector corrector = GraphCorrector.from_word_list(word_list) Creating a corrector directly from a word list can be slow for large word lists, so you can save a corrector's graph to a more efficient on-disk form like this:: graphfile = myindex.storage.create_file("words.graph") # to_file() automatically closes the file when it's finished corrector.to_file(graphfile) To open the graph file again very quickly:: graphfile = myindex.storage.open_file("words.graph") corrector = GraphCorrector.from_graph_file(graphfile) Merging two or more correctors ============================== You can combine suggestions from two sources (for example, the contents of an index field and a word list) using a :class:`whoosh.spelling.MultiCorrector`:: c1 = searcher.corrector("content") c2 = GraphCorrector.from_graph_file(wordfile) corrector = MultiCorrector([c1, c2]) Correcting user queries ======================= You can spell-check a user query using the :meth:`whoosh.searching.Searcher.correct_query` method:: from whoosh import qparser # Parse the user query string qp = qparser.QueryParser("content", myindex.schema) q = qp.parse(qstring) # Try correcting the query with myindex.searcher() as s: corrected = s.correct_query(q, qstring) if corrected.query != q: print("Did you mean:", corrected.string) The ``correct_query`` method returns an object with the following attributes: ``query`` A corrected :class:`whoosh.query.Query` tree. You can test whether this is equal (``==``) to the original parsed query to check if the corrector actually changed anything. ``string`` A corrected version of the user's query string. ``tokens`` A list of corrected token objects representing the corrected terms. You can use this to reformat the user query (see below). You can use a :class:`whoosh.highlight.Formatter` object to format the corrected query string. For example, use the :class:`~whoosh.highlight.HtmlFormatter` to format the corrected string as HTML:: from whoosh import highlight hf = highlight.HtmlFormatter() corrected = s.correct_query(q, qstring, formatter=hf) See the documentation for :meth:`whoosh.searching.Searcher.correct_query` for information on the defaults and arguments. Whoosh-2.5.7/docs/source/stemming.rst0000644000076500000240000002013712254366350017702 0ustar mattstaff00000000000000======================================== Stemming, variations, and accent folding ======================================== The problem =========== The indexed text will often contain words in different form than the one the user searches for. For example, if the user searches for ``render``, we would like the search to match not only documents that contain the ``render``, but also ``renders``, ``rendering``, ``rendered``, etc. A related problem is one of accents. Names and loan words may contain accents in the original text but not in the user's query, or vice versa. For example, we want the user to be able to search for ``cafe`` and find documents containing ``café``. The default analyzer for the :class:`whoosh.fields.TEXT` field does not do stemming or accent folding. Stemming ======== Stemming is a heuristic process of removing suffixes (and sometimes prefixes) from words to arrive (hopefully, most of the time) at the base word. Whoosh includes several stemming algorithms such as Porter and Porter2, Paice Husk, and Lovins. :: >>> from whoosh.lang.porter import stem >>> stem("rendering") 'render' The stemming filter applies the stemming function to the terms it indexes, and to words in user queries. So in theory all variations of a root word ("render", "rendered", "renders", "rendering", etc.) are reduced to a single term in the index, saving space. And all possible variations users might use in a query are reduced to the root, so stemming enhances "recall". The :class:`whoosh.analysis.StemFilter` lets you add a stemming filter to an analyzer chain. :: >>> rext = RegexTokenizer() >>> stream = rext(u"fundamentally willows") >>> stemmer = StemFilter() >>> [token.text for token in stemmer(stream)] [u"fundament", u"willow"] The :func:`whoosh.analysis.StemmingAnalyzer` is a pre-packaged analyzer that combines a tokenizer, lower-case filter, optional stop filter, and stem filter:: from whoosh import fields from whoosh.analysis import StemmingAnalyzer stem_ana = StemmingAnalyzer() schema = fields.Schema(title=TEXT(analyzer=stem_ana, stored=True), content=TEXT(analyzer=stem_ana)) Stemming has pros and cons. * It allows the user to find documents without worrying about word forms. * It reduces the size of the index, since it reduces the number of separate terms indexed by "collapsing" multiple word forms into a single base word. * It's faster than using variations (see below) * The stemming algorithm can sometimes incorrectly conflate words or change the meaning of a word by removing suffixes. * The stemmed forms are often not proper words, so the terms in the field are not useful for things like creating a spelling dictionary. Variations ========== Whereas stemming encodes the words in the index in a base form, when you use variations you instead index words "as is" and *at query time* expand words in the user query using a heuristic algorithm to generate morphological variations of the word. :: >>> from whoosh.lang.morph_en import variations >>> variations("rendered") set(['rendered', 'rendernesses', 'render', 'renderless', 'rendering', 'renderness', 'renderes', 'renderer', 'renderements', 'rendereless', 'renderenesses', 'rendere', 'renderment', 'renderest', 'renderement', 'rendereful', 'renderers', 'renderful', 'renderings', 'renders', 'renderly', 'renderely', 'rendereness', 'renderments']) Many of the generated variations for a given word will not be valid words, but it's fairly fast for Whoosh to check which variations are actually in the index and only search for those. The :class:`whoosh.query.Variations` query object lets you search for variations of a word. Whereas the normal :class:`whoosh.query.Term` object only searches for the given term, the ``Variations`` query acts like an ``Or`` query for the variations of the given word in the index. For example, the query:: query.Variations("content", "rendered") ...might act like this (depending on what words are in the index):: query.Or([query.Term("content", "render"), query.Term("content", "rendered"), query.Term("content", "renders"), query.Term("content", "rendering")]) To have the query parser use :class:`whoosh.query.Variations` instead of :class:`whoosh.query.Term` for individual terms, use the ``termclass`` keyword argument to the parser initialization method:: from whoosh import qparser, query qp = qparser.QueryParser("content", termclass=query.Variations) Variations has pros and cons. * It allows the user to find documents without worrying about word forms. * The terms in the field are actual words, not stems, so you can use the field's contents for other purposes such as spell checking queries. * It increases the size of the index relative to stemming, because different word forms are indexed separately. * It acts like an ``Or`` search for all the variations, which is slower than searching for a single term. Lemmatization ============= Whereas stemming is a somewhat "brute force", mechanical attempt at reducing words to their base form using simple rules, lemmatization usually refers to more sophisticated methods of finding the base form ("lemma") of a word using language models, often involving analysis of the surrounding context and part-of-speech tagging. Whoosh does not include any lemmatization functions, but if you have separate lemmatizing code you could write a custom :class:`whoosh.analysis.Filter` to integrate it into a Whoosh analyzer. Character folding ================= You can set up an analyzer to treat, for example, ``á``, ``a``, ``å``, and ``â`` as equivalent to improve recall. This is often very useful, allowing the user to, for example, type ``cafe`` or ``resume`` and find documents containing ``café`` and ``resumé``. Character folding is especially useful for unicode characters that may appear in Asian language texts that should be treated as equivalent to their ASCII equivalent, such as "half-width" characters. Character folding is not always a panacea. See this article for caveats on where accent folding can break down. http://www.alistapart.com/articles/accent-folding-for-auto-complete/ Whoosh includes several mechanisms for adding character folding to an analyzer. The :class:`whoosh.analysis.CharsetFilter` applies a character map to token text. For example, it will filter the tokens ``u'café', u'resumé', ...`` to ``u'cafe', u'resume', ...``. This is usually the method you'll want to use unless you need to use a charset to tokenize terms:: from whoosh.analysis import CharsetFilter, StemmingAnalyzer from whoosh import fields from whoosh.support.charset import accent_map # For example, to add an accent-folding filter to a stemming analyzer: my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) # To use this analyzer in your schema: my_schema = fields.Schema(content=fields.TEXT(analyzer=my_analyzer)) The :class:`whoosh.analysis.CharsetTokenizer` uses a Sphinx charset table to both separate terms and perform character folding. This tokenizer is slower than the :class:`whoosh.analysis.RegexTokenizer` because it loops over each character in Python. If the language(s) you're indexing can be tokenized using regular expressions, it will be much faster to use ``RegexTokenizer`` and ``CharsetFilter`` in combination instead of using ``CharsetTokenizer``. The :mod:`whoosh.support.charset` module contains an accent folding map useful for most Western languages, as well as a much more extensive Sphinx charset table and a function to convert Sphinx charset tables into the character maps required by ``CharsetTokenizer`` and ``CharsetFilter``:: # To create a filter using an enourmous character map for most languages # generated from a Sphinx charset table from whoosh.analysis import CharsetFilter from whoosh.support.charset import default_charset, charset_table_to_dict charmap = charset_table_to_dict(default_charset) my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) (The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table ) Whoosh-2.5.7/docs/source/tech/0000755000076500000240000000000012277504634016251 5ustar mattstaff00000000000000Whoosh-2.5.7/docs/source/tech/backend.rst0000644000076500000240000001110712254366350020366 0ustar mattstaff00000000000000============================== How to implement a new backend ============================== Index ===== * Subclass :class:`whoosh.index.Index`. * Indexes must implement the following methods. * :meth:`whoosh.index.Index.is_empty` * :meth:`whoosh.index.Index.doc_count` * :meth:`whoosh.index.Index.reader` * :meth:`whoosh.index.Index.writer` * Indexes that require/support locking must implement the following methods. * :meth:`whoosh.index.Index.lock` * :meth:`whoosh.index.Index.unlock` * Indexes that support deletion must implement the following methods. * :meth:`whoosh.index.Index.delete_document` * :meth:`whoosh.index.Index.doc_count_all` -- if the backend has delayed deletion. * Indexes that require/support versioning/transactions *may* implement the following methods. * :meth:`whoosh.index.Index.latest_generation` * :meth:`whoosh.index.Index.up_to_date` * :meth:`whoosh.index.Index.last_modified` * Index *may* implement the following methods (the base class's versions are no-ops). * :meth:`whoosh.index.Index.optimize` * :meth:`whoosh.index.Index.close` IndexWriter =========== * Subclass :class:`whoosh.writing.IndexWriter`. * IndexWriters must implement the following methods. * :meth:`whoosh.writing.IndexWriter.add_document` * :meth:`whoosh.writing.IndexWriter.add_reader` * Backends that support deletion must implement the following methods. * :meth:`whoosh.writing.IndexWriter.delete_document` * IndexWriters that work as transactions must implement the following methods. * :meth:`whoosh.reading.IndexWriter.commit` -- Save the additions/deletions done with this IndexWriter to the main index, and release any resources used by the IndexWriter. * :meth:`whoosh.reading.IndexWriter.cancel` -- Throw away any additions/deletions done with this IndexWriter, and release any resources used by the IndexWriter. IndexReader =========== * Subclass :class:`whoosh.reading.IndexReader`. * IndexReaders must implement the following methods. * :meth:`whoosh.reading.IndexReader.__contains__` * :meth:`whoosh.reading.IndexReader.__iter__` * :meth:`whoosh.reading.IndexReader.iter_from` * :meth:`whoosh.reading.IndexReader.stored_fields` * :meth:`whoosh.reading.IndexReader.doc_count_all` * :meth:`whoosh.reading.IndexReader.doc_count` * :meth:`whoosh.reading.IndexReader.doc_field_length` * :meth:`whoosh.reading.IndexReader.field_length` * :meth:`whoosh.reading.IndexReader.max_field_length` * :meth:`whoosh.reading.IndexReader.postings` * :meth:`whoosh.reading.IndexReader.has_vector` * :meth:`whoosh.reading.IndexReader.vector` * :meth:`whoosh.reading.IndexReader.doc_frequency` * :meth:`whoosh.reading.IndexReader.frequency` * Backends that support deleting documents should implement the following methods. * :meth:`whoosh.reading.IndexReader.has_deletions` * :meth:`whoosh.reading.IndexReader.is_deleted` * Backends that support versioning should implement the following methods. * :meth:`whoosh.reading.IndexReader.generation` * If the IndexReader object does not keep the schema in the ``self.schema`` attribute, it needs to override the following methods. * :meth:`whoosh.reading.IndexReader.field` * :meth:`whoosh.reading.IndexReader.field_names` * :meth:`whoosh.reading.IndexReader.scorable_names` * :meth:`whoosh.reading.IndexReader.vector_names` * IndexReaders *may* implement the following methods. * :meth:`whoosh.reading.DocReader.close` -- closes any open resources associated with the reader. Matcher ======= The :meth:`whoosh.reading.IndexReader.postings` method returns a :class:`whoosh.matching.Matcher` object. You will probably need to implement a custom Matcher class for reading from your posting lists. * Subclass :class:`whoosh.matching.Matcher`. * Implement the following methods at minimum. * :meth:`whoosh.matching.Matcher.is_active` * :meth:`whoosh.matching.Matcher.copy` * :meth:`whoosh.matching.Matcher.id` * :meth:`whoosh.matching.Matcher.next` * :meth:`whoosh.matching.Matcher.value` * :meth:`whoosh.matching.Matcher.value_as` * :meth:`whoosh.matching.Matcher.score` * Depending on the implementation, you *may* implement the following methods more efficiently. * :meth:`whoosh.matching.Matcher.skip_to` * :meth:`whoosh.matching.Matcher.weight` * If the implementation supports quality, you should implement the following methods. * :meth:`whoosh.matching.Matcher.supports_quality` * :meth:`whoosh.matching.Matcher.quality` * :meth:`whoosh.matching.Matcher.block_quality` * :meth:`whoosh.matching.Matcher.skip_to_quality` Whoosh-2.5.7/docs/source/tech/filedb.rst0000644000076500000240000000267612254366350020237 0ustar mattstaff00000000000000============ filedb notes ============ TBD. Files created ============= .toc The "master" file containing information about the index and its segments. The index directory will contain a set of files for each segment. A segment is like a mini-index -- when you add documents to the index, whoosh creates a new segment and then searches the old segment(s) and the new segment to avoid having to do a big merge every time you add a document. When you get enough small segments whoosh will merge them into larger segments or a single segment. .dci Contains per-document information (e.g. field lengths). This will grow linearly with the number of documents. .dcz Contains the stored fields for each document. .tiz Contains per-term information. The size of file will vary based on the number of unique terms. .pst Contains per-term postings. The size of this file depends on the size of the collection and the formats used for each field (e.g. storing term positions takes more space than storing frequency only). .fvz contains term vectors (forward indexes) for each document. This file is only created if at least one field in the schema stores term vectors. The size will vary based on the number of documents, field length, the formats used for each vector (e.g. storing term positions takes more space than storing frequency only), etc. Whoosh-2.5.7/docs/source/tech/index.rst0000644000076500000240000000014112254366350020102 0ustar mattstaff00000000000000=============== Technical notes =============== .. toctree:: :glob: :maxdepth: 2 * Whoosh-2.5.7/docs/source/threads.rst0000644000076500000240000000531512254366350017512 0ustar mattstaff00000000000000==================================== Concurrency, locking, and versioning ==================================== Concurrency =========== The ``FileIndex`` object is "stateless" and should be share-able between threads. A ``Reader`` object (which underlies the ``Searcher`` object) wraps open files and often individual methods rely on consistent file cursor positions (e.g. they do two ``file.read()``\ s in a row, so if another thread moves the cursor between the two read calls Bad Things would happen). You should use one Reader/Searcher per thread in your code. Readers/Searchers tend to cache information (such as field caches for sorting), so if you can share one across multiple search requests, it's a big performance win. Locking ======= Only one thread/process can write to an index at a time. When you open a writer, it locks the index. If you try to open a writer on the same index in another thread/process, it will raise ``whoosh.store.LockError``. In a multi-threaded or multi-process environment your code needs to be aware that opening a writer may raise this exception if a writer is already open. Whoosh includes a couple of example implementations (:class:`whoosh.writing.AsyncWriter` and :class:`whoosh.writing.BufferedWriter`) of ways to work around the write lock. While the writer is open and during the commit, **the index is still available for reading**. Existing readers are unaffected and new readers can open the current index normally. Lock files ---------- Locking the index is accomplished by acquiring an exclusive file lock on the ``_WRITELOCK`` file in the index directory. The file is not deleted after the file lock is released, so the fact that the file exists **does not** mean the index is locked. Versioning ========== When you open a reader/searcher, the reader represents a view of the **current version** of the index. If someone writes changes to the index, any readers that are already open **will not** pick up the changes automatically. A reader always sees the index as it existed when the reader was opened. If you are re-using a Searcher across multiple search requests, you can check whether the Searcher is a view of the latest version of the index using :meth:`whoosh.searching.Searcher.up_to_date`. If the searcher is not up to date, you can get an up-to-date copy of the searcher using :meth:`whoosh.searching.Searcher.refresh`:: # If 'searcher' is not up-to-date, replace it searcher = searcher.refresh() (If the searcher has the latest version of the index, ``refresh()`` simply returns it.) Calling ``Searcher.refresh()`` is more efficient that closing the searcher and opening a new one, since it will re-use any underlying readers and caches that haven't changed. Whoosh-2.5.7/files/0000755000076500000240000000000012277504634014200 5ustar mattstaff00000000000000Whoosh-2.5.7/files/whoosh.svg0000644000076500000240000005160112254366350016227 0ustar mattstaff00000000000000 image/svg+xml Whoosh-2.5.7/files/whoosh_16.png0000644000076500000240000000161512254366350016522 0ustar mattstaff00000000000000PNG  IHDRasBIT|d pHYs B(xtEXtSoftwarewww.inkscape.org< IDAT8]HSqƟsvvN[\Y;tXM[+3㢠B Β/ )" Ûˢ"$jdJ˂ @q}9 ,޻\RLaPCTuXnG#yd$jŜ#:0L"1v2NtBs@P (߀btM#r`ȴodgի I$86xb (5R@fX,|p!#/x%-b횼2~l4藁js(r*l(((@83 S-Z`HOyh^! 7o#2D^^$ @F F!%O$yKKM=q}b~F?@$$nc8d4"EH E2OwMt8tugʒDS&C0!عsji!IEeuWAq133SDwgn.< y8DCSJY1EY:Ǖn&aCؽNroz私Z,}^_yïn޼wnSm7V՝߇/,&=momm7+: e@ Ϛ4Vi֬Y~m{P3B.dAA\vs??/~4@V:;;ٷoT NG44pt-|, pAYxo8֮]s c@*f9(@2$vi7m8?CeS=ɏ'oƙ 痗_;lV](..fkYrK~ŋ),FII iX K˼V/0zL=a=1YLz QiiZ&xq,={޽;!*>HCCdhx;Es<~g-%F`ʆ4|ƫ;}H$a4ϧDR)^xErssi biU{fucSvi#z[~Ɣ_<YYMB  `00 ' dO( 0999~F#N씔%//ʁdPbm/vl0E~GN_. Ldhlll6c41(B2$J2jL&j @:p=T O d2Qԣx"}˦mA>($jNkX,lvLbdeeiK>Yj9ք68xqfG)*%i Gڧn߼8@hc0/ZlE~׋^i7ۧwbbb4fe3#Nk#Np8xh v57n\/CUqy  .єZAA'0ۅ{>}NQ7o٪~# vl6fFI*,X@[8^oٰ)fjthtȃ3gݹk6}CjMhyOy]s{ Պtva |'|Q/(Pf>HD;- i (jD"f>k^+4 4lѭ[Nzw.7 +bNy{a)l6DtZJu*K^Φri@MuU[ rkï5 Jܽ75CQVZŘgJl6u{^>l%ab25/F= NF3_g!|8"#D>/'b3+dذaj㶍+nj֭իWիeRSS#PHnжGJKT"3!>,{Nzse ?߉ cH/D{%_}g}ޭCSٌNd2aQ :Ms^/ߓÆ{6løܯOL |ъ$:D S.lLڢ\2Dʐ2L4Q~ڵK/~_L2Y6_Zp RjjtU8~ql!${;E*rmJHַ}ETg*q],RWW'G`0(TJ>Ywynp>wPPڂωL1-M/5y|_4Hw<뤄.U~x?Mm۾]B ÒNe2馱MA wH4lJCUi[ٝ ;AVOu! g` .^6<`^w߿lt2w+WO~m P1Yeu ף CٗٗcR.GQvG(JdjH)4-!EA@M8_kknzW`DE`9B8^# n{cQv˽BN3$!>6oLyy9t Á(^/DUU!VRK 0`蝹sQS3KBQSY2Vt2BYc6֭~|>ai`UUQZG0X,@/րXf_;6jI7 {ʒ;t53CelٲMHLLk׮c4 0 .]/l6:^e΢ybpyoH:a鞕) ׯUUի999B!V+p2ك(Baa!=z 66)%a`24 `dwܡV )B6|"y!!%F `РA">l1gde MXWTVVҥK~IFg׮̛7]vD C۷/cZuχK,0'1t=M{ ĭ|ױcm!E]n6[:ge M믩wެY&N?G@oΒg凡p ,^:](&2RImF]]p88p ˗[o͂gϞ=$&&0gΣ MRYY\0n\?") |h.X0=d1pڒ7dpeZR"|#1gӧO ͛7eV{}Ea18v[G墢 9&`0EWCrP )Ov:ػw/ݽ{w@yyUUUlܸ׋ @AϞ=GZZQ^VNO릏RNz>?e鼽0^50"5b&j@ jSH+ RVVFYYV$:ub 6ƍQu'99  dddP]]Mxm>\}\ >DI]]X,zEJJ 26Lii)BѣG1Lf<]vmwdSNcȐaX, FCc#I)Y@ }.2jOHu~>➻TTTzxkD___gVc1pڬl UU)++UU9=! 0!{B:t(QxH7 cp\'(**8\GUO@;7n'NQ]]l>.pW|9P5fv󵵵v!vKEEMMMkXPU5!wXCM\cǗ-p |zB 7(/.KEEfڷcM[لP*++QUvfr @LLL vp8]ZCn޽{9P\&*7mj [#eKQҎ#@Q\.qKdm-++kbn ڑo`/QLcc#pIJJ 477G5ൂz;xMֲINN&33f*#i i:ǦoMyt;s`L;6џ \."ұV+fsWoͺ-\! )%-%%:~ 74r0lQidffH]C#5Ѝo |a !p^'wjMݙr(ByyryD]]^yILLLkwO8lں݆夲ł NQWWlMNnp\TQ[a!N,=: R2[Հ0.R9nXΊ.p~5 AdffY'>!P(5TB &ݎ DRWqEYnf%@gbh0- G puuS2|z4 ٌn'-- EQxs륗H]׍oU!z.}zرk7EtNOnz ]1(|X(.)[oQjNGt].iii ($Q͛xݎUr(2!o_p&EuscW_qL`?-=}И7伂}ޟv͞`0HMrR">łlzqѲ (*-Őe'YUu4\.^&/PTW,x0~qtA9h߱C|FNNp݄9?stp@A4 PHdHNkJ@LIY9X ,s԰l 6+6Gm@̌ oHUEȕD)YJRʫEYgif<0{\vSFR ;7öCӠfJgH$YķlLl6v{jbXX,4{}ѭ[7^{XVz9TUNrr2P|9`S ;BXB@!KÖM!)劢ٺm;/ Z=6c_CgCajзwc f_QU"F,#G`x<tBII),zw.钕E.BA, )Elu'I1_ V<.a˦[6y"'\'kݻ3|\udg]k# aq:x}>>|%vw|tqddd( ӦOfa`v?v+f C||<$$$ja:8s Cʩ,Umq lؼ4>Zo>\>J@ϼCRx' x?;p};-㶛obDf2Nlv<9y/Zm:D\..X"!!W_C}]ƟGa:QPQV ӀW)4ÄGK$:R0^&U7-ZcΝ>3z(bȑxv,ػ78~?W| iiitڵCP[WC]}! } ^ <9p% G`O?1*ၫ __ʖ/ m~'Nd_a[oY+w珩v-ۿf3zt:ll{ C?g'n&| 7<"mN1EsUZ/@^_GVog՜3n<,RamSSIlj%&&!?pˑxI0x]p$Sxd! #Y_a6)sr0١|B\kMiYolnn>LȔSH gHH9)ݗȩ^(MQݻw۷|sNYPP /%%%K{._%ܣ}"ݤ<[I)oOceKQfY7(ߺ3u46m#znt6oqݗl/P4iZdd2(<,Rf&q,,k .̘p5(;$Ns/[̀?*UK~p%j].9 _ȇCҎcbJ7Iڑ[,>\gG5ǜ<xΟkB=qS`l %^|A)g^^^/asͻb䘱kK)cC^ה+ߌ/KdCC,,,Ų\VUUɦ&i,*.9YG5HrRARJwI)'[^iUr{CR>6]!AJÐRaRΐV#X}q9y09xHw>zeqq,..eeeJJ'5Mur20. _RTjZrRR0FJ)彗K9$URνxR 0V>)[m #ƞ#~_VUUEȆ벦F{rѲݗ{[@MRwWI)_,XΉ>&~0zB'>Y,7fVcARSR|]MLL9h5xV!5\qTUW]t.E+3TZ H@ׯcB19IT1[o/4 ~KILLcioXK;ww@j@N]uoPZTy yYϸ@ʥ/KPQACJ)WH)=ٹ'se.pXB!Ysٹyr۳s2M>Y# C=%e].tNmIXGQu?CF fݙK)QG47<؂󈤏,.p"ۿ^/))mRsNW)e$V60 i%Aqa+OOn.uY- !IENDB`Whoosh-2.5.7/files/whoosh_small.svg0000644000076500000240000006051012254366350017416 0ustar mattstaff00000000000000 image/svg+xml Whoosh-2.5.7/LICENSE.txt0000644000076500000240000000271212254366350014717 0ustar mattstaff00000000000000Copyright 2011 Matt Chaput. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of Matt Chaput. Whoosh-2.5.7/MANIFEST.in0000644000076500000240000000037512254366350014635 0ustar mattstaff00000000000000include *.txt include benchmark/dcvgr10.txt.gz include benchmark/reuters21578.txt.gz recursive-include tests *.txt *.py recursive-include benchmark *.txt *.py recursive-include docs *.txt *.py *.rst recursive-include files *.txt *.py *.png *.jpg *.svg Whoosh-2.5.7/PKG-INFO0000644000076500000240000000703512277504634014200 0ustar mattstaff00000000000000Metadata-Version: 1.1 Name: Whoosh Version: 2.5.7 Summary: Fast, pure-Python full text indexing, search, and spell checking library. Home-page: http://bitbucket.org/mchaput/whoosh Author: Matt Chaput Author-email: matt@whoosh.ca License: Two-clause BSD license Description: About Whoosh ============ Whoosh is a fast, featureful full-text indexing and searching library implemented in pure Python. Programmers can use it to easily add search functionality to their applications and websites. Every part of how Whoosh works can be extended or replaced to meet your needs exactly. Some of Whoosh's features include: * Pythonic API. * Pure-Python. No compilation or binary packages needed, no mysterious crashes. * Fielded indexing and search. * Fast indexing and retrieval -- faster than any other pure-Python, scoring, full-text search solution I know of. * Pluggable scoring algorithm (including BM25F), text analysis, storage, posting format, etc. * Powerful query language. * Pure Python spell-checker (as far as I know, the only one). Whoosh might be useful in the following circumstances: * Anywhere a pure-Python solution is desirable to avoid having to build/compile native libraries (or force users to build/compile them). * As a research platform (at least for programmers that find Python easier to read and work with than Java ;) * When an easy-to-use Pythonic interface is more important to you than raw speed. Whoosh was created and is maintained by Matt Chaput. It was originally created for use in the online help system of Side Effects Software's 3D animation software Houdini. Side Effects Software Inc. graciously agreed to open-source the code. This software is licensed under the terms of the simplified BSD (A.K.A. "two clause" or "FreeBSD") license. See LICENSE.txt for information. Installing Whoosh ================= If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` or ``pip`` to download and install Whoosh automatically:: $ easy_install Whoosh or $ pip install Whoosh Learning more ============= * Read the online documentation at http://packages.python.org/Whoosh/ * Join the Whoosh mailing list at http://groups.google.com/group/whoosh * File bug reports and view the Whoosh wiki at http://bitbucket.org/mchaput/whoosh/ Getting the source ================== Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/ You can check out the latest version of the source code using Mercurial:: hg clone http://bitbucket.org/mchaput/whoosh Keywords: index search text spell Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: BSD License Classifier: Natural Language :: English Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python :: 2.5 Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing :: Indexing Whoosh-2.5.7/README.txt0000644000076500000240000000435412254366350014576 0ustar mattstaff00000000000000About Whoosh ============ Whoosh is a fast, featureful full-text indexing and searching library implemented in pure Python. Programmers can use it to easily add search functionality to their applications and websites. Every part of how Whoosh works can be extended or replaced to meet your needs exactly. Some of Whoosh's features include: * Pythonic API. * Pure-Python. No compilation or binary packages needed, no mysterious crashes. * Fielded indexing and search. * Fast indexing and retrieval -- faster than any other pure-Python, scoring, full-text search solution I know of. * Pluggable scoring algorithm (including BM25F), text analysis, storage, posting format, etc. * Powerful query language. * Pure Python spell-checker (as far as I know, the only one). Whoosh might be useful in the following circumstances: * Anywhere a pure-Python solution is desirable to avoid having to build/compile native libraries (or force users to build/compile them). * As a research platform (at least for programmers that find Python easier to read and work with than Java ;) * When an easy-to-use Pythonic interface is more important to you than raw speed. Whoosh was created and is maintained by Matt Chaput. It was originally created for use in the online help system of Side Effects Software's 3D animation software Houdini. Side Effects Software Inc. graciously agreed to open-source the code. This software is licensed under the terms of the simplified BSD (A.K.A. "two clause" or "FreeBSD") license. See LICENSE.txt for information. Installing Whoosh ================= If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` or ``pip`` to download and install Whoosh automatically:: $ easy_install Whoosh or $ pip install Whoosh Learning more ============= * Read the online documentation at http://packages.python.org/Whoosh/ * Join the Whoosh mailing list at http://groups.google.com/group/whoosh * File bug reports and view the Whoosh wiki at http://bitbucket.org/mchaput/whoosh/ Getting the source ================== Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/ You can check out the latest version of the source code using Mercurial:: hg clone http://bitbucket.org/mchaput/whoosh Whoosh-2.5.7/setup.cfg0000644000076500000240000000213712277504634014722 0ustar mattstaff00000000000000[wheel] universal = 1 [build_sphinx] build-dir = docs/build source-dir = docs/source [upload_sphinx] upload-dir = docs/build/html [sdist] formats = zip,gztar [aliases] push = sdist bdist_wheel upload pushdocs = build_sphinx upload_sphinx [pytest] addopts = -rs --tb=native norecursedirs = .hg .tox _build tmp* env* benchmark stress minversion = 2.0 python_files = test_*.py pep8ignore = *.py E121 E122 E123 E124 E125 E126 E127 E128 # continuation line indentation *.py E401 # imports on separate lines *.py W391 # blank line at end of file test_*.py E501 # Ignore long lines in tests upload.py ALL # 3rd party (and not in the repo): rietveld upload tool docs/source/conf.py ALL # sphinx stuff, automatically generated, don't check this src/whoosh/lang/*.py ALL # 3rd party / crashing py.test with non-ascii stuff src/whoosh/lang/snowball/*.py ALL # 3rd party src/whoosh/support/relativedelta.py ALL # 3rd party src/whoosh/support/charset.py ALL # non-ascii py.test crash src/whoosh/support/unicode.py ALL # non-ascii py.test crash [egg_info] tag_build = tag_date = 0 tag_svn_revision = 0 Whoosh-2.5.7/setup.py0000644000076500000240000000324012254366350014603 0ustar mattstaff00000000000000#!python import os.path, sys from setuptools import setup, find_packages from setuptools.command.test import test as TestCommand try: import pytest except ImportError: pytest = None sys.path.insert(0, os.path.abspath("src")) from whoosh import __version__, versionstring class PyTest(TestCommand): def finalize_options(self): TestCommand.finalize_options(self) self.test_args = [] self.test_suite = True def run_tests(self): #import here, cause outside the eggs aren't loaded import pytest pytest.main(self.test_args) if __name__ == "__main__": setup( name="Whoosh", version=versionstring(), package_dir={'': 'src'}, packages=find_packages("src"), author="Matt Chaput", author_email="matt@whoosh.ca", description="Fast, pure-Python full text indexing, search, and spell checking library.", long_description=open("README.txt").read(), license="Two-clause BSD license", keywords="index search text spell", url="http://bitbucket.org/mchaput/whoosh", zip_safe=True, tests_require=['pytest'], cmdclass={'test': PyTest}, classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 2.5", "Programming Language :: Python :: 3", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: Indexing", ], ) Whoosh-2.5.7/src/0000755000076500000240000000000012277504634013665 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/0000755000076500000240000000000012277504634015174 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/__init__.py0000644000076500000240000000401312277504551017301 0ustar mattstaff00000000000000# Copyright 2008 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. __version__ = (2, 5, 7) def versionstring(build=True, extra=True): """Returns the version number of Whoosh as a string. :param build: Whether to include the build number in the string. :param extra: Whether to include alpha/beta/rc etc. tags. Only checked if build is True. :rtype: str """ if build: first = 3 else: first = 2 s = ".".join(str(n) for n in __version__[:first]) if build and extra: s += "".join(str(n) for n in __version__[3:]) return s Whoosh-2.5.7/src/whoosh/analysis/0000755000076500000240000000000012277504634017017 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/analysis/__init__.py0000644000076500000240000000633012254366350021126 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """Classes and functions for turning a piece of text into an indexable stream of "tokens" (usually equivalent to words). There are three general classes involved in analysis: * Tokenizers are always at the start of the text processing pipeline. They take a string and yield Token objects (actually, the same token object over and over, for performance reasons) corresponding to the tokens (words) in the text. Every tokenizer is a callable that takes a string and returns an iterator of tokens. * Filters take the tokens from the tokenizer and perform various transformations on them. For example, the LowercaseFilter converts all tokens to lowercase, which is usually necessary when indexing regular English text. Every filter is a callable that takes a token generator and returns a token generator. * Analyzers are convenience functions/classes that "package up" a tokenizer and zero or more filters into a single unit. For example, the StandardAnalyzer combines a RegexTokenizer, LowercaseFilter, and StopFilter. Every analyzer is a callable that takes a string and returns a token iterator. (So Tokenizers can be used as Analyzers if you don't need any filtering). You can compose tokenizers and filters together using the ``|`` character:: my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() The first item must be a tokenizer and the rest must be filters (you can't put a filter first or a tokenizer after the first item). """ from whoosh.analysis.acore import * from whoosh.analysis.tokenizers import * from whoosh.analysis.filters import * from whoosh.analysis.morph import * from whoosh.analysis.intraword import * from whoosh.analysis.ngrams import * from whoosh.analysis.analyzers import * Whoosh-2.5.7/src/whoosh/analysis/acore.py0000644000076500000240000001262512254366350020464 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.compat import iteritems # Exceptions class CompositionError(Exception): pass # Utility functions def unstopped(tokenstream): """Removes tokens from a token stream where token.stopped = True. """ return (t for t in tokenstream if not t.stopped) def entoken(textstream, positions=False, chars=False, start_pos=0, start_char=0, **kwargs): """Takes a sequence of unicode strings and yields a series of Token objects (actually the same Token object over and over, for performance reasons), with the attributes filled in with reasonable values (for example, if ``positions`` or ``chars`` is True, the function assumes each token was separated by one space). """ pos = start_pos char = start_char t = Token(positions=positions, chars=chars, **kwargs) for text in textstream: t.text = text if positions: t.pos = pos pos += 1 if chars: t.startchar = char char = char + len(text) t.endchar = char yield t # Token object class Token(object): """ Represents a "token" (usually a word) extracted from the source text being indexed. See "Advanced analysis" in the user guide for more information. Because object instantiation in Python is slow, tokenizers should create ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes each time. This trick means that consumers of tokens (i.e. filters) must never try to hold onto the token object between loop iterations, or convert the token generator into a list. Instead, save the attributes between iterations, not the object:: def RemoveDuplicatesFilter(self, stream): # Removes duplicate words. lasttext = None for token in stream: # Only yield the token if its text doesn't # match the previous token. if lasttext != token.text: yield token lasttext = token.text ...or, call token.copy() to get a copy of the token object. """ def __init__(self, positions=False, chars=False, removestops=True, mode='', **kwargs): """ :param positions: Whether tokens should have the token position in the 'pos' attribute. :param chars: Whether tokens should have character offsets in the 'startchar' and 'endchar' attributes. :param removestops: whether to remove stop words from the stream (if the tokens pass through a stop filter). :param mode: contains a string describing the purpose for which the analyzer is being called, i.e. 'index' or 'query'. """ self.positions = positions self.chars = chars self.stopped = False self.boost = 1.0 self.removestops = removestops self.mode = mode self.__dict__.update(kwargs) def __repr__(self): parms = ", ".join("%s=%r" % (name, value) for name, value in iteritems(self.__dict__)) return "%s(%s)" % (self.__class__.__name__, parms) def copy(self): # This is faster than using the copy module return Token(**self.__dict__) # Composition support class Composable(object): is_morph = False def __or__(self, other): from whoosh.analysis.analyzers import CompositeAnalyzer if not isinstance(other, Composable): raise TypeError("%r is not composable with %r" % (self, other)) return CompositeAnalyzer(self, other) def __repr__(self): attrs = "" if self.__dict__: attrs = ", ".join("%s=%r" % (key, value) for key, value in iteritems(self.__dict__)) return self.__class__.__name__ + "(%s)" % attrs def has_morph(self): return self.is_morph Whoosh-2.5.7/src/whoosh/analysis/analyzers.py0000644000076500000240000002601512254366350021401 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.analysis.acore import Composable, CompositionError from whoosh.analysis.tokenizers import Tokenizer from whoosh.analysis.filters import LowercaseFilter from whoosh.analysis.filters import StopFilter, STOP_WORDS from whoosh.analysis.morph import StemFilter from whoosh.analysis.intraword import IntraWordFilter from whoosh.analysis.tokenizers import default_pattern from whoosh.analysis.tokenizers import CommaSeparatedTokenizer from whoosh.analysis.tokenizers import IDTokenizer from whoosh.analysis.tokenizers import RegexTokenizer from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer from whoosh.lang.porter import stem # Analyzers class Analyzer(Composable): """ Abstract base class for analyzers. """ def __repr__(self): return "%s()" % self.__class__.__name__ def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.__dict__ == other.__dict__) def __call__(self, value, **kwargs): raise NotImplementedError def clean(self): pass class CompositeAnalyzer(Analyzer): def __init__(self, *composables): self.items = [] for comp in composables: if isinstance(comp, CompositeAnalyzer): self.items.extend(comp.items) else: self.items.append(comp) # Tokenizers must start a chain, and then only filters after that # (because analyzers take a string and return a generator of tokens, # and filters take and return generators of tokens) for item in self.items[1:]: if isinstance(item, Tokenizer): raise CompositionError("Only one tokenizer allowed at the start" " of the analyzer: %r" % self.items) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, ", ".join(repr(item) for item in self.items)) def __call__(self, value, no_morph=False, **kwargs): items = self.items # Start with tokenizer gen = items[0](value, **kwargs) # Run filters for item in items[1:]: if not (no_morph and hasattr(item, "is_morph") and item.is_morph): gen = item(gen) return gen def __getitem__(self, item): return self.items.__getitem__(item) def __len__(self): return len(self.items) def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.items == other.items) def clean(self): for item in self.items: if hasattr(item, "clean"): item.clean() def has_morph(self): return any(item.is_morph for item in self.items) # Functions that return composed analyzers def IDAnalyzer(lowercase=False): """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if desired. """ tokenizer = IDTokenizer() if lowercase: tokenizer = tokenizer | LowercaseFilter() return tokenizer def KeywordAnalyzer(lowercase=False, commas=False): """Parses whitespace- or comma-separated tokens. >>> ana = KeywordAnalyzer() >>> [token.text for token in ana("Hello there, this is a TEST")] ["Hello", "there,", "this", "is", "a", "TEST"] :param lowercase: whether to lowercase the tokens. :param commas: if True, items are separated by commas rather than whitespace. """ if commas: tokenizer = CommaSeparatedTokenizer() else: tokenizer = SpaceSeparatedTokenizer() if lowercase: tokenizer = tokenizer | LowercaseFilter() return tokenizer def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False): """Deprecated, just use a RegexTokenizer directly. """ return RegexTokenizer(expression=expression, gaps=gaps) def SimpleAnalyzer(expression=default_pattern, gaps=False): """Composes a RegexTokenizer with a LowercaseFilter. >>> ana = SimpleAnalyzer() >>> [token.text for token in ana("Hello there, this is a TEST")] ["hello", "there", "this", "is", "a", "test"] :param expression: The regular expression pattern to use to extract tokens. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter() def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False): """Composes a RegexTokenizer with a LowercaseFilter and optional StopFilter. >>> ana = StandardAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["testing", "testing", "testing"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): """Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. >>> ana = StemmingAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["test", "test", "test"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param ignore: a set of words to not stem. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. Use None for no cache, or -1 for an unbounded cache. """ ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter(stemfn=stemfn, ignore=ignore, cachesize=cachesize) def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=True, splitwords=True, splitnums=True, mergewords=False, mergenums=False): """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and StopFilter. >>> ana = FancyAnalyzer() >>> [token.text for token in ana("Should I call getInt or get_real?")] ["should", "call", "getInt", "get", "int", "get_real", "get", "real"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ return (RegexTokenizer(expression=expression, gaps=gaps) | IntraWordFilter(splitwords=splitwords, splitnums=splitnums, mergewords=mergewords, mergenums=mergenums) | LowercaseFilter() | StopFilter(stoplist=stoplist, minsize=minsize) ) def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): """Configures a simple analyzer for the given language, with a LowercaseFilter, StopFilter, and StemFilter. >>> ana = LanguageAnalyzer("es") >>> [token.text for token in ana("Por el mar corren las liebres")] ['mar', 'corr', 'liebr'] The list of available languages is in `whoosh.lang.languages`. You can use :func:`whoosh.lang.has_stemmer` and :func:`whoosh.lang.has_stopwords` to check if a given language has a stemming function and/or stop word list available. :param expression: The regular expression pattern to use to extract tokens. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. """ from whoosh.lang import NoStemmer, NoStopWords # Make the start of the chain chain = (RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()) # Add a stop word filter try: chain = chain | StopFilter(lang=lang) except NoStopWords: pass # Add a stemming filter try: chain = chain | StemFilter(lang=lang, cachesize=cachesize) except NoStemmer: pass return chain Whoosh-2.5.7/src/whoosh/analysis/filters.py0000644000076500000240000004006512254366764021053 0ustar mattstaff00000000000000# coding=utf-8 # Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from itertools import chain from whoosh.compat import next, xrange from whoosh.analysis.acore import Composable from whoosh.util.text import rcompile # Default list of stop words (words so common it's usually wasteful to index # them). This list is used by the StopFilter class, which allows you to supply # an optional list to override this one. STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', 'to', 'us', 'we', 'when', 'will', 'with', 'yet', 'you', 'your')) # Simple pattern for filtering URLs, may be useful url_pattern = rcompile(""" ( [A-Za-z+]+:// # URL protocol \\S+? # URL body (?=\\s|[.]\\s|$|[.]$) # Stop at space/end, or a dot followed by space/end ) | ( # or... \w+([:.]?\w+)* # word characters, with opt. internal colons/dots ) """, verbose=True) # Filters class Filter(Composable): """Base class for Filter objects. A Filter subclass must implement a filter() method that takes a single argument, which is an iterator of Token objects, and yield a series of Token objects in return. Filters that do morphological transformation of tokens (e.g. stemming) should set their ``is_morph`` attribute to True. """ def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.__dict__ == other.__dict__) def __ne__(self, other): return not self == other def __call__(self, tokens): raise NotImplementedError class PassFilter(Filter): """An identity filter: passes the tokens through untouched. """ def __call__(self, tokens): return tokens class LoggingFilter(Filter): """Prints the contents of every filter that passes through as a debug log entry. """ def __init__(self, logger=None): """ :param target: the logger to use. If omitted, the "whoosh.analysis" logger is used. """ if logger is None: import logging logger = logging.getLogger("whoosh.analysis") self.logger = logger def __call__(self, tokens): logger = self.logger for t in tokens: logger.debug(repr(t)) yield t class MultiFilter(Filter): """Chooses one of two or more sub-filters based on the 'mode' attribute of the token stream. """ default_filter = PassFilter() def __init__(self, **kwargs): """Use keyword arguments to associate mode attribute values with instantiated filters. >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False) >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False) >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query) This class expects that the value of the mode attribute is consistent among all tokens in a token stream. """ self.filters = kwargs def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.filters == other.filters) def __call__(self, tokens): # Only selects on the first token t = next(tokens) filter = self.filters.get(t.mode, self.default_filter) return filter(chain([t], tokens)) class TeeFilter(Filter): """Interleaves the results of two or more filters (or filter chains). NOTE: because it needs to create copies of each token for each sub-filter, this filter is quite slow. >>> target = "ALFA BRAVO CHARLIE" >>> # In one branch, we'll lower-case the tokens >>> f1 = LowercaseFilter() >>> # In the other branch, we'll reverse the tokens >>> f2 = ReverseTextFilter() >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) >>> [token.text for token in ana(target)] ["alfa", "AFLA", "bravo", "OVARB", "charlie", "EILRAHC"] To combine the incoming token stream with the output of a filter chain, use ``TeeFilter`` and make one of the filters a :class:`PassFilter`. >>> f1 = PassFilter() >>> f2 = BiWordFilter() >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) | LowercaseFilter() >>> [token.text for token in ana(target)] ["alfa", "alfa-bravo", "bravo", "bravo-charlie", "charlie"] """ def __init__(self, *filters): if len(filters) < 2: raise Exception("TeeFilter requires two or more filters") self.filters = filters def __eq__(self, other): return (self.__class__ is other.__class__ and self.filters == other.fitlers) def __call__(self, tokens): from itertools import tee count = len(self.filters) # Tee the token iterator and wrap each teed iterator with the # corresponding filter gens = [filter(t.copy() for t in gen) for filter, gen in zip(self.filters, tee(tokens, count))] # Keep a count of the number of running iterators running = count while running: for i, gen in enumerate(gens): if gen is not None: try: yield next(gen) except StopIteration: gens[i] = None running -= 1 class ReverseTextFilter(Filter): """Reverses the text of each token. >>> ana = RegexTokenizer() | ReverseTextFilter() >>> [token.text for token in ana("hello there")] ["olleh", "ereht"] """ def __call__(self, tokens): for t in tokens: t.text = t.text[::-1] yield t class LowercaseFilter(Filter): """Uses unicode.lower() to lowercase token text. >>> rext = RegexTokenizer() >>> stream = rext("This is a TEST") >>> [token.text for token in LowercaseFilter(stream)] ["this", "is", "a", "test"] """ def __call__(self, tokens): for t in tokens: t.text = t.text.lower() yield t class StripFilter(Filter): """Calls unicode.strip() on the token text. """ def __call__(self, tokens): for t in tokens: t.text = t.text.strip() yield t class StopFilter(Filter): """Marks "stop" words (words too common to index) in the stream (and by default removes them). Make sure you precede this filter with a :class:`LowercaseFilter`. >>> stopper = RegexTokenizer() | StopFilter() >>> [token.text for token in stopper(u"this is a test")] ["test"] >>> es_stopper = RegexTokenizer() | StopFilter(lang="es") >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")] ["lapiz", "mesa"] The list of available languages is in `whoosh.lang.languages`. You can use :func:`whoosh.lang.has_stopwords` to check if a given language has a stop word list available. """ def __init__(self, stoplist=STOP_WORDS, minsize=2, maxsize=None, renumber=True, lang=None): """ :param stoplist: A collection of words to remove from the stream. This is converted to a frozenset. The default is a list of common English stop words. :param minsize: The minimum length of token texts. Tokens with text smaller than this will be stopped. The default is 2. :param maxsize: The maximum length of token texts. Tokens with text larger than this will be stopped. Use None to allow any length. :param renumber: Change the 'pos' attribute of unstopped tokens to reflect their position with the stopped words removed. :param lang: Automatically get a list of stop words for the given language """ stops = set() if stoplist: stops.update(stoplist) if lang: from whoosh.lang import stopwords_for_language stops.update(stopwords_for_language(lang)) self.stops = frozenset(stops) self.min = minsize self.max = maxsize self.renumber = renumber def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.stops == other.stops and self.min == other.min and self.renumber == other.renumber) def __call__(self, tokens): stoplist = self.stops minsize = self.min maxsize = self.max renumber = self.renumber pos = None for t in tokens: text = t.text if (len(text) >= minsize and (maxsize is None or len(text) <= maxsize) and text not in stoplist): # This is not a stop word if renumber and t.positions: if pos is None: pos = t.pos else: pos += 1 t.pos = pos t.stopped = False yield t else: # This is a stop word if not t.removestops: # This IS a stop word, but we're not removing them t.stopped = True yield t class CharsetFilter(Filter): """Translates the text of tokens by calling unicode.translate() using the supplied character mapping object. This is useful for case and accent folding. The ``whoosh.support.charset`` module has a useful map for accent folding. >>> from whoosh.support.charset import accent_map >>> retokenizer = RegexTokenizer() >>> chfilter = CharsetFilter(accent_map) >>> [t.text for t in chfilter(retokenizer(u'café'))] [u'cafe'] Another way to get a character mapping object is to convert a Sphinx charset table file using :func:`whoosh.support.charset.charset_table_to_dict`. >>> from whoosh.support.charset import charset_table_to_dict >>> from whoosh.support.charset import default_charset >>> retokenizer = RegexTokenizer() >>> charmap = charset_table_to_dict(default_charset) >>> chfilter = CharsetFilter(charmap) >>> [t.text for t in chfilter(retokenizer(u'Stra\\xdfe'))] [u'strase'] The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table. """ __inittypes__ = dict(charmap=dict) def __init__(self, charmap): """ :param charmap: a dictionary mapping from integer character numbers to unicode characters, as required by the unicode.translate() method. """ self.charmap = charmap def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.charmap == other.charmap) def __call__(self, tokens): assert hasattr(tokens, "__iter__") charmap = self.charmap for t in tokens: t.text = t.text.translate(charmap) yield t class DelimitedAttributeFilter(Filter): """Looks for delimiter characters in the text of each token and stores the data after the delimiter in a named attribute on the token. The defaults are set up to use the ``^`` character as a delimiter and store the value after the ``^`` as the boost for the token. >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost") >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter() >>> for t in ana(u("image render^2 file^0.5")) ... print("%r %f" % (t.text, t.boost)) 'image' 1.0 'render' 2.0 'file' 0.5 Note that you need to make sure your tokenizer includes the delimiter and data as part of the token! """ def __init__(self, delimiter="^", attribute="boost", default=1.0, type=float): """ :param delimiter: a string that, when present in a token's text, separates the actual text from the "data" payload. :param attribute: the name of the attribute in which to store the data on the token. :param default: the value to use for the attribute for tokens that don't have delimited data. :param type: the type of the data, for example ``str`` or ``float``. This is used to convert the string value of the data before storing it in the attribute. """ self.delim = delimiter self.attr = attribute self.default = default self.type = type def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.delim == other.delim and self.attr == other.attr and self.default == other.default) def __call__(self, tokens): delim = self.delim attr = self.attr default = self.default type_ = self.type for t in tokens: text = t.text pos = text.find(delim) if pos > -1: setattr(t, attr, type_(text[pos + 1:])) if t.chars: t.endchar -= len(t.text) - pos t.text = text[:pos] else: setattr(t, attr, default) yield t class SubstitutionFilter(Filter): """Performs a regular expression substitution on the token text. This is especially useful for removing text from tokens, for example hyphens:: ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "") Because it has the full power of the re.sub() method behind it, this filter can perform some fairly complex transformations. For example, to take tokens like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c', 'f=e'``:: # Analyzer that swaps the text on either side of an equal sign rt = RegexTokenizer(r"\\S+") sf = SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1") ana = rt | sf """ def __init__(self, pattern, replacement): """ :param pattern: a pattern string or compiled regular expression object describing the text to replace. :param replacement: the substitution text. """ self.pattern = rcompile(pattern) self.replacement = replacement def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.pattern == other.pattern and self.replacement == other.replacement) def __call__(self, tokens): pattern = self.pattern replacement = self.replacement for t in tokens: t.text = pattern.sub(replacement, t.text) yield t Whoosh-2.5.7/src/whoosh/analysis/intraword.py0000644000076500000240000004505712254366350021411 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import re from collections import deque from whoosh.compat import u, text_type from whoosh.compat import xrange from whoosh.analysis.filters import Filter class CompoundWordFilter(Filter): """Given a set of words (or any object with a ``__contains__`` method), break any tokens in the stream that are composites of words in the word set into their individual parts. Given the correct set of words, this filter can break apart run-together words and trademarks (e.g. "turbosquid", "applescript"). It can also be useful for agglutinative languages such as German. The ``keep_compound`` argument lets you decide whether to keep the compound word in the token stream along with the word segments. >>> cwf = CompoundWordFilter(wordset, keep_compound=True) >>> analyzer = RegexTokenizer(r"\S+") | cwf >>> [t.text for t in analyzer("I do not like greeneggs and ham") ["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"] >>> cwf.keep_compound = False >>> [t.text for t in analyzer("I do not like greeneggs and ham") ["I", "do", "not", "like", "green", "eggs", "and", "ham"] """ def __init__(self, wordset, keep_compound=True): """ :param wordset: an object with a ``__contains__`` method, such as a set, containing strings to look for inside the tokens. :param keep_compound: if True (the default), the original compound token will be retained in the stream before the subwords. """ self.wordset = wordset self.keep_compound = keep_compound def subwords(self, s, memo): if s in self.wordset: return [s] if s in memo: return memo[s] for i in xrange(1, len(s)): prefix = s[:i] if prefix in self.wordset: suffix = s[i:] suffix_subs = self.subwords(suffix, memo) if suffix_subs: result = [prefix] + suffix_subs memo[s] = result return result return None def __call__(self, tokens): keep_compound = self.keep_compound memo = {} subwords = self.subwords for t in tokens: subs = subwords(t.text, memo) if subs: if len(subs) > 1 and keep_compound: yield t for subword in subs: t.text = subword yield t else: yield t class BiWordFilter(Filter): """Merges adjacent tokens into "bi-word" tokens, so that for example:: "the", "sign", "of", "four" becomes:: "the-sign", "sign-of", "of-four" This can be used to create fields for pseudo-phrase searching, where if all the terms match the document probably contains the phrase, but the searching is faster than actually doing a phrase search on individual word terms. The ``BiWordFilter`` is much faster than using the otherwise equivalent ``ShingleFilter(2)``. """ def __init__(self, sep="-"): self.sep = sep def __call__(self, tokens): sep = self.sep prev_text = None prev_startchar = None prev_pos = None atleastone = False for token in tokens: # Save the original text of this token text = token.text # Save the original position positions = token.positions if positions: ps = token.pos # Save the original start char chars = token.chars if chars: sc = token.startchar if prev_text is not None: # Use the pos and startchar from the previous token if positions: token.pos = prev_pos if chars: token.startchar = prev_startchar # Join the previous token text and the current token text to # form the biword token token.text = "".join((prev_text, sep, text)) yield token atleastone = True # Save the originals and the new "previous" values prev_text = text if chars: prev_startchar = sc if positions: prev_pos = ps # If no bi-words were emitted, that is, the token stream only had # a single token, then emit that single token. if not atleastone: yield token class ShingleFilter(Filter): """Merges a certain number of adjacent tokens into multi-word tokens, so that for example:: "better", "a", "witty", "fool", "than", "a", "foolish", "wit" with ``ShingleFilter(3, ' ')`` becomes:: 'better a witty', 'a witty fool', 'witty fool than', 'fool than a', 'than a foolish', 'a foolish wit' This can be used to create fields for pseudo-phrase searching, where if all the terms match the document probably contains the phrase, but the searching is faster than actually doing a phrase search on individual word terms. If you're using two-word shingles, you should use the functionally equivalent ``BiWordFilter`` instead because it's faster than ``ShingleFilter``. """ def __init__(self, size=2, sep="-"): self.size = size self.sep = sep def __call__(self, tokens): size = self.size sep = self.sep buf = deque() atleastone = False def make_token(): tk = buf[0] tk.text = sep.join([t.text for t in buf]) if tk.chars: tk.endchar = buf[-1].endchar return tk for token in tokens: if not token.stopped: buf.append(token.copy()) if len(buf) == size: atleastone = True yield make_token() buf.popleft() # If no shingles were emitted, that is, the token stream had fewer than # 'size' tokens, then emit a single token with whatever tokens there # were if not atleastone and buf: yield make_token() class IntraWordFilter(Filter): """Splits words into subwords and performs optional transformations on subword groups. This filter is funtionally based on yonik's WordDelimiterFilter in Solr, but shares no code with it. * Split on intra-word delimiters, e.g. `Wi-Fi` -> `Wi`, `Fi`. * When splitwords=True, split on case transitions, e.g. `PowerShot` -> `Power`, `Shot`. * When splitnums=True, split on letter-number transitions, e.g. `SD500` -> `SD`, `500`. * Leading and trailing delimiter characters are ignored. * Trailing possesive "'s" removed from subwords, e.g. `O'Neil's` -> `O`, `Neil`. The mergewords and mergenums arguments turn on merging of subwords. When the merge arguments are false, subwords are not merged. * `PowerShot` -> `0`:`Power`, `1`:`Shot` (where `0` and `1` are token positions). When one or both of the merge arguments are true, consecutive runs of alphabetic and/or numeric subwords are merged into an additional token with the same position as the last sub-word. * `PowerShot` -> `0`:`Power`, `1`:`Shot`, `1`:`PowerShot` * `A's+B's&C's` -> `0`:`A`, `1`:`B`, `2`:`C`, `2`:`ABC` * `Super-Duper-XL500-42-AutoCoder!` -> `0`:`Super`, `1`:`Duper`, `2`:`XL`, `2`:`SuperDuperXL`, `3`:`500`, `4`:`42`, `4`:`50042`, `5`:`Auto`, `6`:`Coder`, `6`:`AutoCoder` When using this filter you should use a tokenizer that only splits on whitespace, so the tokenizer does not remove intra-word delimiters before this filter can see them, and put this filter before any use of LowercaseFilter. >>> rt = RegexTokenizer(r"\\S+") >>> iwf = IntraWordFilter() >>> lcf = LowercaseFilter() >>> analyzer = rt | iwf | lcf One use for this filter is to help match different written representations of a concept. For example, if the source text contained `wi-fi`, you probably want `wifi`, `WiFi`, `wi-fi`, etc. to match. One way of doing this is to specify mergewords=True and/or mergenums=True in the analyzer used for indexing, and mergewords=False / mergenums=False in the analyzer used for querying. >>> iwf_i = IntraWordFilter(mergewords=True, mergenums=True) >>> iwf_q = IntraWordFilter(mergewords=False, mergenums=False) >>> iwf = MultiFilter(index=iwf_i, query=iwf_q) >>> analyzer = RegexTokenizer(r"\S+") | iwf | LowercaseFilter() (See :class:`MultiFilter`.) """ is_morph = True __inittypes__ = dict(delims=text_type, splitwords=bool, splitnums=bool, mergewords=bool, mergenums=bool) def __init__(self, delims=u("-_'\"()!@#$%^&*[]{}<>\|;:,./?`~=+"), splitwords=True, splitnums=True, mergewords=False, mergenums=False): """ :param delims: a string of delimiter characters. :param splitwords: if True, split at case transitions, e.g. `PowerShot` -> `Power`, `Shot` :param splitnums: if True, split at letter-number transitions, e.g. `SD500` -> `SD`, `500` :param mergewords: merge consecutive runs of alphabetic subwords into an additional token with the same position as the last subword. :param mergenums: merge consecutive runs of numeric subwords into an additional token with the same position as the last subword. """ from whoosh.support.unicode import digits, lowercase, uppercase self.delims = re.escape(delims) # Expression for text between delimiter characters self.between = re.compile(u("[^%s]+") % (self.delims,), re.UNICODE) # Expression for removing "'s" from the end of sub-words dispat = u("(?<=[%s%s])'[Ss](?=$|[%s])") % (lowercase, uppercase, self.delims) self.possessive = re.compile(dispat, re.UNICODE) # Expression for finding case and letter-number transitions lower2upper = u("[%s][%s]") % (lowercase, uppercase) letter2digit = u("[%s%s][%s]") % (lowercase, uppercase, digits) digit2letter = u("[%s][%s%s]") % (digits, lowercase, uppercase) if splitwords and splitnums: splitpat = u("(%s|%s|%s)") % (lower2upper, letter2digit, digit2letter) self.boundary = re.compile(splitpat, re.UNICODE) elif splitwords: self.boundary = re.compile(text_type(lower2upper), re.UNICODE) elif splitnums: numpat = u("(%s|%s)") % (letter2digit, digit2letter) self.boundary = re.compile(numpat, re.UNICODE) self.splitting = splitwords or splitnums self.mergewords = mergewords self.mergenums = mergenums def __eq__(self, other): return other and self.__class__ is other.__class__\ and self.__dict__ == other.__dict__ def _split(self, string): bound = self.boundary # Yields (startchar, endchar) pairs for each indexable substring in # the given string, e.g. "WikiWord" -> (0, 4), (4, 8) # Whether we're splitting on transitions (case changes, letter -> num, # num -> letter, etc.) splitting = self.splitting # Make a list (dispos, for "dispossessed") of (startchar, endchar) # pairs for runs of text between "'s" if "'" in string: # Split on possessive 's dispos = [] prev = 0 for match in self.possessive.finditer(string): dispos.append((prev, match.start())) prev = match.end() if prev < len(string): dispos.append((prev, len(string))) else: # Shortcut if there's no apostrophe in the string dispos = ((0, len(string)),) # For each run between 's for sc, ec in dispos: # Split on boundary characters for part_match in self.between.finditer(string, sc, ec): part_start = part_match.start() part_end = part_match.end() if splitting: # The point to start splitting at prev = part_start # Find transitions (e.g. "iW" or "a0") for bmatch in bound.finditer(string, part_start, part_end): # The point in the middle of the transition pivot = bmatch.start() + 1 # Yield from the previous match to the transition yield (prev, pivot) # Make the transition the new starting point prev = pivot # If there's leftover text at the end, yield it too if prev < part_end: yield (prev, part_end) else: # Not splitting on transitions, just yield the part yield (part_start, part_end) def _merge(self, parts): mergewords = self.mergewords mergenums = self.mergenums # Current type (1=alpah, 2=digit) last = 0 # Where to insert a merged term in the original list insertat = 0 # Buffer for parts to merge buf = [] # Iterate on a copy of the parts list so we can modify the original as # we go def insert_item(buf, at, newpos): newtext = "".join(item[0] for item in buf) newsc = buf[0][2] # start char of first item in buffer newec = buf[-1][3] # end char of last item in buffer parts.insert(insertat, (newtext, newpos, newsc, newec)) for item in list(parts): # item = (text, pos, startchar, endchar) text = item[0] pos = item[1] # Set the type of this part if text.isalpha(): this = 1 elif text.isdigit(): this = 2 else: this = None # Is this the same type as the previous part? if (buf and (this == last == 1 and mergewords) or (this == last == 2 and mergenums)): # This part is the same type as the previous. Add it to the # buffer of parts to merge. buf.append(item) else: # This part is different than the previous. if len(buf) > 1: # If the buffer has at least two parts in it, merge them # and add them to the original list of parts. insert_item(buf, insertat, pos - 1) insertat += 1 # Reset the buffer buf = [item] last = this insertat += 1 # If there are parts left in the buffer at the end, merge them and add # them to the original list. if len(buf) > 1: insert_item(buf, len(parts), pos) def __call__(self, tokens): mergewords = self.mergewords mergenums = self.mergenums # This filter renumbers tokens as it expands them. New position # counter. newpos = None for t in tokens: text = t.text # If this is the first token we've seen, use it to set the new # position counter if newpos is None: if t.positions: newpos = t.pos else: # Token doesn't have positions, just use 0 newpos = 0 if ((text.isalpha() and (text.islower() or text.isupper())) or text.isdigit()): # Short-circuit the common cases of no delimiters, no case # transitions, only digits, etc. t.pos = newpos yield t newpos += 1 else: # Split the token text on delimiters, word and/or number # boundaries into a list of (text, pos, startchar, endchar) # tuples ranges = self._split(text) parts = [(text[sc:ec], i + newpos, sc, ec) for i, (sc, ec) in enumerate(ranges)] # Did the split yield more than one part? if len(parts) > 1: # If the options are set, merge consecutive runs of all- # letters and/or all-numbers. if mergewords or mergenums: self._merge(parts) # Yield tokens for the parts chars = t.chars if chars: base = t.startchar for text, pos, startchar, endchar in parts: t.text = text t.pos = pos if t.chars: t.startchar = base + startchar t.endchar = base + endchar yield t if parts: # Set the new position counter based on the last part newpos = parts[-1][1] + 1 Whoosh-2.5.7/src/whoosh/analysis/morph.py0000644000076500000240000002361512254366350020521 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.analysis.filters import Filter from whoosh.compat import integer_types from whoosh.lang.dmetaphone import double_metaphone from whoosh.lang.porter import stem from whoosh.util.cache import lfu_cache, unbound_cache class StemFilter(Filter): """Stems (removes suffixes from) the text of tokens using the Porter stemming algorithm. Stemming attempts to reduce multiple forms of the same root word (for example, "rendering", "renders", "rendered", etc.) to a single word in the index. >>> stemmer = RegexTokenizer() | StemFilter() >>> [token.text for token in stemmer("fundamentally willows")] ["fundament", "willow"] You can pass your own stemming function to the StemFilter. The default is the Porter stemming algorithm for English. >>> stemfilter = StemFilter(stem_function) You can also use one of the Snowball stemming functions by passing the `lang` keyword argument. >>> stemfilter = StemFilter(lang="ru") The list of available languages is in `whoosh.lang.languages`. You can use :func:`whoosh.lang.has_stemmer` to check if a given language has a stemming function available. By default, this class wraps an LRU cache around the stemming function. The ``cachesize`` keyword argument sets the size of the cache. To make the cache unbounded (the class caches every input), use ``cachesize=-1``. To disable caching, use ``cachesize=None``. If you compile and install the py-stemmer library, the :class:`PyStemmerFilter` provides slightly easier access to the language stemmers in that library. """ __inittypes__ = dict(stemfn=object, ignore=list) is_morph = True def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): """ :param stemfn: the function to use for stemming. :param lang: if not None, overrides the stemfn with a language stemmer from the ``whoosh.lang.snowball`` package. :param ignore: a set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. :param cachesize: the maximum number of words to cache. Use ``-1`` for an unbounded cache, or ``None`` for no caching. """ self.stemfn = stemfn self.lang = lang self.ignore = frozenset() if ignore is None else frozenset(ignore) self.cachesize = cachesize # clear() sets the _stem attr to a cached wrapper around self.stemfn self.clear() def __getstate__(self): # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state return dict([(k, self.__dict__[k]) for k in self.__dict__ if k != "_stem"]) def __setstate__(self, state): # Check for old instances of StemFilter class, which didn't have a # cachesize attribute and pickled the cache attribute if "cachesize" not in state: self.cachesize = 50000 if "ignores" in state: self.ignore = state["ignores"] elif "ignore" not in state: self.ignore = frozenset() if "lang" not in state: self.lang = None if "cache" in state: del state["cache"] self.__dict__.update(state) # Set the _stem attribute self.clear() def clear(self): if self.lang: from whoosh.lang import stemmer_for_language stemfn = stemmer_for_language(self.lang) else: stemfn = self.stemfn if isinstance(self.cachesize, integer_types) and self.cachesize != 0: if self.cachesize < 0: self._stem = unbound_cache(stemfn) elif self.cachesize > 1: self._stem = lfu_cache(self.cachesize)(stemfn) else: self._stem = stemfn def cache_info(self): if self.cachesize <= 1: return None return self._stem.cache_info() def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.stemfn == other.stemfn) def __call__(self, tokens): stemfn = self._stem ignore = self.ignore for t in tokens: if not t.stopped: text = t.text if text not in ignore: t.text = stemfn(text) yield t class PyStemmerFilter(StemFilter): """This is a simple subclass of StemFilter that works with the py-stemmer third-party library. You must have the py-stemmer library installed to use this filter. >>> PyStemmerFilter("spanish") """ def __init__(self, lang="english", ignore=None, cachesize=10000): """ :param lang: a string identifying the stemming algorithm to use. You can get a list of available algorithms by with the :meth:`PyStemmerFilter.algorithms` method. The identification strings are directly from the py-stemmer library. :param ignore: a set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. :param cachesize: the maximum number of words to cache. """ self.lang = lang self.ignore = frozenset() if ignore is None else frozenset(ignore) self.cachesize = cachesize self._stem = self._get_stemmer_fn() def algorithms(self): """Returns a list of stemming algorithms provided by the py-stemmer library. """ import Stemmer # @UnresolvedImport return Stemmer.algorithms() def cache_info(self): return None def _get_stemmer_fn(self): import Stemmer # @UnresolvedImport stemmer = Stemmer.Stemmer(self.lang) stemmer.maxCacheSize = self.cachesize return stemmer.stemWord def __getstate__(self): # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state return dict([(k, self.__dict__[k]) for k in self.__dict__ if k != "_stem"]) def __setstate__(self, state): # Check for old instances of StemFilter class, which didn't have a # cachesize attribute and pickled the cache attribute if "cachesize" not in state: self.cachesize = 10000 if "ignores" in state: self.ignore = state["ignores"] elif "ignore" not in state: self.ignore = frozenset() if "cache" in state: del state["cache"] self.__dict__.update(state) # Set the _stem attribute self._stem = self._get_stemmer_fn() class DoubleMetaphoneFilter(Filter): """Transforms the text of the tokens using Lawrence Philips's Double Metaphone algorithm. This algorithm attempts to encode words in such a way that similar-sounding words reduce to the same code. This may be useful for fields containing the names of people and places, and other uses where tolerance of spelling differences is desireable. """ is_morph = True def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False): """ :param primary_boost: the boost to apply to the token containing the primary code. :param secondary_boost: the boost to apply to the token containing the secondary code, if any. :param combine: if True, the original unencoded tokens are kept in the stream, preceding the encoded tokens. """ self.primary_boost = primary_boost self.secondary_boost = secondary_boost self.combine = combine def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.primary_boost == other.primary_boost) def __call__(self, tokens): primary_boost = self.primary_boost secondary_boost = self.secondary_boost combine = self.combine for t in tokens: if combine: yield t primary, secondary = double_metaphone(t.text) b = t.boost # Overwrite the token's text and boost and yield it if primary: t.text = primary t.boost = b * primary_boost yield t if secondary: t.text = secondary t.boost = b * secondary_boost yield t Whoosh-2.5.7/src/whoosh/analysis/ngrams.py0000644000076500000240000002112412254366350020654 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.compat import text_type from whoosh.compat import xrange from whoosh.analysis.acore import Token from whoosh.analysis.filters import Filter, LowercaseFilter from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer # Tokenizer class NgramTokenizer(Tokenizer): """Splits input text into N-grams instead of words. >>> ngt = NgramTokenizer(4) >>> [token.text for token in ngt("hi there")] ["hi t", "i th", " the", "ther", "here"] Note that this tokenizer does NOT use a regular expression to extract words, so the grams emitted by it will contain whitespace, punctuation, etc. You may want to massage the input or add a custom filter to this tokenizer's output. Alternatively, if you only want sub-word grams without whitespace, you could combine a RegexTokenizer with NgramFilter instead. """ __inittypes__ = dict(minsize=int, maxsize=int) def __init__(self, minsize, maxsize=None): """ :param minsize: The minimum size of the N-grams. :param maxsize: The maximum size of the N-grams. If you omit this parameter, maxsize == minsize. """ self.min = minsize self.max = maxsize or minsize def __eq__(self, other): if self.__class__ is other.__class__: if self.min == other.min and self.max == other.max: return True return False def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value inlen = len(value) t = Token(positions, chars, removestops=removestops, mode=mode) pos = start_pos if mode == "query": size = min(self.max, inlen) for start in xrange(0, inlen - size + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1 else: for start in xrange(0, inlen - self.min + 1): for size in xrange(self.min, self.max + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1 # Filter class NgramFilter(Filter): """Splits token text into N-grams. >>> rext = RegexTokenizer() >>> stream = rext("hello there") >>> ngf = NgramFilter(4) >>> [token.text for token in ngf(stream)] ["hell", "ello", "ther", "here"] """ __inittypes__ = dict(minsize=int, maxsize=int) def __init__(self, minsize, maxsize=None, at=None): """ :param minsize: The minimum size of the N-grams. :param maxsize: The maximum size of the N-grams. If you omit this parameter, maxsize == minsize. :param at: If 'start', only take N-grams from the start of each word. if 'end', only take N-grams from the end of each word. Otherwise, take all N-grams from the word (the default). """ self.min = minsize self.max = maxsize or minsize self.at = 0 if at == "start": self.at = -1 elif at == "end": self.at = 1 def __eq__(self, other): return other and self.__class__ is other.__class__\ and self.min == other.min and self.max == other.max def __call__(self, tokens): assert hasattr(tokens, "__iter__") at = self.at for t in tokens: text = t.text if len(text) < self.min: continue chars = t.chars if chars: startchar = t.startchar # Token positions don't mean much for N-grams, # so we'll leave the token's original position # untouched. if t.mode == "query": size = min(self.max, len(t.text)) if at == -1: t.text = text[:size] if chars: t.endchar = startchar + size yield t elif at == 1: t.text = text[0 - size:] if chars: t.startchar = t.endchar - size yield t else: for start in xrange(0, len(text) - size + 1): t.text = text[start:start + size] if chars: t.startchar = startchar + start t.endchar = startchar + start + size yield t else: if at == -1: limit = min(self.max, len(text)) for size in xrange(self.min, limit + 1): t.text = text[:size] if chars: t.endchar = startchar + size yield t elif at == 1: if chars: original_startchar = t.startchar start = max(0, len(text) - self.max) for i in xrange(start, len(text) - self.min + 1): t.text = text[i:] if chars: t.startchar = original_startchar + i yield t else: for start in xrange(0, len(text) - self.min + 1): for size in xrange(self.min, self.max + 1): end = start + size if end > len(text): continue t.text = text[start:end] if chars: t.startchar = startchar + start t.endchar = startchar + end yield t # Analyzers def NgramAnalyzer(minsize, maxsize=None): """Composes an NgramTokenizer and a LowercaseFilter. >>> ana = NgramAnalyzer(4) >>> [token.text for token in ana("hi there")] ["hi t", "i th", " the", "ther", "here"] """ return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter() def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None): if not tokenizer: tokenizer = RegexTokenizer() return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at) Whoosh-2.5.7/src/whoosh/analysis/tokenizers.py0000644000076500000240000003060612254366350021567 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.compat import u, text_type from whoosh.analysis.acore import Composable, Token from whoosh.util.text import rcompile default_pattern = rcompile(r"\w+(\.?\w+)*") # Tokenizers class Tokenizer(Composable): """Base class for Tokenizers. """ def __eq__(self, other): return other and self.__class__ is other.__class__ class IDTokenizer(Tokenizer): """Yields the entire input string as a single token. For use in indexed but untokenized fields, such as a document's path. >>> idt = IDTokenizer() >>> [token.text for token in idt("/a/b 123 alpha")] ["/a/b 123 alpha"] """ def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) t.text = value t.boost = 1.0 if keeporiginal: t.original = value if positions: t.pos = start_pos + 1 if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t class RegexTokenizer(Tokenizer): """ Uses a regular expression to extract tokens from text. >>> rex = RegexTokenizer() >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))] ["hi", "there", "3.141", "big", "time", "under_score"] """ def __init__(self, expression=default_pattern, gaps=False): """ :param expression: A regular expression object or string. Each match of the expression equals a token. Group 0 (the entire matched text) is used as the text of the token. If you require more complicated handling of the expression match, simply write your own tokenizer. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ self.expression = rcompile(expression) self.gaps = gaps def __eq__(self, other): if self.__class__ is other.__class__: if self.expression.pattern == other.expression.pattern: return True return False def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens for pos, match in enumerate(self.expression.finditer(value)): t.text = match.group(0) t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t else: # When gaps=True, iterate through the matches and # yield the text between them. prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t class CharsetTokenizer(Tokenizer): """Tokenizes and translates text according to a character mapping object. Characters that map to None are considered token break characters. For all other characters the map is used to translate the character. This is useful for case and accent folding. This tokenizer loops character-by-character and so will likely be much slower than :class:`RegexTokenizer`. One way to get a character mapping object is to convert a Sphinx charset table file using :func:`whoosh.support.charset.charset_table_to_dict`. >>> from whoosh.support.charset import charset_table_to_dict >>> from whoosh.support.charset import default_charset >>> charmap = charset_table_to_dict(default_charset) >>> chtokenizer = CharsetTokenizer(charmap) >>> [t.text for t in chtokenizer(u'Stra\\xdfe ABC')] [u'strase', u'abc'] The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table. """ __inittype__ = dict(charmap=str) def __init__(self, charmap): """ :param charmap: a mapping from integer character numbers to unicode characters, as used by the unicode.translate() method. """ self.charmap = charmap def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.charmap == other.charmap) def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode='', **kwargs): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: text = u("") charmap = self.charmap pos = start_pos startchar = currentchar = start_char for char in value: tchar = charmap[ord(char)] if tchar: text += tchar else: if currentchar > startchar: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar t.endchar = currentchar yield t startchar = currentchar + 1 text = u("") currentchar += 1 if currentchar > startchar: t.text = value[startchar:currentchar] t.boost = 1.0 if keeporiginal: t.original = t.text if positions: t.pos = pos if chars: t.startchar = startchar t.endchar = currentchar yield t def SpaceSeparatedTokenizer(): """Returns a RegexTokenizer that splits tokens by whitespace. >>> sst = SpaceSeparatedTokenizer() >>> [token.text for token in sst("hi there big-time, what's up")] ["hi", "there", "big-time,", "what's", "up"] """ return RegexTokenizer(r"[^ \t\r\n]+") def CommaSeparatedTokenizer(): """Splits tokens by commas. Note that the tokenizer calls unicode.strip() on each match of the regular expression. >>> cst = CommaSeparatedTokenizer() >>> [token.text for token in cst("hi there, what's , up")] ["hi there", "what's", "up"] """ from whoosh.analysis.filters import StripFilter return RegexTokenizer(r"[^,]+") | StripFilter() class PathTokenizer(Tokenizer): """A simple tokenizer that given a string ``"/a/b/c"`` yields tokens ``["/a", "/a/b", "/a/b/c"]``. """ def __init__(self, expression="[^/]+"): self.expr = rcompile(expression) def __call__(self, value, positions=False, start_pos=0, **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value token = Token(positions, **kwargs) pos = start_pos for match in self.expr.finditer(value): token.text = value[:match.end()] if positions: token.pos = pos pos += 1 yield token Whoosh-2.5.7/src/whoosh/automata/0000755000076500000240000000000012277504634017007 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/automata/__init__.py0000644000076500000240000000000112254366350021103 0ustar mattstaff00000000000000 Whoosh-2.5.7/src/whoosh/automata/fst.py0000644000076500000240000012747712254366350020173 0ustar mattstaff00000000000000# Copyright 2009 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module implements an FST/FSA writer and reader. An FST (Finite State Transducer) stores a directed acyclic graph with values associated with the leaves. Common elements of the values are pushed inside the tree. An FST that does not store values is a regular FSA. The format of the leaf values is pluggable using subclasses of the Values class. Whoosh uses these structures to store a directed acyclic word graph (DAWG) for use in (at least) spell checking. """ import sys, copy from array import array from hashlib import sha1 # @UnresolvedImport from whoosh.compat import b, u, BytesIO from whoosh.compat import xrange, iteritems, iterkeys, izip, array_tobytes from whoosh.compat import bytes_type, text_type from whoosh.filedb.structfile import StructFile from whoosh.system import _INT_SIZE from whoosh.system import pack_byte, pack_int, pack_uint, pack_long from whoosh.system import emptybytes from whoosh.util.text import utf8encode, utf8decode from whoosh.util.varints import varint class FileVersionError(Exception): pass class InactiveCursor(Exception): pass ARC_LAST = 1 ARC_ACCEPT = 2 ARC_STOP = 4 ARC_HAS_VAL = 8 ARC_HAS_ACCEPT_VAL = 16 MULTIBYTE_LABEL = 32 # FST Value types class Values(object): """Base for classes the describe how to encode and decode FST values. """ @staticmethod def is_valid(v): """Returns True if v is a valid object that can be stored by this class. """ raise NotImplementedError @staticmethod def common(v1, v2): """Returns the "common" part of the two values, for whatever "common" means for this class. For example, a string implementation would return the common shared prefix, for an int implementation it would return the minimum of the two numbers. If there is no common part, this method should return None. """ raise NotImplementedError @staticmethod def add(prefix, v): """Adds the given prefix (the result of a call to common()) to the given value. """ raise NotImplementedError @staticmethod def subtract(v, prefix): """Subtracts the "common" part (the prefix) from the given value. """ raise NotImplementedError @staticmethod def write(dbfile, v): """Writes value v to a file. """ raise NotImplementedError @staticmethod def read(dbfile): """Reads a value from the given file. """ raise NotImplementedError @classmethod def skip(cls, dbfile): """Skips over a value in the given file. """ cls.read(dbfile) @staticmethod def to_bytes(v): """Returns a str (Python 2.x) or bytes (Python 3) representation of the given value. This is used for calculating node digests, so it should be unique but fast to calculate, and does not have to be parseable. """ raise NotImplementedError @staticmethod def merge(v1, v2): raise NotImplementedError class IntValues(Values): """Stores integer values in an FST. """ @staticmethod def is_valid(v): return isinstance(v, int) and v >= 0 @staticmethod def common(v1, v2): if v1 is None or v2 is None: return None if v1 == v2: return v1 return min(v1, v2) @staticmethod def add(base, v): if base is None: return v if v is None: return base return base + v @staticmethod def subtract(v, base): if v is None: return None if base is None: return v return v - base @staticmethod def write(dbfile, v): dbfile.write_uint(v) @staticmethod def read(dbfile): return dbfile.read_uint() @staticmethod def skip(dbfile): dbfile.seek(_INT_SIZE, 1) @staticmethod def to_bytes(v): return pack_int(v) class SequenceValues(Values): """Abstract base class for value types that store sequences. """ @staticmethod def is_valid(v): return isinstance(self, (list, tuple)) @staticmethod def common(v1, v2): if v1 is None or v2 is None: return None i = 0 while i < len(v1) and i < len(v2): if v1[i] != v2[i]: break i += 1 if i == 0: return None if i == len(v1): return v1 if i == len(v2): return v2 return v1[:i] @staticmethod def add(prefix, v): if prefix is None: return v if v is None: return prefix return prefix + v @staticmethod def subtract(v, prefix): if prefix is None: return v if v is None: return None if len(v) == len(prefix): return None if len(v) < len(prefix) or len(prefix) == 0: raise ValueError((v, prefix)) return v[len(prefix):] @staticmethod def write(dbfile, v): dbfile.write_pickle(v) @staticmethod def read(dbfile): return dbfile.read_pickle() class BytesValues(SequenceValues): """Stores bytes objects (str in Python 2.x) in an FST. """ @staticmethod def is_valid(v): return isinstance(v, bytes_type) @staticmethod def write(dbfile, v): dbfile.write_int(len(v)) dbfile.write(v) @staticmethod def read(dbfile): length = dbfile.read_int() return dbfile.read(length) @staticmethod def skip(dbfile): length = dbfile.read_int() dbfile.seek(length, 1) @staticmethod def to_bytes(v): return v class ArrayValues(SequenceValues): """Stores array.array objects in an FST. """ def __init__(self, typecode): self.typecode = typecode self.itemsize = array(self.typecode).itemsize def is_valid(self, v): return isinstance(v, array) and v.typecode == self.typecode @staticmethod def write(dbfile, v): dbfile.write(b(v.typecode)) dbfile.write_int(len(v)) dbfile.write_array(v) def read(self, dbfile): typecode = u(dbfile.read(1)) length = dbfile.read_int() return dbfile.read_array(self.typecode, length) def skip(self, dbfile): length = dbfile.read_int() dbfile.seek(length * self.itemsize, 1) @staticmethod def to_bytes(v): return array_tobytes(v) class IntListValues(SequenceValues): """Stores lists of positive, increasing integers (that is, lists of integers where each number is >= 0 and each number is greater than or equal to the number that precedes it) in an FST. """ @staticmethod def is_valid(v): if isinstance(v, (list, tuple)): if len(v) < 2: return True for i in xrange(1, len(v)): if not isinstance(v[i], int) or v[i] < v[i - 1]: return False return True return False @staticmethod def write(dbfile, v): base = 0 dbfile.write_varint(len(v)) for x in v: delta = x - base assert delta >= 0 dbfile.write_varint(delta) base = x @staticmethod def read(dbfile): length = dbfile.read_varint() result = [] if length > 0: base = 0 for _ in xrange(length): base += dbfile.read_varint() result.append(base) return result @staticmethod def to_bytes(v): return b(repr(v)) # Node-like interface wrappers class Node(object): """A slow but easier-to-use wrapper for FSA/DAWGs. Translates the low-level arc-based interface of GraphReader into Node objects with methods to follow edges. """ def __init__(self, owner, address, accept=False): self.owner = owner self.address = address self._edges = None self.accept = accept def __iter__(self): if not self._edges: self._load() return iterkeys(self._edges) def __contains__(self, key): if self._edges is None: self._load() return key in self._edges def _load(self): owner = self.owner if self.address is None: d = {} else: d = dict((arc.label, Node(owner, arc.target, arc.accept)) for arc in self.owner.iter_arcs(self.address)) self._edges = d def keys(self): if self._edges is None: self._load() return self._edges.keys() def all_edges(self): if self._edges is None: self._load() return self._edges def edge(self, key): if self._edges is None: self._load() return self._edges[key] def flatten(self, sofar=emptybytes): if self.accept: yield sofar for key in sorted(self): node = self.edge(key) for result in node.flatten(sofar + key): yield result def flatten_strings(self): return (utf8decode(k)[0] for k in self.flatten()) class ComboNode(Node): """Base class for nodes that blend the nodes of two different graphs. Concrete subclasses need to implement the ``edge()`` method and possibly override the ``accept`` property. """ def __init__(self, a, b): self.a = a self.b = b def __repr__(self): return "<%s %r %r>" % (self.__class__.__name__, self.a, self.b) def __contains__(self, key): return key in self.a or key in self.b def __iter__(self): return iter(set(self.a) | set(self.b)) @property def accept(self): return self.a.accept or self.b.accept class UnionNode(ComboNode): """Makes two graphs appear to be the union of the two graphs. """ def edge(self, key): a = self.a b = self.b if key in a and key in b: return UnionNode(a.edge(key), b.edge(key)) elif key in a: return a.edge(key) else: return b.edge(key) class IntersectionNode(ComboNode): """Makes two graphs appear to be the intersection of the two graphs. """ def edge(self, key): a = self.a b = self.b if key in a and key in b: return IntersectionNode(a.edge(key), b.edge(key)) # Cursor class BaseCursor(object): """Base class for a cursor-type object for navigating an FST/word graph, represented by a :class:`GraphReader` object. >>> cur = GraphReader(dawgfile).cursor() >>> for key in cur.follow(): ... print(repr(key)) The cursor "rests" on arcs in the FSA/FST graph, rather than nodes. """ def is_active(self): """Returns True if this cursor is still active, that is it has not read past the last arc in the graph. """ raise NotImplementedError def label(self): """Returns the label bytes of the current arc. """ raise NotImplementedError def prefix(self): """Returns a sequence of the label bytes for the path from the root to the current arc. """ raise NotImplementedError def prefix_bytes(self): """Returns the label bytes for the path from the root to the current arc as a single joined bytes object. """ return emptybytes.join(self.prefix()) def prefix_string(self): """Returns the labels of the path from the root to the current arc as a decoded unicode string. """ return utf8decode(self.prefix_bytes())[0] def peek_key(self): """Returns a sequence of label bytes representing the next closest key in the graph. """ for label in self.prefix(): yield label c = self.copy() while not c.stopped(): c.follow() yield c.label() def peek_key_bytes(self): """Returns the next closest key in the graph as a single bytes object. """ return emptybytes.join(self.peek_key()) def peek_key_string(self): """Returns the next closest key in the graph as a decoded unicode string. """ return utf8decode(self.peek_key_bytes())[0] def stopped(self): """Returns True if the current arc leads to a stop state. """ raise NotImplementedError def value(self): """Returns the value at the current arc, if reading an FST. """ raise NotImplementedError def accept(self): """Returns True if the current arc leads to an accept state (the end of a valid key). """ raise NotImplementedError def at_last_arc(self): """Returns True if the current arc is the last outgoing arc from the previous node. """ raise NotImplementedError def next_arc(self): """Moves to the next outgoing arc from the previous node. """ raise NotImplementedError def follow(self): """Follows the current arc. """ raise NotImplementedError def switch_to(self, label): """Switch to the sibling arc with the given label bytes. """ _label = self.label _at_last_arc = self.at_last_arc _next_arc = self.next_arc while True: thislabel = _label() if thislabel == label: return True if thislabel > label or _at_last_arc(): return False _next_arc() def skip_to(self, key): """Moves the cursor to the path represented by the given key bytes. """ _accept = self.accept _prefix = self.prefix _next_arc = self.next_arc keylist = list(key) while True: if _accept(): thiskey = list(_prefix()) if keylist == thiskey: return True elif keylist > thiskey: return False _next_arc() def flatten(self): """Yields the keys in the graph, starting at the current position. """ _is_active = self.is_active _accept = self.accept _stopped = self.stopped _follow = self.follow _next_arc = self.next_arc _prefix_bytes = self.prefix_bytes if not _is_active(): raise InactiveCursor while _is_active(): if _accept(): yield _prefix_bytes() if not _stopped(): _follow() continue _next_arc() def flatten_v(self): """Yields (key, value) tuples in an FST, starting at the current position. """ for key in self.flatten(): yield key, self.value() def flatten_strings(self): return (utf8decode(k)[0] for k in self.flatten()) def find_path(self, path): """Follows the labels in the given path, starting at the current position. """ path = to_labels(path) _switch_to = self.switch_to _follow = self.follow _stopped = self.stopped first = True for i, label in enumerate(path): if not first: _follow() if not _switch_to(label): return False if _stopped(): if i < len(path) - 1: return False first = False return True class Cursor(BaseCursor): def __init__(self, graph, root=None, stack=None): self.graph = graph self.vtype = graph.vtype self.root = root if root is not None else graph.default_root() if stack: self.stack = stack else: self.reset() def _current_attr(self, name): stack = self.stack if not stack: raise InactiveCursor return getattr(stack[-1], name) def is_active(self): return bool(self.stack) def stopped(self): return self._current_attr("target") is None def accept(self): return self._current_attr("accept") def at_last_arc(self): return self._current_attr("lastarc") def label(self): return self._current_attr("label") def reset(self): self.stack = [] self.sums = [None] self._push(self.graph.arc_at(self.root)) def copy(self): return self.__class__(self.graph, self.root, copy.deepcopy(self.stack)) def prefix(self): stack = self.stack if not stack: raise InactiveCursor return (arc.label for arc in stack) # Override: more efficient implementation using graph methods directly def peek_key(self): if not self.stack: raise InactiveCursor for label in self.prefix(): yield label arc = copy.copy(self.stack[-1]) graph = self.graph while not arc.accept and arc.target is not None: graph.arc_at(arc.target, arc) yield arc.label def value(self): stack = self.stack if not stack: raise InactiveCursor vtype = self.vtype if not vtype: raise Exception("No value type") v = self.sums[-1] current = stack[-1] if current.value: v = vtype.add(v, current.value) if current.accept and current.acceptval is not None: v = vtype.add(v, current.acceptval) return v def next_arc(self): stack = self.stack if not stack: raise InactiveCursor while stack and stack[-1].lastarc: self.pop() if stack: current = stack[-1] self.graph.arc_at(current.endpos, current) return current def follow(self): address = self._current_attr("target") if address is None: raise Exception("Can't follow a stop arc") self._push(self.graph.arc_at(address)) return self # Override: more efficient implementation manipulating the stack def skip_to(self, key): key = to_labels(key) stack = self.stack if not stack: raise InactiveCursor _follow = self.follow _next_arc = self.next_arc i = self._pop_to_prefix(key) while stack and i < len(key): curlabel = stack[-1].label keylabel = key[i] if curlabel == keylabel: _follow() i += 1 elif curlabel > keylabel: return else: _next_arc() # Override: more efficient implementation using find_arc def switch_to(self, label): stack = self.stack if not stack: raise InactiveCursor current = stack[-1] if label == current.label: return True else: arc = self.graph.find_arc(current.endpos, label, current) return arc def _push(self, arc): if self.vtype and self.stack: sums = self.sums sums.append(self.vtype.add(sums[-1], self.stack[-1].value)) self.stack.append(arc) def pop(self): self.stack.pop() if self.vtype: self.sums.pop() def _pop_to_prefix(self, key): stack = self.stack if not stack: raise InactiveCursor i = 0 maxpre = min(len(stack), len(key)) while i < maxpre and key[i] == stack[i].label: i += 1 if stack[i].label > key[i]: self.current = None return while len(stack) > i + 1: self.pop() self.next_arc() return i class UncompiledNode(object): # Represents an "in-memory" node used by the GraphWriter before it is # written to disk. compiled = False def __init__(self, owner): self.owner = owner self._digest = None self.clear() def clear(self): self.arcs = [] self.value = None self.accept = False self.inputcount = 0 def __repr__(self): return "<%r>" % ([(a.label, a.value) for a in self.arcs],) def digest(self): if self._digest is None: d = sha1() vtype = self.owner.vtype for arc in self.arcs: d.update(arc.label) if arc.target: d.update(pack_long(arc.target)) else: d.update(b("z")) if arc.value: d.update(vtype.to_bytes(arc.value)) if arc.accept: d.update(b("T")) self._digest = d.digest() return self._digest def edges(self): return self.arcs def last_value(self, label): assert self.arcs[-1].label == label return self.arcs[-1].value def add_arc(self, label, target): self.arcs.append(Arc(label, target)) def replace_last(self, label, target, accept, acceptval=None): arc = self.arcs[-1] assert arc.label == label, "%r != %r" % (arc.label, label) arc.target = target arc.accept = accept arc.acceptval = acceptval def delete_last(self, label, target): arc = self.arcs.pop() assert arc.label == label assert arc.target == target def set_last_value(self, label, value): arc = self.arcs[-1] assert arc.label == label, "%r->%r" % (arc.label, label) arc.value = value def prepend_value(self, prefix): add = self.owner.vtype.add for arc in self.arcs: arc.value = add(prefix, arc.value) if self.accept: self.value = add(prefix, self.value) class Arc(object): """ Represents a directed arc between two nodes in an FSA/FST graph. The ``lastarc`` attribute is True if this is the last outgoing arc from the previous node. """ __slots__ = ("label", "target", "accept", "value", "lastarc", "acceptval", "endpos") def __init__(self, label=None, target=None, value=None, accept=False, acceptval=None, lastarc=None, endpos=None): """ :param label: The label bytes for this arc. For a word graph, this will be a character. :param target: The address of the node at the endpoint of this arc. :param value: The inner FST value at the endpoint of this arc. :param accept: Whether the endpoint of this arc is an accept state (e.g. the end of a valid word). :param acceptval: If the endpoint of this arc is an accept state, the final FST value for that accepted state. """ self.label = label self.target = target self.value = value self.accept = accept self.acceptval = acceptval self.lastarc = lastarc self.endpos = endpos def __repr__(self): return "<%r-%s %s%s>" % (self.label, self.target, "." if self.accept else "", (" %r" % self.value) if self.value else "") def __eq__(self, other): if (isinstance(other, self.__class__) and self.accept == other.accept and self.lastarc == other.lastarc and self.target == other.target and self.value == other.value and self.label == other.label): return True return False def copy(self): # This is faster than using the copy module return Arc(label=self.label, target=self.target, value=self.value, accept=self.accept, acceptval=self.acceptval, lastarc=self.lastarc, endpos=self.endpos) # Graph writer class GraphWriter(object): """Writes an FSA/FST graph to disk. Call ``insert(key)`` to insert keys into the graph. You must insert keys in sorted order. Call ``close()`` to finish the graph and close the file. >>> gw = GraphWriter(my_file) >>> gw.insert("alfa") >>> gw.insert("bravo") >>> gw.insert("charlie") >>> gw.close() The graph writer can write separate graphs for multiple fields. Use ``start_field(name)`` and ``finish_field()`` to separate fields. >>> gw = GraphWriter(my_file) >>> gw.start_field("content") >>> gw.insert("alfalfa") >>> gw.insert("apple") >>> gw.finish_field() >>> gw.start_field("title") >>> gw.insert("artichoke") >>> gw.finish_field() >>> gw.close() """ version = 1 def __init__(self, dbfile, vtype=None, merge=None): """ :param dbfile: the file to write to. :param vtype: a :class:`Values` class to use for storing values. This is only necessary if you will be storing values for the keys. :param merge: a function that takes two values and returns a single value. This is called if you insert two identical keys with values. """ self.dbfile = dbfile self.vtype = vtype self.merge = merge self.fieldroots = {} self.arc_count = 0 self.node_count = 0 self.fixed_count = 0 dbfile.write(b("GRPH")) dbfile.write_int(self.version) dbfile.write_uint(0) self._infield = False def start_field(self, fieldname): """Starts a new graph for the given field. """ if not fieldname: raise ValueError("Field name cannot be equivalent to False") if self._infield: self.finish_field() self.fieldname = fieldname self.seen = {} self.nodes = [UncompiledNode(self)] self.lastkey = '' self._inserted = False self._infield = True def finish_field(self): """Finishes the graph for the current field. """ if not self._infield: raise Exception("Called finish_field before start_field") self._infield = False if self._inserted: self.fieldroots[self.fieldname] = self._finish() self.fieldname = None def close(self): """Finishes the current graph and closes the underlying file. """ if self.fieldname is not None: self.finish_field() dbfile = self.dbfile here = dbfile.tell() dbfile.write_pickle(self.fieldroots) dbfile.flush() dbfile.seek(4 + _INT_SIZE) # Seek past magic and version number dbfile.write_uint(here) dbfile.close() def insert(self, key, value=None): """Inserts the given key into the graph. :param key: a sequence of bytes objects, a bytes object, or a string. :param value: an optional value to encode in the graph along with the key. If the writer was not instantiated with a value type, passing a value here will raise an error. """ if not self._infield: raise Exception("Inserted %r before starting a field" % key) self._inserted = True key = to_labels(key) # Python 3 sucks vtype = self.vtype lastkey = self.lastkey nodes = self.nodes if len(key) < 1: raise KeyError("Can't store a null key %r" % (key,)) if lastkey and lastkey > key: raise KeyError("Keys out of order %r..%r" % (lastkey, key)) # Find the common prefix shared by this key and the previous one prefixlen = 0 for i in xrange(min(len(lastkey), len(key))): if lastkey[i] != key[i]: break prefixlen += 1 # Compile the nodes after the prefix, since they're not shared self._freeze_tail(prefixlen + 1) # Create new nodes for the parts of this key after the shared prefix for char in key[prefixlen:]: node = UncompiledNode(self) # Create an arc to this node on the previous node nodes[-1].add_arc(char, node) nodes.append(node) # Mark the last node as an accept state lastnode = nodes[-1] lastnode.accept = True if vtype: if value is not None and not vtype.is_valid(value): raise ValueError("%r is not valid for %s" % (value, vtype)) # Push value commonalities through the tree common = None for i in xrange(1, prefixlen + 1): node = nodes[i] parent = nodes[i - 1] lastvalue = parent.last_value(key[i - 1]) if lastvalue is not None: common = vtype.common(value, lastvalue) suffix = vtype.subtract(lastvalue, common) parent.set_last_value(key[i - 1], common) node.prepend_value(suffix) else: common = suffix = None value = vtype.subtract(value, common) if key == lastkey: # If this key is a duplicate, merge its value with the value of # the previous (same) key lastnode.value = self.merge(lastnode.value, value) else: nodes[prefixlen].set_last_value(key[prefixlen], value) elif value: raise Exception("Value %r but no value type" % value) self.lastkey = key def _freeze_tail(self, prefixlen): nodes = self.nodes lastkey = self.lastkey downto = max(1, prefixlen) while len(nodes) > downto: node = nodes.pop() parent = nodes[-1] inlabel = lastkey[len(nodes) - 1] self._compile_targets(node) accept = node.accept or len(node.arcs) == 0 address = self._compile_node(node) parent.replace_last(inlabel, address, accept, node.value) def _finish(self): nodes = self.nodes root = nodes[0] # Minimize nodes in the last word's suffix self._freeze_tail(0) # Compile remaining targets self._compile_targets(root) return self._compile_node(root) def _compile_targets(self, node): for arc in node.arcs: if isinstance(arc.target, UncompiledNode): n = arc.target if len(n.arcs) == 0: arc.accept = n.accept = True arc.target = self._compile_node(n) def _compile_node(self, uncnode): seen = self.seen if len(uncnode.arcs) == 0: # Leaf node address = self._write_node(uncnode) else: d = uncnode.digest() address = seen.get(d) if address is None: address = self._write_node(uncnode) seen[d] = address return address def _write_node(self, uncnode): vtype = self.vtype dbfile = self.dbfile arcs = uncnode.arcs numarcs = len(arcs) if not numarcs: if uncnode.accept: return None else: # What does it mean for an arc to stop but not be accepted? raise Exception self.node_count += 1 buf = StructFile(BytesIO()) nodestart = dbfile.tell() #self.count += 1 #self.arccount += numarcs fixedsize = -1 arcstart = buf.tell() for i, arc in enumerate(arcs): self.arc_count += 1 target = arc.target label = arc.label flags = 0 if len(label) > 1: flags += MULTIBYTE_LABEL if i == numarcs - 1: flags += ARC_LAST if arc.accept: flags += ARC_ACCEPT if target is None: flags += ARC_STOP if arc.value is not None: flags += ARC_HAS_VAL if arc.acceptval is not None: flags += ARC_HAS_ACCEPT_VAL buf.write(pack_byte(flags)) if len(label) > 1: buf.write(varint(len(label))) buf.write(label) if target is not None: buf.write(pack_uint(target)) if arc.value is not None: vtype.write(buf, arc.value) if arc.acceptval is not None: vtype.write(buf, arc.acceptval) here = buf.tell() thissize = here - arcstart arcstart = here if fixedsize == -1: fixedsize = thissize elif fixedsize > 0 and thissize != fixedsize: fixedsize = 0 if fixedsize > 0: # Write a fake arc containing the fixed size and number of arcs dbfile.write_byte(255) # FIXED_SIZE dbfile.write_int(fixedsize) dbfile.write_int(numarcs) self.fixed_count += 1 dbfile.write(buf.file.getvalue()) return nodestart # Graph reader class BaseGraphReader(object): def cursor(self, rootname=None): return Cursor(self, self.root(rootname)) def has_root(self, rootname): raise NotImplementedError def root(self, rootname=None): raise NotImplementedError # Low level methods def arc_at(self, address, arc): raise NotImplementedError def iter_arcs(self, address, arc=None): raise NotImplementedError def find_arc(self, address, label, arc=None): arc = arc or Arc() for arc in self.iter_arcs(address, arc): thislabel = arc.label if thislabel == label: return arc elif thislabel > label: return None # Convenience methods def list_arcs(self, address): return list(arc.copy() for arc in self.iter_arcs(address)) def arc_dict(self, address): return dict((arc.label, arc.copy()) for arc in self.iter_arcs(address)) def find_path(self, path, arc=None, address=None): path = to_labels(path) if arc: address = address if address is not None else arc.target else: arc = Arc() if address is None: address = self._root find_arc = self.find_arc for label in path: if address is None: return None if not find_arc(address, label, arc): return None address = arc.target return arc class GraphReader(BaseGraphReader): def __init__(self, dbfile, rootname=None, vtype=None, filebase=0): self.dbfile = dbfile self.vtype = vtype self.filebase = filebase dbfile.seek(filebase) magic = dbfile.read(4) if magic != b("GRPH"): raise FileVersionError self.version = dbfile.read_int() dbfile.seek(dbfile.read_uint()) self.roots = dbfile.read_pickle() self._root = None if rootname is None and len(self.roots) == 1: # If there's only one root, just use it. Have to wrap a list around # the keys() method here because of Python 3. rootname = list(self.roots.keys())[0] if rootname is not None: self._root = self.root(rootname) def close(self): self.dbfile.close() # Overrides def has_root(self, rootname): return rootname in self.roots def root(self, rootname=None): if rootname is None: return self._root else: return self.roots[rootname] def default_root(self): return self._root def arc_at(self, address, arc=None): arc = arc or Arc() self.dbfile.seek(address) return self._read_arc(arc) def iter_arcs(self, address, arc=None): arc = arc or Arc() _read_arc = self._read_arc self.dbfile.seek(address) while True: _read_arc(arc) yield arc if arc.lastarc: break def find_arc(self, address, label, arc=None): # Overrides the default scanning implementation arc = arc or Arc() dbfile = self.dbfile dbfile.seek(address) # If records are fixed size, we can do a binary search finfo = self._read_fixed_info() if finfo: size, count = finfo address = dbfile.tell() if count > 2: return self._binary_search(address, size, count, label, arc) # If records aren't fixed size, fall back to the parent's linear # search method return BaseGraphReader.find_arc(self, address, label, arc) # Implementations def _read_arc(self, toarc=None): toarc = toarc or Arc() dbfile = self.dbfile flags = dbfile.read_byte() if flags == 255: # This is a fake arc containing fixed size information; skip it # and read the next arc dbfile.seek(_INT_SIZE * 2, 1) flags = dbfile.read_byte() toarc.label = self._read_label(flags) return self._read_arc_data(flags, toarc) def _read_label(self, flags): dbfile = self.dbfile if flags & MULTIBYTE_LABEL: length = dbfile.read_varint() else: length = 1 label = dbfile.read(length) return label def _read_fixed_info(self): dbfile = self.dbfile flags = dbfile.read_byte() if flags == 255: size = dbfile.read_int() count = dbfile.read_int() return (size, count) else: return None def _read_arc_data(self, flags, arc): dbfile = self.dbfile accept = arc.accept = bool(flags & ARC_ACCEPT) arc.lastarc = flags & ARC_LAST if flags & ARC_STOP: arc.target = None else: arc.target = dbfile.read_uint() if flags & ARC_HAS_VAL: arc.value = self.vtype.read(dbfile) else: arc.value = None if accept and flags & ARC_HAS_ACCEPT_VAL: arc.acceptval = self.vtype.read(dbfile) arc.endpos = dbfile.tell() return arc def _binary_search(self, address, size, count, label, arc): dbfile = self.dbfile _read_label = self._read_label lo = 0 hi = count while lo < hi: mid = (lo + hi) // 2 midaddr = address + mid * size dbfile.seek(midaddr) flags = dbfile.read_byte() midlabel = self._read_label(flags) if midlabel == label: arc.label = midlabel return self._read_arc_data(flags, arc) elif midlabel < label: lo = mid + 1 else: hi = mid if lo == count: return None def to_labels(key): """Takes a string and returns a list of bytestrings, suitable for use as a key or path in an FSA/FST graph. """ # Convert to tuples of bytestrings (must be tuples so they can be hashed) keytype = type(key) # I hate the Python 3 bytes object so friggin much if keytype is tuple or keytype is list: if not all(isinstance(e, bytes_type) for e in key): raise TypeError("%r contains a non-bytestring" % key) if keytype is list: key = tuple(key) elif isinstance(key, bytes_type): key = tuple(key[i:i + 1] for i in xrange(len(key))) elif isinstance(key, text_type): key = tuple(utf8encode(key[i:i + 1])[0] for i in xrange(len(key))) else: raise TypeError("Don't know how to convert %r" % key) return key # Within edit distance function def within(graph, text, k=1, prefix=0, address=None): """Yields a series of keys in the given graph within ``k`` edit distance of ``text``. If ``prefix`` is greater than 0, all keys must match the first ``prefix`` characters of ``text``. """ text = to_labels(text) if address is None: address = graph._root sofar = emptybytes accept = False if prefix: prefixchars = text[:prefix] arc = graph.find_path(prefixchars, address=address) if arc is None: return sofar = emptybytes.join(prefixchars) address = arc.target accept = arc.accept stack = [(address, k, prefix, sofar, accept)] seen = set() while stack: state = stack.pop() # Have we already tried this state? if state in seen: continue seen.add(state) address, k, i, sofar, accept = state # If we're at the end of the text (or deleting enough chars would get # us to the end and still within K), and we're in the accept state, # yield the current result if (len(text) - i <= k) and accept: yield utf8decode(sofar)[0] # If we're in the stop state, give up if address is None: continue # Exact match if i < len(text): arc = graph.find_arc(address, text[i]) if arc: stack.append((arc.target, k, i + 1, sofar + text[i], arc.accept)) # If K is already 0, can't do any more edits if k < 1: continue k -= 1 arcs = graph.arc_dict(address) # Insertions stack.extend((arc.target, k, i, sofar + char, arc.accept) for char, arc in iteritems(arcs)) # Deletion, replacement, and transpo only work before the end if i >= len(text): continue char = text[i] # Deletion stack.append((address, k, i + 1, sofar, False)) # Replacement for char2, arc in iteritems(arcs): if char2 != char: stack.append((arc.target, k, i + 1, sofar + char2, arc.accept)) # Transposition if i < len(text) - 1: char2 = text[i + 1] if char != char2 and char2 in arcs: # Find arc from next char to this char target = arcs[char2].target if target: arc = graph.find_arc(target, char) if arc: stack.append((arc.target, k, i + 2, sofar + char2 + char, arc.accept)) # Utility functions def dump_graph(graph, address=None, tab=0, out=None): if address is None: address = graph._root if out is None: out = sys.stdout here = "%06d" % address for i, arc in enumerate(graph.list_arcs(address)): if i == 0: out.write(here) else: out.write(" " * 6) out.write(" " * tab) out.write("%r %r %s %r\n" % (arc.label, arc.target, arc.accept, arc.value)) if arc.target is not None: dump_graph(graph, arc.target, tab + 1, out=out) Whoosh-2.5.7/src/whoosh/automata/glob.py0000644000076500000240000002251412254366350020304 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.compat import b from whoosh.system import emptybytes from whoosh.automata.fst import to_labels, Arc # Implement glob matching on graph reader # Constants for glob _LIT = 0 _STAR = 1 _PLUS = 2 _QUEST = 3 _RANGE = 4 _END = 5 def parse_glob(pattern, _glob_multi=b("*"), _glob_single=b("?"), _glob_range1=b("["), _glob_range2=b("]"), _glob_range_not=b("!")): parsed = [] pos = 0 while pos < len(pattern): char = pattern[pos] pos += 1 if char == _glob_multi: # * # (Ignore more than one star in a row) if parsed: prev = parsed[-1][0] if prev == _STAR: continue parsed.append((_STAR,)) elif char == _glob_single: # ? # (Ignore ? after a star) if parsed: prev = parsed[-1][0] if prev == _STAR: continue parsed.append((_QUEST,)) elif char == _glob_range1: # [ chars = set() firstchar = True negate = False # Take the char range specification until the ] while pos < len(pattern): char = pattern[pos] pos += 1 if char == _glob_range2: break # If first char inside the range is !, negate the list if firstchar and char == _glob_range_not: negate = True else: chars.add(char) firstchar = False if chars: parsed.append((_RANGE, chars, negate)) else: if parsed and parsed[-1][0] == _LIT: parsed[-1][1] += char else: parsed.append([_LIT, char]) parsed.append((_END,)) return parsed def glob(graph, pattern, address=None): """Yields a series of keys in the given graph matching the given "glob" string. This function implements the same glob features found in the `fnmatch` module in the Python standard library: ``*`` matches any number of characters, ``?`` matches any single character, `[abc]` matches any of the characters in the list, and ``[!abc]`` matches any character not in the list. (Use ``[[]`` to match an open bracket.) As ``fnmatch``, the star is greedy. :param graph: a :class:`GraphReader` object. :param pattern: a string specifying the glob to match, e.g. `"a*b?c[def]"`. """ address = address if address is not None else graph._root if not isinstance(pattern, list): pattern = parse_glob(pattern) # address, pos, sofar, accept states = [(address, 0, [], False)] seen = set() arc = Arc() times = 0 while states: ns = [] for address, pos, sofar, accept in states: times += 1 op = pattern[pos] code = op[0] if accept and code == _END: if sofar not in seen: yield sofar seen.add(sofar) if code == _END: continue # Zero width match if code == _STAR: ns.append((address, pos + 1, sofar, accept)) if address is None: continue if code == _STAR: for arc in graph.iter_arcs(address, arc): ns.append((arc.target, pos + 1, sofar + [arc.label], arc.accept)) ns.append((arc.target, pos, sofar + [arc.label], arc.accept)) elif code == _QUEST: for arc in graph.iter_arcs(address, arc): ns.append((arc.target, pos + 1, sofar + [arc.label], arc.accept)) elif code == _LIT: labels = op[1] for label in labels: arc = graph.find_arc(address, label) address = arc.target if address is None: break if address is not None: ns.append((address, pos + 1, sofar + labels, arc.accept)) elif code == _RANGE: chars = op[1] negate = op[2] for arc in graph.iter_arcs(address, arc): take = (arc.label in chars) ^ negate if take: ns.append((arc.target, pos + 1, sofar + [arc.label], arc.accept)) else: raise ValueError(code) states = ns # glob limit constants LO = 0 HI = 1 def glob_graph_limit(graph, mode, pattern, address): low = mode == LO output = [] arc = Arc(target=address) for op in pattern: if arc.target is None: break code = op[0] if code == _STAR or code == _PLUS: while arc.target: if low: arc = graph.arc_at(arc.target, arc) else: for arc in graph.iter_arcs(arc.target, arc): pass output.append(arc.label) if low and arc.accept: break elif code == _QUEST: if low: arc = graph.arc_at(arc.target, arc) else: for arc in graph.iter_arcs(arc.target, arc): pass elif code == _LIT: labels = op[1] for label in labels: arc = graph.find_arc(arc.target, label) if arc is None: break output.append(label) if arc.target is None: break if arc is None: break elif code == _RANGE: chars = op[1] negate = op[2] newarc = None for a in graph.iter_arcs(arc.target): if (a.label in chars) ^ negate: newarc = a.copy() if low: break if newarc: output.append(newarc.label) arc = newarc else: break return emptybytes.join(output) def glob_vacuum_limit(mode, pattern): low = mode == LO output = [] for op in pattern: code = op[0] if code == _STAR or code == _PLUS or code == _QUEST: break elif code == _LIT: output.append(op[1]) elif code == _RANGE: if op[2]: # Don't do negated char lists break chars = op[1] if low: output.append(min(chars)) else: output.append(max(chars)) return emptybytes.join(output) # if __name__ == "__main__": # from whoosh import index, query # from whoosh.filedb.filestore import RamStorage # from whoosh.automata import fst # from whoosh.util.testing import timing # # st = RamStorage() # gw = fst.GraphWriter(st.create_file("test")) # gw.start_field("test") # for key in ["aaaa", "aaab", "aabb", "abbb", "babb", "bbab", "bbba"]: # gw.insert(key) # gw.close() # gr = fst.GraphReader(st.open_file("test")) # # print glob_graph_limit(gr, LO, "bbb*", gr._root) # print glob_graph_limit(gr, HI, "bbb*", gr._root) # # ix = index.open_dir("e:/dev/src/houdini/help/index") # r = ix.reader() # gr = r._get_graph() # p = "?[abc]*" # p = "*/" # # with timing(): # q = query.Wildcard("path", p) # x = list(q._btexts(r)) # # with timing(): # prog = parse_glob(p) # lo = glob_graph_limit(gr, LO, prog, address=gr.root("path")) # hi = glob_graph_limit(gr, HI, prog, address=gr.root("path")) # q = query.TermRange("path", lo, hi) # y = list(q._btexts(r)) # # Whoosh-2.5.7/src/whoosh/automata/nfa.py0000644000076500000240000002440212254366350020123 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.automata.fst import Arc class Instruction(object): def __repr__(self): return "%s()" % (self.__class__.__name__, ) class Char(Instruction): """ Matches a literal character. """ def __init__(self, c): self.c = c def __repr__(self): return "Char(%r)" % self.c class Lit(Instruction): """ Matches a literal string. """ def __init__(self, c): self.c = c def __repr__(self): return "Lit(%r)" % self.c class Any(Instruction): """ Matches any character. """ class Match(Instruction): """ Stop this thread: the string matched. """ def __repr__(self): return "Match()" class Jmp(Instruction): """ Jump to a specified instruction. """ def __init__(self, x): self.x = x def __repr__(self): return "Jmp(%s)" % self.x class Split(Instruction): """ Split execution: continue at two separate specified instructions. """ def __init__(self, x, y): self.x = x self.y = y def __repr__(self): return "Split(%s, %s)" % (self.x, self.y) class Label(Instruction): """ Placeholder to act as a target for JMP instructions """ def __hash__(self): return id(self) def __repr__(self): return "L(%s)" % hex(id(self)) def concat(e1, e2): return e1 + e2 def alt(e1, e2): L1, L2, L3 = Label(), Label(), Label() return [L1] + e1 + [Jmp(L3), L2] + e2 + [L3] def zero_or_one(e): L1, L2 = Label(), Label() return [Split(L1, L2), L1] + e + [L2] def zero_or_more(e): L1, L2, L3 = Label(), Label(), Label() return [L1, Split(L2, L3), L2] + e + [Jmp(L1), L3] def one_or_more(e): L1, L2 = Label(), Label() return [L1] + e + [Split(L1, L2), L2] def fixup(program): refs = {} i = 0 while i < len(program): op = program[i] if isinstance(op, Label): refs[op] = i program.pop(i) else: i += 1 if refs: for op in program: if isinstance(op, (Jmp, Split)): op.x = refs[op.x] if isinstance(op, Split): op.y = refs[op.y] return program + [Match] class ThreadList(object): def __init__(self, program, max=1000): self.program = program self.max = max self.threads = [] def __nonzero__(self): return bool(self.threads) def current(self): return self.threads.pop() def add(self, thread): op = self.program[thread.pc] optype = type(op) if optype is Jmp: self.add(thread.at(op.x)) elif optype is Split: self.add(thread.copy_at(op.x)) self.add(thread.at(op.y)) else: self.threads.append(thread) class Thread(object): def __init__(self, pc, address, sofar='', accept=False): self.pc = pc self.address = address self.sofar = sofar self.accept = accept def at(self, pc): self.pc = pc return self def copy_at(self, pc): return Thread(pc, self.address, self.sofar, self.accept) def __repr__(self): d = self.__dict__ return "Thread(%s)" % ",".join("%s=%r" % (k, v) for k, v in d.items()) def advance(thread, arc, c): thread.pc += 1 thread.address = arc.target thread.sofar += c thread.accept = arc.accept def run(graph, program, address): threads = ThreadList(program) threads.add(Thread(0, address)) arc = Arc() while threads: thread = threads.current() address = thread.address op = program[thread.pc] optype = type(op) if optype is Char: if address: arc = graph.find_arc(address, op.c, arc) if arc: advance(thread, arc) threads.add(thread) elif optype is Lit: if address: c = op.c arc = graph.find_path(c, arc, address) if arc: advance(thread, arc, c) threads.add(thread) elif optype is Any: if address: sofar = thread.sofar pc = thread.pc + 1 for arc in graph.iter_arcs(address, arc): t = Thread(pc, arc.target, sofar + arc.label, arc.accept) threads.add(t) elif op is Match: if thread.accept: yield thread.sofar else: raise Exception("Don't know what to do with %r" % op) LO = 0 HI = 1 def regex_limit(graph, mode, program, address): low = mode == LO output = [] threads = ThreadList(program) threads.add(Thread(0, address)) arc = Arc() while threads: thread = threads.current() address = thread.address op = program[thread.pc] optype = type(op) if optype is Char: if address: arc = graph.find_arc(address, op.c, arc) if arc: if low and arc.accept: return thread.sofar + thread.label advance(thread, arc) threads.add(thread) elif optype is Lit: if address: labels = op.c for label in labels: arc = graph.find_arc(address, label) if arc is None: return thread.sofar elif thread.accept: return thread.sofar elif optype is Any: if address: if low: arc = graph.arc_at(address, arc) else: for arc in graph.iter_arcs(address): pass advance(thread, arc, arc.label) threads.add(thread) elif thread.accept: return thread.sofar elif op is Match: return thread.sofar else: raise Exception("Don't know what to do with %r" % op) # if __name__ == "__main__": # from whoosh import index, query # from whoosh.filedb.filestore import RamStorage # from whoosh.automata import fst # from whoosh.util.testing import timing # # st = RamStorage() # gw = fst.GraphWriter(st.create_file("test")) # gw.start_field("test") # for key in ["aaaa", "aaab", "aabb", "abbb", "babb", "bbab", "bbba"]: # gw.insert(key) # gw.close() # gr = fst.GraphReader(st.open_file("test")) # # program = one_or_more([Lit("a")]) # print program # program = fixup(program) # print program # print list(run(gr, program, gr.root("test"))) # # ix = index.open_dir("e:/dev/src/houdini/help/index") # r = ix.reader() # gr = r._get_graph() # # # program = fixup([Any(), Any(), Any(), Any(), Any()]) # # program = fixup(concat(zero_or_more([Any()]), [Char("/")])) # # with timing(): # # x = list(run(gr, program, gr.root("path"))) # # print len(x) # # q = query.Regex("path", "^.[abc].*/$") # with timing(): # y = list(q._btexts(r)) # print len(y) # print y[0], y[-1] # # pr = [Any()] + alt([Lit("c")], alt([Lit("b")], [Lit("a")])) + zero_or_more([Any()]) + [Lit("/")] # program = fixup(pr) # # with timing(): # # x = list(run(gr, program, gr.root("path"))) # # print len(x), x # # with timing(): # print "lo=", regex_limit(gr, LO, program, gr.root("path")) # print "hi=", regex_limit(gr, HI, program, gr.root("path")) # # # # #int # #backtrackingvm(Inst *prog, char *input) # #{ # # enum { MAXTHREAD = 1000 }; # # Thread ready[MAXTHREAD]; # # int nready; # # Inst *pc; # # char *sp; # # # # /* queue initial thread */ # # ready[0] = thread(prog, input); # # nready = 1; # # # # /* run threads in stack order */ # # while(nready > 0){ # # --nready; /* pop state for next thread to run */ # # pc = ready[nready].pc; # # sp = ready[nready].sp; # # for(;;){ # # switch(pc->opcode){ # # case Char: # # if(*sp != pc->c) # # goto Dead; # # pc++; # # sp++; # # continue; # # case Match: # # return 1; # # case Jmp: # # pc = pc->x; # # continue; # # case Split: # # if(nready >= MAXTHREAD){ # # fprintf(stderr, "regexp overflow"); # # return -1; # # } # # /* queue new thread */ # # ready[nready++] = thread(pc->y, sp); # # pc = pc->x; /* continue current thread */ # # continue; # # } # # } # # Dead:; # # } # # return 0; # #} # # Whoosh-2.5.7/src/whoosh/classify.py0000755000076500000240000002671712254366350017377 0ustar mattstaff00000000000000# Copyright 2008 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """Classes and functions for classifying and extracting information from documents. """ from __future__ import division import random from collections import defaultdict from math import log from whoosh.compat import xrange, iteritems # Expansion models class ExpansionModel(object): def __init__(self, doc_count, field_length): self.N = doc_count self.collection_total = field_length if self.N: self.mean_length = self.collection_total / self.N else: self.mean_length = 0 def normalizer(self, maxweight, top_total): raise NotImplementedError def score(self, weight_in_top, weight_in_collection, top_total): raise NotImplementedError class Bo1Model(ExpansionModel): def normalizer(self, maxweight, top_total): f = maxweight / self.N return (maxweight * log((1.0 + f) / f) + log(1.0 + f)) / log(2.0) def score(self, weight_in_top, weight_in_collection, top_total): f = weight_in_collection / self.N return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2) class Bo2Model(ExpansionModel): def normalizer(self, maxweight, top_total): f = maxweight * self.N / self.collection_total return maxweight * log((1.0 + f) / f, 2) + log(1.0 + f, 2) def score(self, weight_in_top, weight_in_collection, top_total): f = weight_in_top * top_total / self.collection_total return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2) class KLModel(ExpansionModel): def normalizer(self, maxweight, top_total): return (maxweight * log(self.collection_total / top_total) / log(2.0) * top_total) def score(self, weight_in_top, weight_in_collection, top_total): wit_over_tt = weight_in_top / top_total wic_over_ct = weight_in_collection / self.collection_total if wit_over_tt < wic_over_ct: return 0 else: return wit_over_tt * log(wit_over_tt / (weight_in_top / self.collection_total), 2) class Expander(object): """Uses an ExpansionModel to expand the set of query terms based on the top N result documents. """ def __init__(self, ixreader, fieldname, model=Bo1Model): """ :param reader: A :class:whoosh.reading.IndexReader object. :param fieldname: The name of the field in which to search. :param model: (classify.ExpansionModel) The model to use for expanding the query terms. If you omit this parameter, the expander uses :class:`Bo1Model` by default. """ self.ixreader = ixreader self.fieldname = fieldname doccount = self.ixreader.doc_count_all() fieldlen = self.ixreader.field_length(fieldname) if type(model) is type: model = model(doccount, fieldlen) self.model = model # Maps words to their weight in the top N documents. self.topN_weight = defaultdict(float) # Total weight of all terms in the top N documents. self.top_total = 0 def add(self, vector): """Adds forward-index information about one of the "top N" documents. :param vector: A series of (text, weight) tuples, such as is returned by Reader.vector_as("weight", docnum, fieldname). """ total_weight = 0 topN_weight = self.topN_weight for word, weight in vector: total_weight += weight topN_weight[word] += weight self.top_total += total_weight def add_document(self, docnum): ixreader = self.ixreader if self.ixreader.has_vector(docnum, self.fieldname): self.add(ixreader.vector_as("weight", docnum, self.fieldname)) elif self.ixreader.schema[self.fieldname].stored: self.add_text(ixreader.stored_fields(docnum).get(self.fieldname)) else: raise Exception("Field %r in document %s is not vectored or stored" % (self.fieldname, docnum)) def add_text(self, string): # Unfortunately since field.index() yields bytes texts, and we want # unicode, we end up encoding and decoding unnecessarily. # # TODO: Find a way around this field = self.ixreader.schema[self.fieldname] from_bytes = field.from_bytes self.add((from_bytes(text), weight) for text, _, weight, _ in field.index(string)) def expanded_terms(self, number, normalize=True): """Returns the N most important terms in the vectors added so far. :param number: The number of terms to return. :param normalize: Whether to normalize the weights. :returns: A list of ("term", weight) tuples. """ model = self.model fieldname = self.fieldname ixreader = self.ixreader field = ixreader.schema[fieldname] tlist = [] maxweight = 0 # If no terms have been added, return an empty list if not self.topN_weight: return [] for word, weight in iteritems(self.topN_weight): btext = field.to_bytes(word) if (fieldname, btext) in ixreader: cf = ixreader.frequency(fieldname, btext) score = model.score(weight, cf, self.top_total) if score > maxweight: maxweight = score tlist.append((score, word)) if normalize: norm = model.normalizer(maxweight, self.top_total) else: norm = maxweight tlist = [(weight / norm, t) for weight, t in tlist] tlist.sort(key=lambda x: (0 - x[0], x[1])) return [(t, weight) for weight, t in tlist[:number]] # Similarity functions def shingles(input, size=2): d = defaultdict(int) for shingle in (input[i:i + size] for i in xrange(len(input) - (size - 1))): d[shingle] += 1 return iteritems(d) def simhash(features, hashbits=32): if hashbits == 32: hashfn = hash else: hashfn = lambda s: _hash(s, hashbits) vs = [0] * hashbits for feature, weight in features: h = hashfn(feature) for i in xrange(hashbits): if h & (1 << i): vs[i] += weight else: vs[i] -= weight out = 0 for i, v in enumerate(vs): if v > 0: out |= 1 << i return out def _hash(s, hashbits): # A variable-length version of Python's builtin hash if s == "": return 0 else: x = ord(s[0]) << 7 m = 1000003 mask = 2 ** hashbits - 1 for c in s: x = ((x * m) ^ ord(c)) & mask x ^= len(s) if x == -1: x = -2 return x def hamming_distance(first_hash, other_hash, hashbits=32): x = (first_hash ^ other_hash) & ((1 << hashbits) - 1) tot = 0 while x: tot += 1 x &= x - 1 return tot # Clustering def kmeans(data, k, t=0.0001, distfun=None, maxiter=50, centers=None): """ One-dimensional K-means clustering function. :param data: list of data points. :param k: number of clusters. :param t: tolerance; stop if changes between iterations are smaller than this value. :param distfun: a distance function. :param centers: a list of centroids to start with. :param maxiter: maximum number of iterations to run. """ # Adapted from a C version by Roger Zhang, # http://cs.smu.ca/~r_zhang/code/kmeans.c DOUBLE_MAX = 1.797693e308 n = len(data) error = DOUBLE_MAX # sum of squared euclidean distance counts = [0] * k # size of each cluster labels = [0] * n # output cluster label for each data point # c1 is an array of len k of the temp centroids c1 = [0] * k # choose k initial centroids if centers: c = centers else: c = random.sample(data, k) niter = 0 # main loop while True: # save error from last step old_error = error error = 0 # clear old counts and temp centroids for i in xrange(k): counts[i] = 0 c1[i] = 0 for h in xrange(n): # identify the closest cluster min_distance = DOUBLE_MAX for i in xrange(k): distance = (data[h] - c[i]) ** 2 if distance < min_distance: labels[h] = i min_distance = distance # update size and temp centroid of the destination cluster c1[labels[h]] += data[h] counts[labels[h]] += 1 # update standard error error += min_distance for i in xrange(k): # update all centroids c[i] = c1[i] / counts[i] if counts[i] else c1[i] niter += 1 if (abs(error - old_error) < t) or (niter > maxiter): break return labels, c # Sliding window clusters def two_pass_variance(data): n = 0 sum1 = 0 sum2 = 0 for x in data: n += 1 sum1 = sum1 + x mean = sum1 / n for x in data: sum2 += (x - mean) * (x - mean) variance = sum2 / (n - 1) return variance def weighted_incremental_variance(data_weight_pairs): mean = 0 S = 0 sumweight = 0 for x, weight in data_weight_pairs: temp = weight + sumweight Q = x - mean R = Q * weight / temp S += sumweight * Q * R mean += R sumweight = temp Variance = S / (sumweight - 1) # if sample is the population, omit -1 return Variance def swin(data, size): clusters = [] for i, left in enumerate(data): j = i right = data[j] while j < len(data) - 1 and right - left < size: j += 1 right = data[j] v = 99999 if j - i > 1: v = two_pass_variance(data[i:j + 1]) clusters.append((left, right, j - i, v)) clusters.sort(key=lambda x: (0 - x[2], x[3])) return clusters Whoosh-2.5.7/src/whoosh/codec/0000755000076500000240000000000012277504634016251 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/codec/__init__.py0000644000076500000240000000316112254366350020357 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. def default_codec(*args, **kwargs): from whoosh.codec.whoosh3 import W3Codec return W3Codec(*args, **kwargs) Whoosh-2.5.7/src/whoosh/codec/base.py0000644000076500000240000006025412254366350017540 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module contains base classes/interfaces for "codec" objects. """ from bisect import bisect_right from whoosh import columns from whoosh.compat import text_type from whoosh.compat import abstractmethod, izip, xrange from whoosh.filedb.compound import CompoundStorage from whoosh.system import emptybytes from whoosh.util import random_name # Exceptions class OutOfOrderError(Exception): pass # Base classes class Codec(object): length_stats = True # Per document value writer @abstractmethod def per_document_writer(self, storage, segment): raise NotImplementedError # Inverted index writer @abstractmethod def field_writer(self, storage, segment): raise NotImplementedError # Postings @abstractmethod def postings_writer(self, dbfile, byteids=False): raise NotImplementedError @abstractmethod def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): raise NotImplementedError # Index readers @abstractmethod def terms_reader(self, storage, segment): raise NotImplementedError @abstractmethod def per_document_reader(self, storage, segment): raise NotImplementedError def supports_graph(self): return False # Don't need to override this if supports_graph() return False def graph_reader(self, storage, segment): raise NotImplementedError # Segments and generations @abstractmethod def new_segment(self, storage, indexname): raise NotImplementedError class WrappingCodec(Codec): def __init__(self, child): self._child = child def per_document_writer(self, storage, segment): return self._child.per_document_writer(storage, segment) def field_writer(self, storage, segment): return self._child.field_writer(storage, segment) def postings_writer(self, dbfile, byteids=False): return self._child.postings_writer(dbfile, byteids=byteids) def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): return self._child.postings_reader(dbfile, terminfo, format_, term=term, scorer=scorer) def terms_reader(self, storage, segment): return self._child.terms_reader(storage, segment) def per_document_reader(self, storage, segment): return self._child.per_document_reader(storage, segment) def supports_graph(self): return self._child.supports_graph() def graph_reader(self, storage, segment): return self._child.graph_reader(storage, segment) def new_segment(self, storage, indexname): return self._child.new_segment(storage, indexname) # Writer classes class PerDocumentWriter(object): @abstractmethod def start_doc(self, docnum): raise NotImplementedError @abstractmethod def add_field(self, fieldname, fieldobj, value, length): raise NotImplementedError @abstractmethod def add_column_value(self, fieldname, columnobj, value): raise NotImplementedError("Codec does not implement writing columns") @abstractmethod def add_vector_items(self, fieldname, fieldobj, items): raise NotImplementedError def add_vector_matcher(self, fieldname, fieldobj, vmatcher): def readitems(): while vmatcher.is_active(): text = vmatcher.id() weight = vmatcher.weight() valuestring = vmatcher.value() yield (text, weight, valuestring) vmatcher.next() self.add_vector_items(fieldname, fieldobj, readitems()) def finish_doc(self): pass def close(self): pass class FieldWriter(object): def add_postings(self, schema, lengths, items): # This method translates a generator of (fieldname, btext, docnum, w, v) # postings into calls to start_field(), start_term(), add(), # finish_term(), finish_field(), etc. start_field = self.start_field start_term = self.start_term add = self.add finish_term = self.finish_term finish_field = self.finish_field if lengths: dfl = lengths.doc_field_length else: dfl = lambda docnum, fieldname: 0 # The fieldname of the previous posting lastfn = None # The bytes text of the previous posting lasttext = None # The (fieldname, btext) of the previous spelling posting lastspell = None # The field object for the current field fieldobj = None for fieldname, btext, docnum, weight, value in items: # Check for out-of-order postings. This is convoluted because Python # 3 removed the ability to compare a string to None if lastfn is not None and fieldname < lastfn: raise OutOfOrderError("Field %r .. %r" % (lastfn, fieldname)) if fieldname == lastfn and lasttext and btext < lasttext: raise OutOfOrderError("Term %s:%r .. %s:%r" % (lastfn, lasttext, fieldname, btext)) # If the fieldname of this posting is different from the last one, # tell the writer we're starting a new field if fieldname != lastfn: if lasttext is not None: finish_term() if lastfn is not None and fieldname != lastfn: finish_field() fieldobj = schema[fieldname] start_field(fieldname, fieldobj) lastfn = fieldname lasttext = None # HACK: items where docnum == -1 indicate words that should be added # to the spelling graph, not the postings if docnum == -1: spellterm = (fieldname, btext) # There can be duplicates of spelling terms, so only add a spell # term if it's greater than the last one if lastspell is None or spellterm > lastspell: spellword = fieldobj.from_bytes(btext) self.add_spell_word(fieldname, spellword) lastspell = spellterm continue # If this term is different from the term in the previous posting, # tell the writer to start a new term if btext != lasttext: if lasttext is not None: finish_term() start_term(btext) lasttext = btext # Add this posting length = dfl(docnum, fieldname) if value is None: value = emptybytes add(docnum, weight, value, length) if lasttext is not None: finish_term() if lastfn is not None: finish_field() @abstractmethod def start_field(self, fieldname, fieldobj): raise NotImplementedError @abstractmethod def start_term(self, text): raise NotImplementedError @abstractmethod def add(self, docnum, weight, vbytes, length): raise NotImplementedError def add_spell_word(self, fieldname, text): raise NotImplementedError @abstractmethod def finish_term(self): raise NotImplementedError def finish_field(self): pass def close(self): pass # Postings class PostingsWriter(object): @abstractmethod def start_postings(self, format_, terminfo): raise NotImplementedError @abstractmethod def add_posting(self, id_, weight, vbytes, length=None): raise NotImplementedError def finish_postings(self): pass @abstractmethod def written(self): """Returns True if this object has already written to disk. """ raise NotImplementedError # Reader classes class TermsReader(object): @abstractmethod def __contains__(self, term): raise NotImplementedError @abstractmethod def terms(self): raise NotImplementedError @abstractmethod def terms_from(self, fieldname, prefix): raise NotImplementedError @abstractmethod def items(self): raise NotImplementedError @abstractmethod def items_from(self, fieldname, prefix): raise NotImplementedError @abstractmethod def term_info(self, fieldname, text): raise NotImplementedError @abstractmethod def frequency(self, fieldname, text): return self.term_info(fieldname, text).weight() @abstractmethod def doc_frequency(self, fieldname, text): return self.term_info(fieldname, text).doc_frequency() @abstractmethod def matcher(self, fieldname, text, format_, scorer=None): raise NotImplementedError @abstractmethod def indexed_field_names(self): raise NotImplementedError def close(self): pass # Per-doc value reader class PerDocumentReader(object): def close(self): pass @abstractmethod def doc_count(self): raise NotImplementedError @abstractmethod def doc_count_all(self): raise NotImplementedError # Deletions @abstractmethod def has_deletions(self): raise NotImplementedError @abstractmethod def is_deleted(self, docnum): raise NotImplementedError @abstractmethod def deleted_docs(self): raise NotImplementedError def all_doc_ids(self): """Returns an iterator of all (undeleted) document IDs in the reader. """ is_deleted = self.is_deleted return (docnum for docnum in xrange(self.doc_count_all()) if not is_deleted(docnum)) def iter_docs(self): for docnum in self.all_doc_ids(): yield docnum, self.stored_fields(docnum) # Columns def supports_columns(self): return False def has_column(self, fieldname): return False def list_columns(self): raise NotImplementedError # Don't need to override this if supports_columns() returns False def column_reader(self, fieldname, column): raise NotImplementedError # Bitmaps def field_docs(self, fieldname): return None # Lengths @abstractmethod def doc_field_length(self, docnum, fieldname, default=0): raise NotImplementedError @abstractmethod def field_length(self, fieldname): raise NotImplementedError @abstractmethod def min_field_length(self, fieldname): raise NotImplementedError @abstractmethod def max_field_length(self, fieldname): raise NotImplementedError # Vectors def has_vector(self, docnum, fieldname): return False # Don't need to override this if has_vector() always returns False def vector(self, docnum, fieldname, format_): raise NotImplementedError # Stored @abstractmethod def stored_fields(self, docnum): raise NotImplementedError def all_stored_fields(self): # Must yield stored fields for deleted documents too for docnum in xrange(self.doc_count_all()): yield self.stored_fields(docnum) # Segment base class class Segment(object): """Do not instantiate this object directly. It is used by the Index object to hold information about a segment. A list of objects of this class are pickled as part of the TOC file. The TOC file stores a minimal amount of information -- mostly a list of Segment objects. Segments are the real reverse indexes. Having multiple segments allows quick incremental indexing: just create a new segment for the new documents, and have the index overlay the new segment over previous ones for purposes of reading/search. "Optimizing" the index combines the contents of existing segments into one (removing any deleted documents along the way). """ # Extension for compound segment files COMPOUND_EXT = ".seg" # self.indexname # self.segid def __init__(self, indexname): self.indexname = indexname self.segid = self._random_id() self.compound = False @classmethod def _random_id(cls, size=16): return random_name(size=size) def __repr__(self): return "<%s %s>" % (self.__class__.__name__, self.segment_id()) def codec(self): raise NotImplementedError def index_name(self): return self.indexname def segment_id(self): if hasattr(self, "name"): # Old segment class return self.name else: return "%s_%s" % (self.index_name(), self.segid) def is_compound(self): if not hasattr(self, "compound"): return False return self.compound # File convenience methods def make_filename(self, ext): return "%s%s" % (self.segment_id(), ext) def list_files(self, storage): prefix = "%s." % self.segment_id() return [name for name in storage.list() if name.startswith(prefix)] def create_file(self, storage, ext, **kwargs): """Convenience method to create a new file in the given storage named with this segment's ID and the given extension. Any keyword arguments are passed to the storage's create_file method. """ fname = self.make_filename(ext) return storage.create_file(fname, **kwargs) def open_file(self, storage, ext, **kwargs): """Convenience method to open a file in the given storage named with this segment's ID and the given extension. Any keyword arguments are passed to the storage's open_file method. """ fname = self.make_filename(ext) return storage.open_file(fname, **kwargs) def create_compound_file(self, storage): segfiles = self.list_files(storage) assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles) cfile = self.create_file(storage, self.COMPOUND_EXT) CompoundStorage.assemble(cfile, storage, segfiles) for name in segfiles: storage.delete_file(name) self.compound = True def open_compound_file(self, storage): name = self.make_filename(self.COMPOUND_EXT) dbfile = storage.open_file(name) return CompoundStorage(dbfile, use_mmap=storage.supports_mmap) # Abstract methods @abstractmethod def doc_count_all(self): """ Returns the total number of documents, DELETED OR UNDELETED, in this segment. """ raise NotImplementedError def doc_count(self): """ Returns the number of (undeleted) documents in this segment. """ return self.doc_count_all() - self.deleted_count() def set_doc_count(self, doccount): raise NotImplementedError def has_deletions(self): """ Returns True if any documents in this segment are deleted. """ return self.deleted_count() > 0 @abstractmethod def deleted_count(self): """ Returns the total number of deleted documents in this segment. """ raise NotImplementedError @abstractmethod def deleted_docs(self): raise NotImplementedError @abstractmethod def delete_document(self, docnum, delete=True): """Deletes the given document number. The document is not actually removed from the index until it is optimized. :param docnum: The document number to delete. :param delete: If False, this undeletes a deleted document. """ raise NotImplementedError @abstractmethod def is_deleted(self, docnum): """ Returns True if the given document number is deleted. """ raise NotImplementedError def should_assemble(self): return True # Wrapping Segment class WrappingSegment(Segment): def __init__(self, child): self._child = child def codec(self): return self._child.codec() def index_name(self): return self._child.index_name() def segment_id(self): return self._child.segment_id() def is_compound(self): return self._child.is_compound() def should_assemble(self): return self._child.should_assemble() def make_filename(self, ext): return self._child.make_filename(ext) def list_files(self, storage): return self._child.list_files(storage) def create_file(self, storage, ext, **kwargs): return self._child.create_file(storage, ext, **kwargs) def open_file(self, storage, ext, **kwargs): return self._child.open_file(storage, ext, **kwargs) def create_compound_file(self, storage): return self._child.create_compound_file(storage) def open_compound_file(self, storage): return self._child.open_compound_file(storage) def delete_document(self, docnum, delete=True): return self._child.delete_document(docnum, delete=delete) def has_deletions(self): return self._child.has_deletions() def deleted_count(self): return self._child.deleted_count() def deleted_docs(self): return self._child.deleted_docs() def is_deleted(self, docnum): return self._child.is_deleted(docnum) def set_doc_count(self, doccount): self._child.set_doc_count(doccount) def doc_count(self): return self._child.doc_count() def doc_count_all(self): return self._child.doc_count_all() # Multi per doc reader class MultiPerDocumentReader(PerDocumentReader): def __init__(self, readers, offset=0): self._readers = readers self._doc_offsets = [] self._doccount = 0 for pdr in readers: self._doc_offsets.append(self._doccount) self._doccount += pdr.doc_count_all() self.is_closed = False def close(self): for r in self._readers: r.close() self.is_closed = True def doc_count_all(self): return self._doccount def doc_count(self): total = 0 for r in self._readers: total += r.doc_count() return total def _document_reader(self, docnum): return max(0, bisect_right(self._doc_offsets, docnum) - 1) def _reader_and_docnum(self, docnum): rnum = self._document_reader(docnum) offset = self._doc_offsets[rnum] return rnum, docnum - offset # Deletions def has_deletions(self): return any(r.has_deletions() for r in self._readers) def is_deleted(self, docnum): x, y = self._reader_and_docnum(docnum) return self._readers[x].is_deleted(y) def deleted_docs(self): for r, offset in izip(self._readers, self._doc_offsets): for docnum in r.deleted_docs(): yield docnum + offset def all_doc_ids(self): for r, offset in izip(self._readers, self._doc_offsets): for docnum in r.all_doc_ids(): yield docnum + offset # Columns def has_column(self, fieldname): return any(r.has_column(fieldname) for r in self._readers) def column_reader(self, fieldname, column): if not self.has_column(fieldname): raise ValueError("No column %r" % (fieldname,)) default = column.default_value() colreaders = [] for r in self._readers: if r.has_column(fieldname): cr = r.column_reader(fieldname, column) else: cr = columns.EmptyColumnReader(default, r.doc_count_all()) colreaders.append(cr) if len(colreaders) == 1: return colreaders[0] else: return columns.MultiColumnReader(colreaders) # Lengths def doc_field_length(self, docnum, fieldname, default=0): x, y = self._reader_and_docnum(docnum) return self._readers[x].doc_field_length(y, fieldname, default) def field_length(self, fieldname): total = 0 for r in self._readers: total += r.field_length(fieldname) return total def min_field_length(self): return min(r.min_field_length() for r in self._readers) def max_field_length(self): return max(r.max_field_length() for r in self._readers) # Extended base classes class PerDocWriterWithColumns(PerDocumentWriter): def __init__(self): PerDocumentWriter.__init__(self) # Implementations need to set these attributes self._storage = None self._segment = None self._docnum = None @abstractmethod def _has_column(self, fieldname): raise NotImplementedError @abstractmethod def _create_column(self, fieldname, column): raise NotImplementedError @abstractmethod def _get_column(self, fieldname): raise NotImplementedError def add_column_value(self, fieldname, column, value): if not self._has_column(fieldname): self._create_column(fieldname, column) self._get_column(fieldname).add(self._docnum, value) class CodecWithGraph(Codec): FST_EXT = ".fst" # FSA/FST graph file def supports_graph(self): return True def graph_reader(self, storage, segment): from whoosh.automata.fst import GraphReader from whoosh.reading import NoGraphError filename = segment.make_filename(self.FST_EXT) if not storage.file_exists(filename): raise NoGraphError return GraphReader(storage.open_file(filename)) class FieldWriterWithGraph(FieldWriter): def __init__(self): FieldWriter.__init__(self) # Implementations need to set these attributes self._storage = None self._segment = None self._fieldname = None self._fieldobj = None FST_EXT = CodecWithGraph.FST_EXT def _prep_graph(self): from whoosh.automata.fst import GraphWriter gf = self._segment.create_file(self._storage, self.FST_EXT) self._gwriter = GraphWriter(gf) def _start_graph_field(self, fieldname, fieldobj): spelling = fieldobj.spelling separate = fieldobj.separate_spelling() self._needs_graph = spelling or separate self._auto_graph = spelling and not separate if self._needs_graph: if not hasattr(self, "_gwriter") or self._gwriter is None: self._prep_graph() self._gwriter.start_field(fieldname) def _insert_graph_key(self, btext): if self._auto_graph: key = self._fieldobj.from_bytes(btext) self.add_spell_word(self._fieldname, key) def add_spell_word(self, fieldname, word): assert fieldname == self._fieldname assert isinstance(word, text_type) self._gwriter.insert(word) def _finish_graph_field(self): if self._needs_graph: self._gwriter.finish_field() def _close_graph(self): if hasattr(self, "_gwriter") and self._gwriter: self._gwriter.close() Whoosh-2.5.7/src/whoosh/codec/memory.py0000644000076500000240000002605712254366350020141 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement from bisect import bisect_left from threading import Lock, RLock from whoosh.compat import xrange from whoosh.codec import base from whoosh.matching import ListMatcher from whoosh.reading import SegmentReader, TermInfo, TermNotFound from whoosh.writing import SegmentWriter class MemWriter(SegmentWriter): def commit(self): self._finalize_segment() class MemoryCodec(base.CodecWithGraph): def __init__(self): from whoosh.filedb.filestore import RamStorage self.storage = RamStorage() self.segment = MemSegment(self, "blah") def writer(self, schema): ix = self.storage.create_index(schema) return MemWriter(ix, _lk=False, codec=self, docbase=self.segment._doccount) def reader(self, schema): return SegmentReader(self.storage, schema, self.segment, codec=self) def per_document_writer(self, storage, segment): return MemPerDocWriter(self.storage, self.segment) def field_writer(self, storage, segment): return MemFieldWriter(self.storage, self.segment) def per_document_reader(self, storage, segment): return MemPerDocReader(self.storage, self.segment) def terms_reader(self, storage, segment): return MemTermsReader(self.storage, self.segment) def new_segment(self, storage, indexname): return self.segment def graph_reader(self, storage, segment): return base.CodecWithGraph.graph_reader(self, self.storage, self.segment) class MemPerDocWriter(base.PerDocWriterWithColumns): def __init__(self, storage, segment): self._storage = storage self._segment = segment self.is_closed = False self._colwriters = {} self._doccount = 0 def _has_column(self, fieldname): return fieldname in self._colwriters def _create_column(self, fieldname, column): colfile = self._storage.create_file("%s.c" % fieldname) self._colwriters[fieldname] = (colfile, column.writer(colfile)) def _get_column(self, fieldname): return self._colwriters[fieldname][1] def start_doc(self, docnum): self._doccount += 1 self._docnum = docnum self._stored = {} self._lengths = {} self._vectors = {} def add_field(self, fieldname, fieldobj, value, length): if value is not None: self._stored[fieldname] = value if length is not None: self._lengths[fieldname] = length def add_vector_items(self, fieldname, fieldobj, items): self._vectors[fieldname] = tuple(items) def finish_doc(self): with self._segment._lock: docnum = self._docnum self._segment._stored[docnum] = self._stored self._segment._lengths[docnum] = self._lengths self._segment._vectors[docnum] = self._vectors def close(self): colwriters = self._colwriters for fieldname in colwriters: colfile, colwriter = colwriters[fieldname] colwriter.finish(self._doccount) colfile.close() self.is_closed = True class MemPerDocReader(base.PerDocumentReader): def __init__(self, storage, segment): self._storage = storage self._segment = segment def doc_count(self): return self._segment.doc_count() def doc_count_all(self): return self._segment.doc_count_all() def has_deletions(self): return self._segment.has_deletions() def is_deleted(self, docnum): return self._segment.is_deleted(docnum) def deleted_docs(self): return self._segment.deleted_docs() def supports_columns(self): return True def has_column(self, fieldname): filename = "%s.c" % fieldname return self._storage.file_exists(filename) def column_reader(self, fieldname, column): filename = "%s.c" % fieldname colfile = self._storage.open_file(filename) length = self._storage.file_length(filename) return column.reader(colfile, 0, length, self._segment.doc_count_all()) def doc_field_length(self, docnum, fieldname, default=0): return self._segment._lengths[docnum].get(fieldname, default) def field_length(self, fieldname): return sum(lens.get(fieldname, 0) for lens in self._segment._lengths.values()) def min_field_length(self, fieldname): return min(lens[fieldname] for lens in self._segment._lengths.values() if fieldname in lens) def max_field_length(self, fieldname): return max(lens[fieldname] for lens in self._segment._lengths.values() if fieldname in lens) def has_vector(self, docnum, fieldname): return (docnum in self._segment._vectors and fieldname in self._segment._vectors[docnum]) def vector(self, docnum, fieldname, format_): items = self._segment._vectors[docnum][fieldname] ids, weights, values = zip(*items) return ListMatcher(ids, weights, values, format_) def stored_fields(self, docnum): return self._segment._stored[docnum] def close(self): pass class MemFieldWriter(base.FieldWriterWithGraph): def __init__(self, storage, segment): self._storage = storage self._segment = segment self._fieldname = None self._btext = None self.is_closed = False def start_field(self, fieldname, fieldobj): if self._fieldname is not None: raise Exception("Called start_field in a field") with self._segment._lock: invindex = self._segment._invindex if fieldname not in invindex: invindex[fieldname] = {} self._fieldname = fieldname self._fieldobj = fieldobj self._start_graph_field(fieldname, fieldobj) def start_term(self, btext): if self._btext is not None: raise Exception("Called start_term in a term") fieldname = self._fieldname fielddict = self._segment._invindex[fieldname] terminfos = self._segment._terminfos with self._segment._lock: if btext not in fielddict: fielddict[btext] = [] if (fieldname, btext) not in terminfos: terminfos[fieldname, btext] = TermInfo() self._postings = fielddict[btext] self._terminfo = terminfos[fieldname, btext] self._btext = btext self._insert_graph_key(btext) def add(self, docnum, weight, vbytes, length): self._postings.append((docnum, weight, vbytes)) self._terminfo.add_posting(docnum, weight, length) def finish_term(self): if self._btext is None: raise Exception("Called finish_term outside a term") self._postings = None self._btext = None self._terminfo = None def finish_field(self): if self._fieldname is None: raise Exception("Called finish_field outside a field") self._fieldname = None self._fieldobj = None self._finish_graph_field() def close(self): self._close_graph() self.is_closed = True class MemTermsReader(base.TermsReader): def __init__(self, storage, segment): self._storage = storage self._segment = segment self._invindex = segment._invindex def __contains__(self, term): return term in self._segment._terminfos def terms(self): for fieldname in self._invindex: for btext in self._invindex[fieldname]: yield (fieldname, btext) def terms_from(self, fieldname, prefix): if fieldname not in self._invindex: raise TermNotFound("Unknown field %r" % (fieldname,)) terms = sorted(self._invindex[fieldname]) if not terms: return start = bisect_left(terms, prefix) for i in xrange(start, len(terms)): yield (fieldname, terms[i]) def term_info(self, fieldname, text): return self._segment._terminfos[fieldname, text] def matcher(self, fieldname, btext, format_, scorer=None): items = self._invindex[fieldname][btext] ids, weights, values = zip(*items) return ListMatcher(ids, weights, values, format_, scorer=scorer) def indexed_field_names(self): return self._invindex.keys() def close(self): pass class MemSegment(base.Segment): def __init__(self, codec, indexname): base.Segment.__init__(self, indexname) self._codec = codec self._doccount = 0 self._stored = {} self._lengths = {} self._vectors = {} self._invindex = {} self._terminfos = {} self._lock = Lock() def codec(self): return self._codec def set_doc_count(self, doccount): self._doccount = doccount def doc_count(self): return len(self._stored) def doc_count_all(self): return self._doccount def delete_document(self, docnum, delete=True): if not delete: raise Exception("MemoryCodec can't undelete") with self._lock: del self._stored[docnum] del self._lengths[docnum] del self._vectors[docnum] def has_deletions(self): with self._lock: return self._doccount - len(self._stored) def is_deleted(self, docnum): return docnum not in self._stored def deleted_docs(self): stored = self._stored for docnum in xrange(self.doc_count_all()): if docnum not in stored: yield docnum def should_assemble(self): return False Whoosh-2.5.7/src/whoosh/codec/plaintext.py0000644000076500000240000003354112254366350020635 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from ast import literal_eval from whoosh.compat import b, bytes_type, text_type, integer_types, PY3 from whoosh.compat import iteritems, dumps, loads, xrange from whoosh.codec import base from whoosh.matching import ListMatcher from whoosh.reading import TermInfo, TermNotFound if not PY3: class memoryview: pass _reprable = (bytes_type, text_type, integer_types, float) # Mixin classes for producing and consuming the simple text format class LineWriter(object): def _print_line(self, indent, command, **kwargs): self._dbfile.write(b(" ") * indent) self._dbfile.write(command.encode("latin1")) for k, v in iteritems(kwargs): if isinstance(v, memoryview): v = bytes(v) if v is not None and not isinstance(v, _reprable): raise TypeError(type(v)) self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1")) self._dbfile.write(b("\n")) class LineReader(object): def _reset(self): self._dbfile.seek(0) def _find_line(self, indent, command, **kwargs): here = self._dbfile.tell() self._dbfile.seek(here) while True: line = self._dbfile.readline() if not line: return None c = self._parse_line(line) if c is None: return None lindent, lcommand, largs = c if lindent == indent and lcommand == command: matched = True if kwargs: for k in kwargs: if kwargs[k] != largs.get(k): matched = False break if matched: return largs elif lindent < indent: return None def _parse_line(self, line): line = line.decode("latin1") line = line.rstrip() l = len(line) line = line.lstrip() if not line or line.startswith("#"): return None indent = (l - len(line)) // 2 parts = line.split("\t") command = parts[0] args = {} for i in xrange(1, len(parts)): n, v = parts[i].split("=") args[n] = literal_eval(v) return (indent, command, args) def _find_root(self, command): self._reset() c = self._find_line(0, command) if c is None: raise Exception("No root section %r" % (command,)) # Codec class class PlainTextCodec(base.Codec): length_stats = False def per_document_writer(self, storage, segment): return PlainPerDocWriter(storage, segment) def field_writer(self, storage, segment): return PlainFieldWriter(storage, segment) def per_document_reader(self, storage, segment): return PlainPerDocReader(storage, segment) def terms_reader(self, storage, segment): return PlainTermsReader(storage, segment) def new_segment(self, storage, indexname): return PlainSegment(indexname) def supports_graphs(self): return False class PlainPerDocWriter(base.PerDocumentWriter, LineWriter): def __init__(self, storage, segment): self._dbfile = storage.create_file(segment.make_filename(".dcs")) self._print_line(0, "DOCS") self.is_closed = False def start_doc(self, docnum): self._print_line(1, "DOC", dn=docnum) def add_field(self, fieldname, fieldobj, value, length): if value is not None: value = dumps(value, -1) self._print_line(2, "DOCFIELD", fn=fieldname, v=value, len=length) def add_column_value(self, fieldname, columnobj, value): self._print_line(2, "COLVAL", fn=fieldname, v=value) def add_vector_items(self, fieldname, fieldobj, items): self._print_line(2, "VECTOR", fn=fieldname) for text, weight, vbytes in items: self._print_line(3, "VPOST", t=text, w=weight, v=vbytes) def finish_doc(self): pass def close(self): self._dbfile.close() self.is_closed = True class PlainPerDocReader(base.PerDocumentReader, LineReader): def __init__(self, storage, segment): self._dbfile = storage.open_file(segment.make_filename(".dcs")) self._segment = segment self.is_closed = False def doc_count(self): return self._segment.doc_count() def doc_count_all(self): return self._segment.doc_count() def has_deletions(self): return False def is_deleted(self, docnum): return False def deleted_docs(self): return frozenset() def _find_doc(self, docnum): self._find_root("DOCS") c = self._find_line(1, "DOC") while c is not None: dn = c["dn"] if dn == docnum: return True elif dn > docnum: return False c = self._find_line(1, "DOC") return False def _iter_docs(self): self._find_root("DOCS") c = self._find_line(1, "DOC") while c is not None: yield c["dn"] c = self._find_line(1, "DOC") def _iter_docfields(self, fieldname): for c in self._iter_docs(): c = self._find_line(2, "DOCFIELD") if c is not None: yield c def _iter_lengths(self, fieldname): return (c.get("len", 0) for c in self._iter_docfields(fieldname)) def doc_field_length(self, docnum, fieldname, default=0): for dn in self._iter_docs(): if dn == docnum: c = self._find_line(2, "DOCFIELD") if c is not None: return c.get("len", default) elif dn > docnum: break return default def _column_values(self, fieldname): for i, docnum in enumerate(self._iter_docs()): if i != docnum: raise Exception("Missing column value for field %r doc %d?" % (fieldname, i)) c = self._find_line(2, "COLVAL", fn=fieldname) if c is None: raise Exception("Missing column value for field %r doc %d?" % (fieldname, docnum)) yield c.get("v") def has_column(self, fieldname): for _ in self._column_values(fieldname): return True return False def column_reader(self, fieldname, column): return list(self._column_values(fieldname)) def field_length(self, fieldname): return sum(self._iter_lengths(fieldname)) def min_field_length(self, fieldname): return min(self._iter_lengths(fieldname)) def max_field_length(self, fieldname): return max(self._iter_lengths(fieldname)) def has_vector(self, docnum, fieldname): if self._find_doc(docnum): if self._find_line(2, "VECTOR"): return True return False def vector(self, docnum, fieldname, format_): if not self._find_doc(docnum): raise Exception if not self._find_line(2, "VECTOR"): raise Exception ids = [] weights = [] values = [] c = self._find_line(3, "VPOST") while c is not None: ids.append(c["t"]) weights.append(c["w"]) values.append(c["v"]) c = self._find_line(3, "VPOST") return ListMatcher(ids, weights, values, format_,) def _read_stored_fields(self): sfs = {} c = self._find_line(2, "DOCFIELD") while c is not None: v = c.get("v") if v is not None: v = loads(v) sfs[c["fn"]] = v c = self._find_line(2, "DOCFIELD") return sfs def stored_fields(self, docnum): if not self._find_doc(docnum): raise Exception return self._read_stored_fields() def iter_docs(self): return enumerate(self.all_stored_fields()) def all_stored_fields(self): for _ in self._iter_docs(): yield self._read_stored_fields() def close(self): self._dbfile.close() self.is_closed = True class PlainFieldWriter(base.FieldWriter, LineWriter): def __init__(self, storage, segment): self._dbfile = storage.create_file(segment.make_filename(".trm")) self._print_line(0, "TERMS") @property def is_closed(self): return self._dbfile.is_closed def start_field(self, fieldname, fieldobj): self._fieldobj = fieldobj self._print_line(1, "TERMFIELD", fn=fieldname) def start_term(self, btext): self._terminfo = TermInfo() self._print_line(2, "BTEXT", t=btext) def add(self, docnum, weight, vbytes, length): self._terminfo.add_posting(docnum, weight, length) self._print_line(3, "POST", dn=docnum, w=weight, v=vbytes) def finish_term(self): ti = self._terminfo self._print_line(3, "TERMINFO", df=ti.doc_frequency(), weight=ti.weight(), minlength=ti.min_length(), maxlength=ti.max_length(), maxweight=ti.max_weight(), minid=ti.min_id(), maxid=ti.max_id()) def add_spell_word(self, fieldname, text): self._print_line(2, "SPELL", fn=fieldname, t=text) def close(self): self._dbfile.close() class PlainTermsReader(base.TermsReader, LineReader): def __init__(self, storage, segment): self._dbfile = storage.open_file(segment.make_filename(".trm")) self._segment = segment self.is_closed = False def _find_field(self, fieldname): self._find_root("TERMS") if self._find_line(1, "TERMFIELD", fn=fieldname) is None: raise TermNotFound("No field %r" % fieldname) def _iter_fields(self): self._find_root() c = self._find_line(1, "TERMFIELD") while c is not None: yield c["fn"] c = self._find_line(1, "TERMFIELD") def _iter_btexts(self): c = self._find_line(2, "BTEXT") while c is not None: yield c["t"] c = self._find_line(2, "BTEXT") def _find_term(self, fieldname, btext): self._find_field(fieldname) for t in self._iter_btexts(): if t == btext: return True elif t > btext: break return False def _find_terminfo(self): c = self._find_line(3, "TERMINFO") return TermInfo(**c) def __contains__(self, term): fieldname, btext = term return self._find_term(fieldname, btext) def indexed_field_names(self): return self._iter_fields() def terms(self): for fieldname in self._iter_fields(): for btext in self._iter_btexts(): yield (fieldname, btext) def terms_from(self, fieldname, prefix): self._find_field(fieldname) for btext in self._iter_btexts(): if btext < prefix: continue yield (fieldname, btext) def items(self): for fieldname, btext in self.terms(): yield (fieldname, btext), self._find_terminfo() def items_from(self, fieldname, prefix): for fieldname, btext in self.terms_from(fieldname, prefix): yield (fieldname, btext), self._find_terminfo() def term_info(self, fieldname, btext): if not self._find_term(fieldname, btext): raise TermNotFound((fieldname, btext)) return self._find_terminfo() def matcher(self, fieldname, btext, format_, scorer=None): if not self._find_term(fieldname, btext): raise TermNotFound((fieldname, btext)) ids = [] weights = [] values = [] c = self._find_line(3, "POST") while c is not None: ids.append(c["dn"]) weights.append(c["w"]) values.append(c["v"]) c = self._find_line(3, "POST") return ListMatcher(ids, weights, values, format_, scorer=scorer) def close(self): self._dbfile.close() self.is_closed = True class PlainSegment(base.Segment): def __init__(self, indexname): base.Segment.__init__(self, indexname) self._doccount = 0 def codec(self): return PlainTextCodec() def set_doc_count(self, doccount): self._doccount = doccount def doc_count(self): return self._doccount def should_assemble(self): return False Whoosh-2.5.7/src/whoosh/codec/whoosh2.py0000644000076500000240000021007512254366350020215 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module implements a "codec" for writing/reading Whoosh 2 indexes. """ import struct import sys from array import array from binascii import crc32 from collections import defaultdict from decimal import Decimal from hashlib import md5 # @UnresolvedImport from struct import Struct try: import zlib except ImportError: zlib = None from whoosh.compat import b, PY3 from whoosh.compat import loads, dumps from whoosh.compat import xrange, iteritems from whoosh.compat import bytes_type, text_type, string_type, integer_types from whoosh.compat import array_frombytes, array_tobytes from whoosh.codec import base from whoosh.filedb.filestore import Storage from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher from whoosh.reading import NoGraphError, TermInfo, TermNotFound from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE, IS_LITTLE from whoosh.system import emptybytes from whoosh.system import pack_byte from whoosh.system import pack_ushort, unpack_ushort, pack_long, unpack_long from whoosh.automata.fst import GraphWriter, GraphReader from whoosh.util.numeric import byte_to_length, length_to_byte from whoosh.util.numeric import to_sortable, from_sortable, NaN from whoosh.util.numlists import GrowableArray from whoosh.util.text import utf8encode, utf8decode from whoosh.util.times import datetime_to_long, long_to_datetime # Old hash file implementations _4GB = 4 * 1024 * 1024 * 1024 def cdb_hash(key): h = 5381 for c in key: h = (h + (h << 5)) & 0xffffffff ^ ord(c) return h def md5_hash(key): return int(md5(key).hexdigest(), 16) & 0xffffffff def crc_hash(key): return crc32(key) & 0xffffffff hash_functions = (hash, cdb_hash, md5_hash, crc_hash) _header_entry_struct = struct.Struct("!qI") # Position, number of slots header_entry_size = _header_entry_struct.size pack_header_entry = _header_entry_struct.pack unpack_header_entry = _header_entry_struct.unpack _lengths_struct = struct.Struct("!II") # Length of key, length of data lengths_size = _lengths_struct.size pack_lengths = _lengths_struct.pack unpack_lengths = _lengths_struct.unpack _pointer_struct = struct.Struct("!Iq") # Hash value, position pointer_size = _pointer_struct.size pack_pointer = _pointer_struct.pack unpack_pointer = _pointer_struct.unpack # Table classes class HashWriter(object): def __init__(self, dbfile, hashtype=2): self.dbfile = dbfile self.hashtype = hashtype self.extras = {} self.startoffset = dbfile.tell() dbfile.write(b("HASH")) # Magic tag dbfile.write_byte(self.hashtype) # Identify hashing function used dbfile.write(b("\x00\x00\x00")) # Unused bytes dbfile.write_long(0) # Pointer to end of hashes self.header_size = 16 + 256 * header_entry_size self.hash_func = hash_functions[self.hashtype] # Seek past the first "header_size" bytes of the file... we'll come # back here to write the header later dbfile.seek(self.header_size) # Store the directory of hashed values self.hashes = defaultdict(list) def add(self, key, value): assert isinstance(key, bytes_type) assert isinstance(value, bytes_type) dbfile = self.dbfile pos = dbfile.tell() dbfile.write(pack_lengths(len(key), len(value))) dbfile.write(key) dbfile.write(value) h = self.hash_func(key) self.hashes[h & 255].append((h, pos)) def add_all(self, items): add = self.add for key, value in items: add(key, value) def _write_hashes(self): dbfile = self.dbfile hashes = self.hashes directory = self.directory = [] pos = dbfile.tell() for i in xrange(0, 256): entries = hashes[i] numslots = 2 * len(entries) directory.append((pos, numslots)) null = (0, 0) hashtable = [null] * numslots for hashval, position in entries: n = (hashval >> 8) % numslots while hashtable[n] != null: n = (n + 1) % numslots hashtable[n] = (hashval, position) write = dbfile.write for hashval, position in hashtable: write(pack_pointer(hashval, position)) pos += pointer_size dbfile.flush() self.extrasoffset = dbfile.tell() def _write_extras(self): self.dbfile.write_pickle(self.extras) # Seek back and write the pointer to the extras self.dbfile.flush() self.dbfile.seek(self.startoffset + 8) self.dbfile.write_long(self.extrasoffset) def _write_directory(self): dbfile = self.dbfile directory = self.directory # Seek back to the header dbfile.seek(self.startoffset + 8) # Write the pointer to the end of the hashes dbfile.write_long(self.extrasoffset) # Write the pointers to the hash tables for position, numslots in directory: dbfile.write(pack_header_entry(position, numslots)) dbfile.flush() assert dbfile.tell() == self.header_size def close(self): self._write_hashes() self._write_extras() self._write_directory() self.dbfile.close() class HashReader(object): def __init__(self, dbfile, startoffset=0): self.dbfile = dbfile self.startoffset = startoffset self.is_closed = False dbfile.seek(startoffset) # Check magic tag magic = dbfile.read(4) if magic != b("HASH"): raise Exception("Unknown file header %r" % magic) self.hashtype = dbfile.read_byte() # Hash function type self.hash_func = hash_functions[self.hashtype] dbfile.read(3) # Unused self.extrasoffset = dbfile.read_long() # Pointer to end of hashes self.header_size = 16 + 256 * header_entry_size assert self.extrasoffset >= self.header_size # Read pointers to hash tables self.buckets = [] for _ in xrange(256): he = unpack_header_entry(dbfile.read(header_entry_size)) self.buckets.append(he) self._start_of_hashes = self.buckets[0][0] dbfile.seek(self.extrasoffset) self._read_extras() def _read_extras(self): try: self.extras = self.dbfile.read_pickle() except EOFError: self.extras = {} def close(self): if self.is_closed: raise Exception("Tried to close %r twice" % self) self.dbfile.close() self.is_closed = True def read(self, position, length): self.dbfile.seek(position) return self.dbfile.read(length) def _ranges(self, pos=None): if pos is None: pos = self.header_size eod = self._start_of_hashes read = self.read while pos < eod: keylen, datalen = unpack_lengths(read(pos, lengths_size)) keypos = pos + lengths_size datapos = pos + lengths_size + keylen pos = datapos + datalen yield (keypos, keylen, datapos, datalen) def __iter__(self): return iter(self.items()) def items(self): read = self.read for keypos, keylen, datapos, datalen in self._ranges(): key = read(keypos, keylen) value = read(datapos, datalen) yield (key, value) def keys(self): read = self.read for keypos, keylen, _, _ in self._ranges(): yield read(keypos, keylen) def values(self): read = self.read for _, _, datapos, datalen in self._ranges(): yield read(datapos, datalen) def __getitem__(self, key): for data in self.all(key): return data raise KeyError(key) def get(self, key, default=None): for data in self.all(key): return data return default def all(self, key): read = self.read for datapos, datalen in self.ranges_for_key(key): yield read(datapos, datalen) def __contains__(self, key): for _ in self.ranges_for_key(key): return True return False def _hashtable_info(self, keyhash): # Return (directory_position, number_of_hash_entries) return self.buckets[keyhash & 255] def _key_position(self, key): keyhash = self.hash_func(key) hpos, hslots = self._hashtable_info(keyhash) if not hslots: raise KeyError(key) slotpos = hpos + (((keyhash >> 8) % hslots) * header_entry_size) return self.dbfile.get_long(slotpos + _INT_SIZE) def _key_at(self, pos): keylen = self.dbfile.get_uint(pos) return self.read(pos + lengths_size, keylen) def ranges_for_key(self, key): read = self.read if not isinstance(key, bytes_type): raise TypeError("Key %r should be bytes" % key) keyhash = self.hash_func(key) hpos, hslots = self._hashtable_info(keyhash) if not hslots: return slotpos = hpos + (((keyhash >> 8) % hslots) * pointer_size) for _ in xrange(hslots): slothash, pos = unpack_pointer(read(slotpos, pointer_size)) if not pos: return slotpos += pointer_size # If we reach the end of the hashtable, wrap around if slotpos == hpos + (hslots * pointer_size): slotpos = hpos if slothash == keyhash: keylen, datalen = unpack_lengths(read(pos, lengths_size)) if keylen == len(key): if key == read(pos + lengths_size, keylen): yield (pos + lengths_size + keylen, datalen) def range_for_key(self, key): for item in self.ranges_for_key(key): return item raise KeyError(key) class OrderedHashWriter(HashWriter): def __init__(self, dbfile): HashWriter.__init__(self, dbfile) self.index = GrowableArray("H") self.lastkey = emptybytes def add(self, key, value): if key <= self.lastkey: raise ValueError("Keys must increase: %r..%r" % (self.lastkey, key)) self.index.append(self.dbfile.tell()) HashWriter.add(self, key, value) self.lastkey = key def _write_extras(self): dbfile = self.dbfile # Save information about the index in the extras ndxarray = self.index self.extras["indexbase"] = dbfile.tell() self.extras["indextype"] = ndxarray.typecode self.extras["indexlen"] = len(ndxarray) # Write key index ndxarray.to_file(dbfile) # Call the super method to write the extras self.extrasoffset = dbfile.tell() HashWriter._write_extras(self) class OrderedHashReader(HashReader): def __init__(self, dbfile): HashReader.__init__(self, dbfile) self.indexbase = self.extras["indexbase"] self.indexlen = self.extras["indexlen"] self.indextype = indextype = self.extras["indextype"] self._ixsize = struct.calcsize(indextype) if indextype == "B": self._ixpos = dbfile.get_byte elif indextype == "H": self._ixpos = dbfile.get_ushort elif indextype == "i": self._ixpos = dbfile.get_int elif indextype == "I": self._ixpos = dbfile.get_uint elif indextype == "q": self._ixpos = dbfile.get_long else: raise Exception("Unknown index type %r" % indextype) def _closest_key(self, key): key_at = self._key_at indexbase = self.indexbase ixpos, ixsize = self._ixpos, self._ixsize lo = 0 hi = self.indexlen if not isinstance(key, bytes_type): raise TypeError("Key %r should be bytes" % key) while lo < hi: mid = (lo + hi) // 2 midkey = key_at(ixpos(indexbase + mid * ixsize)) if midkey < key: lo = mid + 1 else: hi = mid #i = max(0, mid - 1) if lo == self.indexlen: return None return ixpos(indexbase + lo * ixsize) def closest_key(self, key): pos = self._closest_key(key) if pos is None: return None return self._key_at(pos) def _ranges_from(self, key): #read = self.read pos = self._closest_key(key) if pos is None: return for x in self._ranges(pos=pos): yield x def items_from(self, key): read = self.read for keypos, keylen, datapos, datalen in self._ranges_from(key): yield (read(keypos, keylen), read(datapos, datalen)) def keys_from(self, key): read = self.read for keypos, keylen, _, _ in self._ranges_from(key): yield read(keypos, keylen) # Standard codec top-level object class W2Codec(base.Codec): TERMS_EXT = ".trm" # Term index POSTS_EXT = ".pst" # Term postings DAWG_EXT = FST_EXT = ".dag" # Spelling graph file LENGTHS_EXT = ".fln" # Field lengths file VECTOR_EXT = ".vec" # Vector index VPOSTS_EXT = ".vps" # Vector postings STORED_EXT = ".sto" # Stored fields file def __init__(self, blocklimit=128, compression=3, loadlengths=False, inlinelimit=1): self.blocklimit = blocklimit self.compression = compression self.loadlengths = loadlengths self.inlinelimit = inlinelimit # Per-document value writer def per_document_writer(self, storage, segment): return W2PerDocWriter(storage, segment, blocklimit=self.blocklimit, compression=self.compression) # Inverted index writer def field_writer(self, storage, segment): return W2FieldWriter(storage, segment, blocklimit=self.blocklimit, compression=self.compression, inlinelimit=self.inlinelimit) # Readers def terms_reader(self, storage, segment): tifile = segment.open_file(storage, self.TERMS_EXT) postfile = segment.open_file(storage, self.POSTS_EXT) return W2TermsReader(tifile, postfile) def per_document_reader(self, storage, segment): return W2PerDocReader(storage, segment) def graph_reader(self, storage, segment): try: dawgfile = segment.open_file(storage, self.DAWG_EXT) except: raise NoGraphError return GraphReader(dawgfile) # Segments and generations def new_segment(self, storage, indexname): return W2Segment(indexname) # Per-document value writer class W2PerDocWriter(base.PerDocumentWriter): def __init__(self, storage, segment, blocklimit=128, compression=3): if not isinstance(blocklimit, int): raise ValueError self.storage = storage self.segment = segment self.blocklimit = blocklimit self.compression = compression self.doccount = 0 self.is_closed = False sffile = segment.create_file(storage, W2Codec.STORED_EXT) self.stored = StoredFieldWriter(sffile) self.storedfields = None self.lengths = InMemoryLengths() # We'll wait to create the vector files until someone actually tries # to add a vector self.vindex = self.vpostfile = None def _make_vector_files(self): vifile = self.segment.create_file(self.storage, W2Codec.VECTOR_EXT) self.vindex = VectorWriter(vifile) self.vpostfile = self.segment.create_file(self.storage, W2Codec.VPOSTS_EXT) def start_doc(self, docnum): self.docnum = docnum self.storedfields = {} self.doccount = max(self.doccount, docnum + 1) def add_field(self, fieldname, fieldobj, value, length): if length: self.lengths.add(self.docnum, fieldname, length) if value is not None: self.storedfields[fieldname] = value def _new_block(self, vformat): postingsize = vformat.posting_size return W2Block(postingsize, stringids=True) def add_vector_items(self, fieldname, fieldobj, items): if self.vindex is None: self._make_vector_files() # items = (text, weight, value_bytes) ... postfile = self.vpostfile blocklimit = self.blocklimit block = self._new_block(fieldobj.vector) startoffset = postfile.tell() postfile.write(block.magic) # Magic number blockcount = 0 postfile.write_uint(0) # Placeholder for block count countdown = blocklimit for text, weight, valuestring in items: block.add(text, weight, valuestring) countdown -= 1 if countdown == 0: block.to_file(postfile, compression=self.compression) block = self._new_block(fieldobj.vector) blockcount += 1 countdown = blocklimit # If there are leftover items in the current block, write them out if block: block.to_file(postfile, compression=self.compression) blockcount += 1 # Seek back to the start of this list of posting blocks and write the # number of blocks postfile.flush() here = postfile.tell() postfile.seek(startoffset + 4) postfile.write_uint(blockcount) postfile.seek(here) # Add to the index self.vindex.add((self.docnum, fieldname), startoffset) def finish_doc(self): self.stored.add(self.storedfields) self.storedfields = None def close(self): if self.storedfields is not None: self.stored.add(self.storedfields) self.stored.close() flfile = self.segment.create_file(self.storage, W2Codec.LENGTHS_EXT) self.lengths.to_file(flfile, self.doccount) if self.vindex: self.vindex.close() self.vpostfile.close() self.is_closed = True # Inverted index writer class W2FieldWriter(base.FieldWriter): def __init__(self, storage, segment, blocklimit=128, compression=3, inlinelimit=1): assert isinstance(storage, Storage) assert isinstance(segment, base.Segment) assert isinstance(blocklimit, int) assert isinstance(compression, int) assert isinstance(inlinelimit, int) self.storage = storage self.segment = segment self.fieldname = None self.text = None self.field = None self.format = None self.spelling = False tifile = segment.create_file(storage, W2Codec.TERMS_EXT) self.termsindex = TermIndexWriter(tifile) self.postfile = segment.create_file(storage, W2Codec.POSTS_EXT) # We'll wait to create the DAWG builder until someone actually adds # a spelled field self.dawg = None self.blocklimit = blocklimit self.compression = compression self.inlinelimit = inlinelimit self.block = None self.terminfo = None self._infield = False self.is_closed = False def _make_dawg_files(self): dawgfile = self.segment.create_file(self.storage, W2Codec.DAWG_EXT) self.dawg = GraphWriter(dawgfile) def _new_block(self): return W2Block(self.format.posting_size) def _reset_block(self): self.block = self._new_block() def _write_block(self): self.terminfo.add_block(self.block) self.block.to_file(self.postfile, compression=self.compression) self._reset_block() self.blockcount += 1 def _start_blocklist(self): postfile = self.postfile self._reset_block() # Magic number self.startoffset = postfile.tell() postfile.write(W2Block.magic) # Placeholder for block count self.blockcount = 0 postfile.write_uint(0) def start_field(self, fieldname, fieldobj): self.fieldname = fieldname self.field = fieldobj self.format = fieldobj.format self.spelling = fieldobj.spelling and not fieldobj.separate_spelling() self._dawgfield = False if self.spelling or fieldobj.separate_spelling(): if self.dawg is None: self._make_dawg_files() self.dawg.start_field(fieldname) self._dawgfield = True self._infield = True def start_term(self, text): if self.block is not None: raise Exception("Called start_term in a block") self.text = text self.terminfo = FileTermInfo() if self.spelling: self.dawg.insert(text.decode("utf-8")) # TODO: how to decode bytes? self._start_blocklist() def add(self, docnum, weight, valuestring, length): self.block.add(docnum, weight, valuestring, length) if len(self.block) > self.blocklimit: self._write_block() def add_spell_word(self, fieldname, text): if self.dawg is None: self._make_dawg_files() self.dawg.insert(text) def finish_term(self): block = self.block if block is None: raise Exception("Called finish_term when not in a block") terminfo = self.terminfo if self.blockcount < 1 and block and len(block) < self.inlinelimit: # Inline the single block terminfo.add_block(block) vals = None if not block.values else tuple(block.values) postings = (tuple(block.ids), tuple(block.weights), vals) else: if block: # Write the current unfinished block to disk self._write_block() # Seek back to the start of this list of posting blocks and write # the number of blocks postfile = self.postfile postfile.flush() here = postfile.tell() postfile.seek(self.startoffset + 4) postfile.write_uint(self.blockcount) postfile.seek(here) self.block = None postings = self.startoffset self.block = None terminfo.postings = postings self.termsindex.add((self.fieldname, self.text), terminfo) def finish_field(self): if not self._infield: raise Exception("Called finish_field before start_field") self._infield = False if self._dawgfield: self.dawg.finish_field() self._dawgfield = False def close(self): self.termsindex.close() self.postfile.close() if self.dawg is not None: self.dawg.close() self.is_closed = True # Matcher class W2LeafMatcher(LeafMatcher): def __init__(self, postfile, startoffset, fmt, scorer=None, term=None, stringids=False): self.postfile = postfile self.startoffset = startoffset self.format = fmt self.scorer = scorer self._term = term self.stringids = stringids postfile.seek(startoffset) magic = postfile.read(4) assert magic == W2Block.magic self.blockclass = W2Block self.blockcount = postfile.read_uint() self.baseoffset = postfile.tell() self._active = True self.currentblock = -1 self._next_block() def id(self): return self.block.ids[self.i] def is_active(self): return self._active def weight(self): weights = self.block.weights if not weights: weights = self.block.read_weights() return weights[self.i] def value(self): values = self.block.values if values is None: values = self.block.read_values() return values[self.i] def all_ids(self): nextoffset = self.baseoffset for _ in xrange(self.blockcount): block = self._read_block(nextoffset) nextoffset = block.nextoffset ids = block.read_ids() for id in ids: yield id def next(self): if self.i == self.block.count - 1: self._next_block() return True else: self.i += 1 return False def skip_to(self, id): if not self.is_active(): raise ReadTooFar i = self.i # If we're already in the block with the target ID, do nothing if id <= self.block.ids[i]: return # Skip to the block that would contain the target ID if id > self.block.maxid: self._skip_to_block(lambda: id > self.block.maxid) if not self.is_active(): return # Iterate through the IDs in the block until we find or pass the # target ids = self.block.ids i = self.i while ids[i] < id: i += 1 if i == len(ids): self._active = False return self.i = i def skip_to_quality(self, minquality): bq = self.block_quality if bq() > minquality: return 0 return self._skip_to_block(lambda: bq() <= minquality) def block_min_length(self): return self.block.min_length() def block_max_length(self): return self.block.max_length() def block_max_weight(self): return self.block.max_weight() def block_max_wol(self): return self.block.max_wol() def _read_block(self, offset): pf = self.postfile pf.seek(offset) return self.blockclass.from_file(pf, self.format.posting_size, stringids=self.stringids) def _consume_block(self): self.block.read_ids() self.block.read_weights() self.i = 0 def _next_block(self, consume=True): if not (self.currentblock < self.blockcount): raise Exception("No next block") self.currentblock += 1 if self.currentblock == self.blockcount: self._active = False return if self.currentblock == 0: pos = self.baseoffset else: pos = self.block.nextoffset self.block = self._read_block(pos) if consume: self._consume_block() def _skip_to_block(self, targetfn): skipped = 0 while self._active and targetfn(): self._next_block(consume=False) skipped += 1 if self._active: self._consume_block() return skipped # Tables # Writers class TermIndexWriter(HashWriter): def __init__(self, dbfile): HashWriter.__init__(self, dbfile) self.index = [] self.fieldcounter = 0 self.fieldmap = {} def keycoder(self, term): # Encode term fieldmap = self.fieldmap fieldname, text = term if fieldname in fieldmap: fieldnum = fieldmap[fieldname] else: fieldnum = self.fieldcounter fieldmap[fieldname] = fieldnum self.fieldcounter += 1 key = pack_ushort(fieldnum) + text return key def valuecoder(self, terminfo): return terminfo.to_string() def add(self, key, value): pos = self.dbfile.tell() self.index.append(pos) HashWriter.add(self, self.keycoder(key), self.valuecoder(value)) def _write_extras(self): dbfile = self.dbfile dbfile.write_uint(len(self.index)) for n in self.index: dbfile.write_long(n) dbfile.write_pickle(self.fieldmap) class VectorWriter(TermIndexWriter): def keycoder(self, key): fieldmap = self.fieldmap docnum, fieldname = key if fieldname in fieldmap: fieldnum = fieldmap[fieldname] else: fieldnum = self.fieldcounter fieldmap[fieldname] = fieldnum self.fieldcounter += 1 return _vectorkey_struct.pack(docnum, fieldnum) def valuecoder(self, offset): return pack_long(offset) # Readers class PostingIndexBase(HashReader): def __init__(self, dbfile, postfile): HashReader.__init__(self, dbfile) self.postfile = postfile def _read_extras(self): dbfile = self.dbfile self.length = dbfile.read_uint() self.indexbase = dbfile.tell() dbfile.seek(self.indexbase + self.length * _LONG_SIZE) self.fieldmap = dbfile.read_pickle() self.names = [None] * len(self.fieldmap) for name, num in iteritems(self.fieldmap): self.names[num] = name def _closest_key(self, key): dbfile = self.dbfile key_at = self._key_at indexbase = self.indexbase lo = 0 hi = self.length if not isinstance(key, bytes_type): raise TypeError("Key %r should be bytes" % key) while lo < hi: mid = (lo + hi) // 2 midkey = key_at(dbfile.get_long(indexbase + mid * _LONG_SIZE)) if midkey < key: lo = mid + 1 else: hi = mid #i = max(0, mid - 1) if lo == self.length: return None return dbfile.get_long(indexbase + lo * _LONG_SIZE) def closest_key(self, key): pos = self._closest_key(key) if pos is None: return None return self._key_at(pos) def _ranges_from(self, key): #read = self.read pos = self._closest_key(key) if pos is None: return for x in self._ranges(pos=pos): yield x def __getitem__(self, key): k = self.keycoder(key) return self.valuedecoder(HashReader.__getitem__(self, k)) def __contains__(self, key): try: codedkey = self.keycoder(key) except KeyError: return False return HashReader.__contains__(self, codedkey) def range_for_key(self, key): return HashReader.range_for_key(self, self.keycoder(key)) def get(self, key, default=None): k = self.keycoder(key) return self.valuedecoder(HashReader.get(self, k, default)) def keys(self): kd = self.keydecoder for k in HashReader.keys(self): yield kd(k) def items(self): kd = self.keydecoder vd = self.valuedecoder for key, value in HashReader.items(self): yield (kd(key), vd(value)) def terms_from(self, fieldname, prefix): return self.keys_from((fieldname, prefix)) def keys_from(self, key): key = self.keycoder(key) kd = self.keydecoder read = self.read for keypos, keylen, _, _ in self._ranges_from(key): yield kd(read(keypos, keylen)) def items_from(self, fieldname, prefix): read = self.read key = self.keycoder((fieldname, prefix)) kd = self.keydecoder vd = self.valuedecoder for keypos, keylen, datapos, datalen in self._ranges_from(key): yield (kd(read(keypos, keylen)), vd(read(datapos, datalen))) def values(self): vd = self.valuedecoder for v in HashReader.values(self): yield vd(v) def close(self): HashReader.close(self) self.postfile.close() class W2TermsReader(PostingIndexBase): # Implements whoosh.codec.base.TermsReader def indexed_field_names(self): return self.fieldmap.keys() def terms(self): return self.keys() def term_info(self, fieldname, text): return self[fieldname, text] def matcher(self, fieldname, text, format_, scorer=None): # Note this does not filter out deleted documents; a higher level is # expected to wrap this matcher to eliminate deleted docs pf = self.postfile term = (fieldname, text) try: terminfo = self[term] except KeyError: raise TermNotFound("No term %s:%r" % (fieldname, text)) p = terminfo.postings if isinstance(p, integer_types): # terminfo.postings is an offset into the posting file pr = W2LeafMatcher(pf, p, format_, scorer=scorer, term=term) else: # terminfo.postings is an inlined tuple of (ids, weights, values) docids, weights, values = p pr = ListMatcher(docids, weights, values, format_, scorer=scorer, term=term) return pr def keycoder(self, key): fieldname, tbytes = key fnum = self.fieldmap.get(fieldname, 65535) return pack_ushort(fnum) + tbytes def keydecoder(self, v): assert isinstance(v, bytes_type) return (self.names[unpack_ushort(v[:2])[0]], v[2:]) def valuedecoder(self, v): assert isinstance(v, bytes_type) return FileTermInfo.from_string(v) def frequency(self, fieldname, btext): assert isinstance(btext, bytes_type) datapos = self.range_for_key((fieldname, btext))[0] return FileTermInfo.read_weight(self.dbfile, datapos) def doc_frequency(self, fieldname, btext): assert isinstance(btext, bytes_type) datapos = self.range_for_key((fieldname, btext))[0] return FileTermInfo.read_doc_freq(self.dbfile, datapos) # docnum, fieldnum _vectorkey_struct = Struct("!IH") class W2VectorReader(PostingIndexBase): # Implements whoosh.codec.base.VectorReader def matcher(self, docnum, fieldname, format_): pf = self.postfile offset = self[(docnum, fieldname)] pr = W2LeafMatcher(pf, offset, format_, stringids=True) return pr def keycoder(self, key): return _vectorkey_struct.pack(key[0], self.fieldmap[key[1]]) def keydecoder(self, v): docnum, fieldnum = _vectorkey_struct.unpack(v) return (docnum, self.names[fieldnum]) def valuedecoder(self, v): return unpack_long(v)[0] class W2PerDocReader(base.PerDocumentReader): def __init__(self, storage, segment): self._storage = storage self._segment = segment self._doccount = segment.doc_count_all() flfile = segment.open_file(storage, W2Codec.LENGTHS_EXT) self._lengths = InMemoryLengths.from_file(flfile, self._doccount) sffile = segment.open_file(storage, W2Codec.STORED_EXT) self._stored = StoredFieldReader(sffile) self._vectors = None # Lazy load def supports_columns(self): return False def close(self): self._lengths.close() if self._vectors: self._vectors.close() self._stored.close() def doc_count(self): return self._segment.doc_count() def doc_count_all(self): return self._doccount def has_deletions(self): return self._segment.has_deletions() def is_deleted(self, docnum): return self._segment.is_deleted(docnum) def deleted_docs(self): return self._segment.deleted_docs() # Lengths def doc_field_length(self, docnum, fieldname, default=0): return self._lengths.doc_field_length(docnum, fieldname, default) def field_length(self, fieldname): return self._lengths.field_length(fieldname) def min_field_length(self, fieldname): return self._lengths.min_field_length(fieldname) def max_field_length(self, fieldname): return self._lengths.max_field_length(fieldname) # Vectors def _prep_vectors(self): vifile = self._segment.open_file(self._storage, W2Codec.VECTOR_EXT) vpostfile = self._segment.open_file(self._storage, W2Codec.VPOSTS_EXT) self._vectors = W2VectorReader(vifile, vpostfile) def has_vector(self, docnum, fieldname): if self._vectors is None: try: self._prep_vectors() except (NameError, IOError): return False return (docnum, fieldname) in self._vectors def vector(self, docnum, fieldname, format_): if self._vectors is None: self._prep_vectors() return self._vectors.matcher(docnum, fieldname, format_) # Stored def stored_fields(self, docnum): return self._stored[docnum] # Single-byte field lengths implementations class ByteLengthsBase(object): magic = b("~LN1") def __init__(self): self.starts = {} self.totals = {} self.minlens = {} self.maxlens = {} def _read_header(self, dbfile, doccount): first = dbfile.read(4) # Magic assert first == self.magic version = dbfile.read_int() # Version number assert version == 1 self._count = dbfile.read_uint() # Number of documents saved fieldcount = dbfile.read_ushort() # Number of fields # Read per-field info for i in xrange(fieldcount): fieldname = dbfile.read_string().decode('utf-8') self.totals[fieldname] = dbfile.read_long() self.minlens[fieldname] = byte_to_length(dbfile.read_byte()) self.maxlens[fieldname] = byte_to_length(dbfile.read_byte()) self.starts[fieldname] = i * doccount # Add header length to per-field offsets eoh = dbfile.tell() # End of header for fieldname in self.starts: self.starts[fieldname] += eoh def doc_count_all(self): return self._count def field_length(self, fieldname): return self.totals.get(fieldname, 0) def min_field_length(self, fieldname): return self.minlens.get(fieldname, 0) def max_field_length(self, fieldname): return self.maxlens.get(fieldname, 0) class InMemoryLengths(ByteLengthsBase): def __init__(self): ByteLengthsBase.__init__(self) self.totals = defaultdict(int) self.lengths = {} self._count = 0 def close(self): pass # IO def to_file(self, dbfile, doccount): self._pad_arrays(doccount) fieldnames = list(self.lengths.keys()) dbfile.write(self.magic) dbfile.write_int(1) # Format version number dbfile.write_uint(doccount) # Number of documents dbfile.write_ushort(len(self.lengths)) # Number of fields # Write per-field info for fieldname in fieldnames: dbfile.write_string(fieldname.encode('utf-8')) # Fieldname dbfile.write_long(self.field_length(fieldname)) dbfile.write_byte(length_to_byte(self.min_field_length(fieldname))) dbfile.write_byte(length_to_byte(self.max_field_length(fieldname))) # Write byte arrays for fieldname in fieldnames: dbfile.write_array(self.lengths[fieldname]) dbfile.close() @classmethod def from_file(cls, dbfile, doccount=None): obj = cls() obj._read_header(dbfile, doccount) for fieldname, start in iteritems(obj.starts): obj.lengths[fieldname] = dbfile.get_array(start, "B", obj._count) dbfile.close() return obj # Get def doc_field_length(self, docnum, fieldname, default=0): try: arry = self.lengths[fieldname] except KeyError: return default if docnum >= len(arry): return default return byte_to_length(arry[docnum]) # Min/max cache setup -- not meant to be called while adding def _minmax(self, fieldname, op, cache): if fieldname in cache: return cache[fieldname] else: ls = self.lengths[fieldname] if ls: result = byte_to_length(op(ls)) else: result = 0 cache[fieldname] = result return result def min_field_length(self, fieldname): return self._minmax(fieldname, min, self.minlens) def max_field_length(self, fieldname): return self._minmax(fieldname, max, self.maxlens) # Add def _create_field(self, fieldname, docnum): dc = max(self._count, docnum + 1) self.lengths[fieldname] = array("B", (0 for _ in xrange(dc))) self._count = dc def _pad_arrays(self, doccount): # Pad out arrays to full length for fieldname in self.lengths.keys(): arry = self.lengths[fieldname] if len(arry) < doccount: for _ in xrange(doccount - len(arry)): arry.append(0) self._count = doccount def add(self, docnum, fieldname, length): lengths = self.lengths if length: if fieldname not in lengths: self._create_field(fieldname, docnum) arry = self.lengths[fieldname] count = docnum + 1 if len(arry) < count: for _ in xrange(count - len(arry)): arry.append(0) if count > self._count: self._count = count byte = length_to_byte(length) arry[docnum] = byte self.totals[fieldname] += length def add_other(self, other): lengths = self.lengths totals = self.totals doccount = self._count for fname in other.lengths: if fname not in lengths: lengths[fname] = array("B") self._pad_arrays(doccount) for fname in other.lengths: lengths[fname].extend(other.lengths[fname]) self._count = doccount + other._count self._pad_arrays(self._count) for fname in other.totals: totals[fname] += other.totals[fname] class OnDiskLengths(ByteLengthsBase): def __init__(self, dbfile, doccount=None): ByteLengthsBase.__init__(self) self.dbfile = dbfile self._read_header(dbfile, doccount) def doc_field_length(self, docnum, fieldname, default=0): try: start = self.starts[fieldname] except KeyError: return default return byte_to_length(self.dbfile.get_byte(start + docnum)) def close(self): self.dbfile.close() # Stored fields _stored_pointer_struct = Struct("!qI") # offset, length stored_pointer_size = _stored_pointer_struct.size pack_stored_pointer = _stored_pointer_struct.pack unpack_stored_pointer = _stored_pointer_struct.unpack class StoredFieldWriter(object): def __init__(self, dbfile): self.dbfile = dbfile self.length = 0 self.directory = [] self.dbfile.write_long(0) self.dbfile.write_uint(0) self.names = [] self.name_map = {} def add(self, vdict): f = self.dbfile names = self.names name_map = self.name_map vlist = [None] * len(names) for k, v in iteritems(vdict): if k in name_map: vlist[name_map[k]] = v else: name_map[k] = len(names) names.append(k) vlist.append(v) vstring = dumps(tuple(vlist), -1)[2:-1] self.length += 1 self.directory.append(pack_stored_pointer(f.tell(), len(vstring))) f.write(vstring) def add_reader(self, sfreader): add = self.add for vdict in sfreader: add(vdict) def close(self): f = self.dbfile dirpos = f.tell() f.write_pickle(self.names) for pair in self.directory: f.write(pair) f.flush() f.seek(0) f.write_long(dirpos) f.write_uint(self.length) f.close() class StoredFieldReader(object): def __init__(self, dbfile): self.dbfile = dbfile dbfile.seek(0) dirpos = dbfile.read_long() self.length = dbfile.read_uint() self.basepos = dbfile.tell() dbfile.seek(dirpos) nameobj = dbfile.read_pickle() if isinstance(nameobj, dict): # Previous versions stored the list of names as a map of names to # positions... it seemed to make sense at the time... self.names = [None] * len(nameobj) for name, pos in iteritems(nameobj): self.names[pos] = name else: self.names = nameobj self.directory_offset = dbfile.tell() def close(self): self.dbfile.close() def __iter__(self): dbfile = self.dbfile names = self.names lengths = array("I") dbfile.seek(self.directory_offset) for i in xrange(self.length): dbfile.seek(_LONG_SIZE, 1) lengths.append(dbfile.read_uint()) dbfile.seek(self.basepos) for length in lengths: vlist = loads(dbfile.read(length) + b(".")) vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist)) if vlist[i] is not None) yield vdict def __getitem__(self, num): if num > self.length - 1: raise IndexError("Tried to get document %s, file has %s" % (num, self.length)) dbfile = self.dbfile start = self.directory_offset + num * stored_pointer_size dbfile.seek(start) ptr = dbfile.read(stored_pointer_size) if len(ptr) != stored_pointer_size: raise Exception("Error reading %r @%s %s < %s" % (dbfile, start, len(ptr), stored_pointer_size)) position, length = unpack_stored_pointer(ptr) dbfile.seek(position) vlist = loads(dbfile.read(length) + b(".")) names = self.names # Recreate a dictionary by putting the field names and values back # together by position. We can't just use dict(zip(...)) because we # want to filter out the None values. vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist)) if vlist[i] is not None) return vdict # Segment object class W2Segment(base.Segment): def __init__(self, indexname, doccount=0, segid=None, deleted=None): """ :param name: The name of the segment (the Index object computes this from its name and the generation). :param doccount: The maximum document number in the segment. :param term_count: Total count of all terms in all documents. :param deleted: A set of deleted document numbers, or None if no deleted documents exist in this segment. """ assert isinstance(indexname, string_type) self.indexname = indexname assert isinstance(doccount, integer_types) self.doccount = doccount self.segid = self._random_id() if segid is None else segid self.deleted = deleted self.compound = False def codec(self, **kwargs): return W2Codec(**kwargs) def set_doc_count(self, dc): self.doccount = dc def doc_count_all(self): return self.doccount def doc_count(self): return self.doccount - self.deleted_count() def has_deletions(self): return self.deleted is not None and bool(self.deleted) def deleted_count(self): if self.deleted is None: return 0 return len(self.deleted) def delete_document(self, docnum, delete=True): if delete: if self.deleted is None: self.deleted = set() self.deleted.add(docnum) elif self.deleted is not None and docnum in self.deleted: self.deleted.clear(docnum) def is_deleted(self, docnum): if self.deleted is None: return False return docnum in self.deleted def deleted_docs(self): if self.deleted is None: return () else: return iter(self.deleted) # Posting blocks class W2Block(object): magic = b("Blk3") infokeys = ("count", "maxid", "maxweight", "minlength", "maxlength", "idcode", "compression", "idslen", "weightslen") def __init__(self, postingsize, stringids=False): self.postingsize = postingsize self.stringids = stringids self.ids = [] if stringids else array("I") self.weights = array("f") self.values = None self.minlength = None self.maxlength = 0 self.maxweight = 0 def __len__(self): return len(self.ids) def __nonzero__(self): return bool(self.ids) def min_id(self): if self.ids: return self.ids[0] else: raise IndexError def max_id(self): if self.ids: return self.ids[-1] else: raise IndexError def min_length(self): return self.minlength def max_length(self): return self.maxlength def max_weight(self): return self.maxweight def add(self, id_, weight, valuestring, length=None): self.ids.append(id_) self.weights.append(weight) if weight > self.maxweight: self.maxweight = weight if valuestring: if self.values is None: self.values = [] self.values.append(valuestring) if length: if self.minlength is None or length < self.minlength: self.minlength = length if length > self.maxlength: self.maxlength = length def to_file(self, postfile, compression=3): ids = self.ids idcode, idstring = minimize_ids(ids, self.stringids, compression) wtstring = minimize_weights(self.weights, compression) vstring = minimize_values(self.postingsize, self.values, compression) info = (len(ids), ids[-1], self.maxweight, length_to_byte(self.minlength), length_to_byte(self.maxlength), idcode, compression, len(idstring), len(wtstring)) infostring = dumps(info, -1) # Offset to next block postfile.write_uint(len(infostring) + len(idstring) + len(wtstring) + len(vstring)) # Block contents postfile.write(infostring) postfile.write(idstring) postfile.write(wtstring) postfile.write(vstring) @classmethod def from_file(cls, postfile, postingsize, stringids=False): block = cls(postingsize, stringids=stringids) block.postfile = postfile delta = postfile.read_uint() block.nextoffset = postfile.tell() + delta info = postfile.read_pickle() block.dataoffset = postfile.tell() for key, value in zip(cls.infokeys, info): if key in ("minlength", "maxlength"): value = byte_to_length(value) setattr(block, key, value) return block def read_ids(self): offset = self.dataoffset self.postfile.seek(offset) idstring = self.postfile.read(self.idslen) ids = deminimize_ids(self.idcode, self.count, idstring, self.compression) self.ids = ids return ids def read_weights(self): if self.weightslen == 0: weights = [1.0] * self.count else: offset = self.dataoffset + self.idslen self.postfile.seek(offset) wtstring = self.postfile.read(self.weightslen) weights = deminimize_weights(self.count, wtstring, self.compression) self.weights = weights return weights def read_values(self): postingsize = self.postingsize if postingsize == 0: values = [None] * self.count else: offset = self.dataoffset + self.idslen + self.weightslen self.postfile.seek(offset) vstring = self.postfile.read(self.nextoffset - offset) values = deminimize_values(postingsize, self.count, vstring, self.compression) self.values = values return values # File TermInfo NO_ID = 0xffffffff class FileTermInfo(TermInfo): # Freq, Doc freq, min len, max length, max weight, unused, min ID, max ID struct = Struct("!fIBBffII") def __init__(self, *args, **kwargs): self.postings = None if "postings" in kwargs: self.postings = kwargs["postings"] del kwargs["postings"] TermInfo.__init__(self, *args, **kwargs) # filedb specific methods def add_block(self, block): self._weight += sum(block.weights) self._df += len(block) ml = block.min_length() if self._minlength is None: self._minlength = ml else: self._minlength = min(self._minlength, ml) self._maxlength = max(self._maxlength, block.max_length()) self._maxweight = max(self._maxweight, block.max_weight()) if self._minid is None: self._minid = block.ids[0] self._maxid = block.ids[-1] def to_string(self): # Encode the lengths as 0-255 values ml = 0 if self._minlength is None else length_to_byte(self._minlength) xl = length_to_byte(self._maxlength) # Convert None values to the out-of-band NO_ID constant so they can be # stored as unsigned ints mid = NO_ID if self._minid is None else self._minid xid = NO_ID if self._maxid is None else self._maxid # Pack the term info into bytes st = self.struct.pack(self._weight, self._df, ml, xl, self._maxweight, 0, mid, xid) if isinstance(self.postings, tuple): # Postings are inlined - dump them using the pickle protocol isinlined = 1 st += dumps(self.postings, -1)[2:-1] else: # Append postings pointer as long to end of term info bytes isinlined = 0 # It's possible for a term info to not have a pointer to postings # on disk, in which case postings will be None. Convert a None # value to -1 so it can be stored as a long. p = -1 if self.postings is None else self.postings st += pack_long(p) # Prepend byte indicating whether the postings are inlined to the term # info bytes return pack_byte(isinlined) + st @classmethod def from_string(cls, s): assert isinstance(s, bytes_type) if isinstance(s, string_type): hbyte = ord(s[0]) # Python 2.x - str else: hbyte = s[0] # Python 3 - bytes if hbyte < 2: st = cls.struct # Weight, Doc freq, min len, max len, max w, unused, min ID, max ID w, df, ml, xl, xw, _, mid, xid = st.unpack(s[1:st.size + 1]) mid = None if mid == NO_ID else mid xid = None if xid == NO_ID else xid # Postings pstr = s[st.size + 1:] if hbyte == 0: p = unpack_long(pstr)[0] else: p = loads(pstr + b(".")) else: # Old format was encoded as a variable length pickled tuple v = loads(s + b(".")) if len(v) == 1: w = df = 1 p = v[0] elif len(v) == 2: w = df = v[1] p = v[0] else: w, p, df = v # Fake values for stats which weren't stored before ml = 1 xl = 255 xw = 999999999 mid = -1 xid = -1 ml = byte_to_length(ml) xl = byte_to_length(xl) obj = cls(w, df, ml, xl, xw, mid, xid) obj.postings = p return obj @classmethod def read_weight(cls, dbfile, datapos): return dbfile.get_float(datapos + 1) @classmethod def read_doc_freq(cls, dbfile, datapos): return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE) @classmethod def read_min_and_max_length(cls, dbfile, datapos): lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE ml = byte_to_length(dbfile.get_byte(lenpos)) xl = byte_to_length(dbfile.get_byte(lenpos + 1)) return ml, xl @classmethod def read_max_weight(cls, dbfile, datapos): weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2 return dbfile.get_float(weightspos) # Utility functions def minimize_ids(arry, stringids, compression=0): amax = arry[-1] if stringids: typecode = '' string = dumps(arry) else: typecode = arry.typecode if amax <= 255: typecode = "B" elif amax <= 65535: typecode = "H" if typecode != arry.typecode: arry = array(typecode, iter(arry)) if not IS_LITTLE: arry.byteswap() string = array_tobytes(arry) if compression: string = zlib.compress(string, compression) return (typecode, string) def deminimize_ids(typecode, count, string, compression=0): if compression: string = zlib.decompress(string) if typecode == '': return loads(string) else: arry = array(typecode) array_frombytes(arry, string) if not IS_LITTLE: arry.byteswap() return arry def minimize_weights(weights, compression=0): if all(w == 1.0 for w in weights): string = b("") else: if not IS_LITTLE: weights.byteswap() string = array_tobytes(weights) if string and compression: string = zlib.compress(string, compression) return string def deminimize_weights(count, string, compression=0): if not string: return array("f", (1.0 for _ in xrange(count))) if compression: string = zlib.decompress(string) arry = array("f") array_frombytes(arry, string) if not IS_LITTLE: arry.byteswap() return arry def minimize_values(postingsize, values, compression=0): if postingsize < 0: string = dumps(values, -1)[2:] elif postingsize == 0: string = b('') else: string = b('').join(values) if string and compression: string = zlib.compress(string, compression) return string def deminimize_values(postingsize, count, string, compression=0): if compression: string = zlib.decompress(string) if postingsize < 0: return loads(string) elif postingsize == 0: return [None] * count else: return [string[i:i + postingsize] for i in xrange(0, len(string), postingsize)] # Legacy field types from whoosh.compat import long_type from whoosh.fields import NUMERIC class OLD_NUMERIC(NUMERIC): NUMERIC_DEFAULTS = {"b": 2 ** 7 - 1, "B": 2 ** 8 - 1, "h": 2 ** 15 - 1, "H": 2 ** 16 - 1, "i": 2 ** 31 - 1, "I": 2 ** 32 - 1, "q": 2 ** 63 - 1, "Q": 2 ** 64 - 1, "f": NaN, "d": NaN, } def __init__(self, type=int, stored=False, unique=False, field_boost=1.0, decimal_places=0, shift_step=4, signed=True): from whoosh import analysis, formats self.type = type if self.type is long_type: # This will catch the Python 3 int type self._to_text = self._long_to_text self._from_text = self._text_to_long self.sortable_typecode = "q" if signed else "Q" elif self.type is int: self._to_text = self._int_to_text self._from_text = self._text_to_int self.sortable_typecode = "i" if signed else "I" elif self.type is float: self._to_text = self._float_to_text self._from_text = self._text_to_float self.sortable_typecode = "f" elif self.type is Decimal: raise TypeError("To store Decimal instances, set type to int or " "float and use the decimal_places argument") else: raise TypeError("%s field type can't store %r" % (self.__class__, self.type)) self.stored = stored self.unique = unique self.decimal_places = decimal_places self.shift_step = shift_step self.signed = signed self.analyzer = analysis.IDAnalyzer() self.format = formats.Existence(field_boost=field_boost) def __setstate__(self, d): self.__dict__.update(d) self.numtype = d["type"] self.bits = 32 if (d["type"] is int and not PY3) else 64 def prepare_number(self, x): if x is None or x == emptybytes: return x if self.decimal_places: x = Decimal(x) x *= 10 ** self.decimal_places x = self.type(x) return x def unprepare_number(self, x): dc = self.decimal_places if dc: s = str(x) x = Decimal(s[:-dc] + "." + s[-dc:]) return x def to_bytes(self, x, shift=0): if isinstance(x, bytes_type): return x return utf8encode(self.to_text(x, shift))[0] def from_bytes(self, bs): return self.from_text(utf8decode(bs)[0]) def sortable_to_bytes(self, x, shift=0): if shift: x >>= shift return pack_byte(shift) + self._to_text() def to_text(self, x, shift=0): x = self.prepare_number(x) x = self._to_text(x, shift=shift, signed=self.signed) return x def from_text(self, t): x = self._from_text(t, signed=self.signed) return self.unprepare_number(x) def process_text(self, text, **kwargs): return (self.to_text(text),) def self_parsing(self): return True def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query if qstring == "*": return query.Every(fieldname, boost=boost) try: text = self.to_text(qstring) except Exception: e = sys.exc_info()[1] return query.error_query(e) return query.Term(fieldname, text, boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query from whoosh.qparser.common import QueryParserError try: if start is not None: start = self.from_text(self.to_text(start)) if end is not None: end = self.from_text(self.to_text(end)) except Exception: e = sys.exc_info()[1] raise QueryParserError(e) return query.NumericRange(fieldname, start, end, startexcl, endexcl, boost=boost) def sortable_terms(self, ixreader, fieldname): for btext in ixreader.lexicon(fieldname): if btext[0:1] != "\x00": # Only yield the full-precision values break yield btext class OLD_DATETIME(OLD_NUMERIC): def __init__(self, stored=False, unique=False): OLD_NUMERIC.__init__(self, type=long_type, stored=stored, unique=unique, shift_step=8) def to_text(self, x, shift=0): from datetime import datetime from whoosh.util.times import floor try: if isinstance(x, text_type): # For indexing, support same strings as for query parsing x = self._parse_datestring(x) x = floor(x) # this makes most sense (unspecified = lowest) if isinstance(x, datetime): x = datetime_to_long(x) elif not isinstance(x, integer_types): raise TypeError() except Exception: raise ValueError("DATETIME.to_text can't convert from %r" % (x,)) x = OLD_NUMERIC.to_text(self, x, shift=shift) return x def from_text(self, x): x = OLD_NUMERIC.from_text(self, x) return long_to_datetime(x) def _parse_datestring(self, qstring): # This method parses a very simple datetime representation of the form # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]] from whoosh.util.times import adatetime, fix, is_void qstring = qstring.replace(" ", "").replace("-", "").replace(".", "") year = month = day = hour = minute = second = microsecond = None if len(qstring) >= 4: year = int(qstring[:4]) if len(qstring) >= 6: month = int(qstring[4:6]) if len(qstring) >= 8: day = int(qstring[6:8]) if len(qstring) >= 10: hour = int(qstring[8:10]) if len(qstring) >= 12: minute = int(qstring[10:12]) if len(qstring) >= 14: second = int(qstring[12:14]) if len(qstring) == 20: microsecond = int(qstring[14:]) at = fix(adatetime(year, month, day, hour, minute, second, microsecond)) if is_void(at): raise Exception("%r is not a parseable date" % qstring) return at def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query from whoosh.util.times import is_ambiguous try: at = self._parse_datestring(qstring) except: e = sys.exc_info()[1] return query.error_query(e) if is_ambiguous(at): startnum = datetime_to_long(at.floor()) endnum = datetime_to_long(at.ceil()) return query.NumericRange(fieldname, startnum, endnum) else: return query.Term(fieldname, self.to_text(at), boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query if start is None and end is None: return query.Every(fieldname, boost=boost) if start is not None: startdt = self._parse_datestring(start).floor() start = datetime_to_long(startdt) if end is not None: enddt = self._parse_datestring(end).ceil() end = datetime_to_long(enddt) return query.NumericRange(fieldname, start, end, boost=boost) # Functions for converting numbers to and from text def int_to_text(x, shift=0, signed=True): x = to_sortable(int, 32, signed, x) return sortable_int_to_text(x, shift) def text_to_int(text, signed=True): x = text_to_sortable_int(text) x = from_sortable(int, 32, signed, x) return x def long_to_text(x, shift=0, signed=True): x = to_sortable(long_type, 64, signed, x) return sortable_long_to_text(x, shift) def text_to_long(text, signed=True): x = text_to_sortable_long(text) x = from_sortable(long_type, 64, signed, x) return x def float_to_text(x, shift=0, signed=True): x = to_sortable(float, 32, signed, x) return sortable_long_to_text(x, shift) def text_to_float(text, signed=True): x = text_to_sortable_long(text) x = from_sortable(float, 32, signed, x) return x # Functions for converting sortable representations to and from text. from whoosh.support.base85 import to_base85, from_base85 def sortable_int_to_text(x, shift=0): if shift: x >>= shift #text = chr(shift) + u"%08x" % x text = chr(shift) + to_base85(x, False) return text def sortable_long_to_text(x, shift=0): if shift: x >>= shift #text = chr(shift) + u"%016x" % x #assert len(text) == 17 text = chr(shift) + to_base85(x, True) return text def text_to_sortable_int(text): #assert len(text) == 9 #return int(text[1:], 16) return from_base85(text[1:]) def text_to_sortable_long(text): #assert len(text) == 17 #return long(text[1:], 16) return from_base85(text[1:]) Whoosh-2.5.7/src/whoosh/codec/whoosh3.py0000644000076500000240000011711112254366350020213 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module implements a "codec" for writing/reading Whoosh X indexes. """ import struct from array import array from collections import defaultdict from whoosh import columns, formats from whoosh.compat import b, bytes_type, string_type, integer_types from whoosh.compat import dumps, loads, iteritems, xrange from whoosh.codec import base from whoosh.filedb import compound, filetables from whoosh.matching import ListMatcher, ReadTooFar, LeafMatcher from whoosh.reading import TermInfo, TermNotFound from whoosh.system import emptybytes from whoosh.system import _SHORT_SIZE, _INT_SIZE, _LONG_SIZE, _FLOAT_SIZE from whoosh.system import pack_ushort, unpack_ushort from whoosh.system import pack_int, unpack_int, pack_long, unpack_long from whoosh.util.numlists import delta_encode, delta_decode from whoosh.util.numeric import length_to_byte, byte_to_length try: import zlib except ImportError: zlib = None # This byte sequence is written at the start of a posting list to identify the # codec/version WHOOSH3_HEADER_MAGIC = b("W3Bl") # Column type to store field length info LENGTHS_COLUMN = columns.NumericColumn("B", default=0) # Column type to store pointers to vector posting lists VECTOR_COLUMN = columns.NumericColumn("I") # Column type to store vector posting list lengths VECTOR_LEN_COLUMN = columns.NumericColumn("i") # Column type to store values of stored fields STORED_COLUMN = columns.PickleColumn(columns.CompressedBytesColumn()) class W3Codec(base.CodecWithGraph): # File extensions TERMS_EXT = ".trm" # Term index POSTS_EXT = ".pst" # Term postings VPOSTS_EXT = ".vps" # Vector postings COLUMN_EXT = ".col" # Per-document value columns def __init__(self, blocklimit=128, compression=3, inlinelimit=1): self._blocklimit = blocklimit self._compression = compression self._inlinelimit = inlinelimit # Per-document value writer def per_document_writer(self, storage, segment): return W3PerDocWriter(self, storage, segment) # Inverted index writer def field_writer(self, storage, segment): return W3FieldWriter(self, storage, segment) # Postings def postings_writer(self, dbfile, byteids=False): return W3PostingsWriter(dbfile, blocklimit=self._blocklimit, byteids=byteids, compression=self._compression, inlinelimit=self._inlinelimit) def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): if terminfo.is_inlined(): # If the postings were inlined into the terminfo object, pull them # out and use a ListMatcher to wrap them in a Matcher interface ids, weights, values = terminfo.inlined_postings() m = ListMatcher(ids, weights, values, format_, scorer=scorer, term=term, terminfo=terminfo) else: offset, length = terminfo.extent() m = W3LeafMatcher(dbfile, offset, length, format_, term=term, scorer=scorer) return m # Readers def per_document_reader(self, storage, segment): return W3PerDocReader(storage, segment) def terms_reader(self, storage, segment): tiname = segment.make_filename(self.TERMS_EXT) tilen = storage.file_length(tiname) tifile = storage.open_file(tiname) postfile = segment.open_file(storage, self.POSTS_EXT) return W3TermsReader(self, tifile, tilen, postfile) # Graph methods provided by CodecWithGraph # Columns def supports_columns(self): return True @classmethod def column_filename(cls, segment, fieldname): ext = "".join((".", fieldname, cls.COLUMN_EXT)) return segment.make_filename(ext) # Segments and generations def new_segment(self, storage, indexname): return W3Segment(self, indexname) # Common functions def _vecfield(fieldname): return "_%s_vec" % fieldname def _lenfield(fieldname): return "_%s_len" % fieldname # Per-doc information writer class W3PerDocWriter(base.PerDocWriterWithColumns): def __init__(self, codec, storage, segment): self._codec = codec self._storage = storage self._segment = segment tempst = storage.temp_storage("%s.tmp" % segment.indexname) self._cols = compound.CompoundWriter(tempst) self._colwriters = {} self._create_column("_stored", STORED_COLUMN) self._fieldlengths = defaultdict(int) self._doccount = 0 self._docnum = None self._storedfields = None self._indoc = False self.is_closed = False # We'll wait to create the vector file until someone actually tries # to add a vector self._vpostfile = None def _create_file(self, ext): return self._segment.create_file(self._storage, ext) def _has_column(self, fieldname): return fieldname in self._colwriters def _create_column(self, fieldname, column): writers = self._colwriters if fieldname in writers: raise Exception("Already added column %r" % fieldname) f = self._cols.create_file(fieldname) writers[fieldname] = column.writer(f) def _get_column(self, fieldname): return self._colwriters[fieldname] def _prep_vectors(self): self._vpostfile = self._create_file(W3Codec.VPOSTS_EXT) # We'll use offset==0 as a marker for "no vectors", so we can't start # postings at position 0, so just write a few header bytes :) self._vpostfile.write(b("VPST")) def start_doc(self, docnum): if self._indoc: raise Exception("Called start_doc when already in a doc") if docnum != self._doccount: raise Exception("Called start_doc(%r) was expecting %r" % (docnum, self._doccount)) self._docnum = docnum self._doccount += 1 self._storedfields = {} self._indoc = True def add_field(self, fieldname, fieldobj, value, length): if value is not None: self._storedfields[fieldname] = value if length: # Add byte to length column lenfield = _lenfield(fieldname) lb = length_to_byte(length) self.add_column_value(lenfield, LENGTHS_COLUMN, lb) # Add length to total field length self._fieldlengths[fieldname] += length def add_vector_items(self, fieldname, fieldobj, items): if self._vpostfile is None: self._prep_vectors() # Write vector postings vpostwriter = self._codec.postings_writer(self._vpostfile, byteids=True) vpostwriter.start_postings(fieldobj.vector, W3TermInfo()) for text, weight, vbytes in items: vpostwriter.add_posting(text, weight, vbytes) # finish_postings() returns terminfo object vinfo = vpostwriter.finish_postings() # Add row to vector lookup column vecfield = _vecfield(fieldname) # Compute vector column name offset, length = vinfo.extent() self.add_column_value(vecfield, VECTOR_COLUMN, offset) self.add_column_value(vecfield + "L", VECTOR_LEN_COLUMN, length) def finish_doc(self): sf = self._storedfields if sf: self.add_column_value("_stored", STORED_COLUMN, sf) sf.clear() self._indoc = False def _column_filename(self, fieldname): return W3Codec.column_filename(self._segment, fieldname) def close(self): if self._indoc is not None: # Called close without calling finish_doc self.finish_doc() self._segment._fieldlengths = self._fieldlengths # Finish open columns and close the columns writer for writer in self._colwriters.values(): writer.finish(self._doccount) self._cols.save_as_files(self._storage, self._column_filename) # If vectors were written, close the vector writers if self._vpostfile: self._vpostfile.close() self.is_closed = True class W3FieldWriter(base.FieldWriterWithGraph): def __init__(self, codec, storage, segment): self._codec = codec self._storage = storage self._segment = segment self._fieldname = None self._fieldid = None self._btext = None self._fieldobj = None self._format = None _tifile = self._create_file(W3Codec.TERMS_EXT) self._tindex = filetables.OrderedHashWriter(_tifile) self._fieldmap = self._tindex.extras["fieldmap"] = {} self._postfile = self._create_file(W3Codec.POSTS_EXT) self._postwriter = None self._infield = False self.is_closed = False def _create_file(self, ext): return self._segment.create_file(self._storage, ext) def start_field(self, fieldname, fieldobj): fmap = self._fieldmap if fieldname in fmap: self._fieldid = fmap[fieldname] else: self._fieldid = len(fmap) fmap[fieldname] = self._fieldid self._fieldname = fieldname self._fieldobj = fieldobj self._format = fieldobj.format self._infield = True # Set up graph for this field if necessary self._start_graph_field(fieldname, fieldobj) # Start a new postwriter for this field self._postwriter = self._codec.postings_writer(self._postfile) def start_term(self, btext): if self._postwriter is None: raise Exception("Called start_term before start_field") self._btext = btext self._postwriter.start_postings(self._fieldobj.format, W3TermInfo()) # Add the word to the graph if necessary self._insert_graph_key(btext) def add(self, docnum, weight, vbytes, length): self._postwriter.add_posting(docnum, weight, vbytes, length) def finish_term(self): terminfo = self._postwriter.finish_postings() # Add row to term info table keybytes = pack_ushort(self._fieldid) + self._btext valbytes = terminfo.to_bytes() self._tindex.add(keybytes, valbytes) # FieldWriterWithGraph.add_spell_word def finish_field(self): if not self._infield: raise Exception("Called finish_field before start_field") self._infield = False self._postwriter = None self._finish_graph_field() def close(self): self._tindex.close() self._postfile.close() self._close_graph() self.is_closed = True # Reader objects class W3PerDocReader(base.PerDocumentReader): def __init__(self, storage, segment): self._storage = storage self._segment = segment self._doccount = segment.doc_count_all() self._vpostfile = None self._colfiles = {} self._readers = {} self._minlengths = {} self._maxlengths = {} def close(self): for colfile, _, _ in self._colfiles.values(): colfile.close() if self._vpostfile: self._vpostfile.close() def doc_count(self): return self._doccount - self._segment.deleted_count() def doc_count_all(self): return self._doccount # Deletions def has_deletions(self): return self._segment.has_deletions() def is_deleted(self, docnum): return self._segment.is_deleted(docnum) def deleted_docs(self): return self._segment.deleted_docs() # Columns def has_column(self, fieldname): filename = W3Codec.column_filename(self._segment, fieldname) return self._storage.file_exists(filename) def _get_column_file(self, fieldname): filename = W3Codec.column_filename(self._segment, fieldname) length = self._storage.file_length(filename) colfile = self._storage.open_file(filename) return colfile, 0, length def column_reader(self, fieldname, column): if fieldname not in self._colfiles: self._colfiles[fieldname] = self._get_column_file(fieldname) colfile, offset, length = self._colfiles[fieldname] return column.reader(colfile, offset, length, self._doccount) # Lengths def _cached_reader(self, fieldname, column): if fieldname in self._readers: return self._readers[fieldname] else: if not self.has_column(fieldname): return None reader = self.column_reader(fieldname, column) self._readers[fieldname] = reader return reader def doc_field_length(self, docnum, fieldname, default=0): if docnum > self._doccount: raise IndexError("Asked for docnum %r of %d" % (docnum, self._doccount)) lenfield = _lenfield(fieldname) reader = self._cached_reader(lenfield, LENGTHS_COLUMN) if reader is None: return default lbyte = reader[docnum] if lbyte: return byte_to_length(lbyte) def field_length(self, fieldname): return self._segment._fieldlengths.get(fieldname, 0) def _minmax_length(self, fieldname, op, cache): if fieldname in cache: return cache[fieldname] lenfield = _lenfield(fieldname) reader = self._cached_reader(lenfield, LENGTHS_COLUMN) length = byte_to_length(op(reader)) cache[fieldname] = length return length def min_field_length(self, fieldname): return self._minmax_length(fieldname, min, self._minlengths) def max_field_length(self, fieldname): return self._minmax_length(fieldname, max, self._maxlengths) # Vectors def _prep_vectors(self): f = self._segment.open_file(self._storage, W3Codec.VPOSTS_EXT) self._vpostfile = f def _vector_extent(self, docnum, fieldname): if docnum > self._doccount: raise IndexError("Asked for document %r of %d" % (docnum, self._doccount)) vecfield = _vecfield(fieldname) # Compute vector column name # Get the offset from the vector offset column offset = self._cached_reader(vecfield, VECTOR_COLUMN)[docnum] # Get the length from the length column, if it exists, otherwise return # -1 for the length (backwards compatibility with old dev versions) lreader = self._cached_reader(vecfield + "L", VECTOR_COLUMN) if lreader: length = [docnum] else: length = -1 return offset, length def has_vector(self, docnum, fieldname): return (self.has_column(_vecfield(fieldname)) and self._vector_extent(docnum, fieldname)) def vector(self, docnum, fieldname, format_): if self._vpostfile is None: self._prep_vectors() offset, length = self._vector_extent(docnum, fieldname) m = W3LeafMatcher(self._vpostfile, offset, length, format_, byteids=True) return m # Stored fields def stored_fields(self, docnum): reader = self._cached_reader("_stored", STORED_COLUMN) v = reader[docnum] if v is None: v = {} return v class W3TermsReader(base.TermsReader): def __init__(self, codec, dbfile, length, postfile): self._codec = codec self._dbfile = dbfile self._tindex = filetables.OrderedHashReader(dbfile, length) self._fieldmap = self._tindex.extras["fieldmap"] self._postfile = postfile self._fieldunmap = [None] * len(self._fieldmap) for fieldname, num in iteritems(self._fieldmap): self._fieldunmap[num] = fieldname def _keycoder(self, fieldname, tbytes): assert isinstance(tbytes, bytes_type), "tbytes=%r" % tbytes fnum = self._fieldmap.get(fieldname, 65535) return pack_ushort(fnum) + tbytes def _keydecoder(self, keybytes): fieldid = unpack_ushort(keybytes[:_SHORT_SIZE])[0] return self._fieldunmap[fieldid], keybytes[_SHORT_SIZE:] def _range_for_key(self, fieldname, tbytes): return self._tindex.range_for_key(self._keycoder(fieldname, tbytes)) def __contains__(self, term): return self._keycoder(*term) in self._tindex def indexed_field_names(self): return self._fieldmap.keys() def terms(self): keydecoder = self._keydecoder return (keydecoder(keybytes) for keybytes in self._tindex.keys()) def terms_from(self, fieldname, prefix): prefixbytes = self._keycoder(fieldname, prefix) keydecoder = self._keydecoder return (keydecoder(keybytes) for keybytes in self._tindex.keys_from(prefixbytes)) def items(self): tidecoder = W3TermInfo.from_bytes keydecoder = self._keydecoder return ((keydecoder(keybytes), tidecoder(valbytes)) for keybytes, valbytes in self._tindex.items()) def items_from(self, fieldname, prefix): prefixbytes = self._keycoder(fieldname, prefix) tidecoder = W3TermInfo.from_bytes keydecoder = self._keydecoder return ((keydecoder(keybytes), tidecoder(valbytes)) for keybytes, valbytes in self._tindex.items_from(prefixbytes)) def term_info(self, fieldname, tbytes): key = self._keycoder(fieldname, tbytes) try: return W3TermInfo.from_bytes(self._tindex[key]) except KeyError: raise TermNotFound("No term %s:%r" % (fieldname, tbytes)) def frequency(self, fieldname, tbytes): datapos = self._range_for_key(fieldname, tbytes)[0] return W3TermInfo.read_weight(self._dbfile, datapos) def doc_frequency(self, fieldname, tbytes): datapos = self._range_for_key(fieldname, tbytes)[0] return W3TermInfo.read_doc_freq(self._dbfile, datapos) def matcher(self, fieldname, tbytes, format_, scorer=None): terminfo = self.term_info(fieldname, tbytes) m = self._codec.postings_reader(self._postfile, terminfo, format_, term=(fieldname, tbytes), scorer=scorer) return m def close(self): self._tindex.close() self._postfile.close() # Postings class W3PostingsWriter(base.PostingsWriter): """This object writes posting lists to the postings file. It groups postings into blocks and tracks block level statistics to makes it easier to skip through the postings. """ def __init__(self, postfile, blocklimit, byteids=False, compression=3, inlinelimit=1): self._postfile = postfile self._blocklimit = blocklimit self._byteids = byteids self._compression = compression self._inlinelimit = inlinelimit self._blockcount = 0 self._format = None self._terminfo = None def written(self): return self._blockcount > 0 def start_postings(self, format_, terminfo): # Start a new term if self._terminfo: # If self._terminfo is not None, that means we are already in a term raise Exception("Called start in a term") assert isinstance(format_, formats.Format) self._format = format_ # Reset block count self._blockcount = 0 # Reset block bufferg self._new_block() # Remember terminfo object passed to us self._terminfo = terminfo # Remember where we started in the posting file self._startoffset = self._postfile.tell() def add_posting(self, id_, weight, vbytes, length=None): # Add a posting to the buffered block # If the number of buffered postings == the block limit, write out the # buffered block and reset before adding this one if len(self._ids) >= self._blocklimit: self._write_block() # Check types if self._byteids: assert isinstance(id_, string_type), "id_=%r" % id_ else: assert isinstance(id_, integer_types), "id_=%r" % id_ assert isinstance(weight, (int, float)), "weight=%r" % weight assert isinstance(vbytes, bytes_type), "vbytes=%r" % vbytes assert length is None or isinstance(length, integer_types) self._ids.append(id_) self._weights.append(weight) if weight > self._maxweight: self._maxweight = weight if vbytes: self._values.append(vbytes) if length: minlength = self._minlength if minlength is None or length < minlength: self._minlength = length if length > self._maxlength: self._maxlength = length def finish_postings(self): terminfo = self._terminfo # If we have fewer than "inlinelimit" postings in this posting list, # "inline" the postings into the terminfo instead of writing them to # the posting file if not self.written() and len(self) < self._inlinelimit: terminfo.add_block(self) terminfo.set_inline(self._ids, self._weights, self._values) else: # If there are leftover items in the current block, write them out if self._ids: self._write_block(last=True) startoffset = self._startoffset length = self._postfile.tell() - startoffset terminfo.set_extent(startoffset, length) # Clear self._terminfo to indicate we're between terms self._terminfo = None # Return the current terminfo object return terminfo def _new_block(self): # Reset block buffer # List of IDs (docnums for regular posting list, terms for vector PL) self._ids = [] if self._byteids else array("I") # List of weights self._weights = array("f") # List of encoded payloads self._values = [] # Statistics self._minlength = None self._maxlength = 0 self._maxweight = 0 def _write_block(self, last=False): # Write the buffered block to the postings file # If this is the first block, write a small header first if not self._blockcount: self._postfile.write(WHOOSH3_HEADER_MAGIC) # Add this block's statistics to the terminfo object, which tracks the # overall statistics for all term postings self._terminfo.add_block(self) # Minify the IDs, weights, and values, and put them in a tuple data = (self._mini_ids(), self._mini_weights(), self._mini_values()) # Pickle the tuple databytes = dumps(data) # If the pickle is less than 20 bytes, don't bother compressing if len(databytes) < 20: comp = 0 # Compress the pickle (if self._compression > 0) comp = self._compression if comp: databytes = zlib.compress(databytes, comp) # Make a tuple of block info. The posting reader can check this info # and decide whether to skip the block without having to decompress the # full block data # # - Number of postings in block # - Last ID in block # - Maximum weight in block # - Compression level # - Minimum length byte # - Maximum length byte ids = self._ids infobytes = dumps((len(ids), ids[-1], self._maxweight, comp, length_to_byte(self._minlength), length_to_byte(self._maxlength), )) # Write block length postfile = self._postfile blocklength = len(infobytes) + len(databytes) if last: # If this is the last block, use a negative number blocklength *= -1 postfile.write_int(blocklength) # Write block info postfile.write(infobytes) # Write block data postfile.write(databytes) self._blockcount += 1 # Reset block buffer self._new_block() # Methods to reduce the byte size of the various lists def _mini_ids(self): # Minify IDs ids = self._ids if not self._byteids: ids = delta_encode(ids) return tuple(ids) def _mini_weights(self): # Minify weights weights = self._weights if all(w == 1.0 for w in weights): return None elif all(w == weights[0] for w in weights): return weights[0] else: return tuple(weights) def _mini_values(self): # Minify values fixedsize = self._format.fixed_value_size() values = self._values if fixedsize is None or fixedsize < 0: vs = tuple(values) elif fixedsize == 0: vs = None else: vs = emptybytes.join(values) return vs # Block stats methods def __len__(self): # Returns the number of unwritten buffered postings return len(self._ids) def min_id(self): # First ID in the buffered block return self._ids[0] def max_id(self): # Last ID in the buffered block return self._ids[-1] def min_length(self): # Shortest field length in the buffered block return self._minlength def max_length(self): # Longest field length in the buffered block return self._maxlength def max_weight(self): # Highest weight in the buffered block return self._maxweight class W3LeafMatcher(LeafMatcher): """Reads on-disk postings from the postings file and presents the :class:`whoosh.matching.Matcher` interface. """ def __init__(self, postfile, startoffset, length, format_, term=None, byteids=None, scorer=None): self._postfile = postfile self._startoffset = startoffset self._length = length self.format = format_ self._term = term self._byteids = byteids self.scorer = scorer self._fixedsize = self.format.fixed_value_size() # Read the header tag at the start of the postings self._read_header() # "Reset" to read the first block self.reset() def _read_header(self): # Seek to the start of the postings and check the header tag postfile = self._postfile postfile.seek(self._startoffset) magic = postfile.read(4) if magic != WHOOSH3_HEADER_MAGIC: raise Exception("Block tag error %r" % magic) # Remember the base offset (start of postings, after the header) self._baseoffset = postfile.tell() def reset(self): # Reset block stats self._blocklength = None self._maxid = None self._maxweight = None self._compression = None self._minlength = None self._maxlength = None self._lastblock = False self._atend = False # Consume first block self._goto(self._baseoffset) def _goto(self, position): # Read the posting block at the given position postfile = self._postfile # Reset block data -- we'll lazy load the data from the new block as # needed self._data = None self._ids = None self._weights = None self._values = None # Reset pointer into the block self._i = 0 # Seek to the start of the block postfile.seek(position) # Read the block length length = postfile.read_int() # If the block length is negative, that means this is the last block if length < 0: self._lastblock = True length *= -1 # Remember the offset of the next block self._nextoffset = position + _INT_SIZE + length # Read the pickled block info tuple info = postfile.read_pickle() # Remember the offset of the block's data self._dataoffset = postfile.tell() # Decompose the info tuple to set the current block info (self._blocklength, self._maxid, self._maxweight, self._compression, mnlen, mxlen) = info self._minlength = byte_to_length(mnlen) self._maxlength = byte_to_length(mxlen) def _next_block(self): if self._atend: # We were already at the end, and yet somebody called _next_block() # again, so something is wrong somewhere raise Exception("No next block") elif self._lastblock: # Reached the end of the postings self._atend = True else: # Go to the next block self._goto(self._nextoffset) def _skip_to_block(self, skipwhile): # Skip blocks as long as the skipwhile() function returns True skipped = 0 while self.is_active() and skipwhile(): self._next_block() skipped += 1 return skipped def is_active(self): return not self._atend and self._i < self._blocklength def id(self): # Get the current ID (docnum for regular postings, term for vector) # If we haven't loaded the block IDs yet, load them now if self._ids is None: self._read_ids() return self._ids[self._i] def weight(self): # Get the weight for the current posting # If we haven't loaded the block weights yet, load them now if self._weights is None: self._read_weights() return self._weights[self._i] def value(self): # Get the value for the current posting # If we haven't loaded the block values yet, load them now if self._values is None: self._read_values() return self._values[self._i] def next(self): # Move to the next posting # Increment the in-block pointer self._i += 1 # If we reached the end of the block, move to the next block if self._i == self._blocklength: self._next_block() return True else: return False def skip_to(self, targetid): # Skip to the next ID equal to or greater than the given target ID if not self.is_active(): raise ReadTooFar # If we're already at or past target ID, do nothing if targetid <= self.id(): return # Skip to the block that would contain the target ID block_max_id = self.block_max_id if targetid > block_max_id(): self._skip_to_block(lambda: targetid > block_max_id()) # Iterate through the IDs in the block until we find or pass the # target while self.is_active() and self.id() < targetid: self.next() def skip_to_quality(self, minquality): # Skip blocks until we find one that might exceed the given minimum # quality block_quality = self.block_quality # If the quality of this block is already higher than the minimum, # do nothing if block_quality() > minquality: return 0 # Skip blocks as long as the block quality is not greater than the # minimum return self._skip_to_block(lambda: block_quality() <= minquality) def block_min_id(self): if self._ids is None: self._read_ids() return self._ids[0] def block_max_id(self): return self._maxid def block_min_length(self): return self._minlength def block_max_length(self): return self._maxlength def block_max_weight(self): return self._maxweight def _read_data(self): # Load block data tuple from disk datalen = self._nextoffset - self._dataoffset b = self._postfile.get(self._dataoffset, datalen) # Decompress the pickled data if necessary if self._compression: b = zlib.decompress(b) # Unpickle the data tuple and save it in an attribute self._data = loads(b) def _read_ids(self): # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() ids = self._data[0] # De-minify the IDs if not self._byteids: ids = tuple(delta_decode(ids)) self._ids = ids def _read_weights(self): # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() weights = self._data[1] # De-minify the weights postcount = self._blocklength if weights is None: self._weights = array("f", (1.0 for _ in xrange(postcount))) elif isinstance(weights, float): self._weights = array("f", (weights for _ in xrange(postcount))) else: self._weights = weights def _read_values(self): # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() # De-minify the values fixedsize = self._fixedsize vs = self._data[2] if fixedsize is None or fixedsize < 0: self._values = vs elif fixedsize is 0: self._values = (None,) * self._blocklength else: assert isinstance(vs, bytes_type) self._values = tuple(vs[i:i + fixedsize] for i in xrange(0, len(vs), fixedsize)) # Term info implementation class W3TermInfo(TermInfo): # B | Flags # f | Total weight # I | Total doc freq # B | Min length (encoded as byte) # B | Max length (encoded as byte) # f | Max weight # I | Minimum (first) ID # I | Maximum (last) ID _struct = struct.Struct("!BfIBBfII") def __init__(self, *args, **kwargs): TermInfo.__init__(self, *args, **kwargs) self._offset = None self._length = None self._inlined = None def add_block(self, block): self._weight += sum(block._weights) self._df += len(block) ml = block.min_length() if self._minlength is None: self._minlength = ml else: self._minlength = min(self._minlength, ml) self._maxlength = max(self._maxlength, block.max_length()) self._maxweight = max(self._maxweight, block.max_weight()) if self._minid is None: self._minid = block.min_id() self._maxid = block.max_id() def set_extent(self, offset, length): self._offset = offset self._length = length def extent(self): return self._offset, self._length def set_inlined(self, ids, weights, values): self._inlined = (tuple(ids), tuple(weights), tuple(values)) def is_inlined(self): return self._inlined is not None def inlined_postings(self): return self._inlined def to_bytes(self): isinlined = self.is_inlined() # Encode the lengths as 0-255 values minlength = (0 if self._minlength is None else length_to_byte(self._minlength)) maxlength = length_to_byte(self._maxlength) # Convert None values to the out-of-band NO_ID constant so they can be # stored as unsigned ints minid = 0xffffffff if self._minid is None else self._minid maxid = 0xffffffff if self._maxid is None else self._maxid # Pack the term info into bytes st = self._struct.pack(isinlined, self._weight, self._df, minlength, maxlength, self._maxweight, minid, maxid) if isinlined: # Postings are inlined - dump them using the pickle protocol postbytes = dumps(self._inlined, -1) else: postbytes = pack_long(self._offset) + pack_int(self._length) st += postbytes return st @classmethod def from_bytes(cls, s): st = cls._struct vals = st.unpack(s[:st.size]) terminfo = cls() flags = vals[0] terminfo._weight = vals[1] terminfo._df = vals[2] terminfo._minlength = byte_to_length(vals[3]) terminfo._maxlength = byte_to_length(vals[4]) terminfo._maxweight = vals[5] terminfo._minid = None if vals[6] == 0xffffffff else vals[6] terminfo._maxid = None if vals[7] == 0xffffffff else vals[7] if flags: # Postings are stored inline terminfo._inlined = loads(s[st.size:]) else: # Last bytes are pointer into posting file and length offpos = st.size lenpos = st.size + _LONG_SIZE terminfo._offset = unpack_long(s[offpos:lenpos])[0] terminfo._length = unpack_int(s[lenpos:lenpos + _INT_SIZE]) return terminfo @classmethod def read_weight(cls, dbfile, datapos): return dbfile.get_float(datapos + 1) @classmethod def read_doc_freq(cls, dbfile, datapos): return dbfile.get_uint(datapos + 1 + _FLOAT_SIZE) @classmethod def read_min_and_max_length(cls, dbfile, datapos): lenpos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE ml = byte_to_length(dbfile.get_byte(lenpos)) xl = byte_to_length(dbfile.get_byte(lenpos + 1)) return ml, xl @classmethod def read_max_weight(cls, dbfile, datapos): weightspos = datapos + 1 + _FLOAT_SIZE + _INT_SIZE + 2 return dbfile.get_float(weightspos) # Segment implementation class W3Segment(base.Segment): def __init__(self, codec, indexname, doccount=0, segid=None, deleted=None): self.indexname = indexname self.segid = self._random_id() if segid is None else segid self._codec = codec self._doccount = doccount self._deleted = deleted self.compound = False def codec(self, **kwargs): return self._codec def set_doc_count(self, dc): self._doccount = dc def doc_count_all(self): return self._doccount def deleted_count(self): if self._deleted is None: return 0 return len(self._deleted) def deleted_docs(self): if self._deleted is None: return () else: return iter(self._deleted) def delete_document(self, docnum, delete=True): if delete: if self._deleted is None: self._deleted = set() self._deleted.add(docnum) elif self._deleted is not None and docnum in self._deleted: self._deleted.clear(docnum) def is_deleted(self, docnum): if self._deleted is None: return False return docnum in self._deleted Whoosh-2.5.7/src/whoosh/collectors.py0000644000076500000240000012226512254366764017734 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module contains "collector" objects. Collectors provide a way to gather "raw" results from a :class:`whoosh.matching.Matcher` object, implement sorting, filtering, collation, etc., and produce a :class:`whoosh.searching.Results` object. The basic collectors are: TopCollector Returns the top N matching results sorted by score, using block-quality optimizations to skip blocks of documents that can't contribute to the top N. The :meth:`whoosh.searching.Searcher.search` method uses this type of collector by default or when you specify a ``limit``. UnlimitedCollector Returns all matching results sorted by score. The :meth:`whoosh.searching.Searcher.search` method uses this type of collector when you specify ``limit=None`` or you specify a limit equal to or greater than the number of documents in the searcher. SortingCollector Returns all matching results sorted by a :class:`whoosh.sorting.Facet` object. The :meth:`whoosh.searching.Searcher.search` method uses this type of collector when you use the ``sortedby`` parameter. Here's an example of a simple collector that instead of remembering the matched documents just counts up the number of matches:: class CountingCollector(Collector): def prepare(self, top_searcher, q, context): # Always call super method in prepare Collector.prepare(self, top_searcher, q, context) self.count = 0 def collect(self, sub_docnum): self.count += 1 c = CountingCollector() mysearcher.search_with_collector(myquery, c) print(c.count) There are also several wrapping collectors that extend or modify the functionality of other collectors. The meth:`whoosh.searching.Searcher.search` method uses many of these when you specify various parameters. NOTE: collectors are not designed to be reentrant or thread-safe. It is generally a good idea to create a new collector for each search. """ import os import threading from array import array from bisect import insort from collections import defaultdict from heapq import heapify, heappush, heapreplace from whoosh import sorting from whoosh.compat import abstractmethod, iteritems, itervalues, xrange from whoosh.searching import Results, TimeLimit from whoosh.util import now # Functions def ilen(iterator): total = 0 for _ in iterator: total += 1 return total # Base class class Collector(object): """Base class for collectors. """ def prepare(self, top_searcher, q, context): """This method is called before a search. Subclasses can override this to perform set-up work, but they should still call the superclass's method because it sets several necessary attributes on the collector object: self.top_searcher The top-level searcher. self.q The query object self.context ``context.needs_current`` controls whether a wrapping collector requires that this collector's matcher be in a valid state at every call to ``collect()``. If this is ``False``, the collector is free to use faster methods that don't necessarily keep the matcher updated, such as ``matcher.all_ids()``. :param top_searcher: the top-level :class:`whoosh.searching.Searcher` object. :param q: the :class:`whoosh.query.Query` object being searched for. :param context: a :class:`whoosh.searching.SearchContext` object containing information about the search. """ self.top_searcher = top_searcher self.q = q self.context = context self.starttime = now() self.runtime = None self.docset = set() def run(self): # Collect matches for each sub-searcher try: for subsearcher, offset in self.top_searcher.leaf_searchers(): self.set_subsearcher(subsearcher, offset) self.collect_matches() finally: self.finish() def set_subsearcher(self, subsearcher, offset): """This method is called each time the collector starts on a new sub-searcher. Subclasses can override this to perform set-up work, but they should still call the superclass's method because it sets several necessary attributes on the collector object: self.subsearcher The current sub-searcher. If the top-level searcher is atomic, this is the same as the top-level searcher. self.offset The document number offset of the current searcher. You must add this number to the document number passed to :meth:`Collector.collect` to get the top-level document number for use in results. self.matcher A :class:`whoosh.matching.Matcher` object representing the matches for the query in the current sub-searcher. """ self.subsearcher = subsearcher self.offset = offset self.matcher = self.q.matcher(subsearcher, self.context) def computes_count(self): """Returns True if the collector naturally computes the exact number of matching documents. Collectors that use block optimizations will return False since they might skip blocks containing matching documents. Note that if this method returns False you can still call :meth:`count`, but it means that method might have to do more work to calculate the number of matching documents. """ return True def all_ids(self): """Returns a sequence of docnums matched in this collector. (Only valid after the collector is run.) The default implementation is based on the docset. If a collector does not maintain the docset, it will need to override this method. """ return self.docset def count(self): """Returns the total number of documents matched in this collector. (Only valid after the collector is run.) The default implementation is based on the docset. If a collector does not maintain the docset, it will need to override this method. """ return len(self.docset) def collect_matches(self): """This method calls :meth:`Collector.matches` and then for each matched document calls :meth:`Collector.collect`. Sub-classes that want to intervene between finding matches and adding them to the collection (for example, to filter out certain documents) can override this method. """ collect = self.collect for sub_docnum in self.matches(): collect(sub_docnum) @abstractmethod def collect(self, sub_docnum): """This method is called for every matched document. It should do the work of adding a matched document to the results, and it should return an object to use as a "sorting key" for the given document (such as the document's score, a key generated by a facet, or just None). Subclasses must implement this method. If you want the score for the current document, use ``self.matcher.score()``. Overriding methods should add the current document offset (``self.offset``) to the ``sub_docnum`` to get the top-level document number for the matching document to add to results. :param sub_docnum: the document number of the current match within the current sub-searcher. You must add ``self.offset`` to this number to get the document's top-level document number. """ raise NotImplementedError @abstractmethod def sort_key(self, sub_docnum): """Returns a sorting key for the current match. This should return the same value returned by :meth:`Collector.collect`, but without the side effect of adding the current document to the results. If the collector has been prepared with ``context.needs_current=True``, this method can use ``self.matcher`` to get information, for example the score. Otherwise, it should only use the provided ``sub_docnum``, since the matcher may be in an inconsistent state. Subclasses must implement this method. """ raise NotImplementedError def remove(self, global_docnum): """Removes a document from the collector. Not that this method uses the global document number as opposed to :meth:`Collector.collect` which takes a segment-relative docnum. """ items = self.items for i in xrange(len(items)): if items[i][1] == global_docnum: items.pop(i) return raise KeyError(global_docnum) def _step_through_matches(self): matcher = self.matcher while matcher.is_active(): yield matcher.id() matcher.next() def matches(self): """Yields a series of relative document numbers for matches in the current subsearcher. """ # We jump through a lot of hoops to avoid stepping through the matcher # "manually" if we can because all_ids() is MUCH faster if self.context.needs_current: return self._step_through_matches() else: return self.matcher.all_ids() def finish(self): """This method is called after a search. Subclasses can override this to perform set-up work, but they should still call the superclass's method because it sets several necessary attributes on the collector object: self.runtime The time (in seconds) the search took. """ self.runtime = now() - self.starttime def _results(self, items, **kwargs): # Fills in a Results object with the invariant information and the # given "items" (a list of (score, docnum) tuples) r = Results(self.top_searcher, self.q, items, **kwargs) r.runtime = self.runtime r.collector = self return r @abstractmethod def results(self): """Returns a :class:`~whoosh.searching.Results` object containing the results of the search. Subclasses must implement this method """ raise NotImplementedError # Scored collectors class ScoredCollector(Collector): """Base class for collectors that sort the results based on document score. """ def __init__(self, replace=10): """ :param replace: Number of matches between attempts to replace the matcher with a more efficient version. """ Collector.__init__(self) self.replace = replace def prepare(self, top_searcher, q, context): # This collector requires a valid matcher at each step Collector.prepare(self, top_searcher, q, context) if top_searcher.weighting.use_final: self.final_fn = top_searcher.weighting.final else: self.final_fn = None # Heap containing top N (score, 0-docnum) pairs self.items = [] # Minimum score a document must have to make it into the top N. This is # used by the block-quality optimizations self.minscore = 0 # Number of times the matcher was replaced (for debugging) self.replaced_times = 0 # Number of blocks skipped by quality optimizations (for debugging) self.skipped_times = 0 def sort_key(self, sub_docnum): return 0 - self.matcher.score() def _collect(self, global_docnum, score): # Concrete subclasses should override this method to collect matching # documents raise NotImplementedError def _use_block_quality(self): # Concrete subclasses should override this method to return True if the # collector should use block quality optimizations return False def collect(self, sub_docnum): # Do common work to calculate score and top-level document number global_docnum = self.offset + sub_docnum score = self.matcher.score() if self.final_fn: score = self.final_fn(self.top_searcher, global_docnum, score) # Call specialized method on subclass return self._collect(global_docnum, score) def matches(self): minscore = self.minscore matcher = self.matcher usequality = self._use_block_quality() replace = self.replace replacecounter = 0 # A flag to indicate whether we should check block quality at the start # of the next loop checkquality = True while matcher.is_active(): # If the replacement counter has reached 0, try replacing the # matcher with a more efficient version if replace: if replacecounter == 0 or self.minscore != minscore: self.matcher = matcher = matcher.replace(minscore or 0) self.replaced_times += 1 if not matcher.is_active(): break usequality = self._use_block_quality() replacecounter = self.replace if self.minscore != minscore: checkquality = True minscore = self.minscore replacecounter -= 1 # If we're using block quality optimizations, and the checkquality # flag is true, try to skip ahead to the next block with the # minimum required quality if usequality and checkquality and minscore is not None: self.skipped_times += matcher.skip_to_quality(minscore) # Skipping ahead might have moved the matcher to the end of the # posting list if not matcher.is_active(): break yield matcher.id() # Move to the next document. This method returns True if the # matcher has entered a new block, so we should check block quality # again. checkquality = matcher.next() class TopCollector(ScoredCollector): """A collector that only returns the top "N" scored results. """ def __init__(self, limit=10, usequality=True, **kwargs): """ :param limit: the maximum number of results to return. :param usequality: whether to use block-quality optimizations. This may be useful for debugging. """ ScoredCollector.__init__(self, **kwargs) self.limit = limit self.usequality = usequality self.total = 0 def _use_block_quality(self): return (self.usequality and not self.top_searcher.weighting.use_final and self.matcher.supports_block_quality()) def computes_count(self): return not self._use_block_quality() def all_ids(self): # Since this collector can skip blocks, it doesn't track the total # number of matching documents, so if the user asks for all matched # docs we need to re-run the search using docs_for_query return self.top_searcher.docs_for_query(self.q) def count(self): if self.computes_count(): return self.total else: return ilen(self.all_ids()) # ScoredCollector.collect calls this def _collect(self, global_docnum, score): items = self.items self.total += 1 # Document numbers are negated before putting them in the heap so that # higher document numbers have lower "priority" in the queue. Lower # document numbers should always come before higher document numbers # with the same score to keep the order stable. if len(items) < self.limit: # The heap isn't full, so add this document heappush(items, (score, 0 - global_docnum)) # Negate score to act as sort key so higher scores appear first return 0 - score elif score > items[0][0]: # The heap is full, but if this document has a high enough # score to make the top N, add it to the heap heapreplace(items, (score, 0 - global_docnum)) self.minscore = items[0][0] # Negate score to act as sort key so higher scores appear first return 0 - score else: return 0 def remove(self, global_docnum): negated = 0 - global_docnum items = self.items # Remove the document if it's on the list (it may not be since # TopCollector forgets documents that don't make the top N list) for i in xrange(len(items)): if items[i][1] == negated: items.pop(i) # Restore the heap invariant heapify(items) self.minscore = items[0][0] if items else 0 return def results(self): # The items are stored (postive score, negative docnum) so the heap # keeps the highest scores and lowest docnums, in order from lowest to # highest. Since for the results we want the highest scores first, # sort the heap in reverse order items = self.items items.sort(reverse=True) # De-negate the docnums for presentation to the user items = [(score, 0 - docnum) for score, docnum in items] return self._results(items) class UnlimitedCollector(ScoredCollector): """A collector that returns **all** scored results. """ def __init__(self, reverse=False): ScoredCollector.__init__(self) self.reverse = reverse # ScoredCollector.collect calls this def _collect(self, global_docnum, score): self.items.append((score, global_docnum)) self.docset.add(global_docnum) # Negate score to act as sort key so higher scores appear first return 0 - score def results(self): # Sort by negated scores so that higher scores go first, then by # document number to keep the order stable when documents have the # same score self.items.sort(key=lambda x: (0 - x[0], x[1]), reverse=self.reverse) return self._results(self.items, docset=self.docset) # Sorting collector class SortingCollector(Collector): """A collector that returns results sorted by a given :class:`whoosh.sorting.Facet` object. See :doc:`/facets` for more information. """ def __init__(self, sortedby, limit=10, reverse=False): """ :param sortedby: see :doc:`/facets`. :param reverse: If True, reverse the overall results. Note that you can reverse individual facets in a multi-facet sort key as well. """ Collector.__init__(self) self.sortfacet = sorting.MultiFacet.from_sortedby(sortedby) self.limit = limit self.reverse = reverse def prepare(self, top_searcher, q, context): self.categorizer = self.sortfacet.categorizer(top_searcher) # If the categorizer requires a valid matcher, then tell the child # collector that we need it rm = context.needs_current or self.categorizer.needs_current Collector.prepare(self, top_searcher, q, context.set(needs_current=rm)) # List of (sortkey, docnum) pairs self.items = [] def set_subsearcher(self, subsearcher, offset): Collector.set_subsearcher(self, subsearcher, offset) self.categorizer.set_searcher(subsearcher, offset) def sort_key(self, sub_docnum): return self.categorizer.key_for(self.matcher, sub_docnum) def collect(self, sub_docnum): global_docnum = self.offset + sub_docnum sortkey = self.sort_key(sub_docnum) self.items.append((sortkey, global_docnum)) self.docset.add(global_docnum) return sortkey def results(self): items = self.items items.sort(reverse=self.reverse) if self.limit: items = items[:self.limit] return self._results(items, docset=self.docset) class UnsortedCollector(Collector): def prepare(self, top_searcher, q, context): Collector.prepare(self, top_searcher, q, top_searcher.boolean_context()) self.items = [] def collect(self, sub_docnum): global_docnum = self.offset + sub_docnum self.items.append((None, global_docnum)) self.docset.add(global_docnum) def results(self): items = self.items return self._results(items, docset=self.docset) # Wrapping collectors class WrappingCollector(Collector): """Base class for collectors that wrap other collectors. """ def __init__(self, child): self.child = child @property def top_searcher(self): return self.child.top_searcher def prepare(self, top_searcher, q, context): self.child.prepare(top_searcher, q, context) def set_subsearcher(self, subsearcher, offset): self.child.set_subsearcher(subsearcher, offset) self.subsearcher = subsearcher self.matcher = self.child.matcher self.offset = self.child.offset def all_ids(self): return self.child.all_ids() def count(self): return self.child.count() def collect_matches(self): for sub_docnum in self.matches(): self.collect(sub_docnum) def sort_key(self, sub_docnum): return self.child.sort_key(sub_docnum) def collect(self, sub_docnum): return self.child.collect(sub_docnum) def matches(self): return self.child.matches() def finish(self): self.child.finish() def results(self): return self.child.results() # Allow and disallow collector class FilterCollector(WrappingCollector): """A collector that lets you allow and/or restrict certain document numbers in the results:: uc = collectors.UnlimitedCollector() ins = query.Term("chapter", "rendering") outs = query.Term("status", "restricted") fc = FilterCollector(uc, allow=ins, restrict=outs) mysearcher.search_with_collector(myquery, fc) print(fc.results()) This collector discards a document if: * The allowed set is not None and a document number is not in the set, or * The restrict set is not None and a document number is in the set. (So, if the same document number is in both sets, that document will be discarded.) If you have a reference to the collector, you can use ``FilterCollector.filtered_count`` to get the number of matching documents filtered out of the results by the collector. """ def __init__(self, child, allow=None, restrict=None): """ :param child: the collector to wrap. :param allow: a query, Results object, or set-like object containing docnument numbers that are allowed in the results, or None (meaning everything is allowed). :param restrict: a query, Results object, or set-like object containing document numbers to disallow from the results, or None (meaning nothing is disallowed). """ self.child = child self.allow = allow self.restrict = restrict def prepare(self, top_searcher, q, context): self.child.prepare(top_searcher, q, context) allow = self.allow restrict = self.restrict ftc = top_searcher._filter_to_comb self._allow = ftc(allow) if allow else None self._restrict = ftc(restrict) if restrict else None self.filtered_count = 0 def all_ids(self): child = self.child _allow = self._allow _restrict = self._restrict for global_docnum in child.all_ids(): if ((_allow and global_docnum not in _allow) or (_restrict and global_docnum in _restrict)): continue yield global_docnum def count(self): child = self.child if child.computes_count(): return child.count() - self.filtered_count else: return ilen(self.all_ids()) def collect_matches(self): child = self.child _allow = self._allow _restrict = self._restrict if _allow is not None or _restrict is not None: filtered_count = self.filtered_count for sub_docnum in child.matches(): global_docnum = self.offset + sub_docnum if ((_allow is not None and global_docnum not in _allow) or (_restrict is not None and global_docnum in _restrict)): filtered_count += 1 continue child.collect(sub_docnum) self.filtered_count = filtered_count else: # If there was no allow or restrict set, don't do anything special, # just forward the call to the child collector child.collect_matches() def results(self): r = self.child.results() r.filtered_count = self.filtered_count r.allowed = self.allow r.restricted = self.restrict return r # Facet grouping collector class FacetCollector(WrappingCollector): """A collector that creates groups of documents based on :class:`whoosh.sorting.Facet` objects. See :doc:`/facets` for more information. This collector is used if you specify a ``groupedby`` parameter in the :meth:`whoosh.searching.Searcher.search` method. You can use the :meth:`whoosh.searching.Results.groups` method to access the facet groups. If you have a reference to the collector can also use ``FacetedCollector.facetmaps`` to access the groups directly:: uc = collectors.UnlimitedCollector() fc = FacetedCollector(uc, sorting.FieldFacet("category")) mysearcher.search_with_collector(myquery, fc) print(fc.facetmaps) """ def __init__(self, child, groupedby, maptype=None): """ :param groupedby: see :doc:`/facets`. :param maptype: a :class:`whoosh.sorting.FacetMap` type to use for any facets that don't specify their own. """ self.child = child self.facets = sorting.Facets.from_groupedby(groupedby) self.maptype = maptype def prepare(self, top_searcher, q, context): facets = self.facets # For each facet we're grouping by: # - Create a facetmap (to hold the groups) # - Create a categorizer (to generate document keys) self.facetmaps = {} self.categorizers = {} # Set needs_current to True if any of the categorizers require the # current document to work needs_current = context.needs_current for facetname, facet in facets.items(): self.facetmaps[facetname] = facet.map(self.maptype) ctr = facet.categorizer(top_searcher) self.categorizers[facetname] = ctr needs_current = needs_current or ctr.needs_current context = context.set(needs_current=needs_current) self.child.prepare(top_searcher, q, context) def set_subsearcher(self, subsearcher, offset): WrappingCollector.set_subsearcher(self, subsearcher, offset) # Tell each categorizer about the new subsearcher and offset for categorizer in itervalues(self.categorizers): categorizer.set_searcher(self.child.subsearcher, self.child.offset) def collect(self, sub_docnum): matcher = self.child.matcher global_docnum = sub_docnum + self.child.offset # We want the sort key for the document so we can (by default) sort # the facet groups sortkey = self.child.collect(sub_docnum) # For each facet we're grouping by for name, categorizer in iteritems(self.categorizers): add = self.facetmaps[name].add # We have to do more work if the facet allows overlapping groups if categorizer.allow_overlap: for key in categorizer.keys_for(matcher, sub_docnum): add(categorizer.key_to_name(key), global_docnum, sortkey) else: key = categorizer.key_for(matcher, sub_docnum) key = categorizer.key_to_name(key) add(key, global_docnum, sortkey) return sortkey def results(self): r = self.child.results() r._facetmaps = self.facetmaps return r # Collapsing collector class CollapseCollector(WrappingCollector): """A collector that collapses results based on a facet. That is, it eliminates all but the top N results that share the same facet key. Documents with an empty key for the facet are never eliminated. The "top" results within each group is determined by the result ordering (e.g. highest score in a scored search) or an optional second "ordering" facet. If you have a reference to the collector you can use ``CollapseCollector.collapsed_counts`` to access the number of documents eliminated based on each key:: tc = TopCollector(limit=20) cc = CollapseCollector(tc, "group", limit=3) mysearcher.search_with_collector(myquery, cc) print(cc.collapsed_counts) See :ref:`collapsing` for more information. """ def __init__(self, child, keyfacet, limit=1, order=None): """ :param child: the collector to wrap. :param keyfacet: a :class:`whoosh.sorting.Facet` to use for collapsing. All but the top N documents that share a key will be eliminated from the results. :param limit: the maximum number of documents to keep for each key. :param order: an optional :class:`whoosh.sorting.Facet` to use to determine the "top" document(s) to keep when collapsing. The default (``orderfaceet=None``) uses the results order (e.g. the highest score in a scored search). """ self.child = child self.keyfacet = sorting.MultiFacet.from_sortedby(keyfacet) self.limit = limit if order: self.orderfacet = sorting.MultiFacet.from_sortedby(order) else: self.orderfacet = None def prepare(self, top_searcher, q, context): # Categorizer for getting the collapse key of a document self.keyer = self.keyfacet.categorizer(top_searcher) # Categorizer for getting the collapse order of a document self.orderer = None if self.orderfacet: self.orderer = self.orderfacet.categorizer(top_searcher) # Dictionary mapping keys to lists of (sortkey, global_docnum) pairs # representing the best docs for that key self.lists = defaultdict(list) # Dictionary mapping keys to the number of documents that have been # filtered out with that key self.collapsed_counts = defaultdict(int) # Total number of documents filtered out by collapsing self.collapsed_total = 0 # If the keyer or orderer require a valid matcher, tell the child # collector we need it needs_current = (context.needs_current or self.keyer.needs_current or (self.orderer and self.orderer.needs_current)) self.child.prepare(top_searcher, q, context.set(needs_current=needs_current)) def set_subsearcher(self, subsearcher, offset): WrappingCollector.set_subsearcher(self, subsearcher, offset) # Tell the keyer and (optional) orderer about the new subsearcher self.keyer.set_searcher(subsearcher, offset) if self.orderer: self.orderer.set_searcher(subsearcher, offset) def all_ids(self): child = self.child limit = self.limit counters = defaultdict(int) for subsearcher, offset in child.subsearchers(): self.set_subsearcher(subsearcher, offset) matcher = child.matcher keyer = self.keyer for sub_docnum in child.matches(): ckey = keyer.key_for(matcher, sub_docnum) if ckey is not None: if ckey in counters and counters[ckey] >= limit: continue else: counters[ckey] += 1 yield offset + sub_docnum def count(self): if self.child.computes_count(): return self.child.count() - self.collapsed_total else: return ilen(self.all_ids()) def collect_matches(self): lists = self.lists limit = self.limit keyer = self.keyer orderer = self.orderer collapsed_counts = self.collapsed_counts child = self.child matcher = child.matcher offset = child.offset for sub_docnum in child.matches(): # Collapsing category key ckey = keyer.key_to_name(keyer.key_for(matcher, sub_docnum)) if not ckey: # If the document isn't in a collapsing category, just add it child.collect(sub_docnum) else: global_docnum = offset + sub_docnum if orderer: # If user specified a collapse order, use it sortkey = orderer.key_for(child.matcher, sub_docnum) else: # Otherwise, use the results order sortkey = child.sort_key(sub_docnum) # Current list of best docs for this collapse key best = lists[ckey] add = False if len(best) < limit: # If the heap is not full yet, just add this document add = True elif sortkey < best[-1][0]: # If the heap is full but this document has a lower sort # key than the highest key currently on the heap, replace # the "least-best" document # Tell the child collector to remove the document child.remove(best.pop()[1]) add = True if add: insort(best, (sortkey, global_docnum)) child.collect(sub_docnum) else: # Remember that a document was filtered collapsed_counts[ckey] += 1 self.collapsed_total += 1 def results(self): r = self.child.results() r.collapsed_counts = self.collapsed_counts return r # Time limit collector class TimeLimitCollector(WrappingCollector): """A collector that raises a :class:`TimeLimit` exception if the search does not complete within a certain number of seconds:: uc = collectors.UnlimitedCollector() tlc = TimeLimitedCollector(uc, timelimit=5.8) try: mysearcher.search_with_collector(myquery, tlc) except collectors.TimeLimit: print("The search ran out of time!") # We can still get partial results from the collector print(tlc.results()) IMPORTANT: On Unix systems (systems where signal.SIGALRM is defined), the code uses signals to stop searching immediately when the time limit is reached. On Windows, the OS does not support this functionality, so the search only checks the time between each found document, so if a matcher is slow the search could exceed the time limit. """ def __init__(self, child, timelimit, greedy=False, use_alarm=True): """ :param child: the collector to wrap. :param timelimit: the maximum amount of time (in seconds) to allow for searching. If the search takes longer than this, it will raise a ``TimeLimit`` exception. :param greedy: if ``True``, the collector will finish adding the most recent hit before raising the ``TimeLimit`` exception. :param use_alarm: if ``True`` (the default), the collector will try to use signal.SIGALRM (on UNIX). """ self.child = child self.timelimit = timelimit self.greedy = greedy if use_alarm: import signal self.use_alarm = use_alarm and hasattr(signal, "SIGALRM") else: self.use_alarm = False self.timer = None self.timedout = False def prepare(self, top_searcher, q, context): self.child.prepare(top_searcher, q, context) self.timedout = False if self.use_alarm: import signal signal.signal(signal.SIGALRM, self._was_signaled) # Start a timer thread. If the timer fires, it will call this object's # _timestop() method self.timer = threading.Timer(self.timelimit, self._timestop) self.timer.start() def _timestop(self): # Called when the timer expires self.timer = None # Set an attribute that will be noticed in the collect_matches() loop self.timedout = True if self.use_alarm: import signal os.kill(os.getpid(), signal.SIGALRM) def _was_signaled(self, signum, frame): raise TimeLimit def collect_matches(self): child = self.child greedy = self.greedy for sub_docnum in child.matches(): # If the timer fired since the last loop and we're not greedy, # raise the exception if self.timedout and not greedy: raise TimeLimit child.collect(sub_docnum) # If the timer fired since we entered the loop or it fired earlier # but we were greedy, raise now if self.timedout: raise TimeLimit def finish(self): if self.timer: self.timer.cancel() self.timer = None self.child.finish() # Matched terms collector class TermsCollector(WrappingCollector): """A collector that remembers which terms appeared in which terms appeared in each matched document. This collector is used if you specify ``terms=True`` in the :meth:`whoosh.searching.Searcher.search` method. If you have a reference to the collector can also use ``TermsCollector.termslist`` to access the term lists directly:: uc = collectors.UnlimitedCollector() tc = TermsCollector(uc) mysearcher.search_with_collector(myquery, tc) # tc.termdocs is a dictionary mapping (fieldname, text) tuples to # sets of document numbers print(tc.termdocs) # tc.docterms is a dictionary mapping docnums to lists of # (fieldname, text) tuples print(tc.docterms) """ def __init__(self, child, settype=set): self.child = child self.settype = settype def prepare(self, top_searcher, q, context): # This collector requires a valid matcher at each step self.child.prepare(top_searcher, q, context.set(needs_current=True)) # A dictionary mapping (fieldname, text) pairs to arrays of docnums self.termdocs = defaultdict(lambda: array("I")) # A dictionary mapping docnums to lists of (fieldname, text) pairs self.docterms = defaultdict(list) def set_subsearcher(self, subsearcher, offset): WrappingCollector.set_subsearcher(self, subsearcher, offset) # Store a list of all the term matchers in the matcher tree self.termmatchers = list(self.child.matcher.term_matchers()) def collect(self, sub_docnum): child = self.child termdocs = self.termdocs docterms = self.docterms child.collect(sub_docnum) global_docnum = child.offset + sub_docnum # For each term matcher... for tm in self.termmatchers: # If the term matcher is matching the current document... if tm.is_active() and tm.id() == sub_docnum: # Add it to the list of matching documents for the term term = tm.term() termdocs[term].append(global_docnum) docterms[global_docnum].append(term) def results(self): r = self.child.results() r.termdocs = dict(self.termdocs) r.docterms = dict(self.docterms) return r Whoosh-2.5.7/src/whoosh/columns.py0000644000076500000240000013153412254366764017242 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ The API and implementation of columns may change in the next version of Whoosh! This module contains "Column" objects which you can use as the argument to a Field object's ``sortable=`` keyword argument. Each field defines a default column type for when the user specifies ``sortable=True`` (the object returned by the field's ``default_column()`` method). The default column type for most fields is ``VarBytesColumn``, although numeric and date fields use ``NumericColumn``. Expert users may use other field types that may be faster or more storage efficient based on the field contents. For example, if a field always contains one of a limited number of possible values, a ``RefBytesColumn`` will save space by only storing the values once. If a field's values are always a fixed length, the ``FixedBytesColumn`` saves space by not storing the length of each value. A ``Column`` object basically exists to store configuration information and provides two important methods: ``writer()`` to return a ``ColumnWriter`` object and ``reader()`` to return a ``ColumnReader`` object. """ from __future__ import division, with_statement import struct, warnings from array import array from bisect import bisect_right try: import zlib except ImportError: zlib = None from whoosh.compat import b, bytes_type, BytesIO from whoosh.compat import array_tobytes, xrange from whoosh.compat import dumps, loads from whoosh.filedb.structfile import StructFile from whoosh.idsets import BitSet, OnDiskBitSet from whoosh.system import emptybytes from whoosh.util.cache import lru_cache from whoosh.util.numeric import typecode_max, typecode_min from whoosh.util.numlists import GrowableArray from whoosh.util.varints import varint # Utility functions def _mintype(maxn): if maxn < 2 ** 8: typecode = "B" elif maxn < 2 ** 16: typecode = "H" elif maxn < 2 ** 31: typecode = "i" else: typecode = "I" return typecode # Python does not support arrays of long long see Issue 1172711 # These functions help write/read a simulated an array of q/Q using lists def write_qsafe_array(typecode, arry, dbfile): if typecode == "q": for num in arry: dbfile.write_long(num) elif typecode == "Q": for num in arry: dbfile.write_ulong(num) else: dbfile.write_array(arry) def read_qsafe_array(typecode, size, dbfile): if typecode == "q": arry = [dbfile.read_long() for _ in xrange(size)] elif typecode == "Q": arry = [dbfile.read_ulong() for _ in xrange(size)] else: arry = dbfile.read_array(typecode, size) return arry def make_array(typecode, size=0, default=None): if typecode.lower() == "q": # Python does not support arrays of long long see Issue 1172711 if default is not None and size: arry = [default] * size else: arry = [] else: if default is not None and size: arry = array(typecode, (default for _ in xrange(size))) else: arry = array(typecode) return arry # Base classes class Column(object): """Represents a "column" of rows mapping docnums to document values. The interface requires that you store the start offset of the column, the length of the column data, and the number of documents (rows) separately, and pass them to the reader object. """ reversible = False def writer(self, dbfile): """Returns a :class:`ColumnWriter` object you can use to use to create a column of this type on disk. :param dbfile: the :class:`~whoosh.filedb.structfile.StructFile` to write to. """ return self.Writer(dbfile) def reader(self, dbfile, basepos, length, doccount): """Returns a :class:`ColumnReader` object you can use to read a column of this type from disk. :param dbfile: the :class:`~whoosh.filedb.structfile.StructFile` to read from. :param basepos: the offset within the file at which the column starts. :param length: the length in bytes of the column occupies in the file. :param doccount: the number of rows (documents) in the column. """ return self.Reader(dbfile, basepos, length, doccount) def default_value(self, reverse=False): """Returns the default value for this column type. """ return self._default def stores_lists(self): """Returns True if the column stores a list of values for each document instead of a single value. """ return False class ColumnWriter(object): def __init__(self, dbfile): self._dbfile = dbfile self._count = 0 def fill(self, docnum): write = self._dbfile.write default = self._defaultbytes if docnum > self._count: for _ in xrange(docnum - self._count): write(default) def add(self, docnum, value): raise NotImplementedError def finish(self, docnum): pass class ColumnReader(object): def __init__(self, dbfile, basepos, length, doccount): self._dbfile = dbfile self._basepos = basepos self._length = length self._doccount = doccount def __len__(self): return self._doccount def __getitem__(self, docnum): raise NotImplementedError def sort_key(self, docnum): return self[docnum] def __iter__(self): for i in xrange(self._doccount): yield self[i] def load(self): return list(self) def set_reverse(self): raise NotImplementedError # Arbitrary bytes column class VarBytesColumn(Column): """Stores variable length byte strings. See also :class:`RefBytesColumn`. The current implementation limits the total length of all document values a segment to 2 GB. The default value (the value returned for a document that didn't have a value assigned to it at indexing time) is an empty bytestring (``b''``). """ _default = emptybytes class Writer(ColumnWriter): def __init__(self, dbfile): assert isinstance(dbfile, StructFile) self._dbfile = dbfile self._count = 0 self._lengths = GrowableArray(allow_longs=False) def __repr__(self): return "" def fill(self, docnum): if docnum > self._count: self._lengths.extend(0 for _ in xrange(docnum - self._count)) def add(self, docnum, v): self.fill(docnum) self._dbfile.write(v) self._lengths.append(len(v)) self._count = docnum + 1 def finish(self, doccount): self.fill(doccount) lengths = self._lengths.array self._dbfile.write_array(lengths) # Write the typecode for the lengths self._dbfile.write_byte(ord(lengths.typecode)) class Reader(ColumnReader): def __init__(self, dbfile, basepos, length, doccount): self._dbfile = dbfile self._basepos = basepos self._length = length self._doccount = doccount self._read_lengths() # Create an array of offsets into the strings using the lengths offsets = array("L", (0,)) for length in self._lengths: offsets.append(offsets[-1] + length) self._offsets = offsets def __repr__(self): return "" def _read_lengths(self): dbfile = self._dbfile basepos = self._basepos length = self._length doccount = self._doccount # The end of the lengths array is the end of the data minus the # typecode byte endoflens = basepos + length - 1 # Load the length typecode from before the key length typecode = chr(dbfile.get_byte(endoflens)) # Load the length array from before the typecode itemsize = struct.calcsize(typecode) lengthsbase = endoflens - (itemsize * doccount) self._lengths = dbfile.get_array(lengthsbase, typecode, doccount) @lru_cache() def __getitem__(self, docnum): length = self._lengths[docnum] if not length: return emptybytes offset = self._offsets[docnum] return self._dbfile.get(self._basepos + offset, length) def __iter__(self): get = self._dbfile.get pos = self._basepos for length in self._lengths: yield get(pos, length) pos += length class FixedBytesColumn(Column): """Stores fixed-length byte strings. """ def __init__(self, fixedlen, default=None): """ :param fixedlen: the fixed length of byte strings in this column. :param default: the default value to use for documents that don't specify a value. If you don't specify a default, the column will use ``b'\\x00' * fixedlen``. """ self._fixedlen = fixedlen if default is None: default = b("\x00") * fixedlen elif len(default) != fixedlen: raise ValueError self._default = default def writer(self, dbfile): return self.Writer(dbfile, self._fixedlen, self._default) def reader(self, dbfile, basepos, length, doccount): return self.Reader(dbfile, basepos, length, doccount, self._fixedlen, self._default) class Writer(ColumnWriter): def __init__(self, dbfile, fixedlen, default): self._dbfile = dbfile self._fixedlen = fixedlen self._default = self._defaultbytes = default self._count = 0 def __repr__(self): return "" def add(self, docnum, v): if v == self._default: return if docnum > self._count: self.fill(docnum) assert len(v) == self._fixedlen self._dbfile.write(v) self._count = docnum + 1 class Reader(ColumnReader): def __init__(self, dbfile, basepos, length, doccount, fixedlen, default): self._dbfile = dbfile self._basepos = basepos self._doccount = doccount self._fixedlen = fixedlen self._default = self._defaultbytes = default self._count = length // fixedlen def __repr__(self): return "" def __getitem__(self, docnum): if docnum >= self._count: return self._defaultbytes pos = self._basepos + self._fixedlen * docnum return self._dbfile.get(pos, self._fixedlen) def __iter__(self): count = self._count default = self._default for i in xrange(self._doccount): if i < count: yield self[i] else: yield default # Variable/fixed length reference (enum) column class RefBytesColumn(Column): """Stores variable-length or fixed-length byte strings, similar to :class:`VarBytesColumn` and :class:`FixedBytesColumn`. However, where those columns stores a value for each document, this column keeps a list of all the unique values in the field, and for each document stores a short pointer into the unique list. For fields where the number of possible values is smaller than the number of documents (for example, "category" or "chapter"), this saves significant space. This column type supports a maximum of 65535 unique values across all documents in a segment. You should generally use this column type where the number of unique values is in no danger of approaching that number (for example, a "tags" field). If you try to index too many unique values, the column will convert additional unique values to the default value and issue a warning using the ``warnings`` module (this will usually be preferable to crashing the indexer and potentially losing indexed documents). """ # NOTE that RefBytes is reversible within a single column (we could just # negate the reference number), but it's NOT reversible ACROSS SEGMENTS # (since different segments can have different uniques values in their # columns), so we have to say that the column type is not reversible reversible = False def __init__(self, fixedlen=0, default=None): """ :param fixedlen: an optional fixed length for the values. If you specify a number other than 0, the column will require all values to be the specified length. :param default: a default value to use for documents that don't specify one. If you don't specify a default, the column will use an empty bytestring (``b''``), or if you specify a fixed length, ``b'\\x00' * fixedlen``. """ self._fixedlen = fixedlen if default is None: default = b("\x00") * fixedlen if fixedlen else emptybytes elif fixedlen and len(default) != fixedlen: raise ValueError self._default = default def writer(self, dbfile): return self.Writer(dbfile, self._fixedlen, self._default) def reader(self, dbfile, basepos, length, doccount): return self.Reader(dbfile, basepos, length, doccount, self._fixedlen) class Writer(ColumnWriter): def __init__(self, dbfile, fixedlen, default): self._dbfile = dbfile self._fixedlen = fixedlen self._default = default # At first we'll buffer refs in a byte array. If the number of # uniques stays below 256, we can just write the byte array. As # soon as the ref count goes above 255, we know we're going to have # to write shorts, so we'll switch to writing directly. self._refs = array("B") self._uniques = {default: 0} self._count = 0 def __repr__(self): return "" def fill(self, docnum): if docnum > self._count: if self._refs is not None: self._refs.extend(0 for _ in xrange(docnum - self._count)) else: dbfile = self._dbfile for _ in xrange(docnum - self._count): dbfile.write_ushort(0) def add(self, docnum, v): dbfile = self._dbfile refs = self._refs self.fill(docnum) uniques = self._uniques try: ref = uniques[v] except KeyError: uniques[v] = ref = len(uniques) if refs is not None and ref >= 256: # We won't be able to use bytes, we have to switch to # writing unbuffered ushorts for n in refs: dbfile.write_ushort(n) refs = self._refs = None if refs is not None: self._refs.append(ref) else: if ref > 65535: warnings.warn("RefBytesColumn dropped unique value %r" % v, UserWarning) ref = 0 dbfile.write_ushort(ref) self._count = docnum + 1 def _write_uniques(self, typecode): dbfile = self._dbfile fixedlen = self._fixedlen uniques = self._uniques dbfile.write_varint(len(uniques)) # Sort unique values by position vs = sorted(uniques.keys(), key=lambda key: uniques[key]) for v in vs: if not fixedlen: dbfile.write_varint(len(v)) dbfile.write(v) def finish(self, doccount): dbfile = self._dbfile refs = self._refs self.fill(doccount) typecode = "H" if refs is not None: dbfile.write_array(refs) typecode = refs.typecode self._write_uniques(typecode) dbfile.write_byte(ord(typecode)) class Reader(ColumnReader): def __init__(self, dbfile, basepos, length, doccount, fixedlen): self._dbfile = dbfile self._basepos = basepos self._doccount = doccount self._fixedlen = fixedlen self._typecode = chr(dbfile.get_byte(basepos + length - 1)) st = struct.Struct("!" + self._typecode) self._unpack = st.unpack self._itemsize = st.size dbfile.seek(basepos + doccount * self._itemsize) self._uniques = self._read_uniques() def __repr__(self): return "" def _read_uniques(self): dbfile = self._dbfile fixedlen = self._fixedlen ucount = dbfile.read_varint() length = fixedlen uniques = [] for _ in xrange(ucount): if not fixedlen: length = dbfile.read_varint() uniques.append(dbfile.read(length)) return uniques def __getitem__(self, docnum): pos = self._basepos + docnum * self._itemsize ref = self._unpack(self._dbfile.get(pos, self._itemsize))[0] return self._uniques[ref] def __iter__(self): get = self._dbfile.get basepos = self._basepos uniques = self._uniques unpack = self._unpack itemsize = self._itemsize for i in xrange(self._doccount): pos = basepos + i * itemsize ref = unpack(get(pos, itemsize))[0] yield uniques[ref] # Numeric column class NumericColumn(FixedBytesColumn): """Stores numbers (integers and floats) as compact binary. """ reversible = True def __init__(self, typecode, default=0): """ :param typecode: a typecode character (as used by the ``struct`` module) specifying the number type. For example, ``"i"`` for signed integers. :param default: the default value to use for documents that don't specify one. """ self._typecode = typecode self._default = default def writer(self, dbfile): return self.Writer(dbfile, self._typecode, self._default) def reader(self, dbfile, basepos, length, doccount): return self.Reader(dbfile, basepos, length, doccount, self._typecode, self._default) def default_value(self, reverse=False): v = self._default if reverse: v = 0 - v return v class Writer(FixedBytesColumn.Writer): def __init__(self, dbfile, typecode, default): self._dbfile = dbfile self._pack = struct.Struct("!" + typecode).pack self._default = default self._defaultbytes = self._pack(default) self._fixedlen = struct.calcsize(typecode) self._count = 0 def __repr__(self): return "" def add(self, docnum, v): if v == self._default: return if docnum > self._count: self.fill(docnum) self._dbfile.write(self._pack(v)) self._count = docnum + 1 class Reader(FixedBytesColumn.Reader): def __init__(self, dbfile, basepos, length, doccount, typecode, default): self._dbfile = dbfile self._basepos = basepos self._doccount = doccount self._default = default self._reverse = False self._typecode = typecode self._unpack = struct.Struct("!" + typecode).unpack self._defaultbytes = struct.pack("!" + typecode, default) self._fixedlen = struct.calcsize(typecode) self._count = length // self._fixedlen def __repr__(self): return "" def __getitem__(self, docnum): s = FixedBytesColumn.Reader.__getitem__(self, docnum) return self._unpack(s)[0] def sort_key(self, docnum): key = self[docnum] if self._reverse: key = 0 - key return key def load(self): if self._typecode in "qQ": return list(self) else: return array(self._typecode, self) def set_reverse(self): self._reverse = True # Column of boolean values class BitColumn(Column): """Stores a column of True/False values compactly. """ reversible = True _default = False def __init__(self, compress_at=2048): """ :param compress_at: columns with this number of values or fewer will be saved compressed on disk, and loaded into RAM for reading. Set this to 0 to disable compression. """ self._compressat = compress_at def writer(self, dbfile): return self.Writer(dbfile, self._compressat) def default_value(self, reverse=False): return self._default ^ reverse class Writer(ColumnWriter): def __init__(self, dbfile, compressat): self._dbfile = dbfile self._compressat = compressat self._bitset = BitSet() def __repr__(self): return "" def add(self, docnum, value): if value: self._bitset.add(docnum) def finish(self, doccount): dbfile = self._dbfile bits = self._bitset.bits if zlib and len(bits) <= self._compressat: compressed = zlib.compress(array_tobytes(bits), 3) dbfile.write(compressed) dbfile.write_byte(1) else: dbfile.write_array(bits) dbfile.write_byte(0) class Reader(ColumnReader): def __init__(self, dbfile, basepos, length, doccount): self._dbfile = dbfile self._basepos = basepos self._length = length self._doccount = doccount self._reverse = False compressed = dbfile.get_byte(basepos + (length - 1)) if compressed: bbytes = zlib.decompress(dbfile.get(basepos, length - 1)) bitset = BitSet.from_bytes(bbytes) else: dbfile.seek(basepos) bitset = OnDiskBitSet(dbfile, basepos, length - 1) self._bitset = bitset def id_set(self): return self._bitset def __repr__(self): return "" def __getitem__(self, i): return i in self._bitset def sort_key(self, docnum): return int(self[docnum] ^ self._reverse) def __iter__(self): i = 0 for num in self._bitset: if num > i: for _ in xrange(num - i): yield False yield True i = num + 1 if self._doccount > i: for _ in xrange(self._doccount - i): yield False def load(self): if isinstance(self._bitset, OnDiskBitSet): bs = self._dbfile.get_array(self._basepos, "B", self._length - 1) self._bitset = BitSet.from_bytes(bs) return self def set_reverse(self): self._reverse = True # Compressed variants class CompressedBytesColumn(Column): """Stores variable-length byte strings compressed using deflate (by default). """ def __init__(self, level=3, module="zlib"): """ :param level: the compression level to use. :param module: a string containing the name of the compression module to use. The default is "zlib". The module should export "compress" and "decompress" functions. """ self._level = level self._module = module def writer(self, dbfile): return self.Writer(dbfile, self._level, self._module) def reader(self, dbfile, basepos, length, doccount): return self.Reader(dbfile, basepos, length, doccount, self._module) class Writer(VarBytesColumn.Writer): def __init__(self, dbfile, level, module): VarBytesColumn.Writer.__init__(self, dbfile) self._level = level self._compress = __import__(module).compress def __repr__(self): return "" def add(self, docnum, v): v = self._compress(v, self._level) VarBytesColumn.Writer.add(self, docnum, v) class Reader(VarBytesColumn.Reader): def __init__(self, dbfile, basepos, length, doccount, module): VarBytesColumn.Reader.__init__(self, dbfile, basepos, length, doccount) self._decompress = __import__(module).decompress def __repr__(self): return "" def __getitem__(self, docnum): v = VarBytesColumn.Reader.__getitem__(self, docnum) if v: v = self._decompress(v) return v def __iter__(self): for v in VarBytesColumn.Reader.__iter__(self): yield self._decompress(v) def load(self): return list(self) class CompressedBlockColumn(Column): """An experimental column type that compresses and decompresses blocks of values at a time. This can lead to high compression and decent performance for columns with lots of very short values, but random access times are usually terrible. """ def __init__(self, level=3, blocksize=32, module="zlib"): """ :param level: the compression level to use. :param blocksize: the size (in KB) of each compressed block. :param module: a string containing the name of the compression module to use. The default is "zlib". The module should export "compress" and "decompress" functions. """ self._level = level self._blocksize = blocksize self._module = module def writer(self, dbfile): return self.Writer(dbfile, self._level, self._blocksize, self._module) def reader(self, dbfile, basepos, length, doccount): return self.Reader(dbfile, basepos, length, doccount, self._module) class Writer(ColumnWriter): def __init__(self, dbfile, level, blocksize, module): self._dbfile = dbfile self._blocksize = blocksize * 1024 self._level = level self._compress = __import__(module).compress self._reset() def __repr__(self): return "" def _reset(self): self._startdoc = None self._block = emptybytes self._lengths = [] def _emit(self): dbfile = self._dbfile block = self._compress(self._block, self._level) header = (self._startdoc, self._lastdoc, len(block), tuple(self._lengths)) dbfile.write_pickle(header) dbfile.write(block) def add(self, docnum, v): if self._startdoc is None: self._startdoc = docnum self._lengths.append((docnum, len(v))) self._lastdoc = docnum self._block += v if len(self._block) >= self._blocksize: self._emit() self._reset() def finish(self, doccount): # If there's still a pending block, write it out if self._startdoc is not None: self._emit() class Reader(ColumnReader): def __init__(self, dbfile, basepos, length, doccount, module): ColumnReader.__init__(self, dbfile, basepos, length, doccount) self._decompress = __import__(module).decompress self._blocks = [] dbfile.seek(basepos) pos = 0 while pos < length: startdoc, enddoc, blocklen, lengths = dbfile.read_pickle() here = dbfile.tell() self._blocks.append((startdoc, enddoc, here, blocklen, lengths)) dbfile.seek(blocklen, 1) pos = here + blocklen def __repr__(self): return "" def _find_block(self, docnum): # TODO: use binary search instead of linear for i, b in enumerate(self._blocks): if docnum < b[0]: return None elif docnum <= b[1]: return i return None def _get_block(self, blocknum): block = self._blocks[blocknum] pos = block[2] blocklen = block[3] lengths = block[4] data = self._decompress(self._dbfile.get(self._basepos + pos, blocklen)) values = {} base = 0 for docnum, vlen in lengths: values[docnum] = data[base:base + vlen] base += vlen return values def __getitem__(self, docnum): i = self._find_block(docnum) if i is None: return emptybytes return self._get_block(i)[docnum] def __iter__(self): last = -1 for i, block in enumerate(self._blocks): startdoc = block[0] enddoc = block[1] if startdoc > (last + 1): for _ in xrange(startdoc - last): yield emptybytes values = self._get_block(i) for docnum in xrange(startdoc, enddoc + 1): if docnum in values: yield values[docnum] else: yield emptybytes last = enddoc if enddoc < self._doccount - 1: for _ in xrange(self._doccount - enddoc): yield emptybytes class StructColumn(FixedBytesColumn): def __init__(self, spec, default): self._spec = spec self._fixedlen = struct.calcsize(spec) self._default = default def writer(self, dbfile): return self.Writer(dbfile, self._spec, self._default) def reader(self, dbfile, basepos, length, doccount): return self.Reader(dbfile, basepos, length, doccount, self._spec, self._default) class Writer(FixedBytesColumn.Writer): def __init__(self, dbfile, spec, default): self._dbfile = dbfile self._struct = struct.Struct(spec) self._fixedlen = self._struct.size self._default = default self._defaultbytes = self._struct.pack(*default) self._count = 0 def __repr__(self): return "" def add(self, docnum, v): b = self._struct.pack(*v) FixedBytesColumn.Writer.add(self, docnum, b) class Reader(FixedBytesColumn.Reader): def __init__(self, dbfile, basepos, length, doccount, spec, default): self._dbfile = dbfile self._basepos = basepos self._doccount = doccount self._struct = struct.Struct(spec) self._fixedlen = self._struct.size self._default = default self._defaultbytes = self._struct.pack(*default) self._count = length // self._fixedlen def __repr__(self): return "" def __getitem__(self, docnum): v = FixedBytesColumn.Reader.__getitem__(self, docnum) return self._struct.unpack(v) # Utility readers class EmptyColumnReader(ColumnReader): """Acts like a reader for a column with no stored values. Always returns the default. """ def __init__(self, default, doccount): """ :param default: the value to return for all "get" requests. :param doccount: the number of documents in the nominal column. """ self._default = default self._doccount = doccount def __getitem__(self, docnum): return self._default def __iter__(self): return (self._default for _ in xrange(self._doccount)) def load(self): return self class MultiColumnReader(ColumnReader): """Serializes access to multiple column readers, making them appear to be one large column. """ def __init__(self, readers): """ :param readers: a sequence of column reader objects. """ self._readers = readers self._doc_offsets = [] self._doccount = 0 for r in readers: self._doc_offsets.append(self._doccount) self._doccount += len(r) def _document_reader(self, docnum): return max(0, bisect_right(self._doc_offsets, docnum) - 1) def _reader_and_docnum(self, docnum): rnum = self._document_reader(docnum) offset = self._doc_offsets[rnum] return rnum, docnum - offset def __getitem__(self, docnum): x, y = self._reader_and_docnum(docnum) return self._readers[x][y] def __iter__(self): for r in self._readers: for v in r: yield v class TranslatingColumnReader(ColumnReader): """Calls a function to "translate" values from an underlying column reader object before returning them. ``IndexReader`` objects can wrap a column reader with this object to call ``FieldType.from_column_value`` on the stored column value before returning it the the user. """ def __init__(self, reader, translate): """ :param reader: the underlying ColumnReader object to get values from. :param translate: a function that takes a value from the underlying reader and returns a translated value. """ self._reader = reader self._translate = translate def raw_column(self): """Returns the underlying column reader. """ return self._reader def __len__(self): return len(self._reader) def __getitem__(self, docnum): return self._translate(self._reader[docnum]) def sort_key(self, docnum): return self._reader.sort_key(docnum) def __iter__(self): translate = self._translate return (translate(v) for v in self._reader) def set_reverse(self): self._reader.set_reverse() # Column wrappers class WrappedColumn(Column): def __init__(self, child): self._child = child def writer(self, *args, **kwargs): return self.Writer(self._child.writer(*args, **kwargs)) def reader(self, *args, **kwargs): return self.Reader(self._child.reader(*args, **kwargs)) def stores_lists(self): return self._child.stores_lists() class WrappedColumnWriter(ColumnWriter): def __init__(self, child): self._child = child def fill(self, docnum): return self._child.fill(docnum) def add(self, docnum, value): return self._child.add(docnum, value) def finish(self, docnum): return self._child.finish(docnum) class WrappedColumnReader(ColumnReader): def __init__(self, child): self._child = child def __len__(self): return len(self._child) def __getitem__(self, docnum): return self._child[docnum] def sort_key(self, docnum): return self._child.sort_key(docnum) def __iter__(self): return iter(self._child) def load(self): return list(self) def set_reverse(self): self._child.set_reverse() class ClampedNumericColumn(WrappedColumn): """An experimental wrapper type for NumericColumn that clamps out-of-range values instead of raising an exception. """ def reader(self, *args, **kwargs): return self._child.reader(*args, **kwargs) class Writer(WrappedColumnWriter): def __init__(self, child): self._child = child self._min = typecode_min[child._typecode] self._max = typecode_max[child._typecode] def add(self, docnum, v): v = min(v, self._min) v = max(v, self._max) self._child.add(docnum, v) class PickleColumn(WrappedColumn): """Converts arbitrary objects to pickled bytestrings and stores them using the wrapped column (usually a :class:`VarBytesColumn` or :class:`CompressedBytesColumn`). If you can express the value you want to store as a number or bytestring, you should use the appropriate column type to avoid the time and size overhead of pickling and unpickling. """ class Writer(WrappedColumnWriter): def __repr__(self): return "" def add(self, docnum, v): if v is None: v = emptybytes else: v = dumps(v, -1) self._child.add(docnum, v) class Reader(WrappedColumnReader): def __repr__(self): return "" def __getitem__(self, docnum): v = self._child[docnum] if not v: return None else: return loads(v) def __iter__(self): for v in self._child: if not v: yield None else: yield loads(v) # List columns class ListColumn(WrappedColumn): def stores_lists(self): return True class ListColumnReader(ColumnReader): def sort_key(self, docnum): return self[docnum][0] def __iter__(self): for docnum in xrange(len(self)): yield self[docnum] class VarBytesListColumn(ListColumn): def __init__(self): self._child = VarBytesColumn() class Writer(WrappedColumnWriter): def add(self, docnum, ls): out = [varint(len(ls))] for v in ls: assert isinstance(v, bytes_type) out.append(varint(len(v))) out.append(v) self._child.add(emptybytes.join(out)) class Reader(WrappedColumnReader, ListColumnReader): def __getitem__(self, docnum): bio = BytesIO(self._child[docnum]) count = bio.read_varint() out = [] for _ in xrange(count): vlen = bio.read_varint() v = bio.read(vlen) out.append(v) return out class FixedBytesListColumn(ListColumn): def __init__(self, fixedlen): self._fixedlen = fixedlen self._child = VarBytesColumn() def writer(self, *args, **kwargs): return self.Writer(self._child.writer(*args, **kwargs), self._fixedlen) def reader(self, *args, **kwargs): return self.Reader(self._child.reader(*args, **kwargs), self._fixedlen) class Writer(WrappedColumnWriter): def __init__(self, child, fixedlen): self._child = child self._fixedlen = fixedlen self._lengths = GrowableArray() self._count = 0 def add(self, docnum, ls): out = [] for v in ls: assert len(v) == self._fixedlen out.append(v) b = emptybytes.join(out) self._child.add(docnum, b) class Reader(WrappedColumnReader, ListColumnReader): def __init__(self, child, fixedlen): self._child = child self._fixedlen = fixedlen def __getitem__(self, docnum): fixedlen = self._fixedlen v = self._child[docnum] if not v: return [] ls = [v[i:i + fixedlen] for i in xrange(0, len(v), fixedlen)] return ls #class RefListColumn(Column): # def __init__(self, fixedlen=0): # """ # :param fixedlen: an optional fixed length for the values. If you # specify a number other than 0, the column will require all values # to be the specified length. # :param default: a default value to use for documents that don't specify # one. If you don't specify a default, the column will use an empty # bytestring (``b''``), or if you specify a fixed length, # ``b'\\x00' * fixedlen``. # """ # # self._fixedlen = fixedlen # # def stores_lists(self): # return True # # def writer(self, dbfile): # return self.Writer(dbfile, self._fixedlen) # # def reader(self, dbfile, basepos, length, doccount): # return self.Reader(dbfile, basepos, length, doccount, self._fixedlen) # # class Writer(ColumnWriter): # def __init__(self, dbfile, fixedlen): # self._dbfile = dbfile # self._fixedlen = fixedlen # # self._refs = GrowableArray(allow_longs=False) # self._lengths = GrowableArray(allow_longs=False) # self._count = 0 # # def __repr__(self): # return "" # # def fill(self, docnum): # if docnum > self._count: # self._lengths.extend(0 for _ in xrange(docnum - self._count)) # # def add(self, docnum, ls): # uniques = self._uniques # refs = self._refs # # self.fill(docnum) # self._lengths.append(len(ls)) # for v in ls: # try: # i = uniques[v] # except KeyError: # uniques[v] = i = len(uniques) # refs.append(i) # # self._count = docnum + 1 # # def finish(self, doccount): # dbfile = self._dbfile # refs = self._refs.array # lengths = self._lengths.array # # self.fill(doccount) # dbfile.write_byte(ord(lengths.typecode)) # dbfile.write_array(lengths) # dbfile.write_byte(ord(refs.typecode)) # self._write_uniques(refs.typecode) # dbfile.write_array(refs) # # class Reader(ListColumnReader): # def __init__(self, dbfile, basepos, length, doccount, fixedlen): # self._dbfile = dbfile # self._basepos = basepos # self._doccount = doccount # self._fixedlen = fixedlen # # dbfile.seek(basepos) # lencode = chr(dbfile.read_byte()) # self._lengths = dbfile.read_array(lencode, doccount) # # self._typecode = chr(dbfile.read_byte()) # refst = struct.Struct("!" + self._typecode) # self._unpack = refst.unpack # self._itemsize = refst.size # # self._read_uniques() # self._refbase = dbfile.tell() # # # Create an array of offsets into the references using the lengths # offsets = array("i", (0,)) # for length in self._lengths: # offsets.append(offsets[-1] + length) # self._offsets = offsets # # def __repr__(self): # return "" # # def _get_ref(self, docnum): # pos = self._basepos + 1 + docnum * self._itemsize # return self._unpack(self._dbfile.get(pos, self._itemsize))[0] # # def __getitem__(self, docnum): # offset = self._offsets[docnum] # length = self._lengths[docnum] # # pos = self._refbase + offset * self._itemsize # reflist = self._dbfile.get_array(pos, self._typecode, length) # return [self._uniques[ref] for ref in reflist] Whoosh-2.5.7/src/whoosh/compat.py0000644000076500000240000001231212254366350017024 0ustar mattstaff00000000000000import array, sys # Run time aliasing of Python2/3 differences def htmlescape(s, quote=True): # this is html.escape reimplemented with cgi.escape, # so it works for python 2.x, 3.0 and 3.1 import cgi s = cgi.escape(s, quote) if quote: # python 3.2 also replaces the single quotes: s = s.replace("'", "'") return s if sys.version_info[0] < 3: PY3 = False def b(s): return s import cStringIO as StringIO StringIO = BytesIO = StringIO.StringIO callable = callable integer_types = (int, long) iteritems = lambda o: o.iteritems() itervalues = lambda o: o.itervalues() iterkeys = lambda o: o.iterkeys() from itertools import izip long_type = long next = lambda o: o.next() import cPickle as pickle from cPickle import dumps, loads, dump, load string_type = basestring text_type = unicode bytes_type = str unichr = unichr from urllib import urlretrieve def byte(num): return chr(num) def u(s): return unicode(s, "unicode_escape") def with_metaclass(meta, base=object): class _WhooshBase(base): __metaclass__ = meta return _WhooshBase xrange = xrange zip_ = zip def memoryview_(source, offset=None, length=None): if offset or length: return buffer(source, offset, length) else: return buffer(source) else: PY3 = True import collections def b(s): return s.encode("latin-1") import io BytesIO = io.BytesIO callable = lambda o: isinstance(o, collections.Callable) exec_ = eval("exec") integer_types = (int,) iteritems = lambda o: o.items() itervalues = lambda o: o.values() iterkeys = lambda o: iter(o.keys()) izip = zip long_type = int next = next import pickle from pickle import dumps, loads, dump, load StringIO = io.StringIO string_type = str text_type = str bytes_type = bytes unichr = chr from urllib.request import urlretrieve def byte(num): return bytes((num,)) def u(s): if isinstance(s, bytes): return s.decode("ascii") return s def with_metaclass(meta, base=object): ns = dict(base=base, meta=meta) exec_("""class _WhooshBase(base, metaclass=meta): pass""", ns) return ns["_WhooshBase"] xrange = range zip_ = lambda * args: list(zip(*args)) def memoryview_(source, offset=None, length=None): mv = memoryview(source) if offset or length: return mv[offset:offset + length] else: return mv try: # for python >= 3.2, avoid DeprecationWarning for cgi.escape from html import escape as htmlescape except ImportError: pass if hasattr(array.array, "tobytes"): def array_tobytes(arry): return arry.tobytes() def array_frombytes(arry, bs): return arry.frombytes(bs) else: def array_tobytes(arry): return arry.tostring() def array_frombytes(arry, bs): return arry.fromstring(bs) # Implementations missing from older versions of Python try: from itertools import permutations # @UnusedImport except ImportError: # Python 2.5 def permutations(iterable, r=None): pool = tuple(iterable) n = len(pool) r = n if r is None else r if r > n: return indices = range(n) cycles = range(n, n - r, -1) yield tuple(pool[i] for i in indices[:r]) while n: for i in reversed(range(r)): cycles[i] -= 1 if cycles[i] == 0: indices[i:] = indices[i + 1:] + indices[i:i + 1] cycles[i] = n - i else: j = cycles[i] indices[i], indices[-j] = indices[-j], indices[i] yield tuple(pool[i] for i in indices[:r]) break else: return try: # Python 2.6-2.7 from itertools import izip_longest # @UnusedImport except ImportError: try: # Python 3.0 from itertools import zip_longest as izip_longest # @UnusedImport except ImportError: # Python 2.5 from itertools import chain, izip, repeat def izip_longest(*args, **kwds): fillvalue = kwds.get('fillvalue') def sentinel(counter=([fillvalue] * (len(args) - 1)).pop): yield counter() fillers = repeat(fillvalue) iters = [chain(it, sentinel(), fillers) for it in args] try: for tup in izip(*iters): yield tup except IndexError: pass try: from operator import methodcaller # @UnusedImport except ImportError: # Python 2.5 def methodcaller(name, *args, **kwargs): def caller(obj): return getattr(obj, name)(*args, **kwargs) return caller try: from abc import abstractmethod # @UnusedImport except ImportError: # Python 2.5 def abstractmethod(funcobj): """A decorator indicating abstract methods. """ funcobj.__isabstractmethod__ = True return funcobj Whoosh-2.5.7/src/whoosh/externalsort.py0000644000076500000240000001746112254366350020305 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module implements a general external merge sort for Python objects. """ from __future__ import with_statement import os, tempfile from heapq import heapify, heappop, heapreplace from whoosh.compat import dump, load ## Python 3.2 had a bug that make marshal.load unusable #if (hasattr(platform, "python_implementation") # and platform.python_implementation() == "CPython" # and platform.python_version() == "3.2.0"): # # Use pickle instead of marshal on Python 3.2 # from whoosh.compat import dump as dump_pickle # from whoosh.compat import load # # def dump(obj, f): # dump_pickle(obj, f, -1) #else: # from marshal import dump, load try: from heapq import merge def imerge(iterables): return merge(*iterables) except ImportError: def imerge(iterables): _hpop, _hreplace, _Stop = (heappop, heapreplace, StopIteration) h = [] h_append = h.append for itnum, it in enumerate(map(iter, iterables)): try: nx = it.next h_append([nx(), itnum, nx]) except _Stop: pass heapify(h) while 1: try: while 1: v, itnum, nx = s = h[0] yield v s[0] = nx() _hreplace(h, s) except _Stop: _hpop(h) except IndexError: return class SortingPool(object): """This object implements a general K-way external merge sort for Python objects. >>> pool = MergePool() >>> # Add an unlimited number of items in any order >>> for item in my_items: ... pool.add(item) ... >>> # Get the items back in sorted order >>> for item in pool.items(): ... print(item) This class uses the `marshal` module to write the items to temporary files, so you can only sort marshal-able types (generally: numbers, strings, tuples, lists, and dicts). """ def __init__(self, maxsize=1000000, tempdir=None, prefix="", suffix=".run"): """ :param maxsize: the maximum number of items to keep in memory at once. :param tempdir: the path of a directory to use for temporary file storage. The default is to use the system's temp directory. :param prefix: a prefix to add to temporary filenames. :param suffix: a suffix to add to temporary filenames. """ self.tempdir = tempdir if maxsize < 1: raise ValueError("maxsize=%s must be >= 1" % maxsize) self.maxsize = maxsize self.prefix = prefix self.suffix = suffix # Current run queue self.current = [] # List of run filenames self.runs = [] def _new_run(self): fd, path = tempfile.mkstemp(prefix=self.prefix, suffix=self.suffix, dir=self.tempdir) f = os.fdopen(fd, "wb") return path, f def _open_run(self, path): return open(path, "rb") def _remove_run(self, path): os.remove(path) def _read_run(self, path): f = self._open_run(path) try: while True: yield load(f) except EOFError: return finally: f.close() self._remove_run(path) def _merge_runs(self, paths): iters = [self._read_run(path) for path in paths] for item in imerge(iters): yield item def add(self, item): """Adds `item` to the pool to be sorted. """ if len(self.current) >= self.maxsize: self.save() self.current.append(item) def _write_run(self, f, items): for item in items: dump(item, f, -1) f.close() def _add_run(self, filename): self.runs.append(filename) def save(self): current = self.current if current: current.sort() path, f = self._new_run() self._write_run(f, current) self._add_run(path) self.current = [] def cleanup(self): for path in self.runs: try: os.remove(path) except OSError: pass def reduce_to(self, target, k): # Reduce the number of runs to "target" by merging "k" runs at a time if k < 2: raise ValueError("k=%s must be > 2" % k) if target < 1: raise ValueError("target=%s must be >= 1" % target) runs = self.runs while len(runs) > target: newpath, f = self._new_run() # Take k runs off the end of the run list tomerge = [] while runs and len(tomerge) < k: tomerge.append(runs.pop()) # Merge them into a new run and add it at the start of the list self._write_run(f, self._merge_runs(tomerge)) runs.insert(0, newpath) def items(self, maxfiles=128): """Returns a sorted list or iterator of the items in the pool. :param maxfiles: maximum number of files to open at once. """ if maxfiles < 2: raise ValueError("maxfiles=%s must be >= 2" % maxfiles) if not self.runs: # We never wrote a run to disk, so just sort the queue in memory # and return that return sorted(self.current) # Write a new run with the leftover items in the queue self.save() # If we have more runs than allowed open files, merge some of the runs if maxfiles < len(self.runs): self.reduce_to(maxfiles, maxfiles) # Take all the runs off the run list and merge them runs = self.runs self.runs = [] # Minor detail, makes this object reusable return self._merge_runs(runs) def sort(items, maxsize=100000, tempdir=None, maxfiles=128): """Sorts the given items using an external merge sort. :param tempdir: the path of a directory to use for temporary file storage. The default is to use the system's temp directory. :param maxsize: the maximum number of items to keep in memory at once. :param maxfiles: maximum number of files to open at once. """ p = SortingPool(maxsize=maxsize, tempdir=tempdir) for item in items: p.add(item) return p.items(maxfiles=maxfiles) Whoosh-2.5.7/src/whoosh/fields.py0000644000076500000240000014471012254366764017030 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ Contains functions and classes related to fields. """ import datetime, fnmatch, re, struct, sys from array import array from decimal import Decimal from whoosh import analysis, columns, formats from whoosh.compat import u, b, PY3 from whoosh.compat import with_metaclass from whoosh.compat import itervalues, xrange from whoosh.compat import bytes_type, string_type, integer_types, text_type from whoosh.system import emptybytes from whoosh.system import pack_byte, unpack_byte from whoosh.util.numeric import to_sortable, from_sortable from whoosh.util.numeric import typecode_max, NaN from whoosh.util.text import utf8encode, utf8decode from whoosh.util.times import datetime_to_long, long_to_datetime # Exceptions class FieldConfigurationError(Exception): pass class UnknownFieldError(Exception): pass # Field Types class FieldType(object): """Represents a field configuration. The FieldType object supports the following attributes: * format (formats.Format): the storage format for the field's contents. * analyzer (analysis.Analyzer): the analyzer to use to turn text into terms. * vector (formats.Format): the storage format for the field's vectors (forward index), or None if the field should not store vectors. * scorable (boolean): whether searches against this field may be scored. This controls whether the index stores per-document field lengths for this field. * stored (boolean): whether the content of this field is stored for each document. For example, in addition to indexing the title of a document, you usually want to store the title so it can be presented as part of the search results. * unique (boolean): whether this field's value is unique to each document. For example, 'path' or 'ID'. IndexWriter.update_document() will use fields marked as 'unique' to find the previous version of a document being updated. * multitoken_query is a string indicating what kind of query to use when a "word" in a user query parses into multiple tokens. The string is interpreted by the query parser. The strings understood by the default query parser are "first" (use first token only), "and" (join the tokens with an AND query), "or" (join the tokens with OR), "phrase" (join the tokens with a phrase query), and "default" (use the query parser's default join type). The constructor for the base field type simply lets you supply your own configured field format, vector format, and scorable and stored values. Subclasses may configure some or all of this for you. """ analyzer = format = vector = scorable = stored = unique = None indexed = True multitoken_query = "default" sortable_typecode = None spelling = False column_type = None def __init__(self, format, analyzer, vector=None, scorable=False, stored=False, unique=False, multitoken_query="default", sortable=False): assert isinstance(format, formats.Format) self.format = format self.analyzer = analyzer self.vector = vector self.scorable = scorable self.stored = stored self.unique = unique self.multitoken_query = multitoken_query self.set_sortable(sortable) def __repr__(self): temp = "%s(format=%r, vector=%r, scorable=%s, stored=%s, unique=%s)" return temp % (self.__class__.__name__, self.format, self.vector, self.scorable, self.stored, self.unique) def __eq__(self, other): return all((isinstance(other, FieldType), (self.format == other.format), (self.vector == other.vector), (self.scorable == other.scorable), (self.stored == other.stored), (self.unique == other.unique), (self.column_type == other.column_type))) def __ne__(self, other): return not(self.__eq__(other)) # Column methods def set_sortable(self, sortable): if sortable: if isinstance(sortable, columns.Column): self.column_type = sortable else: self.column_type = self.default_column() else: self.column_type = None def default_column(self): return columns.VarBytesColumn() # Methods for converting input into indexing information def index(self, value, **kwargs): """Returns an iterator of (btext, frequency, weight, encoded_value) tuples for each unique word in the input value. The default implementation uses the ``analyzer`` attribute to tokenize the value into strings, then encodes them into bytes using UTF-8. """ if not self.format: raise Exception("%s field %r cannot index without a format" % (self.__class__.__name__, self)) if not isinstance(value, (text_type, list, tuple)): raise ValueError("%r is not unicode or sequence" % value) assert isinstance(self.format, formats.Format) if "mode" not in kwargs: kwargs["mode"] = "index" word_values = self.format.word_values ana = self.analyzer for tstring, freq, wt, vbytes in word_values(value, ana, **kwargs): yield (utf8encode(tstring)[0], freq, wt, vbytes) def process_text(self, qstring, mode='', **kwargs): """Analyzes the given string and returns an iterator of token texts. >>> field = fields.TEXT() >>> list(field.process_text("The ides of March")) ["ides", "march"] """ if not self.format: raise Exception("%s field has no format" % self) return (t.text for t in self.tokenize(qstring, mode=mode, **kwargs)) def tokenize(self, value, **kwargs): """Analyzes the given string and returns an iterator of Token objects (note: for performance reasons, actually the same token yielded over and over with different attributes). """ if not self.analyzer: raise Exception("%s field has no analyzer" % self.__class__) return self.analyzer(value, **kwargs) def to_bytes(self, value): """Returns a bytes representation of the given value, appropriate to be written to disk. The default implementation assumes a unicode value and encodes it using UTF-8. """ if isinstance(value, (list, tuple)): value = value[0] if not isinstance(value, bytes_type): value = utf8encode(value)[0] return value def to_column_value(self, value): """Returns an object suitable to be inserted into the document values column for this field. The default implementation simply calls ``self.to_bytes(value)``. """ return self.to_bytes(value) def from_column_value(self, value): return self.from_bytes(value) def from_bytes(self, bs): return utf8decode(bs)[0] # Methods related to query parsing def self_parsing(self): """Subclasses should override this method to return True if they want the query parser to call the field's ``parse_query()`` method instead of running the analyzer on text in this field. This is useful where the field needs full control over how queries are interpreted, such as in the numeric field type. """ return False def parse_query(self, fieldname, qstring, boost=1.0): """When ``self_parsing()`` returns True, the query parser will call this method to parse basic query text. """ raise NotImplementedError(self.__class__.__name__) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): """When ``self_parsing()`` returns True, the query parser will call this method to parse range query text. If this method returns None instead of a query object, the parser will fall back to parsing the start and end terms using process_text(). """ return None # Methods related to sortings def sortable_terms(self, ixreader, fieldname): """Returns an iterator of the "sortable" tokens in the given reader and field. These values can be used for sorting. The default implementation simply returns all tokens in the field. This can be overridden by field types such as NUMERIC where some values in a field are not useful for sorting. """ return ixreader.lexicon(fieldname) # Methods related to spelling def separate_spelling(self): """Returns True if this field requires special handling of the words that go into the field's word graph. The default behavior is to return True if the field is "spelled" but not indexed, or if the field is indexed but the analyzer has morphological transformations (e.g. stemming). Exotic field types may need to override this behavior. This method should return False if the field does not support spelling (i.e. the ``spelling`` attribute is False). """ return self.spelling and self.analyzer.has_morph() def spellable_words(self, value): """Returns an iterator of each unique word (in sorted order) in the input value, suitable for inclusion in the field's word graph. The default behavior is to call the field analyzer with the keyword argument ``no_morph=True``, which should make the analyzer skip any morphological transformation filters (e.g. stemming) to preserve the original form of the words. Exotic field types may need to override this behavior. """ if isinstance(value, (list, tuple)): words = value else: words = [token.text for token in self.analyzer(value, no_morph=True)] return iter(sorted(set(words))) def has_morph(self): """Returns True if this field by default performs morphological transformations on its terms, e.g. stemming. """ if self.analyzer: return self.analyzer.has_morph() else: return False # Methods related to the posting/vector formats def supports(self, name): """Returns True if the underlying format supports the given posting value type. >>> field = TEXT() >>> field.supports("positions") True >>> field.supports("characters") False """ return self.format.supports(name) def clean(self): """Clears any cached information in the field and any child objects. """ if self.format and hasattr(self.format, "clean"): self.format.clean() if self.vector and hasattr(self.vector, "clean"): self.vector.clean() # Event methods def on_add(self, schema, fieldname): pass def on_remove(self, schema, fieldname): pass class ID(FieldType): """Configured field type that indexes the entire value of the field as one token. This is useful for data you don't want to tokenize, such as the path of a file. """ __inittypes__ = dict(stored=bool, unique=bool, field_boost=float) def __init__(self, stored=False, unique=False, field_boost=1.0, spelling=False, sortable=False, analyzer=None): """ :param stored: Whether the value of this field is stored with the document. """ self.analyzer = analyzer or analysis.IDAnalyzer() self.format = formats.Existence(field_boost=field_boost) self.stored = stored self.unique = unique self.spelling = spelling self.set_sortable(sortable) class IDLIST(FieldType): """Configured field type for fields containing IDs separated by whitespace and/or punctuation (or anything else, using the expression param). """ __inittypes__ = dict(stored=bool, unique=bool, expression=bool, field_boost=float) def __init__(self, stored=False, unique=False, expression=None, field_boost=1.0, spelling=False): """ :param stored: Whether the value of this field is stored with the document. :param unique: Whether the value of this field is unique per-document. :param expression: The regular expression object to use to extract tokens. The default expression breaks tokens on CRs, LFs, tabs, spaces, commas, and semicolons. """ expression = expression or re.compile(r"[^\r\n\t ,;]+") self.analyzer = analysis.RegexAnalyzer(expression=expression) self.format = formats.Existence(field_boost=field_boost) self.stored = stored self.unique = unique self.spelling = spelling class NUMERIC(FieldType): """Special field type that lets you index integer or floating point numbers in relatively short fixed-width terms. The field converts numbers to sortable bytes for you before indexing. You specify the numeric type of the field (``int`` or ``float``) when you create the ``NUMERIC`` object. The default is ``int``. For ``int``, you can specify a size in bits (``32`` or ``64``). For both ``int`` and ``float`` you can specify a ``signed`` keyword argument (default is ``True``). >>> schema = Schema(path=STORED, position=NUMERIC(int, 64, signed=False)) >>> ix = storage.create_index(schema) >>> with ix.writer() as w: ... w.add_document(path="/a", position=5820402204) ... You can also use the NUMERIC field to store Decimal instances by specifying a type of ``int`` or ``long`` and the ``decimal_places`` keyword argument. This simply multiplies each number by ``(10 ** decimal_places)`` before storing it as an integer. Of course this may throw away decimal prcesision (by truncating, not rounding) and imposes the same maximum value limits as ``int``/``long``, but these may be acceptable for certain applications. >>> from decimal import Decimal >>> schema = Schema(path=STORED, position=NUMERIC(int, decimal_places=4)) >>> ix = storage.create_index(schema) >>> with ix.writer() as w: ... w.add_document(path="/a", position=Decimal("123.45") ... """ def __init__(self, numtype=int, bits=32, stored=False, unique=False, field_boost=1.0, decimal_places=0, shift_step=4, signed=True, sortable=False, default=None): """ :param numtype: the type of numbers that can be stored in this field, either ``int``, ``float``. If you use ``Decimal``, use the ``decimal_places`` argument to control how many decimal places the field will store. :param bits: When ``numtype`` is ``int``, the number of bits to use to store the number: 8, 16, 32, or 64. :param stored: Whether the value of this field is stored with the document. :param unique: Whether the value of this field is unique per-document. :param decimal_places: specifies the number of decimal places to save when storing Decimal instances. If you set this, you will always get Decimal instances back from the field. :param shift_steps: The number of bits of precision to shift away at each tiered indexing level. Values should generally be 1-8. Lower values yield faster searches but take up more space. A value of `0` means no tiered indexing. :param signed: Whether the numbers stored in this field may be negative. """ # Allow users to specify strings instead of Python types in case # docstring isn't clear if numtype == "int": numtype = int if numtype == "float": numtype = float # Raise an error if the user tries to use a type other than int or # float if numtype is Decimal: numtype = int if not decimal_places: raise TypeError("To store Decimal instances, you must set the " "decimal_places argument") elif numtype not in (int, float): raise TypeError("Can't use %r as a type, use int or float" % numtype) # Sanity check if numtype is float and decimal_places: raise Exception("A float type and decimal_places argument %r are " "incompatible" % decimal_places) intsizes = [8, 16, 32, 64] intcodes = ["B", "H", "I", "Q"] # Set up field configuration based on type and size if numtype is float: bits = 64 # Floats are converted to 64 bit ints else: if bits not in intsizes: raise Exception("Invalid bits %r, use 8, 16, 32, or 64" % bits) # Type code for the *sortable* representation self.sortable_typecode = intcodes[intsizes.index(bits)] self._struct = struct.Struct(">" + self.sortable_typecode) self.numtype = numtype self.bits = bits self.stored = stored self.unique = unique self.decimal_places = decimal_places self.shift_step = shift_step self.signed = signed self.analyzer = analysis.IDAnalyzer() self.format = formats.Existence(field_boost=field_boost) self.min_value, self.max_value = self._min_max() # Column configuration if default is None: if numtype is int: default = typecode_max[self.sortable_typecode] else: default = NaN elif not self.is_valid(default): raise Exception("The default %r is not a valid number for this " "field" % default) self.default = default self.set_sortable(sortable) def __getstate__(self): d = self.__dict__.copy() if "_struct" in d: del d["_struct"] return d def __setstate__(self, d): self.__dict__.update(d) self._struct = struct.Struct(">" + self.sortable_typecode) if "min_value" not in d: d["min_value"], d["max_value"] = self._min_max() def _min_max(self): numtype = self.numtype bits = self.bits signed = self.signed # Calculate the minimum and maximum possible values for error checking min_value = from_sortable(numtype, bits, signed, 0) max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1) return min_value, max_value def default_column(self): return columns.NumericColumn(self.sortable_typecode, default=self.default) def is_valid(self, x): try: x = self.to_bytes(x) except ValueError: return False except OverflowError: return False return True def index(self, num, **kwargs): # If the user gave us a list of numbers, recurse on the list if isinstance(num, (list, tuple)): for n in num: for item in self.index(n): yield item return # word, freq, weight, valuestring if self.shift_step: for shift in xrange(0, self.bits, self.shift_step): yield (self.to_bytes(num, shift), 1, 1.0, emptybytes) else: yield (self.to_bytes(num), 1, 1.0, emptybytes) def prepare_number(self, x): if x == emptybytes or x is None: return x dc = self.decimal_places if dc and isinstance(x, (string_type, Decimal)): x = Decimal(x) * (10 ** dc) elif isinstance(x, Decimal): raise TypeError("Can't index a Decimal object unless you specified " "decimal_places on the field") try: x = self.numtype(x) except OverflowError: raise ValueError("Value %r overflowed number type %r" % (x, self.numtype)) if x < self.min_value or x > self.max_value: raise ValueError("Numeric field value %s out of range [%s, %s]" % (x, self.min_value, self.max_value)) return x def unprepare_number(self, x): dc = self.decimal_places if dc: s = str(x) x = Decimal(s[:-dc] + "." + s[-dc:]) return x def to_column_value(self, x): if isinstance(x, (list, tuple, array)): x = x[0] x = self.prepare_number(x) return to_sortable(self.numtype, self.bits, self.signed, x) def from_column_value(self, x): x = from_sortable(self.numtype, self.bits, self.signed, x) return self.unprepare_number(x) def to_bytes(self, x, shift=0): # Try to avoid re-encoding; this sucks because on Python 2 we can't # tell the difference between a string and encoded bytes, so we have # to require the user use unicode when they mean string if isinstance(x, bytes_type): return x if x == emptybytes or x is None: return self.sortable_to_bytes(0) x = self.prepare_number(x) x = to_sortable(self.numtype, self.bits, self.signed, x) return self.sortable_to_bytes(x, shift) def sortable_to_bytes(self, x, shift=0): if shift: x >>= shift return pack_byte(shift) + self._struct.pack(x) def from_bytes(self, bs): x = self._struct.unpack(bs[1:])[0] x = from_sortable(self.numtype, self.bits, self.signed, x) x = self.unprepare_number(x) return x def process_text(self, text, **kwargs): return (self.to_bytes(text),) def self_parsing(self): return True def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query from whoosh.qparser.common import QueryParserError if qstring == "*": return query.Every(fieldname, boost=boost) if not self.is_valid(qstring): raise QueryParserError("%r is not a valid number" % qstring) token = self.to_bytes(qstring) return query.Term(fieldname, token, boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query from whoosh.qparser.common import QueryParserError if start is not None: if not self.is_valid(start): raise QueryParserError("Range start %r is not a valid number" % start) start = self.prepare_number(start) if end is not None: if not self.is_valid(end): raise QueryParserError("Range end %r is not a valid number" % end) end = self.prepare_number(end) return query.NumericRange(fieldname, start, end, startexcl, endexcl, boost=boost) def sortable_terms(self, ixreader, fieldname): zero = b("\x00") for token in ixreader.lexicon(fieldname): if token[0:1] != zero: # Only yield the full-precision values break yield token class DATETIME(NUMERIC): """Special field type that lets you index datetime objects. The field converts the datetime objects to sortable text for you before indexing. Since this field is based on Python's datetime module it shares all the limitations of that module, such as the inability to represent dates before year 1 in the proleptic Gregorian calendar. However, since this field stores datetimes as an integer number of microseconds, it could easily represent a much wider range of dates if the Python datetime implementation ever supports them. >>> schema = Schema(path=STORED, date=DATETIME) >>> ix = storage.create_index(schema) >>> w = ix.writer() >>> w.add_document(path="/a", date=datetime.now()) >>> w.commit() """ __inittypes__ = dict(stored=bool, unique=bool) def __init__(self, stored=False, unique=False, sortable=False): """ :param stored: Whether the value of this field is stored with the document. :param unique: Whether the value of this field is unique per-document. """ super(DATETIME, self).__init__(int, 64, stored=stored, unique=unique, shift_step=8, sortable=sortable) def prepare_datetime(self, x): from whoosh.util.times import floor if isinstance(x, text_type): # For indexing, support same strings as for query parsing -- # convert unicode to datetime object x = self._parse_datestring(x) x = floor(x) # this makes most sense (unspecified = lowest) if isinstance(x, datetime.datetime): return datetime_to_long(x) elif isinstance(x, bytes_type): return x else: raise Exception("%r is not a datetime" % (x,)) def to_column_value(self, x): if isinstance(x, bytes_type): raise Exception("%r is not a datetime" % (x,)) if isinstance(x, (list, tuple)): x = x[0] return self.prepare_datetime(x) def from_column_value(self, x): return long_to_datetime(x) def to_bytes(self, x, shift=0): x = self.prepare_datetime(x) return NUMERIC.to_bytes(self, x, shift=shift) def from_bytes(self, bs): x = NUMERIC.from_bytes(self, bs) return long_to_datetime(x) def _parse_datestring(self, qstring): # This method parses a very simple datetime representation of the form # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]] from whoosh.util.times import adatetime, fix, is_void qstring = qstring.replace(" ", "").replace("-", "").replace(".", "") year = month = day = hour = minute = second = microsecond = None if len(qstring) >= 4: year = int(qstring[:4]) if len(qstring) >= 6: month = int(qstring[4:6]) if len(qstring) >= 8: day = int(qstring[6:8]) if len(qstring) >= 10: hour = int(qstring[8:10]) if len(qstring) >= 12: minute = int(qstring[10:12]) if len(qstring) >= 14: second = int(qstring[12:14]) if len(qstring) == 20: microsecond = int(qstring[14:]) at = fix(adatetime(year, month, day, hour, minute, second, microsecond)) if is_void(at): raise Exception("%r is not a parseable date" % qstring) return at def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query from whoosh.util.times import is_ambiguous try: at = self._parse_datestring(qstring) except: e = sys.exc_info()[1] return query.error_query(e) if is_ambiguous(at): startnum = datetime_to_long(at.floor()) endnum = datetime_to_long(at.ceil()) return query.NumericRange(fieldname, startnum, endnum) else: return query.Term(fieldname, at, boost=boost) def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): from whoosh import query if start is None and end is None: return query.Every(fieldname, boost=boost) if start is not None: startdt = self._parse_datestring(start).floor() start = datetime_to_long(startdt) if end is not None: enddt = self._parse_datestring(end).ceil() end = datetime_to_long(enddt) return query.NumericRange(fieldname, start, end, boost=boost) class BOOLEAN(FieldType): """Special field type that lets you index boolean values (True and False). The field converts the boolean values to text for you before indexing. >>> schema = Schema(path=STORED, done=BOOLEAN) >>> ix = storage.create_index(schema) >>> w = ix.writer() >>> w.add_document(path="/a", done=False) >>> w.commit() """ bytestrings = (b("f"), b("t")) trues = frozenset(u("t true yes 1").split()) falses = frozenset(u("f false no 0").split()) __inittypes__ = dict(stored=bool, field_boost=float) def __init__(self, stored=False, field_boost=1.0): """ :param stored: Whether the value of this field is stored with the document. """ self.stored = stored self.field_boost = field_boost self.format = formats.Existence(field_boost=field_boost) def _obj_to_bool(self, x): # We special case strings such as "true", "false", "yes", "no", but # otherwise call bool() on the query value. This lets you pass objects # as query values and do the right thing. if isinstance(x, string_type) and x.lower() in self.trues: x = True elif isinstance(x, string_type) and x.lower() in self.falses: x = False else: x = bool(x) return x def to_bytes(self, x): if isinstance(x, bytes_type): return x elif isinstance(x, string_type): x = x.lower() in self.trues else: x = bool(x) bs = self.bytestrings[int(x)] return bs def index(self, bit, **kwargs): if isinstance(bit, string_type): bit = bit.lower() in self.trues else: bit = bool(bit) # word, freq, weight, valuestring return [(self.bytestrings[int(bit)], 1, 1.0, emptybytes)] def self_parsing(self): return True def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query if qstring == "*": return query.Every(fieldname, boost=boost) return query.Term(fieldname, self._obj_to_bool(qstring), boost=boost) class STORED(FieldType): """Configured field type for fields you want to store but not index. """ indexed = False stored = True def __init__(self): pass class COLUMN(FieldType): """Configured field type for fields you want to store as a per-document value column but not index. """ indexed = False stored = False def __init__(self, columnobj=None): if columnobj is None: columnobj = columns.VarBytesColumn() if not isinstance(columnobj, columns.Column): raise TypeError("%r is not a column object" % (columnobj,)) self.column_type = columnobj def to_bytes(self, v): return v def from_bytes(self, b): return b class KEYWORD(FieldType): """Configured field type for fields containing space-separated or comma-separated keyword-like data (such as tags). The default is to not store positional information (so phrase searching is not allowed in this field) and to not make the field scorable. """ __inittypes__ = dict(stored=bool, lowercase=bool, commas=bool, scorable=bool, unique=bool, field_boost=float) def __init__(self, stored=False, lowercase=False, commas=False, vector=None, scorable=False, unique=False, field_boost=1.0, spelling=False, sortable=False): """ :param stored: Whether to store the value of the field with the document. :param comma: Whether this is a comma-separated field. If this is False (the default), it is treated as a space-separated field. :param scorable: Whether this field is scorable. """ self.analyzer = analysis.KeywordAnalyzer(lowercase=lowercase, commas=commas) self.format = formats.Frequency(field_boost=field_boost) self.scorable = scorable self.stored = stored self.unique = unique self.spelling = spelling if vector: if type(vector) is type: vector = vector() elif isinstance(vector, formats.Format): pass else: vector = self.format else: vector = None self.vector = vector if sortable: self.column_type = self.default_column() class TEXT(FieldType): """Configured field type for text fields (for example, the body text of an article). The default is to store positional information to allow phrase searching. This field type is always scorable. """ __inittypes__ = dict(analyzer=analysis.Analyzer, phrase=bool, vector=object, stored=bool, field_boost=float) def __init__(self, analyzer=None, phrase=True, chars=False, vector=None, stored=False, field_boost=1.0, multitoken_query="default", spelling=False, sortable=False, lang=None): """ :param analyzer: The analysis.Analyzer to use to index the field contents. See the analysis module for more information. If you omit this argument, the field uses analysis.StandardAnalyzer. :param phrase: Whether the store positional information to allow phrase searching. :param chars: Whether to store character ranges along with positions. If this is True, "phrase" is also implied. :param vector: A :class:`whoosh.formats.Format` object to use to store term vectors, or ``True`` to store vectors using the same format as the inverted index, or ``None`` or ``False`` to not store vectors. By default, fields do not store term vectors. :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param spelling: Whether to generate word graphs for this field to make spelling suggestions much faster. :param sortable: If True, make this field sortable using the default column type. If you pass a :class:`whoosh.columns.Column` instance instead of True, the field will use the given column type. :param lang: automaticaly configure a :class:`whoosh.analysis.LanguageAnalyzer` for the given language. This is ignored if you also specify an ``analyzer``. """ if analyzer: self.analyzer = analyzer elif lang: self.analyzer = analysis.LanguageAnalyzer(lang) else: self.analyzer = analysis.StandardAnalyzer() if chars: formatclass = formats.Characters elif phrase: formatclass = formats.Positions else: formatclass = formats.Frequency self.format = formatclass(field_boost=field_boost) if vector: if type(vector) is type: vector = vector() elif isinstance(vector, formats.Format): pass else: vector = formatclass() else: vector = None self.vector = vector if sortable: if isinstance(sortable, columns.Column): self.column_type = sortable else: self.column_type = columns.VarBytesColumn() else: self.column_type = None self.multitoken_query = multitoken_query self.scorable = True self.stored = stored self.spelling = spelling class NGRAM(FieldType): """Configured field that indexes text as N-grams. For example, with a field type NGRAM(3,4), the value "hello" will be indexed as tokens "hel", "hell", "ell", "ello", "llo". This field type chops the entire text into N-grams, including whitespace and punctuation. See :class:`NGRAMWORDS` for a field type that breaks the text into words first before chopping the words into N-grams. """ __inittypes__ = dict(minsize=int, maxsize=int, stored=bool, field_boost=float, queryor=bool, phrase=bool) scorable = True def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, queryor=False, phrase=False, sortable=False): """ :param minsize: The minimum length of the N-grams. :param maxsize: The maximum length of the N-grams. :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param queryor: if True, combine the N-grams with an Or query. The default is to combine N-grams with an And query. :param phrase: store positions on the N-grams to allow exact phrase searching. The default is off. """ formatclass = formats.Frequency if phrase: formatclass = formats.Positions self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) self.format = formatclass(field_boost=field_boost) self.stored = stored self.queryor = queryor self.set_sortable(sortable) def self_parsing(self): return True def parse_query(self, fieldname, qstring, boost=1.0): from whoosh import query terms = [query.Term(fieldname, g) for g in self.process_text(qstring, mode='query')] cls = query.Or if self.queryor else query.And return cls(terms, boost=boost) class NGRAMWORDS(NGRAM): """Configured field that chops text into words using a tokenizer, lowercases the words, and then chops the words into N-grams. """ __inittypes__ = dict(minsize=int, maxsize=int, stored=bool, field_boost=float, tokenizer=analysis.Tokenizer, at=str, queryor=bool) scorable = True def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, tokenizer=None, at=None, queryor=False, sortable=False): """ :param minsize: The minimum length of the N-grams. :param maxsize: The maximum length of the N-grams. :param stored: Whether to store the value of this field with the document. Since this field type generally contains a lot of text, you should avoid storing it with the document unless you need to, for example to allow fast excerpts in the search results. :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer` used to break the text into words. :param at: if 'start', only takes N-grams from the start of the word. If 'end', only takes N-grams from the end. Otherwise the default is to take all N-grams from each word. :param queryor: if True, combine the N-grams with an Or query. The default is to combine N-grams with an And query. """ self.analyzer = analysis.NgramWordAnalyzer(minsize, maxsize, tokenizer, at=at) self.format = formats.Frequency(field_boost=field_boost) self.stored = stored self.queryor = queryor self.set_sortable(sortable) # Schema class class MetaSchema(type): def __new__(cls, name, bases, attrs): super_new = super(MetaSchema, cls).__new__ if not any(b for b in bases if isinstance(b, MetaSchema)): # If this isn't a subclass of MetaSchema, don't do anything special return super_new(cls, name, bases, attrs) # Create the class special_attrs = {} for key in list(attrs.keys()): if key.startswith("__"): special_attrs[key] = attrs.pop(key) new_class = super_new(cls, name, bases, special_attrs) fields = {} for b in bases: if hasattr(b, "_clsfields"): fields.update(b._clsfields) fields.update(attrs) new_class._clsfields = fields return new_class def schema(self): return Schema(**self._clsfields) class Schema(object): """Represents the collection of fields in an index. Maps field names to FieldType objects which define the behavior of each field. Low-level parts of the index use field numbers instead of field names for compactness. This class has several methods for converting between the field name, field number, and field object itself. """ def __init__(self, **fields): """ All keyword arguments to the constructor are treated as fieldname = fieldtype pairs. The fieldtype can be an instantiated FieldType object, or a FieldType sub-class (in which case the Schema will instantiate it with the default constructor before adding it). For example:: s = Schema(content = TEXT, title = TEXT(stored = True), tags = KEYWORD(stored = True)) """ self._fields = {} self._dyn_fields = {} for name in sorted(fields.keys()): self.add(name, fields[name]) def copy(self): """Returns a shallow copy of the schema. The field instances are not deep copied, so they are shared between schema copies. """ return self.__class__(**self._fields) def __eq__(self, other): return (other.__class__ is self.__class__ and list(self.items()) == list(other.items())) def __ne__(self, other): return not(self.__eq__(other)) def __repr__(self): return "<%s: %r>" % (self.__class__.__name__, self.names()) def __iter__(self): """Returns the field objects in this schema. """ return iter(self._fields.values()) def __getitem__(self, name): """Returns the field associated with the given field name. """ if name in self._fields: return self._fields[name] for expr, fieldtype in itervalues(self._dyn_fields): if expr.match(name): return fieldtype raise KeyError("No field named %r" % (name,)) def __len__(self): """Returns the number of fields in this schema. """ return len(self._fields) def __contains__(self, fieldname): """Returns True if a field by the given name is in this schema. """ # Defined in terms of __getitem__ so that there's only one method to # override to provide dynamic fields try: field = self[fieldname] return field is not None except KeyError: return False def items(self): """Returns a list of ("fieldname", field_object) pairs for the fields in this schema. """ return sorted(self._fields.items()) def names(self, check_names=None): """Returns a list of the names of the fields in this schema. :param check_names: (optional) sequence of field names to check whether the schema accepts them as (dynamic) field names - acceptable names will also be in the result list. Note: You may also have static field names in check_names, that won't create duplicates in the result list. Unsupported names will not be in the result list. """ fieldnames = set(self._fields.keys()) if check_names is not None: check_names = set(check_names) - fieldnames fieldnames.update(fieldname for fieldname in check_names if fieldname in self) return sorted(fieldnames) def clean(self): for field in self: field.clean() def add(self, name, fieldtype, glob=False): """Adds a field to this schema. :param name: The name of the field. :param fieldtype: An instantiated fields.FieldType object, or a FieldType subclass. If you pass an instantiated object, the schema will use that as the field configuration for this field. If you pass a FieldType subclass, the schema will automatically instantiate it with the default constructor. """ # Check field name if name.startswith("_"): raise FieldConfigurationError("Field names cannot start with an " "underscore") if " " in name: raise FieldConfigurationError("Field names cannot contain spaces") if name in self._fields or (glob and name in self._dyn_fields): raise FieldConfigurationError("Schema already has a field %r" % name) # If the user passed a type rather than an instantiated field object, # instantiate it automatically if type(fieldtype) is type: try: fieldtype = fieldtype() except: e = sys.exc_info()[1] raise FieldConfigurationError("Error: %s instantiating field " "%r: %r" % (e, name, fieldtype)) if not isinstance(fieldtype, FieldType): raise FieldConfigurationError("%r is not a FieldType object" % fieldtype) if glob: expr = re.compile(fnmatch.translate(name)) self._dyn_fields[name] = (expr, fieldtype) else: fieldtype.on_add(self, name) self._fields[name] = fieldtype def remove(self, fieldname): if fieldname in self._fields: self._fields[fieldname].on_remove(self, fieldname) del self._fields[fieldname] elif fieldname in self._dyn_fields: del self._dyn_fields[fieldname] else: raise KeyError("No field named %r" % fieldname) def has_vectored_fields(self): """Returns True if any of the fields in this schema store term vectors. """ return any(ftype.vector for ftype in self) def has_scorable_fields(self): return any(ftype.scorable for ftype in self) def stored_names(self): """Returns a list of the names of fields that are stored. """ return [name for name, field in self.items() if field.stored] def scorable_names(self): """Returns a list of the names of fields that store field lengths. """ return [name for name, field in self.items() if field.scorable] def vector_names(self): """Returns a list of the names of fields that store vectors. """ return [name for name, field in self.items() if field.vector] def separate_spelling_names(self): """Returns a list of the names of fields that require special handling for generating spelling graphs... either because they store graphs but aren't indexed, or because the analyzer is stemmed. """ return [name for name, field in self.items() if field.spelling and field.separate_spelling()] class SchemaClass(with_metaclass(MetaSchema, Schema)): """Allows you to define a schema using declarative syntax, similar to Django models:: class MySchema(SchemaClass): path = ID date = DATETIME content = TEXT You can use inheritance to share common fields between schemas:: class Parent(SchemaClass): path = ID(stored=True) date = DATETIME class Child1(Parent): content = TEXT(positions=False) class Child2(Parent): tags = KEYWORD This class overrides ``__new__`` so instantiating your sub-class always results in an instance of ``Schema``. >>> class MySchema(SchemaClass): ... title = TEXT(stored=True) ... content = TEXT ... >>> s = MySchema() >>> type(s) """ def __new__(cls, *args, **kwargs): obj = super(Schema, cls).__new__(Schema) kw = getattr(cls, "_clsfields", {}) kw.update(kwargs) obj.__init__(*args, **kw) return obj def ensure_schema(schema): if isinstance(schema, type) and issubclass(schema, Schema): schema = schema.schema() if not isinstance(schema, Schema): raise FieldConfigurationError("%r is not a Schema" % schema) return schema def merge_fielddict(d1, d2): keyset = set(d1.keys()) | set(d2.keys()) out = {} for name in keyset: field1 = d1.get(name) field2 = d2.get(name) if field1 and field2 and field1 != field2: raise Exception("Inconsistent field %r: %r != %r" % (name, field1, field2)) out[name] = field1 or field2 return out def merge_schema(s1, s2): schema = Schema() schema._fields = merge_fielddict(s1._fields, s2._fields) schema._dyn_fields = merge_fielddict(s1._dyn_fields, s2._dyn_fields) return schema def merge_schemas(schemas): schema = schemas[0] for i in xrange(1, len(schemas)): schema = merge_schema(schema, schemas[i]) return schema Whoosh-2.5.7/src/whoosh/filedb/0000755000076500000240000000000012277504634016421 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/filedb/__init__.py0000644000076500000240000000000012254366350020514 0ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/filedb/compound.py0000644000076500000240000002552212277504454020625 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import errno import os import sys from threading import Lock from shutil import copyfileobj try: import mmap except ImportError: mmap = None from whoosh.compat import BytesIO, memoryview_ from whoosh.filedb.structfile import BufferFile, StructFile from whoosh.filedb.filestore import FileStorage, StorageError from whoosh.system import emptybytes from whoosh.util import random_name class CompoundStorage(FileStorage): readonly = True def __init__(self, dbfile, use_mmap=True, basepos=0): self._file = dbfile self.is_closed = False # Seek to the end to get total file size (to check if mmap is OK) dbfile.seek(0, os.SEEK_END) filesize = self._file.tell() dbfile.seek(basepos) self._diroffset = self._file.read_long() self._dirlength = self._file.read_int() self._file.seek(self._diroffset) self._dir = self._file.read_pickle() self._options = self._file.read_pickle() self._locks = {} self._source = None use_mmap = ( use_mmap and hasattr(self._file, "fileno") # check file is a real file and filesize < sys.maxsize # check fit on 32-bit Python ) if mmap and use_mmap: # Try to open the entire segment as a memory-mapped object try: fileno = self._file.fileno() self._source = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ) except (mmap.error, OSError): e = sys.exc_info()[1] # If we got an error because there wasn't enough memory to # open the map, ignore it and fall through, we'll just use the # (slower) "sub-file" implementation if e.errno == errno.ENOMEM: pass else: raise else: # If that worked, we can close the file handle we were given self._file.close() self._file = None def __repr__(self): return "<%s (%s)>" % (self.__class__.__name__, self._name) def close(self): if self.is_closed: raise Exception("Already closed") self.is_closed = True if self._source: try: self._source.close() except BufferError: del self._source if self._file: self._file.close() def range(self, name): try: fileinfo = self._dir[name] except KeyError: raise NameError("Unknown file %r" % (name,)) return fileinfo["offset"], fileinfo["length"] def open_file(self, name, *args, **kwargs): if self.is_closed: raise StorageError("Storage was closed") offset, length = self.range(name) if self._source: # Create a memoryview/buffer from the mmap buf = memoryview_(self._source, offset, length) f = BufferFile(buf, name=name) elif hasattr(self._file, "subset"): f = self._file.subset(offset, length, name=name) else: f = StructFile(SubFile(self._file, offset, length), name=name) return f def list(self): return list(self._dir.keys()) def file_exists(self, name): return name in self._dir def file_length(self, name): info = self._dir[name] return info["length"] def file_modified(self, name): info = self._dir[name] return info["modified"] def lock(self, name): if name not in self._locks: self._locks[name] = Lock() return self._locks[name] @staticmethod def assemble(dbfile, store, names, **options): assert names, names directory = {} basepos = dbfile.tell() dbfile.write_long(0) # Directory position dbfile.write_int(0) # Directory length # Copy the files into the compound file for name in names: if name.endswith(".toc") or name.endswith(".seg"): raise Exception(name) for name in names: offset = dbfile.tell() length = store.file_length(name) modified = store.file_modified(name) directory[name] = {"offset": offset, "length": length, "modified": modified} f = store.open_file(name) copyfileobj(f, dbfile) f.close() CompoundStorage.write_dir(dbfile, basepos, directory, options) @staticmethod def write_dir(dbfile, basepos, directory, options=None): options = options or {} dirpos = dbfile.tell() # Remember the start of the directory dbfile.write_pickle(directory) # Write the directory dbfile.write_pickle(options) endpos = dbfile.tell() # Remember the end of the directory dbfile.flush() dbfile.seek(basepos) # Seek back to the start dbfile.write_long(dirpos) # Directory position dbfile.write_int(endpos - dirpos) # Directory length dbfile.close() class SubFile(object): def __init__(self, parentfile, offset, length, name=None): self._file = parentfile self._offset = offset self._length = length self._end = offset + length self._pos = 0 self.name = name self.closed = False def close(self): self.closed = True def subset(self, position, length, name=None): start = self._offset + position end = start + length name = name or self.name assert self._offset >= start >= self._end assert self._offset >= end >= self._end return SubFile(self._file, self._offset + position, length, name=name) def read(self, size=None): if size is None: size = self._length - self._pos else: size = min(size, self._length - self._pos) if size < 0: size = 0 if size > 0: self._file.seek(self._offset + self._pos) self._pos += size return self._file.read(size) else: return emptybytes def readline(self): maxsize = self._length - self._pos self._file.seek(self._offset + self._pos) data = self._file.readline() if len(data) > maxsize: data = data[:maxsize] self._pos += len(data) return data def seek(self, where, whence=0): if whence == 0: # Absolute pos = where elif whence == 1: # Relative pos = self._pos + where elif whence == 2: # From end pos = self._length - where else: raise ValueError self._pos = pos def tell(self): return self._pos class CompoundWriter(object): def __init__(self, tempstorage, buffersize=32 * 1024): assert isinstance(buffersize, int) self._tempstorage = tempstorage self._tempname = "%s.ctmp" % random_name() self._temp = tempstorage.create_file(self._tempname, mode="w+b") self._buffersize = buffersize self._streams = {} def create_file(self, name): ss = self.SubStream(self._temp, self._buffersize) self._streams[name] = ss return StructFile(ss) def _readback(self): temp = self._temp for name, substream in self._streams.items(): substream.close() def gen(): for f, offset, length in substream.blocks: if f is None: f = temp f.seek(offset) yield f.read(length) yield (name, gen) temp.close() self._tempstorage.delete_file(self._tempname) def save_as_compound(self, dbfile): basepos = dbfile.tell() dbfile.write_long(0) # Directory offset dbfile.write_int(0) # Directory length directory = {} for name, blocks in self._readback(): filestart = dbfile.tell() for block in blocks(): dbfile.write(block) directory[name] = {"offset": filestart, "length": dbfile.tell() - filestart} CompoundStorage.write_dir(dbfile, basepos, directory) def save_as_files(self, storage, name_fn): for name, blocks in self._readback(): f = storage.create_file(name_fn(name)) for block in blocks(): f.write(block) f.close() class SubStream(object): def __init__(self, dbfile, buffersize): self._dbfile = dbfile self._buffersize = buffersize self._buffer = BytesIO() self.blocks = [] def tell(self): return sum(b[2] for b in self.blocks) + self._buffer.tell() def write(self, inbytes): bio = self._buffer buflen = bio.tell() length = buflen + len(inbytes) if length >= self._buffersize: offset = self._dbfile.tell() self._dbfile.write(bio.getvalue()[:buflen]) self._dbfile.write(inbytes) self.blocks.append((None, offset, length)) self._buffer.seek(0) else: bio.write(inbytes) def close(self): bio = self._buffer length = bio.tell() if length: self.blocks.append((bio, 0, length)) Whoosh-2.5.7/src/whoosh/filedb/filestore.py0000644000076500000240000005163212254366350020772 0ustar mattstaff00000000000000# Copyright 2009 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement import errno, os, sys, tempfile from threading import Lock from whoosh.compat import BytesIO, memoryview_ from whoosh.filedb.structfile import BufferFile, StructFile from whoosh.index import _DEF_INDEX_NAME, EmptyIndexError from whoosh.util import random_name from whoosh.util.filelock import FileLock # Exceptions class StorageError(Exception): pass class ReadOnlyError(StorageError): pass # Base class class Storage(object): """Abstract base class for storage objects. A storage object is a virtual flat filesystem, allowing the creation and retrieval of file-like objects (:class:`~whoosh.filedb.structfile.StructFile` objects). The default implementation (:class:`FileStorage`) uses actual files in a directory. All access to files in Whoosh goes through this object. This allows more different forms of storage (for example, in RAM, in a database, in a single file) to be used transparently. For example, to create a :class:`FileStorage` object:: # Create a storage object st = FileStorage("indexdir") # Create the directory if it doesn't already exist st.create() The :meth:`Storage.create` method makes it slightly easier to swap storage implementations. The ``create()`` method handles set-up of the storage object. For example, ``FileStorage.create()`` creates the directory. A database implementation might create tables. This is designed to let you avoid putting implementation-specific setup code in your application. """ readonly = False supports_mmap = False def __iter__(self): return iter(self.list()) def __enter__(self): self.create() return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def create(self): """Creates any required implementation-specific resources. For example, a filesystem-based implementation might create a directory, while a database implementation might create tables. For example:: from whoosh.filedb.filestore import FileStorage # Create a storage object st = FileStorage("indexdir") # Create any necessary resources st.create() This method returns ``self`` so you can also say:: st = FileStorage("indexdir").create() Storage implementations should be written so that calling create() a second time on the same storage :return: a :class:`Storage` instance. """ return self def destroy(self, *args, **kwargs): """Removes any implementation-specific resources related to this storage object. For example, a filesystem-based implementation might delete a directory, and a database implementation might drop tables. The arguments are implementation-specific. """ pass def create_index(self, schema, indexname=_DEF_INDEX_NAME, indexclass=None): """Creates a new index in this storage. >>> from whoosh import fields >>> from whoosh.filedb.filestore import FileStorage >>> schema = fields.Schema(content=fields.TEXT) >>> # Create the storage directory >>> st = FileStorage.create("indexdir") >>> # Create an index in the storage >>> ix = st.create_index(schema) :param schema: the :class:`whoosh.fields.Schema` object to use for the new index. :param indexname: the name of the index within the storage object. You can use this option to store multiple indexes in the same storage. :param indexclass: an optional custom ``Index`` sub-class to use to create the index files. The default is :class:`whoosh.index.FileIndex`. This method will call the ``create`` class method on the given class to create the index. :return: a :class:`whoosh.index.Index` instance. """ if self.readonly: raise ReadOnlyError if indexclass is None: import whoosh.index indexclass = whoosh.index.FileIndex return indexclass.create(self, schema, indexname) def open_index(self, indexname=_DEF_INDEX_NAME, schema=None, indexclass=None): """Opens an existing index (created using :meth:`create_index`) in this storage. >>> from whoosh.filedb.filestore import FileStorage >>> st = FileStorage("indexdir") >>> # Open an index in the storage >>> ix = st.open_index() :param indexname: the name of the index within the storage object. You can use this option to store multiple indexes in the same storage. :param schema: if you pass in a :class:`whoosh.fields.Schema` object using this argument, it will override the schema that was stored with the index. :param indexclass: an optional custom ``Index`` sub-class to use to open the index files. The default is :class:`whoosh.index.FileIndex`. This method will instantiate the class with this storage object. :return: a :class:`whoosh.index.Index` instance. """ if indexclass is None: import whoosh.index indexclass = whoosh.index.FileIndex return indexclass(self, schema=schema, indexname=indexname) def index_exists(self, indexname=None): """Returns True if a non-empty index exists in this storage. :param indexname: the name of the index within the storage object. You can use this option to store multiple indexes in the same storage. :rtype: bool """ if indexname is None: indexname = _DEF_INDEX_NAME try: ix = self.open_index(indexname) gen = ix.latest_generation() ix.close() return gen > -1 except EmptyIndexError: pass return False def create_file(self, name): """Creates a file with the given name in this storage. :param name: the name for the new file. :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ raise NotImplementedError def open_file(self, name, *args, **kwargs): """Opens a file with the given name in this storage. :param name: the name for the new file. :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ raise NotImplementedError def list(self): """Returns a list of file names in this storage. :return: a list of strings """ raise NotImplementedError def file_exists(self, name): """Returns True if the given file exists in this storage. :param name: the name to check. :rtype: bool """ raise NotImplementedError def file_modified(self, name): """Returns the last-modified time of the given file in this storage (as a "ctime" UNIX timestamp). :param name: the name to check. :return: a "ctime" number. """ raise NotImplementedError def file_length(self, name): """Returns the size (in bytes) of the given file in this storage. :param name: the name to check. :rtype: int """ raise NotImplementedError def delete_file(self, name): """Removes the given file from this storage. :param name: the name to delete. """ raise NotImplementedError def rename_file(self, frm, to, safe=False): """Renames a file in this storage. :param frm: The current name of the file. :param to: The new name for the file. :param safe: if True, raise an exception if a file with the new name already exists. """ raise NotImplementedError def lock(self, name): """Return a named lock object (implementing ``.acquire()`` and ``.release()`` methods). Different storage implementations may use different lock types with different guarantees. For example, the RamStorage object uses Python thread locks, while the FileStorage object uses filesystem-based locks that are valid across different processes. :param name: a name for the lock. :return: a lock-like object. """ raise NotImplementedError def close(self): """Closes any resources opened by this storage object. For some storage implementations this will be a no-op, but for others it is necessary to release locks and/or prevent leaks, so it's a good idea to call it when you're done with a storage object. """ pass def optimize(self): """Optimizes the storage object. The meaning and cost of "optimizing" will vary by implementation. For example, a database implementation might run a garbage collection procedure on the underlying database. """ pass def temp_storage(self, name=None): """Creates a new storage object for temporary files. You can call :meth:`Storage.destroy` on the new storage when you're finished with it. :param name: a name for the new storage. This may be optional or required depending on the storage implementation. :rtype: :class:`Storage` """ raise NotImplementedError class OverlayStorage(Storage): """Overlays two storage objects. Reads are processed from the first if it has the named file, otherwise the second. Writes always go to the second. """ def __init__(self, a, b): self.a = a self.b = b def create_index(self, *args, **kwargs): self.b.create_index(*args, **kwargs) def open_index(self, *args, **kwargs): self.a.open_index(*args, **kwargs) def create_file(self, *args, **kwargs): return self.b.create_file(*args, **kwargs) def open_file(self, name, *args, **kwargs): if self.a.file_exists(name): return self.a.open_file(name, *args, **kwargs) else: return self.b.open_file(name, *args, **kwargs) def list(self): return list(set(self.a.list()) | set(self.b.list())) def file_exists(self, name): return self.a.file_exists(name) or self.b.file_exists(name) def file_modified(self, name): if self.a.file_exists(name): return self.a.file_modified(name) else: return self.b.file_modified(name) def file_length(self, name): if self.a.file_exists(name): return self.a.file_length(name) else: return self.b.file_length(name) def delete_file(self, name): return self.b.delete_file(name) def rename_file(self, *args, **kwargs): raise NotImplementedError def lock(self, name): return self.b.lock(name) def close(self): self.a.close() self.b.close() def optimize(self): self.a.optimize() self.b.optimize() def temp_storage(self, name=None): return self.b.temp_storage(name=name) class FileStorage(Storage): """Storage object that stores the index as files in a directory on disk. Prior to version 3, the initializer would raise an IOError if the directory did not exist. As of version 3, the object does not check if the directory exists at initialization. This change is to support using the :meth:`FileStorage.create` method. """ supports_mmap = True def __init__(self, path, supports_mmap=True, readonly=False, debug=False): """ :param path: a path to a directory. :param supports_mmap: if True (the default), use the ``mmap`` module to open memory mapped files. You can open the storage object with ``supports_mmap=False`` to force Whoosh to open files normally instead of with ``mmap``. :param readonly: If ``True``, the object will raise an exception if you attempt to create or rename a file. """ self.folder = path self.supports_mmap = supports_mmap self.readonly = readonly self._debug = debug self.locks = {} def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.folder) def create(self): """Creates this storage object's directory path using ``os.makedirs`` if it doesn't already exist. >>> from whoosh.filedb.filestore import FileStorage >>> st = FileStorage("indexdir") >>> st.create() This method returns ``self``, you can say:: st = FileStorage("indexdir").create() Note that you can simply create handle the creation of the directory yourself and open the storage object using the initializer:: dirname = "indexdir" os.mkdir(dirname) st = FileStorage(dirname) However, using the ``create()`` method allows you to potentially swap in other storage implementations more easily. :return: a :class:`Storage` instance. """ dirpath = os.path.abspath(self.folder) # If the given directory does not already exist, try to create it try: os.makedirs(dirpath) except OSError: # This is necessary for compatibility between Py2 and Py3 e = sys.exc_info()[1] # If we get an error because the path already exists, ignore it if e.errno != errno.EEXIST: raise # Raise an exception if the given path is not a directory if not os.path.isdir(dirpath): e = IOError("%r is not a directory" % dirpath) e.errno = errno.ENOTDIR raise e return self def destroy(self): """Removes any files in this storage object and then removes the storage object's directory. What happens if any of the files or the directory are in use depends on the underlying platform. """ # Remove all files self.clean() # Try to remove the directory os.rmdir(self.folder) def create_file(self, name, excl=False, mode="wb", **kwargs): """Creates a file with the given name in this storage. :param name: the name for the new file. :param excl: if True, try to open the file in "exclusive" mode. :param mode: the mode flags with which to open the file. The default is ``"wb"``. :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ if self.readonly: raise ReadOnlyError path = self._fpath(name) if excl: flags = os.O_CREAT | os.O_EXCL | os.O_RDWR if hasattr(os, "O_BINARY"): flags |= os.O_BINARY fd = os.open(path, flags) fileobj = os.fdopen(fd, mode) else: fileobj = open(path, mode) f = StructFile(fileobj, name=name, **kwargs) return f def open_file(self, name, **kwargs): """Opens an existing file in this storage. :param name: the name of the file to open. :param kwargs: additional keyword arguments are passed through to the :class:`~whoosh.filedb.structfile.StructFile` initializer. :return: a :class:`whoosh.filedb.structfile.StructFile` instance. """ f = StructFile(open(self._fpath(name), "rb"), name=name, **kwargs) return f def _fpath(self, fname): return os.path.abspath(os.path.join(self.folder, fname)) def clean(self, ignore=False): if self.readonly: raise ReadOnlyError path = self.folder files = self.list() for fname in files: try: os.remove(os.path.join(path, fname)) except OSError: if not ignore: raise def list(self): try: files = os.listdir(self.folder) except IOError: files = [] return files def file_exists(self, name): return os.path.exists(self._fpath(name)) def file_modified(self, name): return os.path.getmtime(self._fpath(name)) def file_length(self, name): return os.path.getsize(self._fpath(name)) def delete_file(self, name): if self.readonly: raise ReadOnlyError os.remove(self._fpath(name)) def rename_file(self, oldname, newname, safe=False): if self.readonly: raise ReadOnlyError if os.path.exists(self._fpath(newname)): if safe: raise NameError("File %r exists" % newname) else: os.remove(self._fpath(newname)) os.rename(self._fpath(oldname), self._fpath(newname)) def lock(self, name): return FileLock(self._fpath(name)) def temp_storage(self, name=None): name = name or "%s.tmp" % random_name() path = os.path.join(self.folder, name) tempstore = FileStorage(path) return tempstore.create() class RamStorage(Storage): """Storage object that keeps the index in memory. """ supports_mmap = False def __init__(self): self.files = {} self.locks = {} self.folder = '' def destroy(self): del self.files del self.locks def list(self): return list(self.files.keys()) def clean(self): self.files = {} def total_size(self): return sum(self.file_length(f) for f in self.list()) def file_exists(self, name): return name in self.files def file_length(self, name): if name not in self.files: raise NameError(name) return len(self.files[name]) def file_modified(self, name): return -1 def delete_file(self, name): if name not in self.files: raise NameError(name) del self.files[name] def rename_file(self, name, newname, safe=False): if name not in self.files: raise NameError(name) if safe and newname in self.files: raise NameError("File %r exists" % newname) content = self.files[name] del self.files[name] self.files[newname] = content def create_file(self, name, **kwargs): def onclose_fn(sfile): self.files[name] = sfile.file.getvalue() f = StructFile(BytesIO(), name=name, onclose=onclose_fn) return f def open_file(self, name, **kwargs): if name not in self.files: raise NameError(name) buf = memoryview_(self.files[name]) return BufferFile(buf, name=name, **kwargs) def lock(self, name): if name not in self.locks: self.locks[name] = Lock() return self.locks[name] def temp_storage(self, name=None): tdir = tempfile.gettempdir() name = name or "%s.tmp" % random_name() path = os.path.join(tdir, name) tempstore = FileStorage(path) return tempstore.create() def copy_storage(sourcestore, deststore): """Copies the files from the source storage object to the destination storage object using ``shutil.copyfileobj``. """ from shutil import copyfileobj for name in sourcestore.list(): with sourcestore.open_file(name) as source: with deststore.create_file(name) as dest: copyfileobj(source, dest) def copy_to_ram(storage): """Copies the given FileStorage object into a new RamStorage object. :rtype: :class:`RamStorage` """ ram = RamStorage() copy_storage(storage, ram) return ram Whoosh-2.5.7/src/whoosh/filedb/filetables.py0000644000076500000240000006026412254366350021111 0ustar mattstaff00000000000000# Copyright 2009 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """This module defines writer and reader classes for a fast, immutable on-disk key-value database format. The current format is based heavily on D. J. Bernstein's CDB format (http://cr.yp.to/cdb.html). """ import os, struct from binascii import crc32 from bisect import bisect_left from hashlib import md5 # @UnresolvedImport from whoosh.compat import b, bytes_type from whoosh.compat import xrange from whoosh.util.numlists import GrowableArray from whoosh.system import _INT_SIZE, emptybytes # Exceptions class FileFormatError(Exception): pass # Hash functions def cdb_hash(key): h = 5381 for c in key: h = (h + (h << 5)) & 0xffffffff ^ ord(c) return h def md5_hash(key): return int(md5(key).hexdigest(), 16) & 0xffffffff def crc_hash(key): return crc32(key) & 0xffffffff _hash_functions = (md5_hash, crc_hash, cdb_hash) # Structs # Two uints before the key/value pair giving the length of the key and value _lengths = struct.Struct("!ii") # A pointer in a hash table, giving the hash value and the key position _pointer = struct.Struct("!Iq") # A pointer in the hash table directory, giving the position and number of slots _dir_entry = struct.Struct("!qi") _directory_size = 256 * _dir_entry.size # Basic hash file class HashWriter(object): """Implements a fast on-disk key-value store. This hash uses a two-level hashing scheme, where a key is hashed, the low eight bits of the hash value are used to index into one of 256 hash tables. This is basically the CDB algorithm, but unlike CDB this object writes all data serially (it doesn't seek backwards to overwrite information at the end). Also unlike CDB, this format uses 64-bit file pointers, so the file length is essentially unlimited. However, each key and value must be less than 2 GB in length. """ def __init__(self, dbfile, magic=b("HSH3"), hashtype=0): """ :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object to write to. :param magic: the format tag bytes to write at the start of the file. :param hashtype: an integer indicating which hashing algorithm to use. Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). """ self.dbfile = dbfile self.hashtype = hashtype self.hashfn = _hash_functions[self.hashtype] # A place for subclasses to put extra metadata self.extras = {} self.startoffset = dbfile.tell() # Write format tag dbfile.write(magic) # Write hash type dbfile.write_byte(self.hashtype) # Unused future expansion bits dbfile.write_int(0) dbfile.write_int(0) # 256 lists of hashed keys and positions self.buckets = [[] for _ in xrange(256)] # List to remember the positions of the hash tables self.directory = [] def tell(self): return self.dbfile.tell() def add(self, key, value): """Adds a key/value pair to the file. Note that keys DO NOT need to be unique. You can store multiple values under the same key and retrieve them using :meth:`HashReader.all`. """ assert isinstance(key, bytes_type) assert isinstance(value, bytes_type) dbfile = self.dbfile pos = dbfile.tell() dbfile.write(_lengths.pack(len(key), len(value))) dbfile.write(key) dbfile.write(value) # Get hash value for the key h = self.hashfn(key) # Add hash and on-disk position to appropriate bucket self.buckets[h & 255].append((h, pos)) def add_all(self, items): """Convenience method to add a sequence of ``(key, value)`` pairs. This is the same as calling :meth:`HashWriter.add` on each pair in the sequence. """ add = self.add for key, value in items: add(key, value) def _write_hashes(self): # Writes 256 hash tables containing pointers to the key/value pairs dbfile = self.dbfile # Represent and empty slot in the hash table using 0,0 (no key can # start at position 0 because of the header) null = (0, 0) for entries in self.buckets: # Start position of this bucket's hash table pos = dbfile.tell() # Remember the start position and the number of slots numslots = 2 * len(entries) self.directory.append((pos, numslots)) # Create the empty hash table hashtable = [null] * numslots # For each (hash value, key position) tuple in the bucket for hashval, position in entries: # Bitshift and wrap to get the slot for this entry slot = (hashval >> 8) % numslots # If the slot is taken, keep going until we find an empty slot while hashtable[slot] != null: slot = (slot + 1) % numslots # Insert the entry into the hashtable hashtable[slot] = (hashval, position) # Write the hash table for this bucket to disk for hashval, position in hashtable: dbfile.write(_pointer.pack(hashval, position)) def _write_directory(self): # Writes a directory of pointers to the 256 hash tables dbfile = self.dbfile for position, numslots in self.directory: dbfile.write(_dir_entry.pack(position, numslots)) def _write_extras(self): self.dbfile.write_pickle(self.extras) def close(self): dbfile = self.dbfile # Write hash tables self._write_hashes() # Write directory of pointers to hash tables self._write_directory() expos = dbfile.tell() # Write extra information self._write_extras() # Write length of pickle dbfile.write_int(dbfile.tell() - expos) endpos = dbfile.tell() dbfile.close() return endpos class HashReader(object): """Reader for the fast on-disk key-value files created by :class:`HashWriter`. """ def __init__(self, dbfile, length=None, magic=b("HSH3"), startoffset=0): """ :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object to read from. :param length: the length of the file data. This is necessary since the hashing information is written at the end of the file. :param magic: the format tag bytes to look for at the start of the file. If the file's format tag does not match these bytes, the object raises a :class:`FileFormatError` exception. :param startoffset: the starting point of the file data. """ self.dbfile = dbfile self.startoffset = startoffset self.is_closed = False if length is None: dbfile.seek(0, os.SEEK_END) length = dbfile.tell() - startoffset dbfile.seek(startoffset) # Check format tag filemagic = dbfile.read(4) if filemagic != magic: raise FileFormatError("Unknown file header %r" % filemagic) # Read hash type self.hashtype = dbfile.read_byte() self.hashfn = _hash_functions[self.hashtype] # Skip unused future expansion bits dbfile.read_int() dbfile.read_int() self.startofdata = dbfile.tell() exptr = startoffset + length - _INT_SIZE # Get the length of extras from the end of the file exlen = dbfile.get_int(exptr) # Read the extras expos = exptr - exlen dbfile.seek(expos) self._read_extras() # Calculate the directory base from the beginning of the extras dbfile.seek(expos - _directory_size) # Read directory of hash tables self.tables = [] entrysize = _dir_entry.size unpackentry = _dir_entry.unpack for _ in xrange(256): # position, numslots self.tables.append(unpackentry(dbfile.read(entrysize))) # The position of the first hash table is the end of the key/value pairs self.endofdata = self.tables[0][0] @classmethod def open(cls, storage, name): """Convenience method to open a hash file given a :class:`whoosh.filedb.filestore.Storage` object and a name. This takes care of opening the file and passing its length to the initializer. """ length = storage.file_length(name) dbfile = storage.open_file(name) return cls(dbfile, length) def file(self): return self.dbfile def _read_extras(self): try: self.extras = self.dbfile.read_pickle() except EOFError: self.extras = {} def close(self): if self.is_closed: raise Exception("Tried to close %r twice" % self) self.dbfile.close() self.is_closed = True def _key_at(self, pos): # Returns the key bytes at the given position dbfile = self.dbfile keylen = dbfile.get_uint(pos) return dbfile.get(pos + _lengths.size, keylen) def _ranges(self, pos=None, eod=None): # Yields a series of (keypos, keylength, datapos, datalength) tuples # for the key/value pairs in the file dbfile = self.dbfile pos = pos or self.startofdata eod = eod or self.endofdata lenssize = _lengths.size unpacklens = _lengths.unpack while pos < eod: keylen, datalen = unpacklens(dbfile.get(pos, lenssize)) keypos = pos + lenssize datapos = keypos + keylen yield (keypos, keylen, datapos, datalen) pos = datapos + datalen def __getitem__(self, key): for value in self.all(key): return value raise KeyError(key) def __iter__(self): dbfile = self.dbfile for keypos, keylen, datapos, datalen in self._ranges(): key = dbfile.get(keypos, keylen) value = dbfile.get(datapos, datalen) yield (key, value) def __contains__(self, key): for _ in self.ranges_for_key(key): return True return False def keys(self): dbfile = self.dbfile for keypos, keylen, _, _ in self._ranges(): yield dbfile.get(keypos, keylen) def values(self): dbfile = self.dbfile for _, _, datapos, datalen in self._ranges(): yield dbfile.get(datapos, datalen) def items(self): dbfile = self.dbfile for keypos, keylen, datapos, datalen in self._ranges(): yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) def get(self, key, default=None): for value in self.all(key): return value return default def all(self, key): """Yields a sequence of values associated with the given key. """ dbfile = self.dbfile for datapos, datalen in self.ranges_for_key(key): yield dbfile.get(datapos, datalen) def ranges_for_key(self, key): """Yields a sequence of ``(datapos, datalength)`` tuples associated with the given key. """ if not isinstance(key, bytes_type): raise TypeError("Key %r should be bytes" % key) dbfile = self.dbfile # Hash the key keyhash = self.hashfn(key) # Get the position and number of slots for the hash table in which the # key may be found tablestart, numslots = self.tables[keyhash & 255] # If the hash table is empty, we know the key doesn't exists if not numslots: return ptrsize = _pointer.size unpackptr = _pointer.unpack lenssize = _lengths.size unpacklens = _lengths.unpack # Calculate where the key's slot should be slotpos = tablestart + (((keyhash >> 8) % numslots) * ptrsize) # Read slots looking for our key's hash value for _ in xrange(numslots): slothash, itempos = unpackptr(dbfile.get(slotpos, ptrsize)) # If this slot is empty, we're done if not itempos: return # If the key hash in this slot matches our key's hash, we might have # a match, so read the actual key and see if it's our key if slothash == keyhash: # Read the key and value lengths keylen, datalen = unpacklens(dbfile.get(itempos, lenssize)) # Only bother reading the actual key if the lengths match if keylen == len(key): keystart = itempos + lenssize if key == dbfile.get(keystart, keylen): # The keys match, so yield (datapos, datalen) yield (keystart + keylen, datalen) slotpos += ptrsize # If we reach the end of the hashtable, wrap around if slotpos == tablestart + (numslots * ptrsize): slotpos = tablestart def range_for_key(self, key): for item in self.ranges_for_key(key): return item raise KeyError(key) # Ordered hash file class OrderedHashWriter(HashWriter): """Implements an on-disk hash, but requires that keys be added in order. An :class:`OrderedHashReader` can then look up "nearest keys" based on the ordering. """ def __init__(self, dbfile): HashWriter.__init__(self, dbfile) # Keep an array of the positions of all keys self.index = GrowableArray("H") # Keep track of the last key added self.lastkey = emptybytes def add(self, key, value): if key <= self.lastkey: raise ValueError("Keys must increase: %r..%r" % (self.lastkey, key)) self.index.append(self.dbfile.tell()) HashWriter.add(self, key, value) self.lastkey = key def _write_extras(self): dbfile = self.dbfile index = self.index # Store metadata about the index array self.extras["indextype"] = index.typecode self.extras["indexlen"] = len(index) # Write the extras HashWriter._write_extras(self) # Write the index array index.to_file(dbfile) class OrderedBase(HashReader): def closest_key(self, key): """Returns the closest key equal to or greater than the given key. If there is no key in the file equal to or greater than the given key, returns None. """ pos = self._closest_key_pos(key) if pos is None: return None return self._key_at(pos) def ranges_from(self, key): """Yields a series of ``(keypos, keylen, datapos, datalen)`` tuples for the ordered series of keys equal or greater than the given key. """ pos = self._closest_key_pos(key) if pos is None: return for item in self._ranges(pos=pos): yield item def keys_from(self, key): """Yields an ordered series of keys equal to or greater than the given key. """ dbfile = self.dbfile for keypos, keylen, _, _ in self.ranges_from(key): yield dbfile.get(keypos, keylen) def items_from(self, key): """Yields an ordered series of ``(key, value)`` tuples for keys equal to or greater than the given key. """ dbfile = self.dbfile for keypos, keylen, datapos, datalen in self.ranges_from(key): yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) class OrderedHashReader(OrderedBase): def _read_extras(self): dbfile = self.dbfile # Read the extras HashReader._read_extras(self) # Set up for reading the index array indextype = self.extras["indextype"] self.indexbase = dbfile.tell() self.indexlen = self.extras["indexlen"] self.indexsize = struct.calcsize(indextype) # Set up the function to read values from the index array if indextype == "B": self._get_pos = dbfile.get_byte elif indextype == "H": self._get_pos = dbfile.get_ushort elif indextype == "i": self._get_pos = dbfile.get_int elif indextype == "I": self._get_pos = dbfile.get_uint elif indextype == "q": self._get_pos = dbfile.get_long else: raise Exception("Unknown index type %r" % indextype) def _closest_key_pos(self, key): # Given a key, return the position of that key OR the next highest key # if the given key does not exist if not isinstance(key, bytes_type): raise TypeError("Key %r should be bytes" % key) indexbase = self.indexbase indexsize = self.indexsize _key_at = self._key_at _get_pos = self._get_pos # Do a binary search of the positions in the index array lo = 0 hi = self.indexlen while lo < hi: mid = (lo + hi) // 2 midkey = _key_at(_get_pos(indexbase + mid * indexsize)) if midkey < key: lo = mid + 1 else: hi = mid # If we went off the end, return None if lo == self.indexlen: return None # Return the closest key return _get_pos(indexbase + lo * indexsize) # Fielded Ordered hash file class FieldedOrderedHashWriter(HashWriter): """Implements an on-disk hash, but writes separate position indexes for each field. """ def __init__(self, dbfile): HashWriter.__init__(self, dbfile) # Map field names to (startpos, indexpos, length, typecode) self.fieldmap = self.extras["fieldmap"] = {} # Keep track of the last key added self.lastkey = emptybytes def start_field(self, fieldname): self.fieldstart = self.dbfile.tell() self.fieldname = fieldname # Keep an array of the positions of all keys self.poses = GrowableArray("H") self.lastkey = emptybytes def add(self, key, value): if key <= self.lastkey: raise ValueError("Keys must increase: %r..%r" % (self.lastkey, key)) self.poses.append(self.dbfile.tell() - self.fieldstart) HashWriter.add(self, key, value) self.lastkey = key def end_field(self): dbfile = self.dbfile fieldname = self.fieldname poses = self.poses self.fieldmap[fieldname] = (self.fieldstart, dbfile.tell(), len(poses), poses.typecode) poses.to_file(dbfile) class FieldedOrderedHashReader(HashReader): def __init__(self, *args, **kwargs): HashReader.__init__(self, *args, **kwargs) self.fieldmap = self.extras["fieldmap"] # Make a sorted list of the field names with their start and end ranges self.fieldlist = [] for fieldname in sorted(self.fieldmap.keys()): startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] self.fieldlist.append((fieldname, startpos, ixpos)) def fielded_ranges(self, pos=None, eod=None): flist = self.fieldlist fpos = 0 fieldname, start, end = flist[fpos] for keypos, keylen, datapos, datalen in self._ranges(pos, eod): if keypos >= end: fpos += 1 fieldname, start, end = flist[fpos] yield fieldname, keypos, keylen, datapos, datalen def iter_terms(self): get = self.dbfile.get for fieldname, keypos, keylen, _, _ in self.fielded_ranges(): yield fieldname, get(keypos, keylen) def iter_term_items(self): get = self.dbfile.get for item in self.fielded_ranges(): fieldname, keypos, keylen, datapos, datalen = item yield fieldname, get(keypos, keylen), get(datapos, datalen) def contains_term(self, fieldname, btext): try: x = self.range_for_term(fieldname, btext) return True except KeyError: return False def range_for_term(self, fieldname, btext): start, ixpos, ixsize, code = self.fieldmap[fieldname] for datapos, datalen in self.ranges_for_key(btext): if start < datapos < ixpos: return datapos, datalen raise KeyError((fieldname, btext)) def term_data(self, fieldname, btext): datapos, datalen = self.range_for_term(fieldname, btext) return self.dbfile.get(datapos, datalen) def term_get(self, fieldname, btext, default=None): try: return self.term_data(fieldname, btext) except KeyError: return default def _closest_term_pos(self, fieldname, key): # Given a key, return the position of that key OR the next highest key # if the given key does not exist if not isinstance(key, bytes_type): raise TypeError("Key %r should be bytes" % key) dbfile = self.dbfile _key_at = self._key_at startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] if ixtype == "B": get_pos = dbfile.get_byte elif ixtype == "H": get_pos = dbfile.get_ushort elif ixtype == "i": get_pos = dbfile.get_int elif ixtype == "I": get_pos = dbfile.get_uint elif ixtype == "q": get_pos = dbfile.get_long else: raise Exception("Unknown index type %r" % ixtype) # Do a binary search of the positions in the index array lo = 0 hi = ixsize while lo < hi: mid = (lo + hi) // 2 midkey = _key_at(startpos + get_pos(ixpos + mid * ixsize)) if midkey < key: lo = mid + 1 else: hi = mid # If we went off the end, return None if lo == ixsize: return None # Return the closest key return startpos + get_pos(ixpos + lo * ixsize) def closest_term(self, fieldname, btext): pos = self._closest_term_pos(fieldname, btext) if pos is None: return None return self._key_at(pos) def term_ranges_from(self, fieldname, btext): pos = self._closest_term_pos(fieldname, btext) if pos is None: return startpos, ixpos, ixsize, ixtype = self.fieldmap[fieldname] for item in self._ranges(pos, ixpos): yield item def terms_from(self, fieldname, btext): dbfile = self.dbfile for keypos, keylen, _, _ in self.term_ranges_from(fieldname, btext): yield dbfile.get(keypos, keylen) def term_items_from(self, fieldname, btext): dbfile = self.dbfile for item in self.term_ranges_from(fieldname, btext): keypos, keylen, datapos, datalen = item yield (dbfile.get(keypos, keylen), dbfile.get(datapos, datalen)) Whoosh-2.5.7/src/whoosh/filedb/gae.py0000644000076500000240000001141012254366350017520 0ustar mattstaff00000000000000""" This module contains EXPERIMENTAL support for storing a Whoosh index's files in the Google App Engine blobstore. This will use a lot of RAM since all files are loaded into RAM, but it potentially useful as a workaround for the lack of file storage in Google App Engine. Use at your own risk, but please report any problems to me so I can fix them. To create a new index:: from whoosh.filedb.gae import DatastoreStorage ix = DatastoreStorage().create_index(schema) To open an existing index:: ix = DatastoreStorage().open_index() """ import time from google.appengine.api import memcache # @UnresolvedImport from google.appengine.ext import db # @UnresolvedImport from whoosh.compat import BytesIO from whoosh.index import TOC, FileIndex, _DEF_INDEX_NAME from whoosh.filedb.filestore import ReadOnlyError, Storage from whoosh.filedb.structfile import StructFile class DatastoreFile(db.Model): """A file-like object that is backed by a BytesIO() object whose contents is loaded from a BlobProperty in the app engine datastore. """ value = db.BlobProperty() mtime = db.IntegerProperty(default=0) def __init__(self, *args, **kwargs): super(DatastoreFile, self).__init__(*args, **kwargs) self.data = BytesIO() @classmethod def loadfile(cls, name): value = memcache.get(name, namespace="DatastoreFile") if value is None: file = cls.get_by_key_name(name) memcache.set(name, file.value, namespace="DatastoreFile") else: file = cls(value=value) file.data = BytesIO(file.value) return file def close(self): oldvalue = self.value self.value = self.getvalue() if oldvalue != self.value: self.mtime = int(time.time()) self.put() memcache.set(self.key().id_or_name(), self.value, namespace="DatastoreFile") def tell(self): return self.data.tell() def write(self, data): return self.data.write(data) def read(self, length): return self.data.read(length) def seek(self, *args): return self.data.seek(*args) def readline(self): return self.data.readline() def getvalue(self): return self.data.getvalue() class MemcacheLock(object): def __init__(self, name): self.name = name def acquire(self, blocking=False): val = memcache.add(self.name, "L", 360, namespace="whooshlocks") if blocking and not val: # Simulate blocking by retrying the acquire over and over import time while not val: time.sleep(0.1) val = memcache.add(self.name, "", 360, namespace="whooshlocks") return val def release(self): memcache.delete(self.name, namespace="whooshlocks") class DatastoreStorage(Storage): """An implementation of :class:`whoosh.store.Storage` that stores files in the app engine datastore as blob properties. """ def create_index(self, schema, indexname=_DEF_INDEX_NAME): if self.readonly: raise ReadOnlyError TOC.create(self, schema, indexname) return FileIndex(self, schema, indexname) def open_index(self, indexname=_DEF_INDEX_NAME, schema=None): return FileIndex(self, schema=schema, indexname=indexname) def list(self): query = DatastoreFile.all() keys = [] for file in query: keys.append(file.key().id_or_name()) return keys def clean(self): pass def total_size(self): return sum(self.file_length(f) for f in self.list()) def file_exists(self, name): return DatastoreFile.get_by_key_name(name) is not None def file_modified(self, name): return DatastoreFile.get_by_key_name(name).mtime def file_length(self, name): return len(DatastoreFile.get_by_key_name(name).value) def delete_file(self, name): memcache.delete(name, namespace="DatastoreFile") return DatastoreFile.get_by_key_name(name).delete() def rename_file(self, name, newname, safe=False): file = DatastoreFile.get_by_key_name(name) newfile = DatastoreFile(key_name=newname) newfile.value = file.value newfile.mtime = file.mtime newfile.put() file.delete() def create_file(self, name, **kwargs): f = StructFile(DatastoreFile(key_name=name), name=name, onclose=lambda sfile: sfile.file.close()) return f def open_file(self, name, *args, **kwargs): return StructFile(DatastoreFile.loadfile(name)) def lock(self, name): return MemcacheLock(name) def temp_storage(self, name=None): tempstore = DatastoreStorage() return tempstore.create() Whoosh-2.5.7/src/whoosh/filedb/structfile.py0000644000076500000240000003024512254366350021157 0ustar mattstaff00000000000000# Copyright 2009 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from array import array from copy import copy from struct import calcsize from whoosh.compat import BytesIO, bytes_type from whoosh.compat import dump as dump_pickle from whoosh.compat import load as load_pickle from whoosh.compat import array_frombytes, array_tobytes from whoosh.system import _INT_SIZE, _SHORT_SIZE, _FLOAT_SIZE, _LONG_SIZE from whoosh.system import IS_LITTLE from whoosh.system import pack_byte, unpack_byte, pack_sbyte, unpack_sbyte from whoosh.system import pack_ushort, unpack_ushort from whoosh.system import pack_ushort_le, unpack_ushort_le from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint from whoosh.system import pack_uint_le, unpack_uint_le from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong from whoosh.system import pack_float, unpack_float from whoosh.util.varints import varint, read_varint from whoosh.util.varints import signed_varint, decode_signed_varint _SIZEMAP = dict((typecode, calcsize(typecode)) for typecode in "bBiIhHqQf") _ORDERMAP = {"little": "<", "big": ">"} _types = (("sbyte", "b"), ("ushort", "H"), ("int", "i"), ("long", "q"), ("float", "f")) # Main function class StructFile(object): """Returns a "structured file" object that wraps the given file object and provides numerous additional methods for writing structured data, such as "write_varint" and "write_long". """ def __init__(self, fileobj, name=None, onclose=None): self.file = fileobj self._name = name self.onclose = onclose self.is_closed = False self.is_real = hasattr(fileobj, "fileno") if self.is_real: self.fileno = fileobj.fileno def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self._name) def __str__(self): return self._name def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def __iter__(self): return iter(self.file) def raw_file(self): return self.file def read(self, *args, **kwargs): return self.file.read(*args, **kwargs) def readline(self, *args, **kwargs): return self.file.readline(*args, **kwargs) def write(self, *args, **kwargs): return self.file.write(*args, **kwargs) def tell(self, *args, **kwargs): return self.file.tell(*args, **kwargs) def seek(self, *args, **kwargs): return self.file.seek(*args, **kwargs) def truncate(self, *args, **kwargs): return self.file.truncate(*args, **kwargs) def flush(self): """Flushes the buffer of the wrapped file. This is a no-op if the wrapped file does not have a flush method. """ if hasattr(self.file, "flush"): self.file.flush() def close(self): """Closes the wrapped file. """ if self.is_closed: raise Exception("This file is already closed") if self.onclose: self.onclose(self) if hasattr(self.file, "close"): self.file.close() self.is_closed = True def subset(self, offset, length, name=None): from whoosh.filedb.compound import SubFile name = name or self._name return StructFile(SubFile(self.file, offset, length), name=name) def write_string(self, s): """Writes a string to the wrapped file. This method writes the length of the string first, so you can read the string back without having to know how long it was. """ self.write_varint(len(s)) self.write(s) def write_string2(self, s): self.write(pack_ushort(len(s)) + s) def write_string4(self, s): self.write(pack_int(len(s)) + s) def read_string(self): """Reads a string from the wrapped file. """ return self.read(self.read_varint()) def read_string2(self): l = self.read_ushort() return self.read(l) def read_string4(self): l = self.read_int() return self.read(l) def get_string2(self, pos): l = self.get_ushort(pos) base = pos + _SHORT_SIZE return self.get(base, l), base + l def get_string4(self, pos): l = self.get_int(pos) base = pos + _INT_SIZE return self.get(base, l), base + l def skip_string(self): l = self.read_varint() self.seek(l, 1) def write_varint(self, i): """Writes a variable-length unsigned integer to the wrapped file. """ self.write(varint(i)) def write_svarint(self, i): """Writes a variable-length signed integer to the wrapped file. """ self.write(signed_varint(i)) def read_varint(self): """Reads a variable-length encoded unsigned integer from the wrapped file. """ return read_varint(self.read) def read_svarint(self): """Reads a variable-length encoded signed integer from the wrapped file. """ return decode_signed_varint(read_varint(self.read)) def write_tagint(self, i): """Writes a sometimes-compressed unsigned integer to the wrapped file. This is similar to the varint methods but uses a less compressed but faster format. """ # Store numbers 0-253 in one byte. Byte 254 means "an unsigned 16-bit # int follows." Byte 255 means "An unsigned 32-bit int follows." if i <= 253: self.write(chr(i)) elif i <= 65535: self.write("\xFE" + pack_ushort(i)) else: self.write("\xFF" + pack_uint(i)) def read_tagint(self): """Reads a sometimes-compressed unsigned integer from the wrapped file. This is similar to the varint methods but uses a less compressed but faster format. """ tb = ord(self.read(1)) if tb == 254: return self.read_ushort() elif tb == 255: return self.read_uint() else: return tb def write_byte(self, n): """Writes a single byte to the wrapped file, shortcut for ``file.write(chr(n))``. """ self.write(pack_byte(n)) def read_byte(self): return ord(self.read(1)) def write_pickle(self, obj, protocol=-1): """Writes a pickled representation of obj to the wrapped file. """ dump_pickle(obj, self.file, protocol) def read_pickle(self): """Reads a pickled object from the wrapped file. """ return load_pickle(self.file) def write_sbyte(self, n): self.write(pack_sbyte(n)) def write_int(self, n): self.write(pack_int(n)) def write_uint(self, n): self.write(pack_uint(n)) def write_uint_le(self, n): self.write(pack_uint_le(n)) def write_ushort(self, n): self.write(pack_ushort(n)) def write_ushort_le(self, n): self.write(pack_ushort_le(n)) def write_long(self, n): self.write(pack_long(n)) def write_ulong(self, n): self.write(pack_ulong(n)) def write_float(self, n): self.write(pack_float(n)) def write_array(self, arry): if IS_LITTLE: arry = copy(arry) arry.byteswap() if self.is_real: arry.tofile(self.file) else: self.write(array_tobytes(arry)) def read_sbyte(self): return unpack_sbyte(self.read(1))[0] def read_int(self): return unpack_int(self.read(_INT_SIZE))[0] def read_uint(self): return unpack_uint(self.read(_INT_SIZE))[0] def read_uint_le(self): return unpack_uint_le(self.read(_INT_SIZE))[0] def read_ushort(self): return unpack_ushort(self.read(_SHORT_SIZE))[0] def read_ushort_le(self): return unpack_ushort_le(self.read(_SHORT_SIZE))[0] def read_long(self): return unpack_long(self.read(_LONG_SIZE))[0] def read_ulong(self): return unpack_ulong(self.read(_LONG_SIZE))[0] def read_float(self): return unpack_float(self.read(_FLOAT_SIZE))[0] def read_array(self, typecode, length): a = array(typecode) if self.is_real: a.fromfile(self.file, length) else: array_frombytes(a, self.read(length * _SIZEMAP[typecode])) if IS_LITTLE: a.byteswap() return a def get(self, position, length): self.seek(position) return self.read(length) def get_byte(self, position): return unpack_byte(self.get(position, 1))[0] def get_sbyte(self, position): return unpack_sbyte(self.get(position, 1))[0] def get_int(self, position): return unpack_int(self.get(position, _INT_SIZE))[0] def get_uint(self, position): return unpack_uint(self.get(position, _INT_SIZE))[0] def get_ushort(self, position): return unpack_ushort(self.get(position, _SHORT_SIZE))[0] def get_long(self, position): return unpack_long(self.get(position, _LONG_SIZE))[0] def get_ulong(self, position): return unpack_ulong(self.get(position, _LONG_SIZE))[0] def get_float(self, position): return unpack_float(self.get(position, _FLOAT_SIZE))[0] def get_array(self, position, typecode, length): self.seek(position) return self.read_array(typecode, length) class BufferFile(StructFile): def __init__(self, buf, name=None, onclose=None): self._buf = buf self._name = name self.file = BytesIO(buf) self.onclose = onclose self.is_real = False self.is_closed = False def subset(self, position, length, name=None): name = name or self._name return BufferFile(self.get(position, length), name=name) def get(self, position, length): return bytes_type(self._buf[position:position + length]) def get_array(self, position, typecode, length): a = array(typecode) array_frombytes(a, self.get(position, length * _SIZEMAP[typecode])) if IS_LITTLE: a.byteswap() return a class ChecksumFile(StructFile): def __init__(self, *args, **kwargs): StructFile.__init__(self, *args, **kwargs) self._check = 0 self._crc32 = __import__("zlib").crc32 def __iter__(self): for line in self.file: self._check = self._crc32(line, self._check) yield line def seek(self, *args): raise Exception("Cannot seek on a ChecksumFile") def read(self, *args, **kwargs): b = self.file.read(*args, **kwargs) self._check = self._crc32(b, self._check) return b def write(self, b): self._check = self._crc32(b, self._check) self.file.write(b) def checksum(self): return self._check & 0xffffffff Whoosh-2.5.7/src/whoosh/formats.py0000644000076500000240000004050612254366350017222 0ustar mattstaff00000000000000# Copyright 2009 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ The classes in this module encode and decode posting information for a field. The field format essentially determines what information is stored about each occurance of a term. """ from collections import defaultdict from whoosh.analysis import unstopped, entoken from whoosh.compat import iteritems, dumps, loads, b from whoosh.system import emptybytes from whoosh.system import _INT_SIZE, _FLOAT_SIZE from whoosh.system import pack_uint, unpack_uint, pack_float, unpack_float # Format base class class Format(object): """Abstract base class representing a storage format for a field or vector. Format objects are responsible for writing and reading the low-level representation of a field. It controls what kind/level of information to store about the indexed fields. """ posting_size = -1 textual = True __inittypes__ = dict(field_boost=float) def __init__(self, field_boost=1.0, **options): """ :param field_boost: A constant boost factor to scale to the score of all queries matching terms in this field. """ self.field_boost = field_boost self.options = options def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.__dict__ == other.__dict__) def __repr__(self): return "%s(boost=%s)" % (self.__class__.__name__, self.field_boost) def fixed_value_size(self): if self.posting_size < 0: return None return self.posting_size def word_values(self, value, analyzer, **kwargs): """Takes the text value to be indexed and yields a series of ("tokentext", frequency, weight, valuestring) tuples, where frequency is the number of times "tokentext" appeared in the value, weight is the weight (a float usually equal to frequency in the absence of per-term boosts) and valuestring is encoded field-specific posting value for the token. For example, in a Frequency format, the value string would be the same as frequency; in a Positions format, the value string would encode a list of token positions at which "tokentext" occured. :param value: The unicode text to index. :param analyzer: The analyzer to use to process the text. """ raise NotImplementedError def supports(self, name): """Returns True if this format supports interpreting its posting value as 'name' (e.g. "frequency" or "positions"). """ return hasattr(self, "decode_" + name) def decoder(self, name): """Returns the bound method for interpreting value as 'name', where 'name' is for example "frequency" or "positions". This object must have a corresponding Format.decode_() method. """ return getattr(self, "decode_" + name) def decode_as(self, astype, valuestring): """Interprets the encoded value string as 'astype', where 'astype' is for example "frequency" or "positions". This object must have a corresponding decode_() method. """ return self.decoder(astype)(valuestring) # Concrete field classes # TODO: as a legacy thing most of these formats store the frequency but not the # weight in the value string, so if you use field or term boosts # postreader.value_as("weight") will not match postreader.weight() def tokens(value, analyzer, kwargs): if isinstance(value, (tuple, list)): gen = entoken(value, **kwargs) else: gen = analyzer(value, **kwargs) return unstopped(gen) class Existence(Format): """Only indexes whether a given term occurred in a given document; it does not store frequencies or positions. This is useful for fields that should be searchable but not scorable, such as file path. Supports: frequency, weight (always reports frequency = 1). """ posting_size = 0 __inittypes__ = dict(field_boost=float) def __init__(self, field_boost=1.0, **options): self.field_boost = field_boost self.options = options def word_values(self, value, analyzer, **kwargs): fb = self.field_boost wordset = set(t.text for t in tokens(value, analyzer, kwargs)) return ((w, 1, fb, emptybytes) for w in wordset) def encode(self, value): return emptybytes def decode_frequency(self, valuestring): return 1 def decode_weight(self, valuestring): return self.field_boost def combine(self, vs): return emptybytes class Frequency(Format): """Stores frequency information for each posting. Supports: frequency, weight. """ posting_size = _INT_SIZE __inittypes__ = dict(field_boost=float, boost_as_freq=bool) def __init__(self, field_boost=1.0, boost_as_freq=False, **options): """ :param field_boost: A constant boost factor to scale to the score of all queries matching terms in this field. """ assert isinstance(field_boost, float) self.field_boost = field_boost self.options = options def word_values(self, value, analyzer, **kwargs): fb = self.field_boost length = 0 freqs = defaultdict(int) weights = defaultdict(float) kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): length += 1 freqs[t.text] += 1 weights[t.text] += t.boost wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq in iteritems(freqs)) return wvs def decode_frequency(self, valuestring): return unpack_uint(valuestring)[0] def decode_weight(self, valuestring): freq = unpack_uint(valuestring)[0] return freq * self.field_boost def combine(self, vs): return pack_uint(sum(self.decode_value(v) for v in vs)) class Positions(Format): """Stores position information in each posting, to allow phrase searching and "near" queries. Supports: frequency, weight, positions, position_boosts (always reports position boost = 1.0). """ def word_values(self, value, analyzer, **kwargs): fb = self.field_boost poses = defaultdict(list) weights = defaultdict(float) kwargs["positions"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): poses[t.text].append(t.pos) weights[t.text] += t.boost for w, poslist in iteritems(poses): value = self.encode(poslist) yield (w, len(poslist), weights[w] * fb, value) def encode(self, poslist): deltas = [] base = 0 for pos in poslist: deltas.append(pos - base) base = pos return pack_uint(len(deltas)) + dumps(deltas, -1) def decode_positions(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE:]) position = 0 positions = [] for code in codes: position += code positions.append(position) return positions def decode_frequency(self, valuestring): return unpack_uint(valuestring[:_INT_SIZE])[0] def decode_weight(self, valuestring): return self.decode_frequency(valuestring) * self.field_boost def decode_position_boosts(self, valuestring): return [(pos, 1) for pos in self.decode_positions(valuestring)] def combine(self, vs): s = set() for v in vs: s.update(self.decode_positions(v)) return self.encode(sorted(s)) class Characters(Positions): """Stores token position and character start and end information for each posting. Supports: frequency, weight, positions, position_boosts (always reports position boost = 1.0), characters. """ def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) weights = defaultdict(float) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar)) weights[t.text] += t.boost for w, poslist in iteritems(seen): value = self.encode(poslist) yield (w, len(poslist), weights[w] * fb, value) def encode(self, poslist): deltas = [] posbase = 0 charbase = 0 for pos, startchar, endchar in poslist: deltas.append((pos - posbase, startchar - charbase, endchar - startchar)) posbase = pos charbase = endchar return pack_uint(len(deltas)) + dumps(deltas, -1) def decode_characters(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE:]) position = 0 endchar = 0 posns_chars = [] for code in codes: position = code[0] + position startchar = code[1] + endchar endchar = code[2] + startchar posns_chars.append((position, startchar, endchar)) return posns_chars def decode_positions(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE:]) position = 0 posns = [] for code in codes: position = code[0] + position posns.append(position) return posns def combine(self, vs): s = {} for v in vs: for pos, sc, ec in self.decode_characters(v): if pos in s: old_sc, old_ec = pos[s] s[pos] = (min(sc, old_sc), max(ec, old_ec)) else: s[pos] = (sc, ec) poses = [(pos, s[pos][0], s[pos][1]) for pos in sorted(s.keys())] return self.encode(poses) class PositionBoosts(Positions): """A format that stores positions and per-position boost information in each posting. Supports: frequency, weight, positions, position_boosts. """ def word_values(self, value, analyzer, **kwargs): fb = self.field_boost seen = defaultdict(list) kwargs["positions"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): pos = t.pos boost = t.boost seen[t.text].append((pos, boost)) for w, poses in iteritems(seen): value = self.encode(poses) yield (w, len(poses), sum(p[1] for p in poses) * fb, value) def encode(self, poses): codes = [] base = 0 summedboost = 0 for pos, boost in poses: summedboost += boost codes.append((pos - base, boost)) base = pos return (pack_uint(len(poses)) + pack_float(summedboost) + dumps(codes, -1)) def decode_position_boosts(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) position = 0 posns_boosts = [] for code in codes: position = code[0] + position posns_boosts.append((position, code[1])) return posns_boosts def decode_positions(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) position = 0 posns = [] for code in codes: position = code[0] + position posns.append(position) return posns def decode_weight(self, v): summedboost = unpack_float(v[_INT_SIZE:_INT_SIZE + _FLOAT_SIZE])[0] return summedboost * self.field_boost def combine(self, vs): s = defaultdict(float) for v in vs: for pos, boost in self.decode_position_boosts(v): s[pos] += boost return self.encode(sorted(s.items())) class CharacterBoosts(Characters): """A format that stores positions, character start and end, and per-position boost information in each posting. Supports: frequency, weight, positions, position_boosts, characters, character_boosts. """ def word_values(self, value, analyzer, **kwargs): seen = defaultdict(list) kwargs["positions"] = True kwargs["chars"] = True kwargs["boosts"] = True for t in tokens(value, analyzer, kwargs): seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost)) for w, poses in iteritems(seen): value, summedboost = self.encode(poses) yield (w, len(poses), summedboost, value) def encode(self, poses): fb = self.field_boost # posns_chars_boosts = [(pos, startchar, endchar, boost), ...] codes = [] posbase = 0 charbase = 0 summedboost = 0 for pos, startchar, endchar, boost in poses: codes.append((pos - posbase, startchar - charbase, endchar - startchar, boost)) posbase = pos charbase = endchar summedboost += boost return ((pack_uint(len(poses)) + pack_float(summedboost * fb) + dumps(codes, -1)), summedboost) def decode_character_boosts(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) position = 0 endchar = 0 posn_char_boosts = [] for code in codes: position = position + code[0] startchar = endchar + code[1] endchar = startchar + code[2] posn_char_boosts.append((position, startchar, endchar, code[3])) return posn_char_boosts def decode_positions(self, valuestring): return [item[0] for item in self.decode_character_boosts(valuestring)] def decode_characters(self, valuestring): return [(pos, startchar, endchar) for pos, startchar, endchar, _ in self.decode_character_boosts(valuestring)] def decode_position_boosts(self, valuestring): return [(pos, boost) for pos, _, _, boost in self.decode_character_boosts(valuestring)] def combine(self, vs): s = {} for v in vs: for pos, sc, ec, boost in self.decode_character_boosts(v): if pos in s: old_sc, old_ec, old_boost = pos[s] s[pos] = (min(sc, old_sc), max(ec, old_ec), old_boost + boost) else: s[pos] = (sc, ec, boost) poses = [(pos, sc, ec, boost) for pos, (sc, ec, boost) in sorted(s.items())] return self.encode(poses)[0] # encode() returns value, summedboost Whoosh-2.5.7/src/whoosh/highlight.py0000644000076500000240000007731412254366350017525 0ustar mattstaff00000000000000# Copyright 2008 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """The highlight module contains classes and functions for displaying short excerpts from hit documents in the search results you present to the user, with query terms highlighted. The highlighting system has four main elements. * **Fragmenters** chop up the original text into __fragments__, based on the locations of matched terms in the text. * **Scorers** assign a score to each fragment, allowing the system to rank the best fragments by whatever criterion. * **Order functions** control in what order the top-scoring fragments are presented to the user. For example, you can show the fragments in the order they appear in the document (FIRST) or show higher-scoring fragments first (SCORE) * **Formatters** turn the fragment objects into human-readable output, such as an HTML string. See :doc:`/highlight` for more information. """ from __future__ import division from collections import deque from heapq import nlargest from whoosh.compat import htmlescape from whoosh.analysis import Token # The default value for the maximum chars to examine when fragmenting DEFAULT_CHARLIMIT = 2 ** 15 # Fragment object def mkfrag(text, tokens, startchar=None, endchar=None, charsbefore=0, charsafter=0): """Returns a :class:`Fragment` object based on the :class:`analysis.Token` objects in ``tokens`. """ if startchar is None: startchar = tokens[0].startchar if tokens else 0 if endchar is None: endchar = tokens[-1].endchar if tokens else len(text) startchar = max(0, startchar - charsbefore) endchar = min(len(text), endchar + charsafter) return Fragment(text, tokens, startchar, endchar) class Fragment(object): """Represents a fragment (extract) from a hit document. This object is mainly used to keep track of the start and end points of the fragment and the "matched" character ranges inside; it does not contain the text of the fragment or do much else. The useful attributes are: ``Fragment.text`` The entire original text from which this fragment is taken. ``Fragment.matches`` An ordered list of objects representing the matched terms in the fragment. These objects have ``startchar`` and ``endchar`` attributes. ``Fragment.startchar`` The index of the first character in the fragment. ``Fragment.endchar`` The index of the last character in the fragment. ``Fragment.matched_terms`` A ``set`` of the ``text`` of the matched terms in the fragment (if available). """ def __init__(self, text, matches, startchar=0, endchar= -1): """ :param text: the source text of the fragment. :param matches: a list of objects which have ``startchar`` and ``endchar`` attributes, and optionally a ``text`` attribute. :param startchar: the index into ``text`` at which the fragment starts. The default is 0. :param endchar: the index into ``text`` at which the fragment ends. The default is -1, which is interpreted as the length of ``text``. """ self.text = text self.matches = matches if endchar == -1: endchar = len(text) self.startchar = startchar self.endchar = endchar self.matched_terms = set() for t in matches: if hasattr(t, "text"): self.matched_terms.add(t.text) def __repr__(self): return "" % (self.startchar, self.endchar, len(self.matches)) def __len__(self): return self.endchar - self.startchar def overlaps(self, fragment): sc = self.startchar ec = self.endchar fsc = fragment.startchar fec = fragment.endchar return (sc < fsc < ec) or (sc < fec < ec) def overlapped_length(self, fragment): sc = self.startchar ec = self.endchar fsc = fragment.startchar fec = fragment.endchar return max(ec, fec) - min(sc, fsc) def __lt__(self, other): return id(self) < id(other) # Tokenizing def set_matched_filter(tokens, termset): for t in tokens: t.matched = t.text in termset yield t # Fragmenters class Fragmenter(object): def must_retokenize(self): """Returns True if this fragmenter requires retokenized text. If this method returns True, the fragmenter's ``fragment_tokens`` method will be called with an iterator of ALL tokens from the text, with the tokens for matched terms having the ``matched`` attribute set to True. If this method returns False, the fragmenter's ``fragment_matches`` method will be called with a LIST of matching tokens. """ return True def fragment_tokens(self, text, all_tokens): """Yields :class:`Fragment` objects based on the tokenized text. :param text: the string being highlighted. :param all_tokens: an iterator of :class:`analysis.Token` objects from the string. """ raise NotImplementedError def fragment_matches(self, text, matched_tokens): """Yields :class:`Fragment` objects based on the text and the matched terms. :param text: the string being highlighted. :param matched_tokens: a list of :class:`analysis.Token` objects representing the term matches in the string. """ raise NotImplementedError class WholeFragmenter(Fragmenter): """Doesn't fragment the token stream. This object just returns the entire entire stream as one "fragment". This is useful if you want to highlight the entire text. Note that even if you use the `WholeFragmenter`, the highlight code will return no fragment if no terms matched in the given field. To return the whole fragment even in that case, call `highlights()` with `minscore=0`:: # Query where no terms match in the "text" field q = query.Term("tag", "new") r = mysearcher.search(q) r.fragmenter = highlight.WholeFragmenter() r.formatter = highlight.UppercaseFormatter() # Since no terms in the "text" field matched, we get no fragments back assert r[0].highlights("text") == "" # If we lower the minimum score to 0, we get a fragment even though it # has no matching terms assert r[0].highlights("text", minscore=0) == "This is the text field." """ def __init__(self, charlimit=DEFAULT_CHARLIMIT): self.charlimit = charlimit def fragment_tokens(self, text, tokens): charlimit = self.charlimit matches = [] for t in tokens: if charlimit and t.endchar > charlimit: break if t.matched: matches.append(t.copy()) return [Fragment(text, matches)] # Backwards compatiblity NullFragmeter = WholeFragmenter class SentenceFragmenter(Fragmenter): """Breaks the text up on sentence end punctuation characters (".", "!", or "?"). This object works by looking in the original text for a sentence end as the next character after each token's 'endchar'. When highlighting with this fragmenter, you should use an analyzer that does NOT remove stop words, for example:: sa = StandardAnalyzer(stoplist=None) """ def __init__(self, maxchars=200, sentencechars=".!?", charlimit=DEFAULT_CHARLIMIT): """ :param maxchars: The maximum number of characters allowed in a fragment. """ self.maxchars = maxchars self.sentencechars = frozenset(sentencechars) self.charlimit = charlimit def fragment_tokens(self, text, tokens): maxchars = self.maxchars sentencechars = self.sentencechars charlimit = self.charlimit textlen = len(text) # startchar of first token in the current sentence first = None # Buffer for matched tokens in the current sentence tks = [] endchar = None # Number of chars in the current sentence currentlen = 0 for t in tokens: startchar = t.startchar endchar = t.endchar if charlimit and endchar > charlimit: break if first is None: # Remember the startchar of the first token in a sentence first = startchar currentlen = 0 tlength = endchar - startchar currentlen += tlength if t.matched: tks.append(t.copy()) # If the character after the current token is end-of-sentence # punctuation, finish the sentence and reset if endchar < textlen and text[endchar] in sentencechars: # Don't break for two periods in a row (e.g. ignore "...") if (endchar + 1 < textlen and text[endchar + 1] in sentencechars): continue # If the sentence had matches and it's not too long, yield it # as a token if tks and currentlen <= maxchars: yield mkfrag(text, tks, startchar=first, endchar=endchar) # Reset the counts tks = [] first = None currentlen = 0 # If we get to the end of the text and there's still a sentence # in the buffer, yield it if tks: yield mkfrag(text, tks, startchar=first, endchar=endchar) class ContextFragmenter(Fragmenter): """Looks for matched terms and aggregates them with their surrounding context. """ def __init__(self, maxchars=200, surround=20, charlimit=DEFAULT_CHARLIMIT): """ :param maxchars: The maximum number of characters allowed in a fragment. :param surround: The number of extra characters of context to add both before the first matched term and after the last matched term. """ self.maxchars = maxchars self.surround = surround self.charlimit = charlimit def fragment_tokens(self, text, tokens): maxchars = self.maxchars surround = self.surround charlimit = self.charlimit # startchar of the first token in the fragment first = None # Stack of startchars firsts = deque() # Each time we see a matched token, we reset the countdown to finishing # the fragment. This also indicates whether we're currently inside a # fragment (< 0 not in fragment, >= 0 in fragment) countdown = -1 # Tokens in current fragment tks = [] endchar = None # Number of chars in the current fragment currentlen = 0 for t in tokens: startchar = t.startchar endchar = t.endchar tlength = endchar - startchar if charlimit and endchar > charlimit: break if countdown < 0 and not t.matched: # We're not in a fragment currently, so just maintain the # "charsbefore" buffer firsts.append(startchar) while firsts and endchar - firsts[0] > surround: firsts.popleft() elif currentlen + tlength > maxchars: # We're in a fragment, but adding this token would put us past # the maximum size. Zero the countdown so the code below will # cause the fragment to be emitted countdown = 0 elif t.matched: # Start/restart the countdown countdown = surround # Remember the first char of this fragment if first is None: if firsts: first = firsts[0] else: first = startchar # Add on unused front context countdown += surround tks.append(t.copy()) # If we're in a fragment... if countdown >= 0: # Update the counts currentlen += tlength countdown -= tlength # If the countdown is expired if countdown <= 0: # Finish the fragment yield mkfrag(text, tks, startchar=first, endchar=endchar) # Reset the counts tks = [] firsts = deque() first = None currentlen = 0 # If there's a fragment left over at the end, yield it if tks: yield mkfrag(text, tks, startchar=first, endchar=endchar) class PinpointFragmenter(Fragmenter): """This is a NON-RETOKENIZING fragmenter. It builds fragments from the positions of the matched terms. """ def __init__(self, maxchars=200, surround=20, autotrim=False, charlimit=DEFAULT_CHARLIMIT): """ :param maxchars: The maximum number of characters allowed in a fragment. :param surround: The number of extra characters of context to add both before the first matched term and after the last matched term. :param autotrim: automatically trims text before the first space and after the last space in the fragments, to try to avoid truncated words at the start and end. For short fragments or fragments with long runs between spaces this may give strange results. """ self.maxchars = maxchars self.surround = surround self.autotrim = autotrim self.charlimit = charlimit def must_retokenize(self): return False def fragment_tokens(self, text, tokens): matched = [t for t in tokens if t.matched] return self.fragment_matches(text, matched) @staticmethod def _autotrim(fragment): text = fragment.text startchar = fragment.startchar endchar = fragment.endchar firstspace = text.find(" ", startchar, endchar) if firstspace > 0: startchar = firstspace + 1 lastspace = text.rfind(" ", startchar, endchar) if lastspace > 0: endchar = lastspace if fragment.matches: startchar = min(startchar, fragment.matches[0].startchar) endchar = max(endchar, fragment.matches[-1].endchar) fragment.startchar = startchar fragment.endchar = endchar def fragment_matches(self, text, tokens): maxchars = self.maxchars surround = self.surround autotrim = self.autotrim charlimit = self.charlimit for i, t in enumerate(tokens): j = i left = t.startchar right = t.endchar if charlimit and right > charlimit: break currentlen = right - left while j < len(tokens) - 1 and currentlen < maxchars: next = tokens[j + 1] ec = next.endchar if ec - right <= surround and ec - left <= maxchars: j += 1 right = ec currentlen += (ec - next.startchar) else: break left = max(0, left - surround) right = min(len(text), right + surround) fragment = Fragment(text, tokens[i:j + 1], left, right) if autotrim: self._autotrim(fragment) yield fragment # Fragment scorers class FragmentScorer(object): pass class BasicFragmentScorer(FragmentScorer): def __call__(self, f): # Add up the boosts for the matched terms in this passage score = sum(t.boost for t in f.matches) # Favor diversity: multiply score by the number of separate # terms matched score *= (len(f.matched_terms) * 100) or 1 return score # Fragment sorters def SCORE(fragment): "Sorts higher scored passages first." return 1 def FIRST(fragment): "Sorts passages from earlier in the document first." return fragment.startchar def LONGER(fragment): "Sorts longer passages first." return 0 - len(fragment) def SHORTER(fragment): "Sort shorter passages first." return len(fragment) # Formatters def get_text(original, token, replace): """Convenience function for getting the text to use for a match when formatting. If ``replace`` is False, returns the part of ``original`` between ``token.startchar`` and ``token.endchar``. If ``replace`` is True, returns ``token.text``. """ if replace: return token.text else: return original[token.startchar:token.endchar] class Formatter(object): """Base class for formatters. For highlighters that return strings, it is usually only necessary to override :meth:`Formatter.format_token`. Use the :func:`get_text` function as a convenience to get the token text:: class MyFormatter(Formatter): def format_token(text, token, replace=False): ttext = get_text(text, token, replace) return "[%s]" % ttext """ between = "..." def _text(self, text): return text def format_token(self, text, token, replace=False): """Returns a formatted version of the given "token" object, which should have at least ``startchar`` and ``endchar`` attributes, and a ``text`` attribute if ``replace`` is True. :param text: the original fragment text being highlighted. :param token: an object having ``startchar`` and ``endchar`` attributes and optionally a ``text`` attribute (if ``replace`` is True). :param replace: if True, the original text between the token's ``startchar`` and ``endchar`` indices will be replaced with the value of the token's ``text`` attribute. """ raise NotImplementedError def format_fragment(self, fragment, replace=False): """Returns a formatted version of the given text, using the "token" objects in the given :class:`Fragment`. :param fragment: a :class:`Fragment` object representing a list of matches in the text. :param replace: if True, the original text corresponding to each match will be replaced with the value of the token object's ``text`` attribute. """ output = [] index = fragment.startchar text = fragment.text for t in fragment.matches: if t.startchar < index: continue if t.startchar > index: output.append(self._text(text[index:t.startchar])) output.append(self.format_token(text, t, replace)) index = t.endchar output.append(self._text(text[index:fragment.endchar])) out_string = "".join(output) return out_string def format(self, fragments, replace=False): """Returns a formatted version of the given text, using a list of :class:`Fragment` objects. """ formatted = [self.format_fragment(f, replace=replace) for f in fragments] return self.between.join(formatted) def __call__(self, text, fragments): # For backwards compatibility return self.format(fragments) class NullFormatter(Formatter): """Formatter that does not modify the string. """ def format_token(self, text, token, replace=False): return get_text(text, token, replace) class UppercaseFormatter(Formatter): """Returns a string in which the matched terms are in UPPERCASE. """ def __init__(self, between="..."): """ :param between: the text to add between fragments. """ self.between = between def format_token(self, text, token, replace=False): ttxt = get_text(text, token, replace) return ttxt.upper() class HtmlFormatter(Formatter): """Returns a string containing HTML formatting around the matched terms. This formatter wraps matched terms in an HTML element with two class names. The first class name (set with the constructor argument ``classname``) is the same for each match. The second class name (set with the constructor argument ``termclass`` is different depending on which term matched. This allows you to give different formatting (for example, different background colors) to the different terms in the excerpt. >>> hf = HtmlFormatter(tagname="span", classname="match", termclass="term") >>> hf(mytext, myfragments) "The template geometry is..." This object maintains a dictionary mapping terms to HTML class names (e.g. ``term0`` and ``term1`` above), so that multiple excerpts will use the same class for the same term. If you want to re-use the same HtmlFormatter object with different searches, you should call HtmlFormatter.clear() between searches to clear the mapping. """ template = '<%(tag)s class=%(q)s%(cls)s%(tn)s%(q)s>%(t)s' def __init__(self, tagname="strong", between="...", classname="match", termclass="term", maxclasses=5, attrquote='"'): """ :param tagname: the tag to wrap around matching terms. :param between: the text to add between fragments. :param classname: the class name to add to the elements wrapped around matching terms. :param termclass: the class name prefix for the second class which is different for each matched term. :param maxclasses: the maximum number of term classes to produce. This limits the number of classes you have to define in CSS by recycling term class names. For example, if you set maxclasses to 3 and have 5 terms, the 5 terms will use the CSS classes ``term0``, ``term1``, ``term2``, ``term0``, ``term1``. """ self.between = between self.tagname = tagname self.classname = classname self.termclass = termclass self.attrquote = attrquote self.maxclasses = maxclasses self.seen = {} self.htmlclass = " ".join((self.classname, self.termclass)) def _text(self, text): return htmlescape(text, quote=False) def format_token(self, text, token, replace=False): seen = self.seen ttext = self._text(get_text(text, token, replace)) if ttext in seen: termnum = seen[ttext] else: termnum = len(seen) % self.maxclasses seen[ttext] = termnum return self.template % {"tag": self.tagname, "q": self.attrquote, "cls": self.htmlclass, "t": ttext, "tn": termnum} def clean(self): """Clears the dictionary mapping terms to HTML classnames. """ self.seen = {} class GenshiFormatter(Formatter): """Returns a Genshi event stream containing HTML formatting around the matched terms. """ def __init__(self, qname="strong", between="..."): """ :param qname: the QName for the tag to wrap around matched terms. :param between: the text to add between fragments. """ self.qname = qname self.between = between from genshi.core import START, END, TEXT # @UnresolvedImport from genshi.core import Attrs, Stream # @UnresolvedImport self.START, self.END, self.TEXT = START, END, TEXT self.Attrs, self.Stream = Attrs, Stream def _add_text(self, text, output): if output and output[-1][0] == self.TEXT: output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2]) else: output.append((self.TEXT, text, (None, -1, -1))) def format_token(self, text, token, replace=False): qn = self.qname txt = get_text(text, token, replace) return self.Stream([(self.START, (qn, self.Attrs()), (None, -1, -1)), (self.TEXT, txt, (None, -1, -1)), (self.END, qn, (None, -1, -1))]) def format_fragment(self, fragment, replace=False): output = [] index = fragment.startchar text = fragment.text for t in fragment.matches: if t.startchar > index: self._add_text(text[index:t.startchar], output) output.append((text, t, replace)) index = t.endchar if index < len(text): self._add_text(text[index:], output) return self.Stream(output) def format(self, fragments, replace=False): output = [] first = True for fragment in fragments: if not first: self._add_text(self.between, output) output += self.format_fragment(fragment, replace=replace) first = False return self.Stream(output) # Highlighting def top_fragments(fragments, count, scorer, order, minscore=1): scored_fragments = ((scorer(f), f) for f in fragments) scored_fragments = nlargest(count, scored_fragments) best_fragments = [sf for score, sf in scored_fragments if score >= minscore] best_fragments.sort(key=order) return best_fragments def highlight(text, terms, analyzer, fragmenter, formatter, top=3, scorer=None, minscore=1, order=FIRST, mode="query"): if scorer is None: scorer = BasicFragmentScorer() if type(fragmenter) is type: fragmenter = fragmenter() if type(formatter) is type: formatter = formatter() if type(scorer) is type: scorer = scorer() if scorer is None: scorer = BasicFragmentScorer() termset = frozenset(terms) tokens = analyzer(text, chars=True, mode=mode, removestops=False) tokens = set_matched_filter(tokens, termset) fragments = fragmenter.fragment_tokens(text, tokens) fragments = top_fragments(fragments, top, scorer, order, minscore) return formatter(text, fragments) class Highlighter(object): def __init__(self, fragmenter=None, scorer=None, formatter=None, always_retokenize=False, order=FIRST): self.fragmenter = fragmenter or ContextFragmenter() self.scorer = scorer or BasicFragmentScorer() self.formatter = formatter or HtmlFormatter(tagname="b") self.order = order self.always_retokenize = always_retokenize def can_load_chars(self, results, fieldname): # Is it possible to build a mapping between the matched terms/docs and # their start and end chars for "pinpoint" highlighting (ie not require # re-tokenizing text)? if self.always_retokenize: # No, we've been configured to always retokenize some text return False if not results.has_matched_terms(): # No, we don't know what the matched terms are yet return False if self.fragmenter.must_retokenize(): # No, the configured fragmenter doesn't support it return False # Maybe, if the field was configured to store characters field = results.searcher.schema[fieldname] return field.supports("characters") def _load_chars(self, results, fieldname, texts, to_bytes): # For each docnum, create a mapping of text -> [(startchar, endchar)] # for the matched terms results._char_cache[fieldname] = cache = {} sorted_ids = sorted(docnum for _, docnum in results.top_n) for docnum in sorted_ids: cache[docnum] = {} for text in texts: btext = to_bytes(text) m = results.searcher.postings(fieldname, btext) docset = set(results.termdocs[(fieldname, btext)]) for docnum in sorted_ids: if docnum in docset: m.skip_to(docnum) assert m.id() == docnum cache[docnum][text] = m.value_as("characters") def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1): results = hitobj.results schema = results.searcher.schema field = schema[fieldname] to_bytes = field.to_bytes from_bytes = field.from_bytes if text is None: if fieldname not in hitobj: raise KeyError("Field %r is not stored." % fieldname) text = hitobj[fieldname] # Get the terms searched for/matched in this field if results.has_matched_terms(): bterms = (term for term in hitobj.matched_terms() if term[0] == fieldname) else: bterms = results.query_terms(expand=True, fieldname=fieldname) # Convert bytes to unicode words = frozenset(from_bytes(term[1]) for term in bterms) # If we can do "pinpoint" highlighting... if self.can_load_chars(results, fieldname): # Build the docnum->[(startchar, endchar),] map if fieldname not in results._char_cache: self._load_chars(results, fieldname, words, to_bytes) # Grab the word->[(startchar, endchar)] map for this docnum cmap = results._char_cache[fieldname][hitobj.docnum] # A list of Token objects for matched words tokens = [] charlimit = self.fragmenter.charlimit for word in words: chars = cmap[word] for pos, startchar, endchar in chars: if charlimit and endchar > charlimit: break tokens.append(Token(text=word, pos=pos, startchar=startchar, endchar=endchar)) tokens.sort(key=lambda t: t.startchar) fragments = self.fragmenter.fragment_matches(text, tokens) else: # Retokenize the text analyzer = results.searcher.schema[fieldname].analyzer tokens = analyzer(text, positions=True, chars=True, mode="query", removestops=False) # Set Token.matched attribute for tokens that match a query term tokens = set_matched_filter(tokens, words) fragments = self.fragmenter.fragment_tokens(text, tokens) fragments = top_fragments(fragments, top, self.scorer, self.order, minscore=minscore) output = self.formatter.format(fragments) return output Whoosh-2.5.7/src/whoosh/idsets.py0000644000076500000240000003742612254366350017051 0ustar mattstaff00000000000000""" An implementation of an object that acts like a collection of on/off bits. """ import operator from array import array from bisect import bisect_left, bisect_right, insort from whoosh.compat import integer_types, izip, izip_longest, xrange from whoosh.util.numeric import bytes_for_bits # Number of '1' bits in each byte (0-255) _1SPERBYTE = array('B', [0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8]) class DocIdSet(object): """Base class for a set of positive integers, implementing a subset of the built-in ``set`` type's interface with extra docid-related methods. This is a superclass for alternative set implementations to the built-in ``set`` which are more memory-efficient and specialized toward storing sorted lists of positive integers, though they will inevitably be slower than ``set`` for most operations since they're pure Python. """ def __eq__(self, other): for a, b in izip(self, other): if a != b: return False return True def __neq__(self, other): return not self.__eq__(other) def __len__(self): raise NotImplementedError def __iter__(self): raise NotImplementedError def __contains__(self): raise NotImplementedError def __or__(self, other): return self.union(other) def __and__(self, other): return self.intersection(other) def __sub__(self, other): return self.difference(other) def copy(self): raise NotImplementedError def add(self, n): raise NotImplementedError def discard(self, n): raise NotImplementedError def update(self, other): for n in other: self.add(n) def intersection_update(self, other): for n in self: if n not in other: self.discard(n) def difference_update(self, other): for n in other: self.discard(n) def invert_update(self, size): """Updates the set in-place to contain numbers in the range ``[0 - size)`` except numbers that are in this set. """ for i in xrange(size): if i in self: self.discard(i) else: self.add(i) def intersection(self, other): c = self.copy() c.intersection_update(other) return c def union(self, other): c = self.copy() c.update(other) return c def difference(self, other): c = self.copy() c.difference_update(other) return c def invert(self, size): c = self.copy() c.invert_update(size) return c def isdisjoint(self, other): a = self b = other if len(other) < len(self): a, b = other, self for num in a: if num in b: return False return True def before(self): """Returns the previous integer in the set before ``i``, or None. """ raise NotImplementedError def after(self): """Returns the next integer in the set after ``i``, or None. """ raise NotImplementedError def first(self): """Returns the first (lowest) integer in the set. """ raise NotImplementedError def last(self): """Returns the last (highest) integer in the set. """ raise NotImplementedError class BaseBitSet(DocIdSet): # Methods to override def byte_count(self): raise NotImplementedError def _get_byte(self, i): raise NotImplementedError def _iter_bytes(self): raise NotImplementedError # Base implementations def __len__(self): return sum(_1SPERBYTE[b] for b in self._iter_bytes()) def __iter__(self): base = 0 for byte in self._iter_bytes(): for i in xrange(8): if byte & (1 << i): yield base + i base += 8 def __nonzero__(self): return any(n for n in self._iter_bytes()) __bool__ = __nonzero__ def __contains__(self, i): bucket = i // 8 if bucket >= self.byte_count(): return False return bool(self._get_byte(bucket) & (1 << (i & 7))) def first(self): return self.after(-1) def last(self): return self.before(self.byte_count() * 8 + 1) def before(self, i): _get_byte = self._get_byte size = self.byte_count() * 8 if i <= 0: return None elif i >= size: i = size - 1 else: i -= 1 bucket = i // 8 while i >= 0: byte = _get_byte(bucket) if not byte: bucket -= 1 i = bucket * 8 + 7 continue if byte & (1 << (i & 7)): return i if i % 8 == 0: bucket -= 1 i -= 1 return None def after(self, i): _get_byte = self._get_byte size = self.byte_count() * 8 if i >= size: return None elif i < 0: i = 0 else: i += 1 bucket = i // 8 while i < size: byte = _get_byte(bucket) if not byte: bucket += 1 i = bucket * 8 continue if byte & (1 << (i & 7)): return i i += 1 if i % 8 == 0: bucket += 1 return None class OnDiskBitSet(BaseBitSet): """A DocIdSet backed by an array of bits on disk. >>> st = RamStorage() >>> f = st.create_file("test.bin") >>> bs = BitSet([1, 10, 15, 7, 2]) >>> bytecount = bs.to_disk(f) >>> f.close() >>> # ... >>> f = st.open_file("test.bin") >>> odbs = OnDiskBitSet(f, bytecount) >>> list(odbs) [1, 2, 7, 10, 15] """ def __init__(self, dbfile, basepos, bytecount): """ :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object to read from. :param basepos: the base position of the bytes in the given file. :param bytecount: the number of bytes to use for the bit array. """ self._dbfile = dbfile self._basepos = basepos self._bytecount = bytecount def __repr__(self): return "%s(%s, %d, %d)" % (self.__class__.__name__, self.dbfile, self._basepos, self.bytecount) def byte_count(self): return self._bytecount def _get_byte(self, n): return self._dbfile.get_byte(self._basepos + n) def _iter_bytes(self): dbfile = self._dbfile dbfile.seek(self._basepos) for _ in xrange(self._bytecount): yield dbfile.read_byte() class BitSet(BaseBitSet): """A DocIdSet backed by an array of bits. This can also be useful as a bit array (e.g. for a Bloom filter). It is much more memory efficient than a large built-in set of integers, but wastes memory for sparse sets. """ def __init__(self, source=None, size=0): """ :param maxsize: the maximum size of the bit array. :param source: an iterable of positive integers to add to this set. :param bits: an array of unsigned bytes ("B") to use as the underlying bit array. This is used by some of the object's methods. """ # If the source is a list, tuple, or set, we can guess the size if not size and isinstance(source, (list, tuple, set, frozenset)): size = max(source) bytecount = bytes_for_bits(size) self.bits = array("B", (0 for _ in xrange(bytecount))) if source: add = self.add for num in source: add(num) def __repr__(self): return "%s(%r)" % (self.__class__.__name__, list(self)) def byte_count(self): return len(self.bits) def _get_byte(self, n): return self.bits[n] def _iter_bytes(self): return iter(self.bits) def _trim(self): bits = self.bits last = len(self.bits) - 1 while last >= 0 and not bits[last]: last -= 1 del self.bits[last + 1:] def _resize(self, tosize): curlength = len(self.bits) newlength = bytes_for_bits(tosize) if newlength > curlength: self.bits.extend((0,) * (newlength - curlength)) elif newlength < curlength: del self.bits[newlength + 1:] def _zero_extra_bits(self, size): bits = self.bits spill = size - ((len(bits) - 1) * 8) if spill: mask = 2 ** spill - 1 bits[-1] = bits[-1] & mask def _logic(self, obj, op, other): objbits = obj.bits for i, (byte1, byte2) in enumerate(izip_longest(objbits, other.bits, fillvalue=0)): value = op(byte1, byte2) & 0xFF if i >= len(objbits): objbits.append(value) else: objbits[i] = value obj._trim() return obj def to_disk(self, dbfile): dbfile.write_array(self.bits) return len(self.bits) @classmethod def from_bytes(cls, bs): b = cls() b.bits = array("B", bs) return b @classmethod def from_disk(cls, dbfile, bytecount): return cls.from_bytes(dbfile.read_array("B", bytecount)) def copy(self): b = self.__class__() b.bits = array("B", iter(self.bits)) return b def clear(self): for i in xrange(len(self.bits)): self.bits[i] = 0 def add(self, i): bucket = i >> 3 if bucket >= len(self.bits): self._resize(i + 1) self.bits[bucket] |= 1 << (i & 7) def discard(self, i): bucket = i >> 3 self.bits[bucket] &= ~(1 << (i & 7)) def _resize_to_other(self, other): if isinstance(other, (list, tuple, set, frozenset)): maxbit = max(other) if maxbit // 8 > len(self.bits): self._resize(maxbit) def update(self, iterable): self._resize_to_other(iterable) add = self.add for i in iterable: add(i) def intersection_update(self, other): if isinstance(other, BitSet): return self._logic(self, operator.__and__, other) discard = self.discard for n in self: if n not in other: discard(n) def difference_update(self, other): if isinstance(other, BitSet): return self._logic(self, lambda x, y: x & ~y, other) discard = self.discard for n in other: discard(n) def invert_update(self, size): bits = self.bits for i in xrange(len(bits)): bits[i] = ~bits[i] & 0xFF self._zero_extra_bits(size) def union(self, other): if isinstance(other, BitSet): return self._logic(self.copy(), operator.__or__, other) b = self.copy() b.update(other) return b def intersection(self, other): if isinstance(other, BitSet): return self._logic(self.copy(), operator.__and__, other) return BitSet(source=(n for n in self if n in other)) def difference(self, other): if isinstance(other, BitSet): return self._logic(self.copy(), lambda x, y: x & ~y, other) return BitSet(source=(n for n in self if n not in other)) class SortedIntSet(DocIdSet): """A DocIdSet backed by a sorted array of integers. """ def __init__(self, source=None): if source: self.data = array("I", sorted(source)) else: self.data = array("I") def copy(self): sis = SortedIntSet() sis.data = array("I", self.data) return sis def size(self): return len(self.data) * self.data.itemsize def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.data) def __len__(self): return len(self.data) def __iter__(self): return iter(self.data) def __nonzero__(self): return bool(self.data) __bool__ = __nonzero__ def __contains__(self, i): data = self.data if not data or i < data[0] or i > data[-1]: return False pos = bisect_left(data, i) if pos == len(data): return False return data[pos] == i def add(self, i): data = self.data if not data or i > data[-1]: data.append(i) else: mn = data[0] mx = data[-1] if i == mn or i == mx: return elif i > mx: data.append(i) elif i < mn: data.insert(0, i) else: pos = bisect_left(data, i) if data[pos] != i: data.insert(pos, i) def discard(self, i): data = self.data pos = bisect_left(data, i) if data[pos] == i: data.pop(pos) def clear(self): self.data = array("I") def update(self, other): add = self.add for i in other: add(i) def intersection_update(self, other): self.data = array("I", (num for num in self if num in other)) def difference_update(self, other): self.data = array("I", (num for num in self if num not in other)) def intersection(self, other): return SortedIntSet((num for num in self if num in other)) def difference(self, other): return SortedIntSet((num for num in self if num not in other)) def first(self): return self.data[0] def last(self): return self.data[-1] def before(self, i): data = self.data pos = bisect_left(data, i) if pos < 1: return None else: return data[pos - 1] def after(self, i): data = self.data if not data or i >= data[-1]: return None elif i < data[0]: return data[0] pos = bisect_right(data, i) return data[pos] class MultiIdSet(DocIdSet): """Wraps multiple SERIAL sub-DocIdSet objects and presents them as an aggregated, read-only set. """ def __init__(self, idsets, offsets): """ :param idsets: a list of DocIdSet objects. :param offsets: a list of offsets corresponding to the DocIdSet objects in ``idsets``. """ assert len(idsets) == len(offsets) self.idsets = idsets self.offsets = offsets def _document_set(self, n): offsets = self.offsets return max(bisect_left(offsets, n), len(self.offsets) - 1) def _set_and_docnum(self, n): setnum = self._document_set(n) offset = self.offsets[setnum] return self.idsets[setnum], n - offset def __len__(self): return sum(len(idset) for idset in self.idsets) def __iter__(self): for idset, offset in izip(self.idsets, self.offsets): for docnum in idset: yield docnum + offset def __contains__(self, item): idset, n = self._set_and_docnum(item) return n in idset Whoosh-2.5.7/src/whoosh/index.py0000644000076500000240000005677412254366350016674 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """Contains the main functions/classes for creating, maintaining, and using an index. """ from __future__ import division import os.path, re, sys from time import time, sleep from whoosh import __version__ from whoosh.legacy import toc_loaders from whoosh.compat import pickle, string_type from whoosh.fields import ensure_schema from whoosh.system import _INT_SIZE, _FLOAT_SIZE, _LONG_SIZE _DEF_INDEX_NAME = "MAIN" _CURRENT_TOC_VERSION = -111 # Exceptions class LockError(Exception): pass class IndexError(Exception): """Generic index error.""" class IndexVersionError(IndexError): """Raised when you try to open an index using a format that the current version of Whoosh cannot read. That is, when the index you're trying to open is either not backward or forward compatible with this version of Whoosh. """ def __init__(self, msg, version, release=None): Exception.__init__(self, msg) self.version = version self.release = release class OutOfDateError(IndexError): """Raised when you try to commit changes to an index which is not the latest generation. """ class EmptyIndexError(IndexError): """Raised when you try to work with an index that has no indexed terms. """ # Convenience functions def create_in(dirname, schema, indexname=None): """Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you. :param dirname: the path string of the directory in which to create the index. :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields. :param indexname: the name of the index to create; you only need to specify this if you are creating multiple indexes within the same storage object. :returns: :class:`Index` """ from whoosh.filedb.filestore import FileStorage if not indexname: indexname = _DEF_INDEX_NAME storage = FileStorage(dirname) return FileIndex.create(storage, schema, indexname) def open_dir(dirname, indexname=None, readonly=False, schema=None): """Convenience function for opening an index in a directory. Takes care of creating a FileStorage object for you. dirname is the filename of the directory in containing the index. indexname is the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. :param dirname: the path string of the directory in which to create the index. :param indexname: the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. """ from whoosh.filedb.filestore import FileStorage if indexname is None: indexname = _DEF_INDEX_NAME storage = FileStorage(dirname, readonly=readonly) return FileIndex(storage, schema=schema, indexname=indexname) def exists_in(dirname, indexname=None): """Returns True if dirname contains a Whoosh index. :param dirname: the file path of a directory. :param indexname: the name of the index. If None, the default index name is used. """ if os.path.exists(dirname): try: ix = open_dir(dirname, indexname=indexname) return ix.latest_generation() > -1 except EmptyIndexError: pass return False def exists(storage, indexname=None): """Deprecated; use ``storage.index_exists()``. :param storage: a store.Storage object. :param indexname: the name of the index. If None, the default index name is used. """ return storage.index_exists(indexname) def version_in(dirname, indexname=None): """Returns a tuple of (release_version, format_version), where release_version is the release version number of the Whoosh code that created the index -- e.g. (0, 1, 24) -- and format_version is the version number of the on-disk format used for the index -- e.g. -102. You should avoid attaching significance to the second number (the index version). This is simply a version number for the TOC file and probably should not have been exposed in a public interface. The best way to check if the current version of Whoosh can open an index is to actually try to open it and see if it raises a ``whoosh.index.IndexVersionError`` exception. Note that the release and format version are available as attributes on the Index object in Index.release and Index.version. :param dirname: the file path of a directory containing an index. :param indexname: the name of the index. If None, the default index name is used. :returns: ((major_ver, minor_ver, build_ver), format_ver) """ from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname) return version(storage, indexname=indexname) def version(storage, indexname=None): """Returns a tuple of (release_version, format_version), where release_version is the release version number of the Whoosh code that created the index -- e.g. (0, 1, 24) -- and format_version is the version number of the on-disk format used for the index -- e.g. -102. You should avoid attaching significance to the second number (the index version). This is simply a version number for the TOC file and probably should not have been exposed in a public interface. The best way to check if the current version of Whoosh can open an index is to actually try to open it and see if it raises a ``whoosh.index.IndexVersionError`` exception. Note that the release and format version are available as attributes on the Index object in Index.release and Index.version. :param storage: a store.Storage object. :param indexname: the name of the index. If None, the default index name is used. :returns: ((major_ver, minor_ver, build_ver), format_ver) """ try: if indexname is None: indexname = _DEF_INDEX_NAME ix = storage.open_index(indexname) return (ix.release, ix.version) except IndexVersionError: e = sys.exc_info()[1] return (None, e.version) # Index base class class Index(object): """Represents an indexed collection of documents. """ def close(self): """Closes any open resources held by the Index object itself. This may not close all resources being used everywhere, for example by a Searcher object. """ pass def add_field(self, fieldname, fieldspec): """Adds a field to the index's schema. :param fieldname: the name of the field to add. :param fieldspec: an instantiated :class:`whoosh.fields.FieldType` object. """ w = self.writer() w.add_field(fieldname, fieldspec) w.commit() def remove_field(self, fieldname): """Removes the named field from the index's schema. Depending on the backend implementation, this may or may not actually remove existing data for the field from the index. Optimizing the index should always clear out existing data for a removed field. """ w = self.writer() w.remove_field(fieldname) w.commit() def latest_generation(self): """Returns the generation number of the latest generation of this index, or -1 if the backend doesn't support versioning. """ return -1 def refresh(self): """Returns a new Index object representing the latest generation of this index (if this object is the latest generation, or the backend doesn't support versioning, returns self). :returns: :class:`Index` """ return self def up_to_date(self): """Returns True if this object represents the latest generation of this index. Returns False if this object is not the latest generation (that is, someone else has updated the index since you opened this object). """ return True def last_modified(self): """Returns the last modified time of the index, or -1 if the backend doesn't support last-modified times. """ return -1 def is_empty(self): """Returns True if this index is empty (that is, it has never had any documents successfully written to it. """ raise NotImplementedError def optimize(self): """Optimizes this index, if necessary. """ pass def doc_count_all(self): """Returns the total number of documents, DELETED OR UNDELETED, in this index. """ r = self.reader() try: return r.doc_count_all() finally: r.close() def doc_count(self): """Returns the total number of UNDELETED documents in this index. """ r = self.reader() try: return r.doc_count() finally: r.close() def searcher(self, **kwargs): """Returns a Searcher object for this index. Keyword arguments are passed to the Searcher object's constructor. :rtype: :class:`whoosh.searching.Searcher` """ from whoosh.searching import Searcher return Searcher(self.reader(), fromindex=self, **kwargs) def field_length(self, fieldname): """Returns the total length of the field across all documents. """ r = self.reader() try: return r.field_length(fieldname) finally: r.close() def max_field_length(self, fieldname): """Returns the maximum length of the field across all documents. """ r = self.reader() try: return r.max_field_length(fieldname) finally: r.close() def reader(self, reuse=None): """Returns an IndexReader object for this index. :param reuse: an existing reader. Some implementations may recycle resources from this existing reader to create the new reader. Note that any resources in the "recycled" reader that are not used by the new reader will be CLOSED, so you CANNOT use it afterward. :rtype: :class:`whoosh.reading.IndexReader` """ raise NotImplementedError def writer(self, **kwargs): """Returns an IndexWriter object for this index. :rtype: :class:`whoosh.writing.IndexWriter` """ raise NotImplementedError def delete_by_term(self, fieldname, text, searcher=None): w = self.writer() w.delete_by_term(fieldname, text, searcher=searcher) w.commit() def delete_by_query(self, q, searcher=None): w = self.writer() w.delete_by_query(q, searcher=searcher) w.commit() # Codec-based index implementation def clean_files(storage, indexname, gen, segments): # Attempts to remove unused index files (called when a new generation # is created). If existing Index and/or reader objects have the files # open, they may not be deleted immediately (i.e. on Windows) but will # probably be deleted eventually by a later call to clean_files. current_segment_names = set(s.segment_id() for s in segments) tocpattern = TOC._pattern(indexname) segpattern = TOC._segment_pattern(indexname) todelete = set() for filename in storage: if filename.startswith("."): continue tocm = tocpattern.match(filename) segm = segpattern.match(filename) if tocm: if int(tocm.group(1)) != gen: todelete.add(filename) elif segm: name = segm.group(1) if name not in current_segment_names: todelete.add(filename) for filename in todelete: try: storage.delete_file(filename) except OSError: # Another process still has this file open, I guess pass class FileIndex(Index): def __init__(self, storage, schema=None, indexname=_DEF_INDEX_NAME): from whoosh.filedb.filestore import Storage if not isinstance(storage, Storage): raise ValueError("%r is not a Storage object" % storage) if not isinstance(indexname, string_type): raise ValueError("indexname %r is not a string" % indexname) if schema: schema = ensure_schema(schema) self.storage = storage self._schema = schema self.indexname = indexname # Try reading the TOC to see if it's possible TOC.read(self.storage, self.indexname, schema=self._schema) @classmethod def create(cls, storage, schema, indexname=_DEF_INDEX_NAME): TOC.create(storage, schema, indexname) return cls(storage, schema, indexname) def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self.storage, self.indexname) def close(self): pass # add_field # remove_field def latest_generation(self): return TOC._latest_generation(self.storage, self.indexname) # refresh # up_to_date def last_modified(self): gen = self.latest_generation() filename = TOC._filename(self.indexname, gen) return self.storage.file_modified(filename) def is_empty(self): return len(self._read_toc().segments) == 0 def optimize(self, **kwargs): w = self.writer(**kwargs) w.commit(optimize=True) # searcher def writer(self, procs=1, **kwargs): if procs > 1: from whoosh.multiproc import MpWriter return MpWriter(self, procs=procs, **kwargs) else: from whoosh.writing import SegmentWriter return SegmentWriter(self, **kwargs) def lock(self, name): """Returns a lock object that you can try to call acquire() on to lock the index. """ return self.storage.lock(self.indexname + "_" + name) def _read_toc(self): return TOC.read(self.storage, self.indexname, schema=self._schema) def _segments(self): return self._read_toc().segments def _current_schema(self): return self._read_toc().schema @property def schema(self): return self._current_schema() @property def release(self): return self._read_toc().release @property def version(self): return self._read_toc().version @classmethod def _reader(cls, storage, schema, segments, generation, reuse=None): # Returns a reader for the given segments, possibly reusing already # opened readers from whoosh.reading import SegmentReader, MultiReader, EmptyReader reusable = {} try: if len(segments) == 0: # This index has no segments! Return an EmptyReader object, # which simply returns empty or zero to every method return EmptyReader(schema) if reuse: # Put all atomic readers in a dictionary keyed by their # generation, so we can re-use them if them if possible readers = [r for r, _ in reuse.leaf_readers()] reusable = dict((r.generation(), r) for r in readers) # Make a function to open readers, which reuses reusable readers. # It removes any readers it reuses from the "reusable" dictionary, # so later we can close any readers left in the dictionary. def segreader(segment): segid = segment.segment_id() if segid in reusable: r = reusable[segid] del reusable[segid] return r else: return SegmentReader(storage, schema, segment, generation=generation) if len(segments) == 1: # This index has one segment, so return a SegmentReader object # for the segment return segreader(segments[0]) else: # This index has multiple segments, so create a list of # SegmentReaders for the segments, then composite them with a # MultiReader readers = [segreader(segment) for segment in segments] return MultiReader(readers, generation=generation) finally: for r in reusable.values(): r.close() def reader(self, reuse=None): retries = 10 while retries > 0: # Read the information from the TOC file try: info = self._read_toc() return self._reader(self.storage, info.schema, info.segments, info.generation, reuse=reuse) except IOError: # Presume that we got a "file not found error" because a writer # deleted one of the files just as we were trying to open it, # and so retry a few times before actually raising the # exception e = sys.exc_info()[1] retries -= 1 if retries <= 0: raise e sleep(0.05) # TOC class class TOC(object): """Object representing the state of the index after a commit. Essentially a container for the index's schema and the list of segment objects. """ def __init__(self, schema, segments, generation, version=_CURRENT_TOC_VERSION, release=__version__): self.schema = schema self.segments = segments self.generation = generation self.version = version self.release = release @classmethod def _filename(cls, indexname, gen): return "_%s_%s.toc" % (indexname, gen) @classmethod def _pattern(cls, indexname): return re.compile("^_%s_([0-9]+).toc$" % indexname) @classmethod def _segment_pattern(cls, indexname): return re.compile("(%s_[0-9a-z]+)[.][A-Za-z0-9_.]+" % indexname) @classmethod def _latest_generation(cls, storage, indexname): pattern = cls._pattern(indexname) mx = -1 for filename in storage: m = pattern.match(filename) if m: mx = max(int(m.group(1)), mx) return mx @classmethod def create(cls, storage, schema, indexname=_DEF_INDEX_NAME): schema = ensure_schema(schema) # Clear existing files prefix = "_%s_" % indexname for filename in storage: if filename.startswith(prefix): storage.delete_file(filename) # Write a TOC file with an empty list of segments toc = cls(schema, [], 0) toc.write(storage, indexname) @classmethod def read(cls, storage, indexname, gen=None, schema=None): if gen is None: gen = cls._latest_generation(storage, indexname) if gen < 0: raise EmptyIndexError("Index %r does not exist in %r" % (indexname, storage)) # Read the content of this index from the .toc file. tocfilename = cls._filename(indexname, gen) stream = storage.open_file(tocfilename) def check_size(name, target): sz = stream.read_varint() if sz != target: raise IndexError("Index was created on different architecture:" " saved %s = %s, this computer = %s" % (name, sz, target)) check_size("int", _INT_SIZE) check_size("long", _LONG_SIZE) check_size("float", _FLOAT_SIZE) if not stream.read_int() == -12345: raise IndexError("Number misread: byte order problem") version = stream.read_int() release = (stream.read_varint(), stream.read_varint(), stream.read_varint()) if version != _CURRENT_TOC_VERSION: if version in toc_loaders: loader = toc_loaders[version] schema, segments = loader(stream, gen, schema, version) else: raise IndexVersionError("Can't read format %s" % version, version) else: # If the user supplied a schema object with the constructor, don't # load the pickled schema from the saved index. if schema: stream.skip_string() else: schema = pickle.loads(stream.read_string()) schema = ensure_schema(schema) # Generation index_gen = stream.read_int() assert gen == index_gen _ = stream.read_int() # Unused segments = stream.read_pickle() stream.close() return cls(schema, segments, gen, version=version, release=release) def write(self, storage, indexname): schema = ensure_schema(self.schema) schema.clean() # Use a temporary file for atomic write. tocfilename = self._filename(indexname, self.generation) tempfilename = '%s.%s' % (tocfilename, time()) stream = storage.create_file(tempfilename) stream.write_varint(_INT_SIZE) stream.write_varint(_LONG_SIZE) stream.write_varint(_FLOAT_SIZE) stream.write_int(-12345) stream.write_int(_CURRENT_TOC_VERSION) for num in __version__[:3]: stream.write_varint(num) try: stream.write_string(pickle.dumps(schema, -1)) except pickle.PicklingError: # Try to narrow down the error to a single field for fieldname, field in schema.items(): try: pickle.dumps(field) except pickle.PicklingError: e = sys.exc_info()[1] raise pickle.PicklingError("%s %s=%r" % (e, fieldname, field)) # Otherwise, re-raise the original exception raise stream.write_int(self.generation) stream.write_int(0) # Unused stream.write_pickle(self.segments) stream.close() # Rename temporary file to the proper filename storage.rename_file(tempfilename, tocfilename, safe=True) Whoosh-2.5.7/src/whoosh/lang/0000755000076500000240000000000012277504634016115 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/lang/__init__.py0000644000076500000240000001032412254366350020222 0ustar mattstaff00000000000000# coding=utf-8 # Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. # Exceptions class NoStemmer(Exception): pass class NoStopWords(Exception): pass # Data and functions for language names languages = ("ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", "ro", "ru", "es", "sv", "tr") aliases = { # By ISO 639-1 three letter codes "ara": "ar", "dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr", "deu": "de", "hun": "hu", "ita": "it", "nor": "no", "por": "pt", "ron": "ro", "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr", # By name in English "arabic": "ar", "danish": "da", "dutch": "nl", "english": "en", "finnish": "fi", "french": "fr", "german": "de", "hungarian": "hu", "italian": "it", "norwegian": "no", "portuguese": "pt", "romanian": "ro", "russian": "ru", "spanish": "es", "swedish": "sw", "turkish": "tr", # By name in own language "العربية": "ar", "dansk": "da", "nederlands": "nl", "suomi": "fi", "français": "fr", "deutsch": "de", "magyar": "hu", "italiano": "it", "norsk": "no", "português": "pt", "русский язык": "ru", "español": "es", "svenska": "sv", "türkçe": "tr", } def two_letter_code(name): if name in languages: return name if name in aliases: return aliases[name] return None # Getter functions def has_stemmer(lang): try: return bool(stemmer_for_language(lang)) except NoStemmer: return False def has_stopwords(lang): try: return bool(stopwords_for_language(lang)) except NoStopWords: return False def stemmer_for_language(lang): if lang == "en_porter": # Original porter stemming algorithm is several times faster than the # more correct porter2 algorithm in snowball package from .porter import stem as porter_stem return porter_stem tlc = two_letter_code(lang) if tlc == "ar": from .isri import ISRIStemmer return ISRIStemmer().stem from .snowball import classes as snowball_classes if tlc in snowball_classes: return snowball_classes[tlc]().stem raise NoStemmer("No stemmer available for %r" % lang) def stopwords_for_language(lang): from .stopwords import stoplists tlc = two_letter_code(lang) if tlc in stoplists: return stoplists[tlc] raise NoStopWords("No stop-word list available for %r" % lang) Whoosh-2.5.7/src/whoosh/lang/dmetaphone.py0000644000076500000240000004216012254366350020612 0ustar mattstaff00000000000000# coding= utf-8 # This script implements the Double Metaphone algorythm (c) 1998, 1999 by # Lawrence Philips. It was translated to Python from the C source written by # Kevin Atkinson (http://aspell.net/metaphone/) By Andrew Collins - January 12, # 2007 who claims no rights to this work. # http://atomboy.isa-geek.com:8080/plone/Members/acoil/programing/double-metaphone import re from whoosh.compat import u vowels = frozenset("AEIOUY") slavo_germ_exp = re.compile("W|K|CZ|WITZ") silent_starts = re.compile("GN|KN|PN|WR|PS") def double_metaphone(text): text = text.upper() slavo_germanic = bool(slavo_germ_exp.search(text)) length = len(text) text = "--" + text + " " first = pos = 2 last = first + length - 1 primary = secondary = "" if silent_starts.match(text, pos): pos += 1 while pos < length + 2: ch = text[pos] if ch in vowels: # all init vowels now map to 'A' if pos != first: next = (None, 1) else: next = ("A", 1) elif ch == "B": #"-mb", e.g", "dumb", already skipped over... see 'M' below if text[pos + 1] == "B": next = ("P", 2) else: next = ("P", 1) elif ch == "C": # various germanic if (pos > (first + 1) and text[pos - 2] not in vowels and text[pos - 1:pos + 2] == 'ACH' and \ (text[pos + 2] not in ['I', 'E'] or text[pos - 2:pos + 4] in ['BACHER', 'MACHER'])): next = ('K', 2) # special case 'CAESAR' elif pos == first and text[first:first + 6] == 'CAESAR': next = ('S', 2) elif text[pos:pos + 4] == 'CHIA': # italian 'chianti' next = ('K', 2) elif text[pos:pos + 2] == 'CH': # find 'michael' if pos > first and text[pos:pos + 4] == 'CHAE': next = ('K', 'X', 2) elif pos == first and (text[pos + 1:pos + 6] in ['HARAC', 'HARIS'] or \ text[pos + 1:pos + 4] in ["HOR", "HYM", "HIA", "HEM"]) and text[first:first + 5] != 'CHORE': next = ('K', 2) # germanic, greek, or otherwise 'ch' for 'kh' sound elif text[first:first + 4] in ['VAN ', 'VON '] or text[first:first + 3] == 'SCH' \ or text[pos - 2:pos + 4] in ["ORCHES", "ARCHIT", "ORCHID"] \ or text[pos + 2] in ['T', 'S'] \ or ((text[pos - 1] in ["A", "O", "U", "E"] or pos == first) \ and text[pos + 2] in ["L", "R", "N", "M", "B", "H", "F", "V", "W", " "]): next = ('K', 1) else: if pos > first: if text[first:first + 2] == 'MC': next = ('K', 2) else: next = ('X', 'K', 2) else: next = ('X', 2) # e.g, 'czerny' elif text[pos:pos + 2] == 'CZ' and text[pos - 2:pos + 2] != 'WICZ': next = ('S', 'X', 2) # e.g., 'focaccia' elif text[pos + 1:pos + 4] == 'CIA': next = ('X', 3) # double 'C', but not if e.g. 'McClellan' elif text[pos:pos + 2] == 'CC' and not (pos == (first + 1) and text[first] == 'M'): # 'bellocchio' but not 'bacchus' if text[pos + 2] in ["I", "E", "H"] and text[pos + 2:pos + 4] != 'HU': # 'accident', 'accede' 'succeed' if (pos == (first + 1) and text[first] == 'A') or \ text[pos - 1:pos + 4] in ['UCCEE', 'UCCES']: next = ('KS', 3) # 'bacci', 'bertucci', other italian else: next = ('X', 3) else: next = ('K', 2) elif text[pos:pos + 2] in ["CK", "CG", "CQ"]: next = ('K', 'K', 2) elif text[pos:pos + 2] in ["CI", "CE", "CY"]: # italian vs. english if text[pos:pos + 3] in ["CIO", "CIE", "CIA"]: next = ('S', 'X', 2) else: next = ('S', 2) else: # name sent in 'mac caffrey', 'mac gregor if text[pos + 1:pos + 3] in [" C", " Q", " G"]: next = ('K', 3) else: if text[pos + 1] in ["C", "K", "Q"] and text[pos + 1:pos + 3] not in ["CE", "CI"]: next = ('K', 2) else: # default for 'C' next = ('K', 1) elif ch == u('\xc7'): next = ('S', 1) elif ch == 'D': if text[pos:pos + 2] == 'DG': if text[pos + 2] in ['I', 'E', 'Y']: # e.g. 'edge' next = ('J', 3) else: next = ('TK', 2) elif text[pos:pos + 2] in ['DT', 'DD']: next = ('T', 2) else: next = ('T', 1) elif ch == 'F': if text[pos + 1] == 'F': next = ('F', 2) else: next = ('F', 1) elif ch == 'G': if text[pos + 1] == 'H': if pos > first and text[pos - 1] not in vowels: next = ('K', 2) elif pos < (first + 3): if pos == first: # 'ghislane', ghiradelli if text[pos + 2] == 'I': next = ('J', 2) else: next = ('K', 2) # Parker's rule (with some further refinements) - e.g., 'hugh' elif (pos > (first + 1) and text[pos - 2] in ['B', 'H', 'D']) \ or (pos > (first + 2) and text[pos - 3] in ['B', 'H', 'D']) \ or (pos > (first + 3) and text[pos - 4] in ['B', 'H']): next = (None, 2) else: # e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' if pos > (first + 2) and text[pos - 1] == 'U' \ and text[pos - 3] in ["C", "G", "L", "R", "T"]: next = ('F', 2) else: if pos > first and text[pos - 1] != 'I': next = ('K', 2) elif text[pos + 1] == 'N': if pos == (first + 1) and text[first] in vowels and not slavo_germanic: next = ('KN', 'N', 2) else: # not e.g. 'cagney' if text[pos + 2:pos + 4] != 'EY' and text[pos + 1] != 'Y' and not slavo_germanic: next = ('N', 'KN', 2) else: next = ('KN', 2) # 'tagliaro' elif text[pos + 1:pos + 3] == 'LI' and not slavo_germanic: next = ('KL', 'L', 2) # -ges-,-gep-,-gel-, -gie- at beginning elif pos == first and (text[pos + 1] == 'Y' \ or text[pos + 1:pos + 3] in ["ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"]): next = ('K', 'J', 2) # -ger-, -gy- elif (text[pos + 1:pos + 2] == 'ER' or text[pos + 1] == 'Y') \ and text[first:first + 6] not in ["DANGER", "RANGER", "MANGER"] \ and text[pos - 1] not in ['E', 'I'] and text[pos - 1:pos + 2] not in ['RGY', 'OGY']: next = ('K', 'J', 2) # italian e.g, 'biaggi' elif text[pos + 1] in ['E', 'I', 'Y'] or text[pos - 1:pos + 3] in ["AGGI", "OGGI"]: # obvious germanic if text[first:first + 4] in ['VON ', 'VAN '] or text[first:first + 3] == 'SCH' \ or text[pos + 1:pos + 3] == 'ET': next = ('K', 2) else: # always soft if french ending if text[pos + 1:pos + 5] == 'IER ': next = ('J', 2) else: next = ('J', 'K', 2) elif text[pos + 1] == 'G': next = ('K', 2) else: next = ('K', 1) elif ch == 'H': # only keep if first & before vowel or btw. 2 vowels if (pos == first or text[pos - 1] in vowels) and text[pos + 1] in vowels: next = ('H', 2) else: # (also takes care of 'HH') next = (None, 1) elif ch == 'J': # obvious spanish, 'jose', 'san jacinto' if text[pos:pos + 4] == 'JOSE' or text[first:first + 4] == 'SAN ': if (pos == first and text[pos + 4] == ' ') or text[first:first + 4] == 'SAN ': next = ('H',) else: next = ('J', 'H') elif pos == first and text[pos:pos + 4] != 'JOSE': next = ('J', 'A') # Yankelovich/Jankelowicz else: # spanish pron. of e.g. 'bajador' if text[pos - 1] in vowels and not slavo_germanic \ and text[pos + 1] in ['A', 'O']: next = ('J', 'H') else: if pos == last: next = ('J', ' ') else: if text[pos + 1] not in ["L", "T", "K", "S", "N", "M", "B", "Z"] \ and text[pos - 1] not in ["S", "K", "L"]: next = ('J',) else: next = (None,) if text[pos + 1] == 'J': next = next + (2,) else: next = next + (1,) elif ch == 'K': if text[pos + 1] == 'K': next = ('K', 2) else: next = ('K', 1) elif ch == 'L': if text[pos + 1] == 'L': # spanish e.g. 'cabrillo', 'gallegos' if (pos == (last - 2) and text[pos - 1:pos + 3] in ["ILLO", "ILLA", "ALLE"]) \ or ((text[last - 1:last + 1] in ["AS", "OS"] or text[last] in ["A", "O"]) \ and text[pos - 1:pos + 3] == 'ALLE'): next = ('L', '', 2) else: next = ('L', 2) else: next = ('L', 1) elif ch == 'M': if text[pos + 1:pos + 4] == 'UMB' \ and (pos + 1 == last or text[pos + 2:pos + 4] == 'ER') \ or text[pos + 1] == 'M': next = ('M', 2) else: next = ('M', 1) elif ch == 'N': if text[pos + 1] == 'N': next = ('N', 2) else: next = ('N', 1) elif ch == u('\xd1'): next = ('N', 1) elif ch == 'P': if text[pos + 1] == 'H': next = ('F', 2) elif text[pos + 1] in ['P', 'B']: # also account for "campbell", "raspberry" next = ('P', 2) else: next = ('P', 1) elif ch == 'Q': if text[pos + 1] == 'Q': next = ('K', 2) else: next = ('K', 1) elif ch == 'R': # french e.g. 'rogier', but exclude 'hochmeier' if pos == last and not slavo_germanic \ and text[pos - 2:pos] == 'IE' and text[pos - 4:pos - 2] not in ['ME', 'MA']: next = ('', 'R') else: next = ('R',) if text[pos + 1] == 'R': next = next + (2,) else: next = next + (1,) elif ch == 'S': # special cases 'island', 'isle', 'carlisle', 'carlysle' if text[pos - 1:pos + 2] in ['ISL', 'YSL']: next = (None, 1) # special case 'sugar-' elif pos == first and text[first:first + 5] == 'SUGAR': next = ('X', 'S', 1) elif text[pos:pos + 2] == 'SH': # germanic if text[pos + 1:pos + 5] in ["HEIM", "HOEK", "HOLM", "HOLZ"]: next = ('S', 2) else: next = ('X', 2) # italian & armenian elif text[pos:pos + 3] in ["SIO", "SIA"] or text[pos:pos + 4] == 'SIAN': if not slavo_germanic: next = ('S', 'X', 3) else: next = ('S', 3) # german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' # also, -sz- in slavic language altho in hungarian it is pronounced 's' elif (pos == first and text[pos + 1] in ["M", "N", "L", "W"]) or text[pos + 1] == 'Z': next = ('S', 'X') if text[pos + 1] == 'Z': next = next + (2,) else: next = next + (1,) elif text[pos:pos + 2] == 'SC': # Schlesinger's rule if text[pos + 2] == 'H': # dutch origin, e.g. 'school', 'schooner' if text[pos + 3:pos + 5] in ["OO", "ER", "EN", "UY", "ED", "EM"]: # 'schermerhorn', 'schenker' if text[pos + 3:pos + 5] in ['ER', 'EN']: next = ('X', 'SK', 3) else: next = ('SK', 3) else: if pos == first and text[first + 3] not in vowels and text[first + 3] != 'W': next = ('X', 'S', 3) else: next = ('X', 3) elif text[pos + 2] in ['I', 'E', 'Y']: next = ('S', 3) else: next = ('SK', 3) # french e.g. 'resnais', 'artois' elif pos == last and text[pos - 2:pos] in ['AI', 'OI']: next = ('', 'S', 1) else: next = ('S',) if text[pos + 1] in ['S', 'Z']: next = next + (2,) else: next = next + (1,) elif ch == 'T': if text[pos:pos + 4] == 'TION': next = ('X', 3) elif text[pos:pos + 3] in ['TIA', 'TCH']: next = ('X', 3) elif text[pos:pos + 2] == 'TH' or text[pos:pos + 3] == 'TTH': # special case 'thomas', 'thames' or germanic if text[pos + 2:pos + 4] in ['OM', 'AM'] or text[first:first + 4] in ['VON ', 'VAN '] \ or text[first:first + 3] == 'SCH': next = ('T', 2) else: next = ('0', 'T', 2) elif text[pos + 1] in ['T', 'D']: next = ('T', 2) else: next = ('T', 1) elif ch == 'V': if text[pos + 1] == 'V': next = ('F', 2) else: next = ('F', 1) elif ch == 'W': # can also be in middle of word if text[pos:pos + 2] == 'WR': next = ('R', 2) elif pos == first and (text[pos + 1] in vowels or text[pos:pos + 2] == 'WH'): # Wasserman should match Vasserman if text[pos + 1] in vowels: next = ('A', 'F', 1) else: next = ('A', 1) # Arnow should match Arnoff elif (pos == last and text[pos - 1] in vowels) \ or text[pos - 1:pos + 5] in ["EWSKI", "EWSKY", "OWSKI", "OWSKY"] \ or text[first:first + 3] == 'SCH': next = ('', 'F', 1) # polish e.g. 'filipowicz' elif text[pos:pos + 4] in ["WICZ", "WITZ"]: next = ('TS', 'FX', 4) else: # default is to skip it next = (None, 1) elif ch == 'X': # french e.g. breaux next = (None,) if not(pos == last and (text[pos - 3:pos] in ["IAU", "EAU"] \ or text[pos - 2:pos] in ['AU', 'OU'])): next = ('KS',) if text[pos + 1] in ['C', 'X']: next = next + (2,) else: next = next + (1,) elif ch == 'Z': # chinese pinyin e.g. 'zhao' if text[pos + 1] == 'H': next = ('J',) elif text[pos + 1:pos + 3] in ["ZO", "ZI", "ZA"] \ or (slavo_germanic and pos > first and text[pos - 1] != 'T'): next = ('S', 'TS') else: next = ('S',) if text[pos + 1] == 'Z': next = next + (2,) else: next = next + (1,) else: next = (None, 1) if len(next) == 2: if next[0]: primary += next[0] secondary += next[0] pos += next[1] elif len(next) == 3: if next[0]: primary += next[0] if next[1]: secondary += next[1] pos += next[2] if primary == secondary: return (primary, None) else: return (primary, secondary) Whoosh-2.5.7/src/whoosh/lang/isri.py0000644000076500000240000004122112254366350017431 0ustar mattstaff00000000000000# -*- coding: utf-8 -*- # # Natural Language Toolkit: The ISRI Arabic Stemmer # # Copyright (C) 2001-2012 NLTK Proejct # Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005) # Author: Hosam Algasaier # URL: # For license information, see LICENSE.TXT """ ISRI Arabic Stemmer The algorithm for this stemmer is described in: Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. The Information Science Research Institute’s (ISRI) Arabic stemmer shares many features with the Khoja stemmer. However, the main difference is that ISRI stemmer does not use root dictionary. Also, if a root is not found, ISRI stemmer returned normalized form, rather than returning the original unmodified word. Additional adjustments were made to improve the algorithm: 1- Adding 60 stop words. 2- Adding the pattern (تفاعيل) to ISRI pattern set. 3- The step 2 in the original algorithm was normalizing all hamza. This step is discarded because it increases the word ambiguities and changes the original root. """ from __future__ import unicode_literals import re class ISRIStemmer(object): ''' ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. A few minor modifications have been made to ISRI basic algorithm. See the source code of this module for more information. isri.stem(token) returns Arabic root for the given token. The ISRI Stemmer requires that all tokens have Unicode string types. If you use Python IDLE on Arabic Windows you have to decode text first using Arabic '1256' coding. ''' def __init__(self): self.stm = 'defult none' self.p3 = ['\u0643\u0627\u0644', '\u0628\u0627\u0644', '\u0648\u0644\u0644', '\u0648\u0627\u0644'] # length three prefixes self.p2 = ['\u0627\u0644', '\u0644\u0644'] # length two prefixes self.p1 = ['\u0644', '\u0628', '\u0641', '\u0633', '\u0648', '\u064a', '\u062a', '\u0646', '\u0627'] # length one prefixes self.s3 = ['\u062a\u0645\u0644', '\u0647\u0645\u0644', '\u062a\u0627\u0646', '\u062a\u064a\u0646', '\u0643\u0645\u0644'] # length three suffixes self.s2 = ['\u0648\u0646', '\u0627\u062a', '\u0627\u0646', '\u064a\u0646', '\u062a\u0646', '\u0643\u0645', '\u0647\u0646', '\u0646\u0627', '\u064a\u0627', '\u0647\u0627', '\u062a\u0645', '\u0643\u0646', '\u0646\u064a', '\u0648\u0627', '\u0645\u0627', '\u0647\u0645'] # length two suffixes self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a', '\u0627', '\u0646'] # length one suffixes self.pr4 = {0: ['\u0645'], 1:['\u0627'], 2: ['\u0627', '\u0648', '\u064A'], 3:['\u0629']} # groups of length four patterns self.pr53 = {0: ['\u0627', '\u062a'], 1: ['\u0627', '\u064a', '\u0648'], 2: ['\u0627', '\u062a', '\u0645'], 3: ['\u0645', '\u064a', '\u062a'], 4: ['\u0645', '\u062a'], 5: ['\u0627', '\u0648'], 6: ['\u0627', '\u0645']} # Groups of length five patterns and length three roots self.re_short_vowels = re.compile(r'[\u064B-\u0652]') self.re_hamza = re.compile(r'[\u0621\u0624\u0626]') self.re_intial_hamza = re.compile(r'^[\u0622\u0623\u0625]') self.stop_words = ['\u064a\u0643\u0648\u0646', '\u0648\u0644\u064a\u0633', '\u0648\u0643\u0627\u0646', '\u0643\u0630\u0644\u0643', '\u0627\u0644\u062a\u064a', '\u0648\u0628\u064a\u0646', '\u0639\u0644\u064a\u0647\u0627', '\u0645\u0633\u0627\u0621', '\u0627\u0644\u0630\u064a', '\u0648\u0643\u0627\u0646\u062a', '\u0648\u0644\u0643\u0646', '\u0648\u0627\u0644\u062a\u064a', '\u062a\u0643\u0648\u0646', '\u0627\u0644\u064a\u0648\u0645', '\u0627\u0644\u0644\u0630\u064a\u0646', '\u0639\u0644\u064a\u0647', '\u0643\u0627\u0646\u062a', '\u0644\u0630\u0644\u0643', '\u0623\u0645\u0627\u0645', '\u0647\u0646\u0627\u0643', '\u0645\u0646\u0647\u0627', '\u0645\u0627\u0632\u0627\u0644', '\u0644\u0627\u0632\u0627\u0644', '\u0644\u0627\u064a\u0632\u0627\u0644', '\u0645\u0627\u064a\u0632\u0627\u0644', '\u0627\u0635\u0628\u062d', '\u0623\u0635\u0628\u062d', '\u0623\u0645\u0633\u0649', '\u0627\u0645\u0633\u0649', '\u0623\u0636\u062d\u0649', '\u0627\u0636\u062d\u0649', '\u0645\u0627\u0628\u0631\u062d', '\u0645\u0627\u0641\u062a\u0626', '\u0645\u0627\u0627\u0646\u0641\u0643', '\u0644\u0627\u0633\u064a\u0645\u0627', '\u0648\u0644\u0627\u064a\u0632\u0627\u0644', '\u0627\u0644\u062d\u0627\u0644\u064a', '\u0627\u0644\u064a\u0647\u0627', '\u0627\u0644\u0630\u064a\u0646', '\u0641\u0627\u0646\u0647', '\u0648\u0627\u0644\u0630\u064a', '\u0648\u0647\u0630\u0627', '\u0644\u0647\u0630\u0627', '\u0641\u0643\u0627\u0646', '\u0633\u062a\u0643\u0648\u0646', '\u0627\u0644\u064a\u0647', '\u064a\u0645\u0643\u0646', '\u0628\u0647\u0630\u0627', '\u0627\u0644\u0630\u0649'] def stem(self, token): """ Stemming a word token using the ISRI stemmer. """ self.stm = token self.norm(1) # remove diacritics which representing Arabic short vowels if self.stm in self.stop_words: return self.stm # exclude stop words from being processed self.pre32() # remove length three and length two prefixes in this order self.suf32() # remove length three and length two suffixes in this order self.waw() # remove connective ‘و’ if it precedes a word beginning with ‘و’ self.norm(2) # normalize initial hamza to bare alif if len(self.stm) <= 3: return self.stm # return stem if less than or equal to three if len(self.stm) == 4: # length 4 word self.pro_w4() return self.stm elif len(self.stm) == 5: # length 5 word self.pro_w53() self.end_w5() return self.stm elif len(self.stm) == 6: # length 6 word self.pro_w6() self.end_w6() return self.stm elif len(self.stm) == 7: # length 7 word self.suf1() if len(self.stm) == 7: self.pre1() if len(self.stm) == 6: self.pro_w6() self.end_w6() return self.stm return self.stm # if word length >7 , then no stemming def norm(self, num): """ normalization: num=1 normalize diacritics num=2 normalize initial hamza num=3 both 1&2 """ self.k = num if self.k == 1: self.stm = self.re_short_vowels.sub('', self.stm) return self.stm elif self.k == 2: self.stm = self.re_intial_hamza.sub(r'\u0627', self.stm) return self.stm elif self.k == 3: self.stm = self.re_short_vowels.sub('', self.stm) self.stm = self.re_intial_hamza.sub(r'\u0627', self.stm) return self.stm def pre32(self): """remove length three and length two prefixes in this order""" if len(self.stm) >= 6: for pre3 in self.p3: if self.stm.startswith(pre3): self.stm = self.stm[3:] return self.stm elif len(self.stm) >= 5: for pre2 in self.p2: if self.stm.startswith(pre2): self.stm = self.stm[2:] return self.stm def suf32(self): """remove length three and length two suffixes in this order""" if len(self.stm) >= 6: for suf3 in self.s3: if self.stm.endswith(suf3): self.stm = self.stm[:-3] return self.stm elif len(self.stm) >= 5: for suf2 in self.s2: if self.stm.endswith(suf2): self.stm = self.stm[:-2] return self.stm def waw(self): """remove connective ‘و’ if it precedes a word beginning with ‘و’ """ if (len(self.stm) >= 4) & (self.stm[:2] == '\u0648\u0648'): self.stm = self.stm[1:] return self.stm def pro_w4(self): """process length four patterns and extract length three roots""" if self.stm[0] in self.pr4[0]: # مفعل self.stm = self.stm[1:] return self.stm elif self.stm[1] in self.pr4[1]: # فاعل self.stm = self.stm[0] + self.stm[2:] return self.stm elif self.stm[2] in self.pr4[2]: # فعال - فعول - فعيل self.stm = self.stm[:2] + self.stm[3] return self.stm elif self.stm[3] in self.pr4[3]: # فعلة self.stm = self.stm[:-1] return self.stm else: self.suf1() # do - normalize short sufix if len(self.stm) == 4: self.pre1() # do - normalize short prefix return self.stm def pro_w53(self): """process length five patterns and extract length three roots""" if ((self.stm[2] in self.pr53[0]) & (self.stm[0] == '\u0627')): # افتعل - افاعل self.stm = self.stm[1] + self.stm[3:] return self.stm elif ((self.stm[3] in self.pr53[1]) & (self.stm[0] == '\u0645')): # مفعول - مفعال - مفعيل self.stm = self.stm[1:3] + self.stm[4] return self.stm elif ((self.stm[0] in self.pr53[2]) & (self.stm[4] == '\u0629')): # مفعلة - تفعلة - افعلة self.stm = self.stm[1:4] return self.stm elif ((self.stm[0] in self.pr53[3]) & (self.stm[2] == '\u062a')): # مفتعل - يفتعل - تفتعل self.stm = self.stm[1] + self.stm[3:] return self.stm elif ((self.stm[0] in self.pr53[4]) & (self.stm[2] == '\u0627')): #مفاعل - تفاعل self.stm = self.stm[1] + self.stm[3:] return self.stm elif ((self.stm[2] in self.pr53[5]) & (self.stm[4] == '\u0629')): # فعولة - فعالة self.stm = self.stm[:2] + self.stm[3] return self.stm elif ((self.stm[0] in self.pr53[6]) & (self.stm[1] == '\u0646')): # انفعل - منفعل self.stm = self.stm[2:] return self.stm elif ((self.stm[3] == '\u0627') & (self.stm[0] == '\u0627')): # افعال self.stm = self.stm[1:3] + self.stm[4] return self.stm elif ((self.stm[4] == '\u0646') & (self.stm[3] == '\u0627')): # فعلان self.stm = self.stm[:3] return self.stm elif ((self.stm[3] == '\u064a') & (self.stm[0] == '\u062a')): # تفعيل self.stm = self.stm[1:3] + self.stm[4] return self.stm elif ((self.stm[3] == '\u0648') & (self.stm[1] == '\u0627')): # فاعول self.stm = self.stm[0] + self.stm[2] + self.stm[4] return self.stm elif ((self.stm[2] == '\u0627') & (self.stm[1] == '\u0648')): # فواعل self.stm = self.stm[0] + self.stm[3:] return self.stm elif ((self.stm[3] == '\u0626') & (self.stm[2] == '\u0627')): # فعائل self.stm = self.stm[:2] + self.stm[4] return self.stm elif ((self.stm[4] == '\u0629') & (self.stm[1] == '\u0627')): # فاعلة self.stm = self.stm[0] + self.stm[2:4] return self.stm elif ((self.stm[4] == '\u064a') & (self.stm[2] == '\u0627')): # فعالي self.stm = self.stm[:2] + self.stm[3] return self.stm else: self.suf1() # do - normalize short sufix if len(self.stm) == 5: self.pre1() # do - normalize short prefix return self.stm def pro_w54(self): """process length five patterns and extract length four roots""" if (self.stm[0] in self.pr53[2]): #تفعلل - افعلل - مفعلل self.stm = self.stm[1:] return self.stm elif (self.stm[4] == '\u0629'): # فعللة self.stm = self.stm[:4] return self.stm elif (self.stm[2] == '\u0627'): # فعالل self.stm = self.stm[:2] + self.stm[3:] return self.stm def end_w5(self): """ending step (word of length five)""" if len(self.stm) == 3: return self.stm elif len(self.stm) == 4: self.pro_w4() return self.stm elif len(self.stm) == 5: self.pro_w54() return self.stm def pro_w6(self): """process length six patterns and extract length three roots""" if ((self.stm.startswith('\u0627\u0633\u062a')) or (self.stm.startswith('\u0645\u0633\u062a'))): # مستفعل - استفعل self.stm = self.stm[3:] return self.stm elif (self.stm[0] == '\u0645' and self.stm[3] == '\u0627' and self.stm[5] == '\u0629'): # مفعالة self.stm = self.stm[1:3] + self.stm[4] return self.stm elif (self.stm[0] == '\u0627' and self.stm[2] == '\u062a' and self.stm[4] == '\u0627'): # افتعال self.stm = self.stm[1] + self.stm[3] + self.stm[5] return self.stm elif (self.stm[0] == '\u0627' and self.stm[3] == '\u0648' and self.stm[2] == self.stm[4]): # افعوعل self.stm = self.stm[1] + self.stm[4:] return self.stm elif (self.stm[0] == '\u062a' and self.stm[2] == '\u0627' and self.stm[4] == '\u064a'): # تفاعيل new pattern self.stm = self.stm[1] + self.stm[3] + self.stm[5] return self.stm else: self.suf1() # do - normalize short sufix if len(self.stm) == 6: self.pre1() # do - normalize short prefix return self.stm def pro_w64(self): """process length six patterns and extract length four roots""" if (self.stm[0] and self.stm[4]) == '\u0627': # افعلال self.stm = self.stm[1:4] + self.stm[5] return self.stm elif (self.stm.startswith('\u0645\u062a')): # متفعلل self.stm = self.stm[2:] return self.stm def end_w6(self): """ending step (word of length six)""" if len(self.stm) == 3: return self.stm elif len(self.stm) == 5: self.pro_w53() self.end_w5() return self.stm elif len (self.stm) == 6: self.pro_w64() return self.stm def suf1(self): """normalize short sufix""" for sf1 in self.s1: if self.stm.endswith(sf1): self.stm = self.stm[:-1] return self.stm def pre1(self): """normalize short prefix""" for sp1 in self.p1: if self.stm.startswith(sp1): self.stm = self.stm[1:] return self.stm Whoosh-2.5.7/src/whoosh/lang/lovins.py0000644000076500000240000003054512254366350020004 0ustar mattstaff00000000000000"""This module implements the Lovins stemming algorithm. Use the ``stem()`` function:: stemmed_word = stem(word) """ from collections import defaultdict # Conditions def A(base): # A No restrictions on stem return True def B(base): # B Minimum stem length = 3 return len(base) > 2 def C(base): # C Minimum stem length = 4 return len(base) > 3 def D(base): # D Minimum stem length = 5 return len(base) > 4 def E(base): # E Do not remove ending after e return base[-1] != "e" def F(base): # F Minimum stem length = 3 and do not remove ending after e return len(base) > 2 and base[-1] != "e" def G(base): # G Minimum stem length = 3 and remove ending only after f return len(base) > 2 and base[-1] == "f" def H(base): # H Remove ending only after t or ll c1, c2 = base[-2:] return c2 == "t" or (c2 == "l" and c1 == "l") def I(base): # I Do not remove ending after o or e c = base[-1] return c != "o" and c != "e" def J(base): # J Do not remove ending after a or e c = base[-1] return c != "a" and c != "e" def K(base): # K Minimum stem length = 3 and remove ending only after l, i or u*e c = base[-1] cc = base[-3] return len(base) > 2 and (c == "l" or c == "i" or (c == "e" and cc == "u")) def L(base): # L Do not remove ending after u, x or s, unless s follows o c1, c2 = base[-2:] return c2 != "u" and c2 != "x" and (c2 != "s" or c1 == "o") def M(base): # M Do not remove ending after a, c, e or m c = base[-1] return c != "a" and c != "c" and c != "e" and c != "m" def N(base): # N Minimum stem length = 4 after s**, elsewhere = 3 return len(base) > 3 or (len(base) == 3 and base[-1] != "s") def O(base): # O Remove ending only after l or i c = base[-1] return c == "l" or c == "i" def P(base): # P Do not remove ending after c return base[-1] != "c" def Q(base): # Q Minimum stem length = 3 and do not remove ending after l or n c = base[-1] return len(base) > 2 and (c != "l" and c != "n") def R(base): # R Remove ending only after n or r c = base[-1] return c == "n" or c == "r" def S(base): # S Remove ending only after dr or t, unless t follows t l2 = base[-2] return l2 == "rd" or (base[-1] == "t" and l2 != "tt") def T(base): # T Remove ending only after s or t, unless t follows o c1, c2 = base[-2:] return c2 == "s" or (c2 == "t" and c1 != "o") def U(base): # U Remove ending only after l, m, n or r c = base[-1] return c == "l" or c == "m" or c == "n" or c == "r" def V(base): # V Remove ending only after c return base[-1] == "c" def W(base): # W Do not remove ending after s or u c = base[-1] return c != "s" and c != "u" def X(base): # X Remove ending only after l, i or u*e c = base[-1] cc = base[-3] return c == "l" or c == "i" or (c == "e" and cc == "u") def Y(base): # Y Remove ending only after in return base[-2:] == "in" def Z(base): # Z Do not remove ending after f return base[-1] != "f" def a(base): # a Remove ending only after d, f, ph, th, l, er, or, es or t c = base[-1] l2 = base[-2:] return (c == "d" or c == "f" or l2 == "ph" or l2 == "th" or c == "l" or l2 == "er" or l2 == "or" or l2 == "es" or c == "t") def b(base): # b Minimum stem length = 3 and do not remove ending after met or ryst return len(base) > 2 and not (base.endswith("met") or base.endswith("ryst")) def c(base): # c Remove ending only after l return base[-1] == "l" # Endings m = [None] * 12 m[11] = dict(( ("alistically", B), ("arizability", A), ("izationally", B))) m[10] = dict(( ("antialness", A), ("arisations", A), ("arizations", A), ("entialness", A))) m[9] = dict(( ("allically", C), ("antaneous", A), ("antiality", A), ("arisation", A), ("arization", A), ("ationally", B), ("ativeness", A), ("eableness", E), ("entations", A), ("entiality", A), ("entialize", A), ("entiation", A), ("ionalness", A), ("istically", A), ("itousness", A), ("izability", A), ("izational", A))) m[8] = dict(( ("ableness", A), ("arizable", A), ("entation", A), ("entially", A), ("eousness", A), ("ibleness", A), ("icalness", A), ("ionalism", A), ("ionality", A), ("ionalize", A), ("iousness", A), ("izations", A), ("lessness", A))) m[7] = dict(( ("ability", A), ("aically", A), ("alistic", B), ("alities", A), ("ariness", E), ("aristic", A), ("arizing", A), ("ateness", A), ("atingly", A), ("ational", B), ("atively", A), ("ativism", A), ("elihood", E), ("encible", A), ("entally", A), ("entials", A), ("entiate", A), ("entness", A), ("fulness", A), ("ibility", A), ("icalism", A), ("icalist", A), ("icality", A), ("icalize", A), ("ication", G), ("icianry", A), ("ination", A), ("ingness", A), ("ionally", A), ("isation", A), ("ishness", A), ("istical", A), ("iteness", A), ("iveness", A), ("ivistic", A), ("ivities", A), ("ization", F), ("izement", A), ("oidally", A), ("ousness", A))) m[6] = dict(( ("aceous", A), ("acious", B), ("action", G), ("alness", A), ("ancial", A), ("ancies", A), ("ancing", B), ("ariser", A), ("arized", A), ("arizer", A), ("atable", A), ("ations", B), ("atives", A), ("eature", Z), ("efully", A), ("encies", A), ("encing", A), ("ential", A), ("enting", C), ("entist", A), ("eously", A), ("ialist", A), ("iality", A), ("ialize", A), ("ically", A), ("icance", A), ("icians", A), ("icists", A), ("ifully", A), ("ionals", A), ("ionate", D), ("ioning", A), ("ionist", A), ("iously", A), ("istics", A), ("izable", E), ("lessly", A), ("nesses", A), ("oidism", A))) m[5] = dict(( ("acies", A), ("acity", A), ("aging", B), ("aical", A), ("alist", A), ("alism", B), ("ality", A), ("alize", A), ("allic", b), ("anced", B), ("ances", B), ("antic", C), ("arial", A), ("aries", A), ("arily", A), ("arity", B), ("arize", A), ("aroid", A), ("ately", A), ("ating", I), ("ation", B), ("ative", A), ("ators", A), ("atory", A), ("ature", E), ("early", Y), ("ehood", A), ("eless", A), ("elily", A), ("ement", A), ("enced", A), ("ences", A), ("eness", E), ("ening", E), ("ental", A), ("ented", C), ("ently", A), ("fully", A), ("ially", A), ("icant", A), ("ician", A), ("icide", A), ("icism", A), ("icist", A), ("icity", A), ("idine", I), ("iedly", A), ("ihood", A), ("inate", A), ("iness", A), ("ingly", B), ("inism", J), ("inity", c), ("ional", A), ("ioned", A), ("ished", A), ("istic", A), ("ities", A), ("itous", A), ("ively", A), ("ivity", A), ("izers", F), ("izing", F), ("oidal", A), ("oides", A), ("otide", A), ("ously", A))) m[4] = dict(( ("able", A), ("ably", A), ("ages", B), ("ally", B), ("ance", B), ("ancy", B), ("ants", B), ("aric", A), ("arly", K), ("ated", I), ("ates", A), ("atic", B), ("ator", A), ("ealy", Y), ("edly", E), ("eful", A), ("eity", A), ("ence", A), ("ency", A), ("ened", E), ("enly", E), ("eous", A), ("hood", A), ("ials", A), ("ians", A), ("ible", A), ("ibly", A), ("ical", A), ("ides", L), ("iers", A), ("iful", A), ("ines", M), ("ings", N), ("ions", B), ("ious", A), ("isms", B), ("ists", A), ("itic", H), ("ized", F), ("izer", F), ("less", A), ("lily", A), ("ness", A), ("ogen", A), ("ward", A), ("wise", A), ("ying", B), ("yish", A))) m[3] = dict(( ("acy", A), ("age", B), ("aic", A), ("als", b), ("ant", B), ("ars", O), ("ary", F), ("ata", A), ("ate", A), ("eal", Y), ("ear", Y), ("ely", E), ("ene", E), ("ent", C), ("ery", E), ("ese", A), ("ful", A), ("ial", A), ("ian", A), ("ics", A), ("ide", L), ("ied", A), ("ier", A), ("ies", P), ("ily", A), ("ine", M), ("ing", N), ("ion", Q), ("ish", C), ("ism", B), ("ist", A), ("ite", a), ("ity", A), ("ium", A), ("ive", A), ("ize", F), ("oid", A), ("one", R), ("ous", A))) m[2] = dict(( ("ae", A), ("al", b), ("ar", X), ("as", B), ("ed", E), ("en", F), ("es", E), ("ia", A), ("ic", A), ("is", A), ("ly", B), ("on", S), ("or", T), ("um", U), ("us", V), ("yl", R), ("s'", A), ("'s", A))) m[1] = dict(( ("a", A), ("e", A), ("i", A), ("o", A), ("s", W), ("y", B))) def remove_ending(word): length = len(word) el = 11 while el > 0: if length - el > 1: ending = word[length - el:] cond = m[el].get(ending) if cond: base = word[:length - el] if cond(base): return base el -= 1 return word _endings = (("iev", "ief"), ("uct", "uc"), ("iev", "ief"), ("uct", "uc"), ("umpt", "um"), ("rpt", "rb"), ("urs", "ur"), ("istr", "ister"), ("metr", "meter"), ("olv", "olut"), ("ul", "l", "aoi"), ("bex", "bic"), ("dex", "dic"), ("pex", "pic"), ("tex", "tic"), ("ax", "ac"), ("ex", "ec"), ("ix", "ic"), ("lux", "luc"), ("uad", "uas"), ("vad", "vas"), ("cid", "cis"), ("lid", "lis"), ("erid", "eris"), ("pand", "pans"), ("end", "ens", "s"), ("ond", "ons"), ("lud", "lus"), ("rud", "rus"), ("her", "hes", "pt"), ("mit", "mis"), ("ent", "ens", "m"), ("ert", "ers"), ("et", "es", "n"), ("yt", "ys"), ("yz", "ys")) # Hash the ending rules by the last letter of the target ending _endingrules = defaultdict(list) for rule in _endings: _endingrules[rule[0][-1]].append(rule) _doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt")) def fix_ending(word): if word[-2:] in _doubles: word = word[:-1] for endingrule in _endingrules[word[-1]]: target, newend = endingrule[:2] if word.endswith(target): if len(endingrule) > 2: exceptafter = endingrule[2] c = word[0 - (len(target) + 1)] if c in exceptafter: return word return word[:0 - len(target)] + newend return word def stem(word): """Returns the stemmed version of the argument string. """ return fix_ending(remove_ending(word)) Whoosh-2.5.7/src/whoosh/lang/morph_en.py0000644000076500000240000013674412254366350020311 0ustar mattstaff00000000000000""" Contains the variations() function for expanding an English word into multiple variations by programatically adding and removing suffixes. Translated to Python from the ``com.sun.labs.minion.lexmorph.LiteMorph_en`` class of Sun's `Minion search engine `_. """ import re from whoosh.compat import xrange, iteritems # Rule exceptions exceptions = [ "a", "abandoner abandon abandons abandoned abandoning abandonings abandoners", "abdomen abdomens", "about", "above", "acid acids acidic acidity acidities", "across", "act acts acted acting actor actors", "ad ads", "add adds added adding addings addition additions adder adders", "advertise advertises advertised advertising advertiser advertisers advertisement advertisements advertisings", "after", "again", "against", "ago", "all", "almost", "along", "already", "also", "although", "alumna alumnae alumnus alumni", "always", "amen amens", "amidships", "amid amidst", "among amongst", "an", "analysis analyses", "and", "another other others", "antenna antennas antennae", "antitheses antithesis", "any", "anyone anybody", "anything", "appendix appendixes appendices", "apropos", "aquarium aquariums aquaria", "argument arguments argue argues argued arguing arguings arguer arguers", "arise arises arose arisen ariser arisers arising arisings", "around", "as", "asbestos", "at", "atlas atlases", "auger augers augered augering augerings augerer augerers", "augment augments augmented augmenting augmentings augmentation augmentations augmenter augmenters", "automata automaton automatons", "automation automating automate automates automated automatic", "avoirdupois", "awake awakes awoke awaked awoken awaker awakers awaking awakings awakening awakenings", "away", "awful awfully awfulness", "axis axes axises", "bacillus bacilli", "bacterium bacteria", "bad worse worst badly badness", "bas", "bases basis", "bases base based basing basings basely baseness basenesses basement basements baseless basic basics", "be am are is was were been being", "bear bears bore borne bearing bearings bearer bearers", "beat beats beaten beating beatings beater beaters", "because", "become becomes became becoming", "beef beefs beeves beefed beefing", "beer beers", "before", "begin begins began begun beginning beginnings beginner beginners", "behalf behalves", "being beings", "bend bends bent bending bendings bender benders", "bereave bereaves bereaved bereft bereaving bereavings bereavement bereavements", "beside besides", "best bests bested besting", "bet bets betting bettor bettors", "betimes", "between", "beyond", "bid bids bade bidden bidding biddings bidder bidders", "bier biers", "bind binds bound binding bindings binder binders", "bit bits", "bite bites bit bitten biting bitings biter biters", "blackfoot blackfeet", "bleed bleeds bled bleeding bleedings bleeder bleeders", "blow blows blew blown blowing blowings blower blowers", "bookshelf bookshelves", "both", "bound bounds bounded bounding boundings bounder bounders boundless", "bourgeois bourgeoisie", "bra bras", "brahman brahmans", "break breaks broke broken breaking breakings breaker breakers", "breed breeds bred breeding breedings breeder breeders", "bring brings brought bringing bringings bringer bringers", "build builds built building buildings builder builders", "bus buses bused bussed busing bussing busings bussings buser busers busser bussers", "buss busses bussed bussing bussings busser bussers", "but", "buy buys bought buying buyings buyer buyers", "by", "calf calves calved calving calvings calver calvers", "can cans canned canning cannings canner canners", "can could cannot", "canoes canoe canoed canoeing canoeings canoer canoers", "catch catches caught catching catchings catcher catchers", "cement cements cemented cementing cementings cementer cementers", "cent cents", "center centers centered centering centerings centerless", "child children childless childish childishly", "choose chooses chose chosen choosing choosings chooser choosers", "cling clings clung clinging clingings clinger clingers", "colloquium colloquia colloquiums", "come comes came coming comings comer comers", "comment comments commented commenting commentings commenter commenters", "compendium compendia compendiums", "complement complements complemented complementing complementings complementer complementers complementary", "compliment compliments complimented complimenting complimentings complimenter complimenters complimentary", "concerto concertos concerti", "condiment condiments", "corps", "cortex cortices cortexes cortical", "couscous", "creep creeps crept creeping creepings creeper creepers creepy", "crisis crises", "criterion criteria criterial", "cryptanalysis cryptanalyses", "curriculum curricula curriculums curricular", "datum data", "day days daily", "deal deals dealt dealing dealings dealer dealers", "decrement decrements decremented decrementing decrementings decrementer decrementers decremental", "deer deers", "demented dementia", "desideratum desiderata", "diagnosis diagnoses diagnose diagnosed diagnosing diagnostic", "dialysis dialyses", "dice dices diced dicing dicings dicer dicers", "die dice", "die dies died dying dyings", "dig digs dug digging diggings digger diggers", "dive dives diver divers dove dived diving divings", "divest divests divester divesters divested divesting divestings divestment divestments", "do does did done doing doings doer doers", "document documents documented documenting documentings documenter documenters documentation documentations documentary", "doe does", "dove doves", "downstairs", "dozen", "draw draws drew drawn drawing drawings drawer drawers", "drink drinks drank drunk drinking drinkings drinker drinkers", "drive drives drove driven driving drivings driver drivers driverless", "due dues duly", "during", "e", "each", "eager eagerer eagerest eagerly eagerness eagernesses", "early earlier earliest", "easement easements", "eat eats ate eaten eating eatings eater eaters", "effluvium effluvia", "either", "element elements elementary", "elf elves elfen", "ellipse ellipses elliptic elliptical elliptically", "ellipsis ellipses elliptic elliptical elliptically", "else", "embolus emboli embolic embolism", "emolument emoluments", "emphasis emphases", "employ employs employed employing employer employers employee employees employment employments employable", "enough", "equilibrium equilibria equilibriums", "erratum errata", "ever", "every", "everything", "exotic exotically exoticness exotica", "experiment experiments experimented experimenting experimentings experimenter experimenters experimentation experimental", "extra extras", "fall falls fell fallen falling fallings faller fallers", "far farther farthest", "fee fees feeless", "feed feeds fed feeding feedings feeder feeders", "feel feels felt feeling feelings feeler feelers", "ferment ferments fermented fermenting fermentings fermentation fermentations fermenter fermenters", "few fewer fewest", "fight fights fought fighting fightings fighter fighters", "figment figments", "filament filaments", "find finds found finding findings finder finders", "firmament firmaments", "flee flees fled fleeing fleeings", "fling flings flung flinging flingings flinger flingers", "floe floes", "fly flies flew flown flying flyings flier fliers flyer flyers", "focus foci focuses focused focusing focusses focussed focussing focuser focal", "foment foments fomented fomenting fomentings fomenter fomenters", "foot feet", "foot foots footed footing footer footers", "footing footings footer footers", "for", "forbid forbids forbade forbidden forbidding forbiddings forbidder forbidders", "foresee foresaw foreseen foreseeing foreseeings foreseer foreseers", "forest forests forester foresting forestation forestations", "forget forgets forgot forgotten forgetting forgettings forgetter forgetters forgetful", "forsake forsakes forsook forsaken forsaking forsakings forsaker forsakers", "found founds founded founding foundings founder founders", "fragment fragments fragmented fragmenting fragmentings fragmentation fragmentations fragmenter fragmenters", "free frees freer freest freed freeing freely freeness freenesses", "freeze freezes froze frozen freezing freezings freezer freezers", "from", "full fully fuller fullest", "fuller fullers full fulls fulled fulling fullings", "fungus fungi funguses fungal", "gallows", "ganglion ganglia ganglions ganglionic", "garment garments", "gas gasses gassed gassing gassings gasser gassers", "gas gases gasses gaseous gasless", "gel gels gelled gelling gellings geller gellers", "german germans germanic germany German Germans Germanic Germany", "get gets got gotten getting gettings getter getters", "give gives gave given giving givings giver givers", "gladiolus gladioli gladioluses gladiola gladiolas gladiolae", "glans glandes", "gluiness gluey glue glues glued gluing gluings gluer gluers", "go goes went gone going goings goer goers", "godchild godchildren", "good better best goodly goodness goodnesses", "goods", "goose geese", "goose gooses goosed goosing goosings gooser goosers", "grandchild grandchildren", "grind grinds ground grinding grindings grinder grinders", "ground grounds grounded grounding groundings grounder grounders groundless", "grow grows grew grown growing growings grower growers growth", "gum gums gummed gumming gummings gummer gummers", "half halves", "halve halves halved halving halvings halver halvers", "hang hangs hung hanged hanging hangings hanger hangers", "have has had having havings haver havers", "he him his himself", "hear hears heard hearing hearings hearer hearers", "here", "hide hides hid hidden hiding hidings hider hiders", "hippopotamus hippopotami hippopotamuses", "hold holds held holding holdings holder holders", "honorarium honoraria honorariums", "hoof hoofs hooves hoofed hoofing hoofer hoofers", "how", "hum hums hummed humming hummings hummer hummers", "hymen hymens hymenal", "hypotheses hypothesis hypothesize hypothesizes hypothesized hypothesizer hypothesizing hypothetical hypothetically", "i", "if iffy", "impediment impediments", "implement implements implemented implementing implementings implementation implementations implementer implementers", "imply implies implied implying implyings implier impliers", "in inner", "inclement", "increment increments incremented incrementing incrementings incrementer incrementers incremental incrementally", "index indexes indexed indexing indexings indexer indexers", "index indexes indices indexical indexicals", "indoor indoors", "instrument instruments instrumented instrumenting instrumentings instrumenter instrumenters instrumentation instrumentations instrumental", "integument integumentary", "into", "it its itself", "java", "july julys", "keep keeps kept keeping keepings keeper keepers", "knife knifes knifed knifing knifings knifer knifers", "knife knives", "know knows knew known knowing knowings knower knowers knowledge", "lament laments lamented lamenting lamentings lamentation lamentations lamenter lamenters lamentable lamentably", "larva larvae larvas larval", "late later latest lately lateness", "latter latterly", "lay lays laid laying layer layers", "layer layers layered layering layerings", "lead leads led leading leadings leader leaders leaderless", "leaf leafs leafed leafing leafings leafer leafers", "leaf leaves leafless", "leave leaves left leaving leavings leaver leavers", "lend lends lent lending lendings lender lenders", "less lesser least", "let lets letting lettings", "lie lies lay lain lying lier liers", "lie lies lied lying liar liars", "life lives lifeless", "light lights lit lighted lighting lightings lightly lighter lighters lightness lightnesses lightless", "likely likelier likeliest", "limen limens", "lineament lineaments", "liniment liniments", "live alive living", "live lives lived living livings", "liver livers", "loaf loafs loafed loafing loafings loafer loafers", "loaf loaves", "logic logics logical logically", "lose loses lost losing loser losers loss losses", "louse lice", "lumen lumens", "make makes made making makings maker makers", "man mans manned manning mannings", "man men", "manly manlier manliest manliness manful manfulness manhood", "manic manically", "manner manners mannered mannerly mannerless mannerful", "many", "matrix matrices matrixes", "may might", "maximum maxima maximums maximal maximize maximizes maximized maximizing", "mean means meant meaning meanings meaningless meaningful", "mean meaner meanest meanly meanness meannesses", "median medians medianly medial", "medium media mediums", "meet meets met meeting meetings", "memorandum memoranda memorandums", "mere merely", "metal metals metallic", "might mighty mightily", "millenium millennia milleniums millennial", "mine mines mined mining minings miner miners", "mine my our ours", "minimum minima minimums minimal", "minus minuses", "miscellaneous miscellanea miscellaneously miscellaneousness miscellany", "molest molests molested molesting molestings molester molesters", "moment moments", "monument monuments monumental", "more most", "mouse mice mouseless", "much", "multiply multiplies multiplier multipliers multiple multiples multiplying multiplyings multiplication multiplications", "mum mums mummed mumming mummings mummer mummers", "must musts", "neither", "nemeses nemesis", "neurosis neuroses neurotic neurotics", "nomen", "none", "nos no noes", "not", "nothing nothings nothingness", "now", "nowadays", "nucleus nuclei nucleuses nuclear", "number numbers numbered numbering numberings numberless", "nutriment nutriments nutrient nutrients nutrition nutritions", "oasis oases", "octopus octopi octopuses", "of", "off", "offer offers offered offering offerings offerer offerers offeror offerors", "often", "oftentimes", "ointment ointments", "omen omens", "on", "once", "only", "ornament ornaments ornamented ornamenting ornamentings ornamentation ornamenter ornamenters ornamental", "outdoor outdoors", "outlay outlays", "outlie outlies outlay outlied outlain outlying outlier outliers", "ovum ova", "ox oxen", "parentheses parenthesis", "parliament parliaments parliamentary", "passerby passer-by passersby passers-by", "past pasts", "pay pays paid paying payings payer payers payee payees payment payments", "per", "perhaps", "person persons people", "phenomenon phenomena phenomenal", "pi", "picnic picnics picnicker picnickers picnicked picnicking picnickings", "pigment pigments pigmented pigmenting pigmentings pigmenter pigmenters pigmentation pigmentations", "please pleases pleased pleasing pleasings pleaser pleasers pleasure pleasures pleasuring pleasurings pleasant pleasantly pleasureless pleasureful", "plus pluses plusses", "polyhedra polyhedron polyhedral", "priest priests priestly priestlier priestliest priestliness priestless", "prognosis prognoses", "prostheses prosthesis", "prove proves proved proving provings proofs proof prover provers provable", "psychosis psychoses psychotic psychotics", "qed", "quiz quizzes quizzed quizzing quizzings quizzer quizzers", "raiment", "rather", "re", "real really", "redo redoes redid redone redoing redoings redoer redoers", "regiment regiments regimented regimenting regimenter regimenters regimentation regimental", "rendezvous", "requiz requizzes requizzed requizzing requizzings requizzer requizzers", "ride rides rode ridden riding ridings rider riders rideless", "ring rings rang rung ringing ringings ringer ringers ringless", "rise rises rose risen rising risings riser risers", "rose roses", "rudiment rudiments rudimentary", "rum rums rummed rumming rummings rummer rummers", "run runs ran running runnings runner runners", "sacrament sacraments sacramental", "same sameness", "sans", "saw saws sawed sawn sawing sawings sawyer sawyers", "say says said saying sayings sayer sayers", "scarf scarfs scarves scarfless", "schema schemata schemas", "sediment sediments sedimentary sedimentation sedimentations", "see sees saw seen seeing seeings seer seers", "seek seeks sought seeking seekings seeker seekers", "segment segments segmented segmenting segmentings segmenter segmenters segmentation segmentations", "self selves selfless", "sell sells sold selling sellings seller sellers", "semen", "send sends sent sending sendings sender senders", "sentiment sentiments sentimental", "series", "set sets setting settings", "several severally", "sew sews sewed sewn sewing sewings sewer sewers", "sewer sewers sewerless", "shake shakes shook shaken shaking shakings shaker shakers", "shall should", "shaman shamans", "shave shaves shaved shaven shaving shavings shaver shavers shaveless", "she her hers herself", "sheaf sheaves sheafless", "sheep", "shelf shelves shelved shelfing shelvings shelver shelvers shelfless", "shine shines shined shone shining shinings shiner shiners shineless", "shoe shoes shoed shod shoeing shoeings shoer shoers shoeless", "shoot shoots shot shooting shootings shooter shooters", "shot shots", "show shows showed shown showing showings shower showers", "shower showers showery showerless", "shrink shrinks shrank shrunk shrinking shrinkings shrinker shrinkers shrinkable", "sideways", "simply simple simpler simplest", "since", "sing sings sang sung singing singings singer singers singable", "sink sinks sank sunk sinking sinkings sinker sinkers sinkable", "sit sits sat sitting sittings sitter sitters", "ski skis skied skiing skiings skier skiers skiless skiable", "sky skies", "slay slays slew slain slaying slayings slayer slayers", "sleep sleeps slept sleeping sleepings sleeper sleepers sleepless", "so", "some", "something", "sometime sometimes", "soon", "spa spas", "speak speaks spoke spoken speaking speakings speaker speakers", "species specie", "spectrum spectra spectrums", "speed speeds sped speeded speeding speedings speeder speeders", "spend spends spent spending spendings spender spenders spendable", "spin spins spun spinning spinnings spinner spinners", "spoke spokes", "spring springs sprang sprung springing springings springer springers springy springiness", "staff staffs staves staffed staffing staffings staffer staffers", "stand stands stood standing standings", "stasis stases", "steal steals stole stolen stealing stealings stealer stealers", "stick sticks stuck sticking stickings sticker stickers", "stigma stigmata stigmas stigmatize stigmatizes stigmatized stigmatizing", "stimulus stimuli", "sting stings stung stinging stingings stinger stingers", "stink stinks stank stunk stinking stinkings stinker stinkers", "stomach stomachs", "stratum strata stratums", "stride strides strode stridden striding stridings strider striders", "string strings strung stringing stringings stringer stringers stringless", "strive strives strove striven striving strivings striver strivers", "strum strums strummed strumming strummings strummer strummers strummable", "such", "suffer suffers suffered suffering sufferings sufferer sufferers sufferable", "suggest suggests suggested suggesting suggestings suggester suggesters suggestor suggestors suggestive suggestion suggestions suggestible suggestable", "sum sums summed summing summings summer summers", "summer summers summered summering summerings", "supplement supplements supplemented supplementing supplementings supplementation supplementer supplementers supplementary supplemental", "supply supplies supplied supplying supplyings supplier suppliers", "swear swears swore sworn swearing swearings swearer swearers", "sweep sweeps swept sweeping sweepings sweeper sweepers", "swell swells swelled swollen swelling swellings", "swim swims swam swum swimming swimmings swimmer swimmers swimable", "swine", "swing swings swung swinging swingings swinger swingers", "syllabus syllabi syllabuses", "symposium symposia symposiums", "synapse synapses", "synapsis synapses", "synopsis synopses", "synthesis syntheses", "tableau tableaux tableaus", "take takes took taken taking takings taker takers takable", "teach teaches taught teaching teachings teacher teachers teachable", "tear tears tore torn tearing tearings tearer tearers tearable", "tegument teguments", "tell tells told telling tellings teller tellers tellable", "temperament temperaments temperamental temperamentally", "tenement tenements", "the", "there theres", "theses thesis", "they them their theirs themselves", "thief thieves thieving thievings", "think thinks thought thinking thinker thinkers thinkable", "this that these those", "thought thoughts thougtful thoughtless", "throw throws threw thrown throwing throwings thrower throwers throwable", "tic tics", "tie ties tied tying tyings tier tiers tieable tieless", "tier tiers tiered tiering tierings tierer tierers", "to", "toe toes toed toeing toeings toer toers toeless", "together togetherness", "too", "tooth teeth toothless", "topaz topazes", "torment torments tormented tormenting tormentings tormenter tormenters tormentable", "toward towards", "tread treads trod trodden treading treadings treader treaders", "tread treads treadless retread retreads", "true truly trueness", "two twos", "u", "under", "underlay underlays underlaid underlaying underlayings underlayer underlayers", "underlie underlies underlay underlain underlying underlier underliers", "undo undoes undid undone undoing undoings undoer undoers undoable", "unrest unrestful", "until", "unto", "up", "upon", "upstairs", "use uses user users used using useful useless", "various variously", "vehement vehemently vehemence", "versus", "very", "visit visits visited visiting visitings visitor visitors", "vortex vortexes vortices", "wake wakes woke waked woken waking wakings waker wakers wakeful wakefulness wakefulnesses wakeable", "wear wears wore worn wearing wearings wearer wearers wearable", "weather weathers weathered weathering weatherly", "weave weaves wove woven weaving weavings weaver weavers weaveable", "weep weeps wept weeping weepings weeper weepers", "wharf wharfs wharves", "where wheres", "whereas whereases", "whether whethers", "while whiles whilst whiled whiling", "whiz whizzes whizzed whizzing whizzings whizzer whizzers", "who whom whos whose whoses", "why whys", "wife wives wifeless", "will wills willed willing willings willful", "will would", "win wins won winning winnings winner winners winnable", "wind winds wound winding windings winder winders windable", "wind winds windy windless", "with", "within", "without", "wolf wolves", "woman women womanless womanly", "wound wounds wounded wounding woundings", "write writes wrote written writing writings writer writers writeable", "yeses yes", "yet yets", "you your yours yourself" ] _exdict = {} for exlist in exceptions: for ex in exlist.split(" "): _exdict[ex] = exlist # Programmatic rules vowels = "aeiouy" cons = "bcdfghjklmnpqrstvwxyz" rules = ( # Words ending in S # (e.g., happiness, business) (r"[%s].*[%s](iness)" % (vowels, cons), "y,ies,ier,iers,iest,ied,ying,yings,ily,inesses,iment,iments,iless,iful"), # (e.g., baseless, shoeless) (r"[%s].*(eless)" % vowels, "e,es,er,ers,est,ed,ing,ings,eing,eings,ely,eness,enesses,ement,ements,eness,enesses,eful"), # (e.g., gutless, hatless, spotless) (r"[%s][%s][bdgklmnprt]?(less)" % (cons, vowels), ",s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,ful"), # (e.g., thoughtless, worthless) (r"[%s].*?(less)" % vowels, ",s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,ful"), # (e.g., baseness, toeness) (r"[%s].*(eness)" % vowels, "e,es,er,ers,est,ed,ing,ings,eing,eings,ely,enesses,ement,ements,eless,eful"), # (e.g., bluntness, grayness) (r"[%s].*(ness)" % vowels, ",s,er,ers,est,ed,ing,ings,ly,nesses,ment,ments,less,ful"), # (e.g., albatross, kiss) (r"[%s]ss" % vowels, "es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., joyous, fractious, gaseous) (r"[%s].*(ous)" % vowels, "ly,ness"), # (e.g., tries, unties, jollies, beauties) (r"(ies)", "y,ie,yer,yers,ier,iers,iest,ied,ying,yings,yness,iness,ieness,ynesses,inesses,ienesses,iment,iement,iments,iements,yless,iless,ieless,yful,iful,ieful"), # (e.g., crisis, kinesis) (r"[%s].*(sis)" % vowels, "ses,sises,sisness,sisment,sisments,sisless,sisful"), # (e.g., bronchitis, bursitis) (r"[%s].*(is)" % vowels, "es,ness,ment,ments,less,ful"), (r"[%s].*[cs]h(es)" % vowels, ",e,er,ers,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"), # (e.g., tokenizes) // adds British variations (r"[%s].*[%s](izes)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations"), # (e.g., tokenises) // British variant // ~expertise (r"[%s].*[%s](ises)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ise,iser,isers,ised,ising,isings,isation,isations"), # (e.g., aches, arches) (r"[%s].*[jsxz](es)" % vowels, ",e,er,ers,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"), # (e.g., judges, abridges) (r"[%s].*dg(es)" % vowels, "e,er,ers,est,ed,ing,ings,ely,eness,enesses,ment,ments,ement,ements,eless,eful"), # (e.g., trees, races, likes, agrees) covers all other -es words (r"e(s)", ",*"), # (e.g., segments, bisegments, cosegments) (r"segment(s)", ",*"), # (e.g., pigments, depigments, repigments) (r"pigment(s)", ",*"), # (e.g., judgments, abridgments) (r"[%s].*dg(ments)" % vowels, "ment,*ments"), # (e.g., merriments, embodiments) -iment in turn will generate y and *y (redo y) (r"[%s].*[%s]iment(s)" % (vowels, cons), ",*"), # (e.g., atonements, entrapments) (r"[%s].*ment(s)" % vowels, ",*"), # (e.g., viewers, meters, traders, transfers) (r"[%s].*er(s)" % vowels, ",*"), # (e.g., unflags) polysyllables (r"[%s].*[%s][%s][bdglmnprt](s)" % (vowels, cons, vowels), ",*"), # (e.g., frogs) monosyllables (r"[%s][%s][bdglmnprt](s)" % (vowels, cons), ",*"), # (e.g., killings, muggings) (r"[%s].*ing(s)" % vowels, ",*"), # (e.g., hulls, tolls) (r"[%s].*ll(s)" % vowels, ",*"), # e.g., boas, polkas, spas) don't generate latin endings (r"a(s)", ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., beads, toads) (r"[%s].*[%s].*(s)" % (vowels, cons), ",*"), # (e.g., boas, zoos) (r"[%s].*[%s](s)" % (cons, vowels), ",er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., ss, sss, ssss) no vowel (vowel case is already handled above) (r"ss()", ""), # (e.g., cds, lcds, m-16s) no vowel (can be a plural noun, but not verb) (r"[%s].*[%s1234567890](s)" % (cons, cons), ""), # Words ending in E # (e.g., apple, so it doesn't include apply) (r"appl(e)", "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ement,ements,eless,eful"), # (e.g., supple, so it doesn't include supply) (r"suppl(e)", "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ement,ements,eless,eful"), # (e.g., able, abominable, fungible, table, enable, idle, subtle) (r"[%s].*[%s]l(e)" % (vowels, cons), "es,er,ers,est,ed,ing,ings,y,ely,eness,enesses,ement,ements,eless,eful"), # (e.g., bookie, magpie, vie) (r"(ie)", "ies,ier,iers,iest,ied,ying,yings,iely,ieness,ienesses,iement,iements,ieless,ieful"), # (e.g., dye, redye, redeye) (r"ye()", "s,r,rs,st,d,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., judge, abridge) (r"[%s].*dg(e)" % vowels, "es,er,ers,est,ed,ing,ings,ely,eness,enesses,ment,ments,less,ful,ement,ements,eless,eful"), # (e.g., true, due, imbue) (r"u(e)", "es,er,ers,est,ed,ing,ings,eing,eings,ly,ely,eness,enesses,ment,ments,less,ful,ement,ements,eless,eful"), # (e.g., tokenize) // adds British variations (r"[%s].*[%s](ize)" % (vowels, cons), "izes,izer,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"), # (e.g., tokenise) // British variant // ~expertise (r"[%s].*[%s](ise)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ising,isings,isation,isations"), # (e.g., tree, agree, rage, horse, hoarse) (r"[%s].*[%s](e)" % (vowels, cons), "es,er,ers,est,ed,ing,ings,eing,eings,ely,eness,enesses,ement,ements,eless,eful"), # Words ending in -ED # (e.g., agreed, freed, decreed, treed) (r"ree(d)", "ds,der,ders,ded,ding,dings,dly,dness,dnesses,dment,dments,dless,dful,,*"), # (e.g., feed, seed, Xweed) (r"ee(d)", "ds,der,ders,ded,ding,dings,dly,dness,dnesses,dment,dments,dless,dful"), # (e.g., tried) (r"[%s](ied)" % cons, "y,ie,ies,ier,iers,iest,ying,yings,ily,yly,iness,yness,inesses,ynesses,iment,iments,iless,iful,yment,yments,yless,yful"), # (e.g., controlled, fulfilled, rebelled) (r"[%s].*[%s].*l(led)" % (vowels, cons), ",s,er,ers,est,ing,ings,ly,ness,nesses,ment,ments,less,ful,&,&s,&er,&ers,&est,&ing,&ings,&y,&ness,&nesses,&ment,&ments,&ful"), # (e.g., pulled, filled, fulled) (r"[%s].*l(led)" % vowels, "&,&s,&er,&ers,&est,&ing,&ings,&y,&ness,&nesses,&ment,&ments,&ful"), # (e.g., hissed, grossed) (r"[%s].*s(sed)" % vowels, "&,&es,&er,&ers,&est,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"), # (e.g., hugged, trekked) (r"[%s][%s](?P[bdgklmnprt])((?P=ed1)ed)", ",s,&er,&ers,&est,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., tokenize) // adds British variations (r"[%s].*[%s](ized)" % (vowels, cons), "izes,izer,izers,ize,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"), # (e.g., tokenise) // British variant // ~expertise (r"[%s].*[%s](ized)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ise,ising,isings,isation,isations"), # (e.g., spoiled, tooled, tracked, roasted, atoned, abridged) (r"[%s].*(ed)" % vowels, ",e,s,es,er,ers,est,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ement,ments,ements,less,eless,ful,eful"), # (e.g., bed, sled) words with a single e as the only vowel (r"ed()", "s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), # Words ending in -ER # (e.g., altimeter, ammeter, odometer, perimeter) (r"meter()", "s,er,ers,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., agreer, beer, budgeteer, engineer, freer) (r"eer()", "eers,eered,eering,eerings,eerly,eerness,eernesses,eerment,eerments,eerless,eerful,ee,ees,eest,eed,eeing,eeings,eely,eeness,eenesses,eement,eements,eeless,eeful,eerer,eerers,eerest"), # (e.g., acidifier, saltier) (r"[%s].*[%s](ier)" % (vowels, cons), "y,ie,ies,iest,ied,ying,yings,ily,yly,iness,yness,inesses,ynesses,yment,yments,yless,yful,iment,iments,iless,iful,iers,iered,iering,ierings,ierly,ierness,iernesses,ierment,ierments,ierless,ierful,ierer,ierers,ierest"), # (e.g., puller, filler, fuller) (r"[%s].*l(ler)" % vowels, "&,&s,&est,&ed,&ing,&ings,ly,lely,&ness,&nesses,&ment,&ments,&ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erments,&erless,&erful"), # (e.g., hisser, grosser) (r"[%s].*s(ser)" % vowels, "&,&es,&est,&ed,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erment,&erments,&erless,&erful"), # (e.g., bigger, trekker, hitter) (r"[%s][%s](?P[bdgkmnprt])((?P=er1)er)" % (cons, vowels), "s,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful,&ers,&ered,&ering,&erings,&erly,&erness,&ernesses,&erments,&erless,&erful"), # (e.g., tokenize) // adds British variations (r"[%s].*[%s](izer)" % (vowels, cons), "izes,ize,izers,ized,izing,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"), # (e.g., tokenise) // British variant // ~expertise (r"[%s].*[%s](iser)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,ise,isers,ised,ising,isings,isation,isations"), #(e.g., actioner, atoner, icer, trader, accruer, churchgoer, prefer) (r"[%s].*(er)" % vowels, ",e,s,es,est,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ers,ered,erred,ering,erring,erings,errings,erly,erness,ernesses,erment,erments,erless,erful,erer,erers,erest,errer,errers,errest"), # Words ending in -EST # (e.g., sliest, happiest, wittiest) (r"[%s](iest)" % cons, "y,ies,ier,iers,ied,ying,yings,ily,yly,iness,yness,inesses,ynesses,iment,iments,iless,iful"), # (e.g., fullest) (r"[%s].*l(lest)" % vowels, "&,&s,&er,&ers,&ed,&ing,&ings,ly,&ness,&nesses,&ment,&ments,&ful"), # (e.g., grossest) (r"[%s].*s(sest)" % vowels, "&,&es,&er,&ers,&ed,&ing,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"), # (e.g., biggest) (r"[%s][%s](?P[bdglmnprst])((?P=est1)est)" % (cons, vowels), ",s,&er,&ers,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., basest, archest, rashest) (r"[%s].*([cs]h|[jsxz])(est)" % vowels, "e,es,er,ers,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"), # (e.g., severest, Xinterest, merest) (r"er(est)", "e,es,er,ers,ed,eing,eings,ely,eness,enesses,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"), # (e.g., slickest, coolest, ablest, amplest, protest, quest) (r"[%s].*(est)" % vowels, ",e,s,es,er,ers,ed,ing,ings,ly,ely,ness,eness,nesses,enesses,ment,ments,less,ful,ement,ements,eless,eful,ests,ester,esters,ested,esting,estings,estly,estness,estnesses,estment,estments,estless,estful"), # (e.g., rest, test) (r"est", "s,er,ers,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # Words ending in -FUL # (e.g., beautiful, plentiful) (r"[%s].*[%s](iful)" % (vowels, cons), "ifully,ifulness,*y"), # (e.g., hopeful, sorrowful) (r"[%s].*(ful)" % vowels, "fully,fulness,,*"), # Words ending in -ICAL (r"[%s].*(ical)" % vowels, "ic,ics,ically"), # Words ending in -IC (r"[%s].*(ic)" % vowels, "ics,ical,ically"), # Words ending in -ING # (e.g., dying, crying, supplying) (r"[%s](ying)" % cons, "yings,ie,y,ies,ier,iers,iest,ied,iely,yly,ieness,yness,ienesses,ynesses,iment,iments,iless,iful"), # (e.g., pulling, filling, fulling) (r"[%s].*l(ling)" % vowels, ",*,&,&s,&er,&ers,&est,&ed,&ings,&ness,&nesses,&ment,&ments,&ful"), # (e.g., hissing, grossing, processing) (r"[%s].*s(sing)" % vowels, "&,&s,&er,&ers,&est,&ed,&ings,&ly,&ness,&nesses,&ment,&ments,&less,&ful"), # (e.g., hugging, trekking) (r"[%s][%s](?P[bdgklmnprt])((?P=ing1)ing)" % (cons, vowels), ",s,&er,&ers,&est,&ed,&ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., freeing, agreeing) (r"eeing()", "ee,ees,eer,eers,eest,eed,eeings,eely,eeness,eenesses,eement,eements,eeless,eeful"), # (e.g., ageing, aweing) (r"[%s].*(eing)" % vowels, "e,es,er,ers,est,ed,eings,ely,eness,enesses,ement,ements,eless,eful"), # (e.g., toying, playing) (r"[%s].*y(ing)" % vowels, ",s,er,ers,est,ed,ings,ly,ingly,ness,nesses,ment,ments,less,ful"), # (e.g., editing, crediting, expediting, siting, exciting) (r"[%s].*[%s][eio]t(ing)" % (vowels, cons), ",*,*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), # (e.g., robing, siding, doling, translating, flaking) (r"[%s][%s][bdgklmt](ing)" % (cons, vowels), "*e,ings,inger,ingers,ingest,inged,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), # (e.g., tokenize) // adds British variations (r"[%s].*[%s](izing)" % (vowels, cons), "izes,izer,izers,ized,ize,izings,ization,izations,ise,ises,iser,isers,ised,ising,isings,isation,isations"), # (e.g., tokenise) // British variant // ~expertise (r"[%s].*[%s](ising)" % (vowels, cons), "ize,izes,izer,izers,ized,izing,izings,ization,izations,ises,iser,isers,ised,ise,isings,isation,isations"), # (e.g., icing, aging, achieving, amazing, housing) (r"[%s][cgsvz](ing)" % vowels, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), # (e.g., dancing, troubling, arguing, bluing, carving) (r"[%s][clsuv](ing)" % cons, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), # (e.g., charging, bulging) (r"[%s].*[lr]g(ing)" % vowels, "*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), # (e.g., farming, harping, interesting, bedspring, redwing) (r"[%s].*[%s][bdfjkmnpqrtwxz](ing)" % (vowels, cons), ",*,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), # (e.g., spoiling, reviling, autoing, egging, hanging, hingeing) (r"[%s].*(ing)" % vowels, ",*,*e,ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), # (e.g., wing, thing) monosyllables (r"(ing)", "ings,inger,ingers,ingest,inged,inging,ingings,ingly,ingness,ingnesses,ingment,ingments,ingless,ingful"), # -LEAF rules omitted # Words ending in -MAN # (e.g., policewomen, hatchetmen, dolmen) (r"(man)", "man,mens,mener,meners,menest,mened,mening,menings,menly,menness,mennesses,menless,menful"), # Words ending in -MENT # (e.g., segment, bisegment, cosegment, pigment, depigment, repigment) (r"segment|pigment", "s,ed,ing,ings,er,ers,ly,ness,nesses,less,ful"), # (e.g., judgment, abridgment) (r"[%s].*dg(ment)" % vowels, "*e"), # (e.g., merriment, embodiment) (r"[%s].*[%s](iment)" % (vowels, cons), "*y"), # (e.g., atonement, entrapment) (r"[%s].*[%s](ment)" % (vowels, cons), ",*"), # Words ending in -O # (e.g., taboo, rodeo) (r"[%s]o()" % vowels, "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., tomato, bonito) (r"[%s].*o()" % vowels, "s,es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # Words ending in -UM # (e.g., datum, quantum, tedium, strum, [oil]drum, vacuum) (r"[%s].*(um)" % vowels, "a,ums,umer,ummer,umers,ummers,umed,ummed,uming,umming,umings,ummings,umness,umments,umless,umful"), # Words ending in -Y # (e.g., ably, horribly, wobbly) (r"[%s].*b(ly)" % vowels, "le,les,ler,lers,lest,led,ling,lings,leness,lenesses,lement,lements,leless,leful"), # (e.g., happily, dizzily) (r"[%s].*[%s](ily)" % (vowels, cons), "y,ies,ier,iers,iest,ied,ying,yings,yness,iness,ynesses,inesses,iment,iments,iless,iful"), # (e.g., peaceful+ly) (r"[%s].*ful(ly)" % vowels, ",*"), # (e.g., fully, folly, coolly, fatally, dally) (r"[%s].*l(ly)" % vowels, ",*,lies,lier,liers,liest,lied,lying,lyings,liness,linesses,liment,liments,liless,liful,*l"), # (e.g., monopoly, Xcephaly, holy) (r"[%s](ly)" % vowels, "lies,lier,liers,liest,lied,lying,lyings,liness,linesses,liment,liments,liless,liful"), # (e.g., frequently, comely, deeply, apply, badly) (r"[%s].*(ly)" % vowels, ",*,lies,lier,liers,liest,lied,lying,lyings,liness,linesses,lyless,lyful"), # (e.g., happy, ply, spy, cry) (r"[%s](y)" % cons, "ies,ier,iers,iest,ied,ying,yings,ily,yness,iness,ynesses,inesses,iment,iments,iless,iful,yment,yments,yless,yful"), # (e.g., betray, gay, stay) (r"[%s]y()" % vowels, "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # Root rules # (e.g., fix, arch, rash) (r"[%s].*(ch|sh|[jxz])()" % vowels, "es,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., unflag, open, besot) (r"[%s].*[%s][%s][bdglmnprt]()" % (vowels, cons, vowels), "s,er,ers,est,ed,ing,ings,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., bed, cop) (r"[%s][%s][bdglmnprt]()" % (cons, vowels), "s,&er,&ers,&est,&ed,&ing,&ings,ly,ness,nesses,ment,ments,less,ful"), # (e.g., schemata, automata) (r"[%s].*[%s][%s]ma(ta)" % (vowels, cons, vowels), ",s,tas,tum,tums,ton,tons,tic,tical"), # (e.g., chordata, data, errata, sonata, toccata) (r"[%s].*t(a)" % vowels, "as,ae,um,ums,on,ons,ic,ical"), # (e.g., polka, spa, schema, ova, polyhedra) (r"[%s].*[%s](a)" % (vowels, cons), "as,aed,aing,ae,ata,um,ums,on,ons,al,atic,atical"), # (e.g., full) (r"[%s].*ll()" % vowels, "s,er,ers,est,ed,ing,ings,y,ness,nesses,ment,ments,-less,ful"), # (e.g., spoon, rhythm) (r"[%s].*()", "s,er,ers,est,ed,ing,ings,ly,ness,nesses,ment,ments,less,ful"), ) # There are a limited number of named groups available in a single # regular expression, so we'll partition the list of rules into # smaller chunks. _partition_size = 20 _partitions = [] for p in xrange(0, len(rules) // _partition_size + 1): start = p * _partition_size end = (p + 1) * _partition_size pattern = "|".join("(?P<_g%s>%s)$" % (i, r[0]) for i, r in enumerate(rules[start:end])) _partitions.append(re.compile(pattern)) def variations(word): """Given an English word, returns a collection of morphological variations on the word by algorithmically adding and removing suffixes. The variation list may contain non-words (e.g. render -> renderment). >>> variations("pull") set(['pull', 'pullings', 'pullnesses', 'pullful', 'pullment', 'puller', ... ]) """ if word in _exdict: return _exdict[word].split(" ") for i, p in enumerate(_partitions): match = p.search(word) if match: # Get the named group that matched num = int([k for k, v in iteritems(match.groupdict()) if v is not None and k.startswith("_g")][0][2:]) # Get the positional groups for the matched group (all other # positional groups are None) groups = [g for g in match.groups() if g is not None] ending = groups[-1] root = word[:0 - len(ending)] if ending else word out = set((word,)) results = rules[i * _partition_size + num][1] for result in results.split(","): if result.startswith("&"): out.add(root + root[-1] + result[1:]) elif result.startswith("*"): out.union(variations(root + result[1:])) else: out.add(root + result) return set(out) return [word] if __name__ == '__main__': import time t = time.clock() s = variations("rendering") print(time.clock() - t) print(len(s)) Whoosh-2.5.7/src/whoosh/lang/paicehusk.py0000644000076500000240000001520412254366350020441 0ustar mattstaff00000000000000"""This module contains an object that implements the Paice-Husk stemming algorithm. If you just want to use the standard Paice-Husk stemming rules, use the module's ``stem()`` function:: stemmed_word = stem(word) If you want to use a custom rule set, read the rules into a string where the rules are separated by newlines, and instantiate the object with the string, then use the object's stem method to stem words:: stemmer = PaiceHuskStemmer(my_rules_string) stemmed_word = stemmer.stem(word) """ import re from collections import defaultdict class PaiceHuskStemmer(object): """Implements the Paice-Husk stemming algorithm. """ rule_expr = re.compile(r""" ^(?P\w+) (?P[*]?) (?P\d+) (?P\w*) (?P[.>]) """, re.UNICODE | re.VERBOSE) stem_expr = re.compile("^\w+", re.UNICODE) def __init__(self, ruletable): """ :param ruletable: a string containing the rule data, separated by newlines. """ self.rules = defaultdict(list) self.read_rules(ruletable) def read_rules(self, ruletable): rule_expr = self.rule_expr rules = self.rules for line in ruletable.split("\n"): line = line.strip() if not line: continue match = rule_expr.match(line) if match: ending = match.group("ending")[::-1] lastchar = ending[-1] intact = match.group("intact") == "*" num = int(match.group("num")) append = match.group("append") cont = match.group("cont") == ">" rules[lastchar].append((ending, intact, num, append, cont)) else: raise Exception("Bad rule: %r" % line) def first_vowel(self, word): vp = min([p for p in [word.find(v) for v in "aeiou"] if p > -1]) yp = word.find("y") if yp > 0 and yp < vp: return yp return vp def strip_prefix(self, word): for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"): if word.startswith(prefix): return word[len(prefix):] return word def stem(self, word): """Returns a stemmed version of the argument string. """ rules = self.rules match = self.stem_expr.match(word) if not match: return word stem = self.strip_prefix(match.group(0)) is_intact = True continuing = True while continuing: pfv = self.first_vowel(stem) rulelist = rules.get(stem[-1]) if not rulelist: break continuing = False for ending, intact, num, append, cont in rulelist: if stem.endswith(ending): if intact and not is_intact: continue newlen = len(stem) - num + len(append) if ((pfv == 0 and newlen < 2) or (pfv > 0 and newlen < 3)): # If word starts with vowel, minimum stem length is 2. # If word starts with consonant, minimum stem length is # 3. continue is_intact = False stem = stem[:0 - num] + append continuing = cont break return stem # The default rules for the Paice-Husk stemming algorithm defaultrules = """ ai*2. { -ia > - if intact } a*1. { -a > - if intact } bb1. { -bb > -b } city3s. { -ytic > -ys } ci2> { -ic > - } cn1t> { -nc > -nt } dd1. { -dd > -d } dei3y> { -ied > -y } deec2ss. { -ceed > -cess } dee1. { -eed > -ee } de2> { -ed > - } dooh4> { -hood > - } e1> { -e > - } feil1v. { -lief > -liev } fi2> { -if > - } gni3> { -ing > - } gai3y. { -iag > -y } ga2> { -ag > - } gg1. { -gg > -g } ht*2. { -th > - if intact } hsiug5ct. { -guish > -ct } hsi3> { -ish > - } i*1. { -i > - if intact } i1y> { -i > -y } ji1d. { -ij > -id -- see nois4j> & vis3j> } juf1s. { -fuj > -fus } ju1d. { -uj > -ud } jo1d. { -oj > -od } jeh1r. { -hej > -her } jrev1t. { -verj > -vert } jsim2t. { -misj > -mit } jn1d. { -nj > -nd } j1s. { -j > -s } lbaifi6. { -ifiabl > - } lbai4y. { -iabl > -y } lba3> { -abl > - } lbi3. { -ibl > - } lib2l> { -bil > -bl } lc1. { -cl > c } lufi4y. { -iful > -y } luf3> { -ful > - } lu2. { -ul > - } lai3> { -ial > - } lau3> { -ual > - } la2> { -al > - } ll1. { -ll > -l } mui3. { -ium > - } mu*2. { -um > - if intact } msi3> { -ism > - } mm1. { -mm > -m } nois4j> { -sion > -j } noix4ct. { -xion > -ct } noi3> { -ion > - } nai3> { -ian > - } na2> { -an > - } nee0. { protect -een } ne2> { -en > - } nn1. { -nn > -n } pihs4> { -ship > - } pp1. { -pp > -p } re2> { -er > - } rae0. { protect -ear } ra2. { -ar > - } ro2> { -or > - } ru2> { -ur > - } rr1. { -rr > -r } rt1> { -tr > -t } rei3y> { -ier > -y } sei3y> { -ies > -y } sis2. { -sis > -s } si2> { -is > - } ssen4> { -ness > - } ss0. { protect -ss } suo3> { -ous > - } su*2. { -us > - if intact } s*1> { -s > - if intact } s0. { -s > -s } tacilp4y. { -plicat > -ply } ta2> { -at > - } tnem4> { -ment > - } tne3> { -ent > - } tna3> { -ant > - } tpir2b. { -ript > -rib } tpro2b. { -orpt > -orb } tcud1. { -duct > -duc } tpmus2. { -sumpt > -sum } tpec2iv. { -cept > -ceiv } tulo2v. { -olut > -olv } tsis0. { protect -sist } tsi3> { -ist > - } tt1. { -tt > -t } uqi3. { -iqu > - } ugo1. { -ogu > -og } vis3j> { -siv > -j } vie0. { protect -eiv } vi2> { -iv > - } ylb1> { -bly > -bl } yli3y> { -ily > -y } ylp0. { protect -ply } yl2> { -ly > - } ygo1. { -ogy > -og } yhp1. { -phy > -ph } ymo1. { -omy > -om } ypo1. { -opy > -op } yti3> { -ity > - } yte3> { -ety > - } ytl2. { -lty > -l } yrtsi5. { -istry > - } yra3> { -ary > - } yro3> { -ory > - } yfi3. { -ify > - } ycn2t> { -ncy > -nt } yca3> { -acy > - } zi2> { -iz > - } zy1s. { -yz > -ys } """ # Make the standard rules available as a module-level function stem = PaiceHuskStemmer(defaultrules).stem Whoosh-2.5.7/src/whoosh/lang/phonetic.py0000644000076500000240000000645112254366350020302 0ustar mattstaff00000000000000#encoding: utf-8 """ This module contains quasi-phonetic encoders for words in different languages. """ import re from whoosh.compat import iteritems # This soundex implementation is adapted from the recipe here: # http://code.activestate.com/recipes/52213/ english_codes = '01230120022455012623010202' def soundex_en(word): # digits holds the soundex values for the alphabet r = "" if word: # Remember first character fc = None prevcode = None for char in word.lower(): c = ord(char) if c >= 97 and c <= 122: # a-z if not fc: fc = char code = english_codes[c - 97] # Don't append the code if it's the same as the previous if code != prevcode: r += code prevcode = code # Replace first digit with first alpha character r = fc + r[1:] return r # Quasi-phonetic coder for Spanish, translated to Python from Sebastian # Ferreyra's version here: # http://www.javalobby.org/java/forums/t16936.html _esp_codes = (("\\Aw?[uh]?([aeiou])", ""), ("c[eiéí]|z|ll|sh|ch|sch|cc|y[aeiouáéíóú]|ps|bs|x|j|g[eiéí]", "s"), ("[aeiouhwáéíóúü]+", ""), ("y", ""), ("ñ|gn", "n"), ("[dpc]t", "t"), ("c[aouáóú]|ck|q", "k"), ("v", "b"), ("d$", "t"), # Change a trailing d to a t ) _esp_codes = tuple((re.compile(pat), repl) for pat, repl in _esp_codes) def soundex_esp(word): word = word.lower() r = "" prevcode = None i = 0 while i < len(word): code = None for expr, ecode in _esp_codes: match = expr.match(word, i) if match: i = match.end() code = ecode break if code is None: code = word[i] i += 1 if code != prevcode: r += code prevcode = code return r # This version of soundex for Arabic is translated to Python from Tammam # Koujan's C# version here: # http://www.codeproject.com/KB/recipes/ArabicSoundex.aspx # Create a dictionary mapping arabic characters to digits _arabic_codes = {} for chars, code in iteritems({'\u0627\u0623\u0625\u0622\u062d\u062e\u0647\u0639\u063a\u0634\u0648\u064a': "0", '\u0641\u0628': "1", '\u062c\u0632\u0633\u0635\u0638\u0642\u0643': "2", '\u062a\u062b\u062f\u0630\u0636\u0637': "3", '\u0644': "4", '\u0645\u0646': "5", '\u0631': "6", }): for char in chars: _arabic_codes[char] = code def soundex_ar(word): if word[0] in "\u0627\u0623\u0625\u0622": word = word[1:] r = "0" prevcode = "0" if len(word) > 1: # Discard the first character for char in word[1:]: if char in _arabic_codes: code = _arabic_codes.get(char, "0") # Don't append the code if it's the same as the previous if code != prevcode: # If the code is a 0 (vowel), don't process it if code != "0": r += code prevcode = code return r Whoosh-2.5.7/src/whoosh/lang/porter.py0000755000076500000240000001030412254366350017777 0ustar mattstaff00000000000000""" Reimplementation of the `Porter stemming algorithm `_ in Python. In my quick tests, this implementation about 3.5 times faster than the seriously weird Python linked from the official page. """ import re # Suffix replacement lists _step2list = { "ational": "ate", "tional": "tion", "enci": "ence", "anci": "ance", "izer": "ize", "bli": "ble", "alli": "al", "entli": "ent", "eli": "e", "ousli": "ous", "ization": "ize", "ation": "ate", "ator": "ate", "alism": "al", "iveness": "ive", "fulness": "ful", "ousness": "ous", "aliti": "al", "iviti": "ive", "biliti": "ble", "logi": "log", } _step3list = { "icate": "ic", "ative": "", "alize": "al", "iciti": "ic", "ical": "ic", "ful": "", "ness": "", } _cons = "[^aeiou]" _vowel = "[aeiouy]" _cons_seq = "[^aeiouy]+" _vowel_seq = "[aeiou]+" # m > 0 _mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq) # m == 0 _meq1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$") # m > 1 _mgr1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq) # vowel in stem _s_v = re.compile("^(" + _cons_seq + ")?" + _vowel) # ??? _c_v = re.compile("^" + _cons_seq + _vowel + "[^aeiouwxy]$") # Patterns used in the rules _ed_ing = re.compile("^(.*)(ed|ing)$") _at_bl_iz = re.compile("(at|bl|iz)$") _step1b = re.compile("([^aeiouylsz])\\1$") _step2 = re.compile("^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$") _step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$") _step4_1 = re.compile("^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$") _step4_2 = re.compile("^(.+?)(s|t)(ion)$") _step5 = re.compile("^(.+?)e$") # Stemming function def stem(w): """Uses the Porter stemming algorithm to remove suffixes from English words. >>> stem("fundamentally") "fundament" """ if len(w) < 3: return w first_is_y = w[0] == "y" if first_is_y: w = "Y" + w[1:] # Step 1a if w.endswith("s"): if w.endswith("sses"): w = w[:-2] elif w.endswith("ies"): w = w[:-2] elif w[-2] != "s": w = w[:-1] # Step 1b if w.endswith("eed"): s = w[:-3] if _mgr0.match(s): w = w[:-1] else: m = _ed_ing.match(w) if m: stem = m.group(1) if _s_v.match(stem): w = stem if _at_bl_iz.match(w): w += "e" elif _step1b.match(w): w = w[:-1] elif _c_v.match(w): w += "e" # Step 1c if w.endswith("y"): stem = w[:-1] if _s_v.match(stem): w = stem + "i" # Step 2 m = _step2.match(w) if m: stem = m.group(1) suffix = m.group(2) if _mgr0.match(stem): w = stem + _step2list[suffix] # Step 3 m = _step3.match(w) if m: stem = m.group(1) suffix = m.group(2) if _mgr0.match(stem): w = stem + _step3list[suffix] # Step 4 m = _step4_1.match(w) if m: stem = m.group(1) if _mgr1.match(stem): w = stem else: m = _step4_2.match(w) if m: stem = m.group(1) + m.group(2) if _mgr1.match(stem): w = stem # Step 5 m = _step5.match(w) if m: stem = m.group(1) if _mgr1.match(stem) or (_meq1.match(stem) and not _c_v.match(stem)): w = stem if w.endswith("ll") and _mgr1.match(w): w = w[:-1] if first_is_y: w = "y" + w[1:] return w if __name__ == '__main__': print(stem("fundamentally")) Whoosh-2.5.7/src/whoosh/lang/porter2.py0000644000076500000240000002017212254366350020062 0ustar mattstaff00000000000000"""An implementation of the Porter2 stemming algorithm. See http://snowball.tartarus.org/algorithms/english/stemmer.html Adapted from pyporter2 by Michael Dirolf. This algorithm is more correct but (at least in this implementation) several times slower than the original porter algorithm as implemented in stemming.porter. """ import re r_exp = re.compile(r"[^aeiouy]*[aeiouy]+[^aeiouy](\w*)") ewss_exp1 = re.compile(r"^[aeiouy][^aeiouy]$") ewss_exp2 = re.compile(r".*[^aeiouy][aeiouy][^aeiouywxY]$") ccy_exp = re.compile(r"([aeiouy])y") s1a_exp = re.compile(r"[aeiouy].") s1b_exp = re.compile(r"[aeiouy]") def get_r1(word): # exceptional forms if word.startswith('gener') or word.startswith('arsen'): return 5 if word.startswith('commun'): return 6 # normal form match = r_exp.match(word) if match: return match.start(1) return len(word) def get_r2(word): match = r_exp.match(word, get_r1(word)) if match: return match.start(1) return len(word) def ends_with_short_syllable(word): if len(word) == 2: if ewss_exp1.match(word): return True if ewss_exp2.match(word): return True return False def is_short_word(word): if ends_with_short_syllable(word): if get_r1(word) == len(word): return True return False def remove_initial_apostrophe(word): if word.startswith("'"): return word[1:] return word def capitalize_consonant_ys(word): if word.startswith('y'): word = 'Y' + word[1:] return ccy_exp.sub('\g<1>Y', word) def step_0(word): if word.endswith("'s'"): return word[:-3] if word.endswith("'s"): return word[:-2] if word.endswith("'"): return word[:-1] return word def step_1a(word): if word.endswith('sses'): return word[:-4] + 'ss' if word.endswith('ied') or word.endswith('ies'): if len(word) > 4: return word[:-3] + 'i' else: return word[:-3] + 'ie' if word.endswith('us') or word.endswith('ss'): return word if word.endswith('s'): preceding = word[:-1] if s1a_exp.search(preceding): return preceding return word return word doubles = ('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt') def ends_with_double(word): for double in doubles: if word.endswith(double): return True return False def step_1b_helper(word): if word.endswith('at') or word.endswith('bl') or word.endswith('iz'): return word + 'e' if ends_with_double(word): return word[:-1] if is_short_word(word): return word + 'e' return word s1b_suffixes = ('ed', 'edly', 'ing', 'ingly') def step_1b(word, r1): if word.endswith('eedly'): if len(word) - 5 >= r1: return word[:-3] return word if word.endswith('eed'): if len(word) - 3 >= r1: return word[:-1] return word for suffix in s1b_suffixes: if word.endswith(suffix): preceding = word[:-len(suffix)] if s1b_exp.search(preceding): return step_1b_helper(preceding) return word return word def step_1c(word): if word.endswith('y') or word.endswith('Y') and len(word) > 1: if word[-2] not in 'aeiouy': if len(word) > 2: return word[:-1] + 'i' return word def step_2_helper(word, r1, end, repl, prev): if word.endswith(end): if len(word) - len(end) >= r1: if prev == []: return word[:-len(end)] + repl for p in prev: if word[:-len(end)].endswith(p): return word[:-len(end)] + repl return word return None s2_triples = (('ization', 'ize', []), ('ational', 'ate', []), ('fulness', 'ful', []), ('ousness', 'ous', []), ('iveness', 'ive', []), ('tional', 'tion', []), ('biliti', 'ble', []), ('lessli', 'less', []), ('entli', 'ent', []), ('ation', 'ate', []), ('alism', 'al', []), ('aliti', 'al', []), ('ousli', 'ous', []), ('iviti', 'ive', []), ('fulli', 'ful', []), ('enci', 'ence', []), ('anci', 'ance', []), ('abli', 'able', []), ('izer', 'ize', []), ('ator', 'ate', []), ('alli', 'al', []), ('bli', 'ble', []), ('ogi', 'og', ['l']), ('li', '', ['c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'])) def step_2(word, r1): for trip in s2_triples: attempt = step_2_helper(word, r1, trip[0], trip[1], trip[2]) if attempt: return attempt return word def step_3_helper(word, r1, r2, end, repl, r2_necessary): if word.endswith(end): if len(word) - len(end) >= r1: if not r2_necessary: return word[:-len(end)] + repl else: if len(word) - len(end) >= r2: return word[:-len(end)] + repl return word return None s3_triples = (('ational', 'ate', False), ('tional', 'tion', False), ('alize', 'al', False), ('icate', 'ic', False), ('iciti', 'ic', False), ('ative', '', True), ('ical', 'ic', False), ('ness', '', False), ('ful', '', False)) def step_3(word, r1, r2): for trip in s3_triples: attempt = step_3_helper(word, r1, r2, trip[0], trip[1], trip[2]) if attempt: return attempt return word s4_delete_list = ('al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement', 'ment', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize') def step_4(word, r2): for end in s4_delete_list: if word.endswith(end): if len(word) - len(end) >= r2: return word[:-len(end)] return word if word.endswith('sion') or word.endswith('tion'): if len(word) - 3 >= r2: return word[:-3] return word def step_5(word, r1, r2): if word.endswith('l'): if len(word) - 1 >= r2 and word[-2] == 'l': return word[:-1] return word if word.endswith('e'): if len(word) - 1 >= r2: return word[:-1] if len(word) - 1 >= r1 and not ends_with_short_syllable(word[:-1]): return word[:-1] return word def normalize_ys(word): return word.replace('Y', 'y') exceptional_forms = {'skis': 'ski', 'skies': 'sky', 'dying': 'die', 'lying': 'lie', 'tying': 'tie', 'idly': 'idl', 'gently': 'gentl', 'ugly': 'ugli', 'early': 'earli', 'only': 'onli', 'singly': 'singl', 'sky': 'sky', 'news': 'news', 'howe': 'howe', 'atlas': 'atlas', 'cosmos': 'cosmos', 'bias': 'bias', 'andes': 'andes'} exceptional_early_exit_post_1a = frozenset(['inning', 'outing', 'canning', 'herring', 'earring', 'proceed', 'exceed', 'succeed']) def stem(word): if len(word) <= 2: return word word = remove_initial_apostrophe(word) # handle some exceptional forms if word in exceptional_forms: return exceptional_forms[word] word = capitalize_consonant_ys(word) r1 = get_r1(word) r2 = get_r2(word) word = step_0(word) word = step_1a(word) # handle some more exceptional forms if word in exceptional_early_exit_post_1a: return word word = step_1b(word, r1) word = step_1c(word) word = step_2(word, r1) word = step_3(word, r1, r2) word = step_4(word, r2) word = step_5(word, r1, r2) word = normalize_ys(word) return word Whoosh-2.5.7/src/whoosh/lang/snowball/0000755000076500000240000000000012277504634017736 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/lang/snowball/__init__.py0000644000076500000240000000507612254366350022053 0ustar mattstaff00000000000000# Copyright (C) 2001-2012 NLTK Project # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Natural Language Toolkit: Snowball Stemmer # # Copyright (C) 2001-2012 NLTK Project # Author: Peter Michael Stahl # Peter Ljunglof (revisions) # Algorithms: Dr Martin Porter # URL: # For license information, see LICENSE.TXT # HJ 2012/07/19 adapted from https://github.com/kmike/nltk.git (branch 2and3) # 2.0.1rc4-256-g45768f8 """ This module provides a port of the Snowball stemmers developed by Martin Porter. At the moment, this port is able to stem words from fourteen languages: Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish and Swedish. The algorithms have been developed by Martin Porter. These stemmers are called Snowball, because he invented a programming language with this name for creating new stemming algorithms. There is more information available at http://snowball.tartarus.org/ """ from .danish import DanishStemmer from .dutch import DutchStemmer from .english import EnglishStemmer from .finnish import FinnishStemmer from .french import FrenchStemmer from .german import GermanStemmer from .hungarian import HungarianStemmer from .italian import ItalianStemmer from .norwegian import NorwegianStemmer from .portugese import PortugueseStemmer from .romanian import RomanianStemmer from .russian import RussianStemmer from .spanish import SpanishStemmer from .swedish import SwedishStemmer # Map two-letter codes to stemming classes classes = {"da": DanishStemmer, "nl": DutchStemmer, "en": EnglishStemmer, "fi": FinnishStemmer, "fr": FrenchStemmer, "de": GermanStemmer, "hu": HungarianStemmer, "it": ItalianStemmer, "no": NorwegianStemmer, "pt": PortugueseStemmer, "ro": RomanianStemmer, "ru": RussianStemmer, "es": SpanishStemmer, "sv": SwedishStemmer, } Whoosh-2.5.7/src/whoosh/lang/snowball/bases.py0000644000076500000240000001141212254366350021400 0ustar mattstaff00000000000000# Base classes class _ScandinavianStemmer(object): """ This subclass encapsulates a method for defining the string region R1. It is used by the Danish, Norwegian, and Swedish stemmer. """ def _r1_scandinavian(self, word, vowels): """ Return the region R1 that is used by the Scandinavian stemmers. R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel. But then R1 is adjusted so that the region before it contains at least three letters. :param word: The word whose region R1 is determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the region R1. :type vowels: unicode :return: the region R1 for the respective word. :rtype: unicode :note: This helper method is invoked by the respective stem method of the subclasses DanishStemmer, NorwegianStemmer, and SwedishStemmer. It is not to be invoked directly! """ r1 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: r1 = word[3:] elif len(word[:i + 1]) >= 3: r1 = word[i + 1:] else: return word break return r1 class _StandardStemmer(object): """ This subclass encapsulates two methods for defining the standard versions of the string regions R1, R2, and RV. """ def _r1r2_standard(self, word, vowels): """ Return the standard interpretations of the string regions R1 and R2. R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel. R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel. :param word: The word whose regions R1 and R2 are determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the regions R1 and R2. :type vowels: unicode :return: (r1,r2), the regions R1 and R2 for the respective word. :rtype: tuple :note: This helper method is invoked by the respective stem method of the subclasses DutchStemmer, FinnishStemmer, FrenchStemmer, GermanStemmer, ItalianStemmer, PortugueseStemmer, RomanianStemmer, and SpanishStemmer. It is not to be invoked directly! :note: A detailed description of how to define R1 and R2 can be found at http://snowball.tartarus.org/texts/r1r2.html """ r1 = "" r2 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: r1 = word[i + 1:] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i - 1] in vowels: r2 = r1[i + 1:] break return (r1, r2) def _rv_standard(self, word, vowels): """ Return the standard interpretation of the string region RV. If the second letter is a consonant, RV is the region after the next following vowel. If the first two letters are vowels, RV is the region after the next following consonant. Otherwise, RV is the region after the third letter. :param word: The word whose region RV is determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the region RV. :type vowels: unicode :return: the region RV for the respective word. :rtype: unicode :note: This helper method is invoked by the respective stem method of the subclasses ItalianStemmer, PortugueseStemmer, RomanianStemmer, and SpanishStemmer. It is not to be invoked directly! """ rv = "" if len(word) >= 2: if word[1] not in vowels: for i in range(2, len(word)): if word[i] in vowels: rv = word[i + 1:] break elif word[:2] in vowels: for i in range(2, len(word)): if word[i] not in vowels: rv = word[i + 1:] break else: rv = word[3:] return rv Whoosh-2.5.7/src/whoosh/lang/snowball/danish.py0000644000076500000240000001002012254366350021543 0ustar mattstaff00000000000000from .bases import _ScandinavianStemmer from whoosh.compat import u class DanishStemmer(_ScandinavianStemmer): """ The Danish Snowball stemmer. :cvar __vowels: The Danish vowels. :type __vowels: unicode :cvar __consonants: The Danish consonants. :type __consonants: unicode :cvar __double_consonants: The Danish double consonants. :type __double_consonants: tuple :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Danish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/danish/stemmer.html """ # The language's vowels and other important characters are defined. __vowels = u("aeiouy\xE6\xE5\xF8") __consonants = "bcdfghjklmnpqrstvwxz" __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj", "kk", "ll", "mm", "nn", "pp", "qq", "rr", "ss", "tt", "vv", "ww", "xx", "zz") __s_ending = u("abcdfghjklmnoprtvyz\xE5") # The different suffixes, divided into the algorithm's steps # and organized by length, are listed in tuples. __step1_suffixes = ("erendes", "erende", "hedens", "ethed", "erede", "heden", "heder", "endes", "ernes", "erens", "erets", "ered", "ende", "erne", "eren", "erer", "heds", "enes", "eres", "eret", "hed", "ene", "ere", "ens", "ers", "ets", "en", "er", "es", "et", "e", "s") __step2_suffixes = ("gd", "dt", "gt", "kt") __step3_suffixes = ("elig", u("l\xF8st"), "lig", "els", "ig") def stem(self, word): """ Stem a Danish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ # Every word is put into lower case for normalization. word = word.lower() # After this, the required regions are generated # by the respective helper method. r1 = self._r1_scandinavian(word, self.__vowels) # Then the actual stemming process starts. # Every new step is explicitly indicated # according to the descriptions on the Snowball website. # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 if r1.endswith("igst"): word = word[:-2] r1 = r1[:-2] for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix == u("l\xF8st"): word = word[:-1] r1 = r1[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] if r1.endswith(self.__step2_suffixes): word = word[:-1] r1 = r1[:-1] break # STEP 4: Undouble for double_cons in self.__double_consonants: if word.endswith(double_cons) and len(word) > 3: word = word[:-1] break return word Whoosh-2.5.7/src/whoosh/lang/snowball/dutch.py0000644000076500000240000001406212254366350021416 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class DutchStemmer(_StandardStemmer): """ The Dutch Snowball stemmer. :cvar __vowels: The Dutch vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm. :type __step3b_suffixes: tuple :note: A detailed description of the Dutch stemming algorithm can be found under http://snowball.tartarus.org/algorithms/dutch/stemmer.html """ __vowels = u("aeiouy\xE8") __step1_suffixes = ("heden", "ene", "en", "se", "s") __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig") def stem(self, word): """ Stem a Dutch word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() step2_success = False # Vowel accents are removed. word = (word.replace(u("\xE4"), "a").replace(u("\xE1"), "a") .replace(u("\xEB"), "e").replace(u("\xE9"), "e") .replace(u("\xED"), "i").replace(u("\xEF"), "i") .replace(u("\xF6"), "o").replace(u("\xF3"), "o") .replace(u("\xFC"), "u").replace(u("\xFA"), "u")) # An initial 'y', a 'y' after a vowel, # and an 'i' between self.__vowels is put into upper case. # As from now these are treated as consonants. if word.startswith("y"): word = "".join(("Y", word[1:])) for i in range(1, len(word)): if word[i - 1] in self.__vowels and word[i] == "y": word = "".join((word[:i], "Y", word[i + 1:])) for i in range(1, len(word) - 1): if (word[i - 1] in self.__vowels and word[i] == "i" and word[i + 1] in self.__vowels): word = "".join((word[:i], "I", word[i + 1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) # R1 is adjusted so that the region before it # contains at least 3 letters. for i in range(1, len(word)): if word[i] not in self.__vowels and word[i - 1] in self.__vowels: if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: r1 = word[3:] elif len(word[:i + 1]) == 0: return word break # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "heden": word = "".join((word[:-5], "heid")) r1 = "".join((r1[:-5], "heid")) if r2.endswith("heden"): r2 = "".join((r2[:-5], "heid")) elif (suffix in ("ene", "en") and not word.endswith("heden") and word[-len(suffix) - 1] not in self.__vowels and word[-len(suffix) - 3:-len(suffix)] != "gem"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif (suffix in ("se", "s") and word[-len(suffix) - 1] not in self.__vowels and word[-len(suffix) - 1] != "j"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 2 if r1.endswith("e") and word[-2] not in self.__vowels: step2_success = True word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] # STEP 3a if r2.endswith("heid") and word[-5] != "c": word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] if (r1.endswith("en") and word[-3] not in self.__vowels and word[-5:-2] != "gem"): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] # STEP 3b: Derivational suffixes for suffix in self.__step3b_suffixes: if r2.endswith(suffix): if suffix in ("end", "ing"): word = word[:-3] r2 = r2[:-3] if r2.endswith("ig") and word[-3] != "e": word = word[:-2] else: if word.endswith(("kk", "dd", "tt")): word = word[:-1] elif suffix == "ig" and word[-3] != "e": word = word[:-2] elif suffix == "lijk": word = word[:-4] r1 = r1[:-4] if r1.endswith("e") and word[-2] not in self.__vowels: word = word[:-1] if word.endswith(("kk", "dd", "tt")): word = word[:-1] elif suffix == "baar": word = word[:-4] elif suffix == "bar" and step2_success: word = word[:-3] break # STEP 4: Undouble vowel if len(word) >= 4: if word[-1] not in self.__vowels and word[-1] != "I": if word[-3:-1] in ("aa", "ee", "oo", "uu"): if word[-4] not in self.__vowels: word = "".join((word[:-3], word[-3], word[-1])) # All occurrences of 'I' and 'Y' are put back into lower case. word = word.replace("I", "i").replace("Y", "y") return word Whoosh-2.5.7/src/whoosh/lang/snowball/english.py0000644000076500000240000004170312254366350021742 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class EnglishStemmer(_StandardStemmer): """ The English Snowball stemmer. :cvar __vowels: The English vowels. :type __vowels: unicode :cvar __double_consonants: The English double consonants. :type __double_consonants: tuple :cvar __li_ending: Letters that may directly appear before a word final 'li'. :type __li_ending: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm. :type __step1a_suffixes: tuple :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm. :type __step1b_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. :type __step5_suffixes: tuple :cvar __special_words: A dictionary containing words which have to be stemmed specially. :type __special_words: dict :note: A detailed description of the English stemming algorithm can be found under http://snowball.tartarus.org/algorithms/english/stemmer.html """ __vowels = "aeiouy" __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") __li_ending = "cdeghkmnrt" __step0_suffixes = ("'s'", "'s", "'") __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s") __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed") __step2_suffixes = ('ization', 'ational', 'fulness', 'ousness', 'iveness', 'tional', 'biliti', 'lessli', 'entli', 'ation', 'alism', 'aliti', 'ousli', 'iviti', 'fulli', 'enci', 'anci', 'abli', 'izer', 'ator', 'alli', 'bli', 'ogi', 'li') __step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti', 'ative', 'ical', 'ness', 'ful') __step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment', 'ant', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'ion', 'al', 'er', 'ic') __step5_suffixes = ("e", "l") __special_words = {"skis": "ski", "skies": "sky", "dying": "die", "lying": "lie", "tying": "tie", "idly": "idl", "gently": "gentl", "ugly": "ugli", "early": "earli", "only": "onli", "singly": "singl", "sky": "sky", "news": "news", "howe": "howe", "atlas": "atlas", "cosmos": "cosmos", "bias": "bias", "andes": "andes", "inning": "inning", "innings": "inning", "outing": "outing", "outings": "outing", "canning": "canning", "cannings": "canning", "herring": "herring", "herrings": "herring", "earring": "earring", "earrings": "earring", "proceed": "proceed", "proceeds": "proceed", "proceeded": "proceed", "proceeding": "proceed", "exceed": "exceed", "exceeds": "exceed", "exceeded": "exceed", "exceeding": "exceed", "succeed": "succeed", "succeeds": "succeed", "succeeded": "succeed", "succeeding": "succeed"} def stem(self, word): """ Stem an English word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.__special_words: return self.__special_words[word] # Map the different apostrophe characters to a single consistent one word = (word.replace(u("\u2019"), u("\x27")) .replace(u("\u2018"), u("\x27")) .replace(u("\u201B"), u("\x27"))) if word.startswith(u("\x27")): word = word[1:] if word.startswith("y"): word = "".join(("Y", word[1:])) for i in range(1, len(word)): if word[i - 1] in self.__vowels and word[i] == "y": word = "".join((word[:i], "Y", word[i + 1:])) step1a_vowel_found = False step1b_vowel_found = False r1 = "" r2 = "" if word.startswith(("gener", "commun", "arsen")): if word.startswith(("gener", "arsen")): r1 = word[5:] else: r1 = word[6:] for i in range(1, len(r1)): if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels: r2 = r1[i + 1:] break else: r1, r2 = self._r1r2_standard(word, self.__vowels) # STEP 0 for suffix in self.__step0_suffixes: if word.endswith(suffix): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 1a for suffix in self.__step1a_suffixes: if word.endswith(suffix): if suffix == "sses": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("ied", "ies"): if len(word[:-len(suffix)]) > 1: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif suffix == "s": for letter in word[:-2]: if letter in self.__vowels: step1a_vowel_found = True break if step1a_vowel_found: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] break # STEP 1b for suffix in self.__step1b_suffixes: if word.endswith(suffix): if suffix in ("eed", "eedly"): if r1.endswith(suffix): word = "".join((word[:-len(suffix)], "ee")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "ee")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "ee")) else: r2 = "" else: for letter in word[:-len(suffix)]: if letter in self.__vowels: step1b_vowel_found = True break if step1b_vowel_found: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] if word.endswith(("at", "bl", "iz")): word = "".join((word, "e")) r1 = "".join((r1, "e")) if len(word) > 5 or len(r1) >= 3: r2 = "".join((r2, "e")) elif word.endswith(self.__double_consonants): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif ((r1 == "" and len(word) >= 3 and word[-1] not in self.__vowels and word[-1] not in "wxY" and word[-2] in self.__vowels and word[-3] not in self.__vowels) or (r1 == "" and len(word) == 2 and word[0] in self.__vowels and word[1] not in self.__vowels)): word = "".join((word, "e")) if len(r1) > 0: r1 = "".join((r1, "e")) if len(r2) > 0: r2 = "".join((r2, "e")) break # STEP 1c if (len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels): word = "".join((word[:-1], "i")) if len(r1) >= 1: r1 = "".join((r1[:-1], "i")) else: r1 = "" if len(r2) >= 1: r2 = "".join((r2[:-1], "i")) else: r2 = "" # STEP 2 for suffix in self.__step2_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == "tional": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("enci", "anci", "abli"): word = "".join((word[:-1], "e")) if len(r1) >= 1: r1 = "".join((r1[:-1], "e")) else: r1 = "" if len(r2) >= 1: r2 = "".join((r2[:-1], "e")) else: r2 = "" elif suffix == "entli": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("izer", "ization"): word = "".join((word[:-len(suffix)], "ize")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "ize")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "ize")) else: r2 = "" elif suffix in ("ational", "ation", "ator"): word = "".join((word[:-len(suffix)], "ate")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "ate")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "ate")) else: r2 = "e" elif suffix in ("alism", "aliti", "alli"): word = "".join((word[:-len(suffix)], "al")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "al")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "al")) else: r2 = "" elif suffix == "fulness": word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] elif suffix in ("ousli", "ousness"): word = "".join((word[:-len(suffix)], "ous")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "ous")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "ous")) else: r2 = "" elif suffix in ("iveness", "iviti"): word = "".join((word[:-len(suffix)], "ive")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "ive")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "ive")) else: r2 = "e" elif suffix in ("biliti", "bli"): word = "".join((word[:-len(suffix)], "ble")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "ble")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "ble")) else: r2 = "" elif suffix == "ogi" and word[-4] == "l": word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif suffix in ("fulli", "lessli"): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "li" and word[-3] in self.__li_ending: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] break # STEP 3 for suffix in self.__step3_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == "tional": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "ational": word = "".join((word[:-len(suffix)], "ate")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "ate")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "ate")) else: r2 = "" elif suffix == "alize": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] elif suffix in ("icate", "iciti", "ical"): word = "".join((word[:-len(suffix)], "ic")) if len(r1) >= len(suffix): r1 = "".join((r1[:-len(suffix)], "ic")) else: r1 = "" if len(r2) >= len(suffix): r2 = "".join((r2[:-len(suffix)], "ic")) else: r2 = "" elif suffix in ("ful", "ness"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] elif suffix == "ative" and r2.endswith(suffix): word = word[:-5] r1 = r1[:-5] r2 = r2[:-5] break # STEP 4 for suffix in self.__step4_suffixes: if word.endswith(suffix): if r2.endswith(suffix): if suffix == "ion": if word[-4] in "st": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 5 if r2.endswith("l") and word[-2] == "l": word = word[:-1] elif r2.endswith("e"): word = word[:-1] elif r1.endswith("e"): if len(word) >= 4 and (word[-2] in self.__vowels or word[-2] in "wxY" or word[-3] not in self.__vowels or word[-4] in self.__vowels): word = word[:-1] word = word.replace("Y", "y") return word Whoosh-2.5.7/src/whoosh/lang/snowball/finnish.py0000644000076500000240000002353212254366350021747 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class FinnishStemmer(_StandardStemmer): """ The Finnish Snowball stemmer. :cvar __vowels: The Finnish vowels. :type __vowels: unicode :cvar __restricted_vowels: A subset of the Finnish vowels. :type __restricted_vowels: unicode :cvar __long_vowels: The Finnish vowels in their long forms. :type __long_vowels: tuple :cvar __consonants: The Finnish consonants. :type __consonants: unicode :cvar __double_consonants: The Finnish double consonants. :type __double_consonants: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the Finnish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/finnish/stemmer.html """ __vowels = u("aeiouy\xE4\xF6") __restricted_vowels = u("aeiou\xE4\xF6") __long_vowels = ("aa", "ee", "ii", "oo", "uu", u("\xE4\xE4"), u("\xF6\xF6")) __consonants = "bcdfghjklmnpqrstvwxz" __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj", "kk", "ll", "mm", "nn", "pp", "qq", "rr", "ss", "tt", "vv", "ww", "xx", "zz") __step1_suffixes = ('kaan', u('k\xE4\xE4n'), 'sti', 'kin', 'han', u('h\xE4n'), 'ko', u('k\xF6'), 'pa', u('p\xE4')) __step2_suffixes = ('nsa', u('ns\xE4'), 'mme', 'nne', 'si', 'ni', 'an', u('\xE4n'), 'en') __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin', 'hon', u('h\xE4n'), u('h\xF6n'), 'den', 'tta', u('tt\xE4'), 'ssa', u('ss\xE4'), 'sta', u('st\xE4'), 'lla', u('ll\xE4'), 'lta', u('lt\xE4'), 'lle', 'ksi', 'ine', 'ta', u('t\xE4'), 'na', u('n\xE4'), 'a', u('\xE4'), 'n') __step4_suffixes = ('impi', 'impa', u('imp\xE4'), 'immi', 'imma', u('imm\xE4'), 'mpi', 'mpa', u('mp\xE4'), 'mmi', 'mma', u('mm\xE4'), 'eja', u('ej\xE4')) def stem(self, word): """ Stem a Finnish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() step3_success = False r1, r2 = self._r1r2_standard(word, self.__vowels) # STEP 1: Particles etc. for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "sti": if suffix in r2: word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: if word[-len(suffix) - 1] in u("ntaeiouy\xE4\xF6"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 2: Possessives for suffix in self.__step2_suffixes: if r1.endswith(suffix): if suffix == "si": if word[-3] != "k": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "ni": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] if word.endswith("kse"): word = "".join((word[:-3], "ksi")) if r1.endswith("kse"): r1 = "".join((r1[:-3], "ksi")) if r2.endswith("kse"): r2 = "".join((r2[:-3], "ksi")) elif suffix == "an": if (word[-4:-2] in ("ta", "na") or word[-5:-2] in ("ssa", "sta", "lla", "lta")): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == u("\xE4n"): if (word[-4:-2] in (u("t\xE4"), u("n\xE4")) or word[-5:-2] in (u("ss\xE4"), u("st\xE4"), u("ll\xE4"), u("lt\xE4"))): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "en": if word[-5:-2] in ("lle", "ine"): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] break # STEP 3: Cases for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix in ("han", "hen", "hin", "hon", u("h\xE4n"), u("h\xF6n")): if ((suffix == "han" and word[-4] == "a") or (suffix == "hen" and word[-4] == "e") or (suffix == "hin" and word[-4] == "i") or (suffix == "hon" and word[-4] == "o") or (suffix == u("h\xE4n") and word[-4] == u("\xE4")) or (suffix == u("h\xF6n") and word[-4] == u("\xF6"))): word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] step3_success = True elif suffix in ("siin", "den", "tten"): if (word[-len(suffix) - 1] == "i" and word[-len(suffix) - 2] in self.__restricted_vowels): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] step3_success = True else: continue elif suffix == "seen": if word[-6:-4] in self.__long_vowels: word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] step3_success = True else: continue elif suffix in ("a", u("\xE4")): if word[-2] in self.__vowels and word[-3] in self.__consonants: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] step3_success = True elif suffix in ("tta", u("tt\xE4")): if word[-4] == "e": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] step3_success = True elif suffix == "n": word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] step3_success = True if word[-2:] == "ie" or word[-2:] in self.__long_vowels: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] step3_success = True break # STEP 4: Other endings for suffix in self.__step4_suffixes: if r2.endswith(suffix): if suffix in ("mpi", "mpa", u("mp\xE4"), "mmi", "mma", u("mm\xE4")): if word[-5:-3] != "po": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 5: Plurals if step3_success and len(r1) >= 1 and r1[-1] in "ij": word = word[:-1] r1 = r1[:-1] elif (not step3_success and len(r1) >= 2 and r1[-1] == "t" and r1[-2] in self.__vowels): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] if r2.endswith("imma"): word = word[:-4] r1 = r1[:-4] elif r2.endswith("mma") and r2[-5:-3] != "po": word = word[:-3] r1 = r1[:-3] # STEP 6: Tidying up if r1[-2:] in self.__long_vowels: word = word[:-1] r1 = r1[:-1] if (len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in u("a\xE4ei")): word = word[:-1] r1 = r1[:-1] if r1.endswith(("oj", "uj")): word = word[:-1] r1 = r1[:-1] if r1.endswith("jo"): word = word[:-1] r1 = r1[:-1] # If the word ends with a double consonant # followed by zero or more vowels, the last consonant is removed. for i in range(1, len(word)): if word[-i] in self.__vowels: continue else: if i == 1: if word[-i - 1:] in self.__double_consonants: word = word[:-1] else: if word[-i - 1:-i + 1] in self.__double_consonants: word = "".join((word[:-i], word[-i + 1:])) break return word Whoosh-2.5.7/src/whoosh/lang/snowball/french.py0000644000076500000240000003417512254366350021563 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class FrenchStemmer(_StandardStemmer): """ The French Snowball stemmer. :cvar __vowels: The French vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. :type __step2a_suffixes: tuple :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. :type __step2b_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the French stemming algorithm can be found under http://snowball.tartarus.org/algorithms/french/stemmer.html """ __vowels = u("aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9") __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice', 'ateurs', 'ations', 'logies', 'usions', 'utions', 'ements', 'amment', 'emment', 'ances', 'iqUes', 'ismes', 'ables', 'istes', 'ateur', 'ation', 'logie', 'usion', 'ution', 'ences', 'ement', 'euses', 'ments', 'ance', 'iqUe', 'isme', 'able', 'iste', 'ence', u('it\xE9s'), 'ives', 'eaux', 'euse', 'ment', 'eux', u('it\xE9'), 'ive', 'ifs', 'aux', 'if') __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante', 'issants', 'issions', 'irions', 'issais', 'issait', 'issant', 'issent', 'issiez', 'issons', 'irais', 'irait', 'irent', 'iriez', 'irons', 'iront', 'isses', 'issez', u('\xEEmes'), u('\xEEtes'), 'irai', 'iras', 'irez', 'isse', 'ies', 'ira', u('\xEEt'), 'ie', 'ir', 'is', 'it', 'i') __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent', 'assiez', u('\xE8rent'), 'erais', 'erait', 'eriez', 'erons', 'eront', 'aIent', 'antes', 'asses', 'ions', 'erai', 'eras', 'erez', u('\xE2mes'), u('\xE2tes'), 'ante', 'ants', 'asse', u('\xE9es'), 'era', 'iez', 'ais', 'ait', 'ant', u('\xE9e'), u('\xE9s'), 'er', 'ez', u('\xE2t'), 'ai', 'as', u('\xE9'), 'a') __step4_suffixes = (u('i\xE8re'), u('I\xE8re'), 'ion', 'ier', 'Ier', 'e', u('\xEB')) def stem(self, word): """ Stem a French word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() step1_success = False rv_ending_found = False step2a_success = False step2b_success = False # Every occurrence of 'u' after 'q' is put into upper case. for i in range(1, len(word)): if word[i - 1] == "q" and word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) # Every occurrence of 'u' and 'i' # between vowels is put into upper case. # Every occurrence of 'y' preceded or # followed by a vowel is also put into upper case. for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) elif word[i] == "i": word = "".join((word[:i], "I", word[i + 1:])) if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels: if word[i] == "y": word = "".join((word[:i], "Y", word[i + 1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self.__rv_french(word, self.__vowels) # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "eaux": word = word[:-1] step1_success = True elif suffix in ("euse", "euses"): if suffix in r2: word = word[:-len(suffix)] step1_success = True elif suffix in r1: word = "".join((word[:-len(suffix)], "eux")) step1_success = True elif suffix in ("ement", "ements") and suffix in rv: word = word[:-len(suffix)] step1_success = True if word[-2:] == "iv" and "iv" in r2: word = word[:-2] if word[-2:] == "at" and "at" in r2: word = word[:-2] elif word[-3:] == "eus": if "eus" in r2: word = word[:-3] elif "eus" in r1: word = "".join((word[:-1], "x")) elif word[-3:] in ("abl", "iqU"): if "abl" in r2 or "iqU" in r2: word = word[:-3] elif word[-3:] in (u("i\xE8r"), u("I\xE8r")): if u("i\xE8r") in rv or u("I\xE8r") in rv: word = "".join((word[:-3], "i")) elif suffix == "amment" and suffix in rv: word = "".join((word[:-6], "ant")) rv = "".join((rv[:-6], "ant")) rv_ending_found = True elif suffix == "emment" and suffix in rv: word = "".join((word[:-6], "ent")) rv_ending_found = True elif (suffix in ("ment", "ments") and suffix in rv and not rv.startswith(suffix) and rv[rv.rindex(suffix) - 1] in self.__vowels): word = word[:-len(suffix)] rv = rv[:-len(suffix)] rv_ending_found = True elif suffix == "aux" and suffix in r1: word = "".join((word[:-2], "l")) step1_success = True elif (suffix in ("issement", "issements") and suffix in r1 and word[-len(suffix) - 1] not in self.__vowels): word = word[:-len(suffix)] step1_success = True elif suffix in ("ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes") and suffix in r2: word = word[:-len(suffix)] step1_success = True elif suffix in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations") and suffix in r2: word = word[:-len(suffix)] step1_success = True if word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) elif suffix in ("logie", "logies") and suffix in r2: word = "".join((word[:-len(suffix)], "log")) step1_success = True elif (suffix in ("usion", "ution", "usions", "utions") and suffix in r2): word = "".join((word[:-len(suffix)], "u")) step1_success = True elif suffix in ("ence", "ences") and suffix in r2: word = "".join((word[:-len(suffix)], "ent")) step1_success = True elif suffix in (u("it\xE9"), u("it\xE9s")) and suffix in r2: word = word[:-len(suffix)] step1_success = True if word[-4:] == "abil": if "abil" in r2: word = word[:-4] else: word = "".join((word[:-2], "l")) elif word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) elif word[-2:] == "iv": if "iv" in r2: word = word[:-2] elif (suffix in ("if", "ive", "ifs", "ives") and suffix in r2): word = word[:-len(suffix)] step1_success = True if word[-2:] == "at" and "at" in r2: word = word[:-2] if word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) break # STEP 2a: Verb suffixes beginning 'i' if not step1_success or rv_ending_found: for suffix in self.__step2a_suffixes: if word.endswith(suffix): if (suffix in rv and len(rv) > len(suffix) and rv[rv.rindex(suffix) - 1] not in self.__vowels): word = word[:-len(suffix)] step2a_success = True break # STEP 2b: Other verb suffixes if not step2a_success: for suffix in self.__step2b_suffixes: if rv.endswith(suffix): if suffix == "ions" and "ions" in r2: word = word[:-4] step2b_success = True elif suffix in ('eraIent', 'erions', u('\xE8rent'), 'erais', 'erait', 'eriez', 'erons', 'eront', 'erai', 'eras', 'erez', u('\xE9es'), 'era', 'iez', u('\xE9e'), u('\xE9s'), 'er', 'ez', u('\xE9')): word = word[:-len(suffix)] step2b_success = True elif suffix in ('assions', 'assent', 'assiez', 'aIent', 'antes', 'asses', u('\xE2mes'), u('\xE2tes'), 'ante', 'ants', 'asse', 'ais', 'ait', 'ant', u('\xE2t'), 'ai', 'as', 'a'): word = word[:-len(suffix)] rv = rv[:-len(suffix)] step2b_success = True if rv.endswith("e"): word = word[:-1] break # STEP 3 if step1_success or step2a_success or step2b_success: if word[-1] == "Y": word = "".join((word[:-1], "i")) elif word[-1] == u("\xE7"): word = "".join((word[:-1], "c")) # STEP 4: Residual suffixes else: if (len(word) >= 2 and word[-1] == "s" and word[-2] not in u("aiou\xE8s")): word = word[:-1] for suffix in self.__step4_suffixes: if word.endswith(suffix): if suffix in rv: if (suffix == "ion" and suffix in r2 and rv[-4] in "st"): word = word[:-3] elif suffix in ("ier", u("i\xE8re"), "Ier", u("I\xE8re")): word = "".join((word[:-len(suffix)], "i")) elif suffix == "e": word = word[:-1] elif suffix == u("\xEB") and word[-3:-1] == "gu": word = word[:-1] break # STEP 5: Undouble if word.endswith(("enn", "onn", "ett", "ell", "eill")): word = word[:-1] # STEP 6: Un-accent for i in range(1, len(word)): if word[-i] not in self.__vowels: i += 1 else: if i != 1 and word[-i] in (u("\xE9"), u("\xE8")): word = "".join((word[:-i], "e", word[-i + 1:])) break word = (word.replace("I", "i") .replace("U", "u") .replace("Y", "y")) return word def __rv_french(self, word, vowels): """ Return the region RV that is used by the French stemmer. If the word begins with two vowels, RV is the region after the third letter. Otherwise, it is the region after the first vowel not at the beginning of the word, or the end of the word if these positions cannot be found. (Exceptionally, u'par', u'col' or u'tap' at the beginning of a word is also taken to define RV as the region to their right.) :param word: The French word whose region RV is determined. :type word: str or unicode :param vowels: The French vowels that are used to determine the region RV. :type vowels: unicode :return: the region RV for the respective French word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass FrenchStemmer. It is not to be invoked directly! """ rv = "" if len(word) >= 2: if (word.startswith(("par", "col", "tap")) or (word[0] in vowels and word[1] in vowels)): rv = word[3:] else: for i in range(1, len(word)): if word[i] in vowels: rv = word[i + 1:] break return rv Whoosh-2.5.7/src/whoosh/lang/snowball/german.py0000644000076500000240000001234312254366350021560 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class GermanStemmer(_StandardStemmer): """ The German Snowball stemmer. :cvar __vowels: The German vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __st_ending: Letter that may directly appear before a word final 'st'. :type __st_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the German stemming algorithm can be found under http://snowball.tartarus.org/algorithms/german/stemmer.html """ __vowels = u("aeiouy\xE4\xF6\xFC") __s_ending = "bdfghklmnrt" __st_ending = "bdfghklmnt" __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") __step2_suffixes = ("est", "en", "er", "st") __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik") def stem(self, word): """ Stem a German word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() word = word.replace(u("\xDF"), "ss") # Every occurrence of 'u' and 'y' # between vowels is put into upper case. for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) elif word[i] == "y": word = "".join((word[:i], "Y", word[i + 1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) # R1 is adjusted so that the region before it # contains at least 3 letters. for i in range(1, len(word)): if word[i] not in self.__vowels and word[i - 1] in self.__vowels: if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: r1 = word[3:] elif len(word[:i + 1]) == 0: return word break # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if (suffix in ("en", "es", "e") and word[-len(suffix) - 4:-len(suffix)] == "niss"): word = word[:-len(suffix) - 1] r1 = r1[:-len(suffix) - 1] r2 = r2[:-len(suffix) - 1] elif suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): if suffix == "st": if word[-3] in self.__st_ending and len(word[:-3]) >= 3: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 3: Derivational suffixes for suffix in self.__step3_suffixes: if r2.endswith(suffix): if suffix in ("end", "ung"): if ("ig" in r2[-len(suffix) - 2:-len(suffix)] and "e" not in r2[-len(suffix) - 3:-len(suffix) - 2]): word = word[:-len(suffix) - 2] else: word = word[:-len(suffix)] elif (suffix in ("ig", "ik", "isch") and "e" not in r2[-len(suffix) - 1:-len(suffix)]): word = word[:-len(suffix)] elif suffix in ("lich", "heit"): if ("er" in r1[-len(suffix) - 2:-len(suffix)] or "en" in r1[-len(suffix) - 2:-len(suffix)]): word = word[:-len(suffix) - 2] else: word = word[:-len(suffix)] elif suffix == "keit": if "lich" in r2[-len(suffix) - 4:-len(suffix)]: word = word[:-len(suffix) - 4] elif "ig" in r2[-len(suffix) - 2:-len(suffix)]: word = word[:-len(suffix) - 2] else: word = word[:-len(suffix)] break # Umlaut accents are removed and # 'u' and 'y' are put back into lower case. word = (word.replace(u("\xE4"), "a").replace(u("\xF6"), "o") .replace(u("\xFC"), "u").replace("U", "u") .replace("Y", "y")) return word Whoosh-2.5.7/src/whoosh/lang/snowball/hungarian.py0000644000076500000240000002665612254366350022277 0ustar mattstaff00000000000000from whoosh.compat import u class HungarianStemmer(object): """ The Hungarian Snowball stemmer. :cvar __vowels: The Hungarian vowels. :type __vowels: unicode :cvar __digraphs: The Hungarian digraphs. :type __digraphs: tuple :cvar __double_consonants: The Hungarian double consonants. :type __double_consonants: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. :type __step5_suffixes: tuple :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. :type __step6_suffixes: tuple :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. :type __step7_suffixes: tuple :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. :type __step8_suffixes: tuple :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. :type __step9_suffixes: tuple :note: A detailed description of the Hungarian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/hungarian/stemmer.html """ __vowels = u("aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB") __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg", "ggy", "jj", "kk", "ll", "lly", "mm", "nn", "nny", "pp", "rr", "ss", "ssz", "tt", "tty", "vv", "zz", "zzs") __step1_suffixes = ("al", "el") __step2_suffixes = (u('k\xE9ppen'), u('onk\xE9nt'), u('enk\xE9nt'), u('ank\xE9nt'), u('k\xE9pp'), u('k\xE9nt'), 'ban', 'ben', 'nak', 'nek', 'val', 'vel', u('t\xF3l'), u('t\xF5l'), u('r\xF3l'), u('r\xF5l'), u('b\xF3l'), u('b\xF5l'), 'hoz', 'hez', u('h\xF6z'), u('n\xE1l'), u('n\xE9l'), u('\xE9rt'), 'kor', 'ba', 'be', 'ra', 're', 'ig', 'at', 'et', 'ot', u('\xF6t'), 'ul', u('\xFCl'), u('v\xE1'), u('v\xE9'), 'en', 'on', 'an', u('\xF6n'), 'n', 't') __step3_suffixes = (u("\xE1nk\xE9nt"), u("\xE1n"), u("\xE9n")) __step4_suffixes = ('astul', u('est\xFCl'), u('\xE1stul'), u('\xE9st\xFCl'), 'stul', u('st\xFCl')) __step5_suffixes = (u("\xE1"), u("\xE9")) __step6_suffixes = (u('ok\xE9'), u('\xF6k\xE9'), u('ak\xE9'), u('ek\xE9'), u('\xE1k\xE9'), u('\xE1\xE9i'), u('\xE9k\xE9'), u('\xE9\xE9i'), u('k\xE9'), u('\xE9i'), u('\xE9\xE9'), u('\xE9')) __step7_suffixes = (u('\xE1juk'), u('\xE9j\xFCk'), u('\xFCnk'), 'unk', 'juk', u('j\xFCk'), u('\xE1nk'), u('\xE9nk'), 'nk', 'uk', u('\xFCk'), 'em', 'om', 'am', 'od', 'ed', 'ad', u('\xF6d'), 'ja', 'je', u('\xE1m'), u('\xE1d'), u('\xE9m'), u('\xE9d'), 'm', 'd', 'a', 'e', 'o', u('\xE1'), u('\xE9')) __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok', 'eitek', u('\xE1itok'), u('\xE9itek'), 'jaim', 'jeim', 'jaid', 'jeid', 'eink', 'aink', 'itek', 'jeik', 'jaik', u('\xE1ink'), u('\xE9ink'), 'aim', 'eim', 'aid', 'eid', 'jai', 'jei', 'ink', 'aik', 'eik', u('\xE1im'), u('\xE1id'), u('\xE1ik'), u('\xE9im'), u('\xE9id'), u('\xE9ik'), 'im', 'id', 'ai', 'ei', 'ik', u('\xE1i'), u('\xE9i'), 'i') __step9_suffixes = (u("\xE1k"), u("\xE9k"), u("\xF6k"), "ok", "ek", "ak", "k") def stem(self, word): """ Stem an Hungarian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) # STEP 1: Remove instrumental case if r1.endswith(self.__step1_suffixes): for double_cons in self.__double_consonants: if word[-2 - len(double_cons):-2] == double_cons: word = "".join((word[:-4], word[-3])) if r1[-2 - len(double_cons):-2] == double_cons: r1 = "".join((r1[:-4], r1[-3])) break # STEP 2: Remove frequent cases for suffix in self.__step2_suffixes: if word.endswith(suffix): if r1.endswith(suffix): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] if r1.endswith(u("\xE1")): word = "".join((word[:-1], "a")) r1 = "".join((r1[:-1], "a")) elif r1.endswith(u("\xE9")): word = "".join((word[:-1], "e")) r1 = "".join((r1[:-1], "e")) break # STEP 3: Remove special cases for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix == u("\xE9n"): word = "".join((word[:-2], "e")) r1 = "".join((r1[:-2], "e")) else: word = "".join((word[:-len(suffix)], "a")) r1 = "".join((r1[:-len(suffix)], "a")) break # STEP 4: Remove other cases for suffix in self.__step4_suffixes: if r1.endswith(suffix): if suffix == u("\xE1stul"): word = "".join((word[:-5], "a")) r1 = "".join((r1[:-5], "a")) elif suffix == u("\xE9st\xFCl"): word = "".join((word[:-5], "e")) r1 = "".join((r1[:-5], "e")) else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 5: Remove factive case for suffix in self.__step5_suffixes: if r1.endswith(suffix): for double_cons in self.__double_consonants: if word[-1 - len(double_cons):-1] == double_cons: word = "".join((word[:-3], word[-2])) if r1[-1 - len(double_cons):-1] == double_cons: r1 = "".join((r1[:-3], r1[-2])) break # STEP 6: Remove owned for suffix in self.__step6_suffixes: if r1.endswith(suffix): if suffix in (u("\xE1k\xE9"), u("\xE1\xE9i")): word = "".join((word[:-3], "a")) r1 = "".join((r1[:-3], "a")) elif suffix in (u("\xE9k\xE9"), u("\xE9\xE9i"), u("\xE9\xE9")): word = "".join((word[:-len(suffix)], "e")) r1 = "".join((r1[:-len(suffix)], "e")) else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 7: Remove singular owner suffixes for suffix in self.__step7_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix in (u("\xE1nk"), u("\xE1juk"), u("\xE1m"), u("\xE1d"), u("\xE1")): word = "".join((word[:-len(suffix)], "a")) r1 = "".join((r1[:-len(suffix)], "a")) elif suffix in (u("\xE9nk"), u("\xE9j\xFCk"), u("\xE9m"), u("\xE9d"), u("\xE9")): word = "".join((word[:-len(suffix)], "e")) r1 = "".join((r1[:-len(suffix)], "e")) else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 8: Remove plural owner suffixes for suffix in self.__step8_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix in (u("\xE1im"), u("\xE1id"), u("\xE1i"), u("\xE1ink"), u("\xE1itok"), u("\xE1ik")): word = "".join((word[:-len(suffix)], "a")) r1 = "".join((r1[:-len(suffix)], "a")) elif suffix in (u("\xE9im"), u("\xE9id"), u("\xE9i"), u("\xE9ink"), u("\xE9itek"), u("\xE9ik")): word = "".join((word[:-len(suffix)], "e")) r1 = "".join((r1[:-len(suffix)], "e")) else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 9: Remove plural suffixes for suffix in self.__step9_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == u("\xE1k"): word = "".join((word[:-2], "a")) elif suffix == u("\xE9k"): word = "".join((word[:-2], "e")) else: word = word[:-len(suffix)] break return word def __r1_hungarian(self, word, vowels, digraphs): """ Return the region R1 that is used by the Hungarian stemmer. If the word begins with a vowel, R1 is defined as the region after the first consonant or digraph (= two letters stand for one phoneme) in the word. If the word begins with a consonant, it is defined as the region after the first vowel in the word. If the word does not contain both a vowel and consonant, R1 is the null region at the end of the word. :param word: The Hungarian word whose region R1 is determined. :type word: str or unicode :param vowels: The Hungarian vowels that are used to determine the region R1. :type vowels: unicode :param digraphs: The digraphs that are used to determine the region R1. :type digraphs: tuple :return: the region R1 for the respective word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass HungarianStemmer. It is not to be invoked directly! """ r1 = "" if word[0] in vowels: for digraph in digraphs: if digraph in word[1:]: r1 = word[word.index(digraph[-1]) + 1:] return r1 for i in range(1, len(word)): if word[i] not in vowels: r1 = word[i + 1:] break else: for i in range(1, len(word)): if word[i] in vowels: r1 = word[i + 1:] break return r1 Whoosh-2.5.7/src/whoosh/lang/snowball/italian.py0000644000076500000240000002164512254366350021735 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class ItalianStemmer(_StandardStemmer): """ The Italian Snowball stemmer. :cvar __vowels: The Italian vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :note: A detailed description of the Italian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/italian/stemmer.html """ __vowels = u("aeiou\xE0\xE8\xEC\xF2\xF9") __step0_suffixes = ('gliela', 'gliele', 'glieli', 'glielo', 'gliene', 'sene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela', 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', 'gli', 'ci', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi') __step1_suffixes = ('atrice', 'atrici', 'azione', 'azioni', 'uzione', 'uzioni', 'usione', 'usioni', 'amento', 'amenti', 'imento', 'imenti', 'amente', 'abile', 'abili', 'ibile', 'ibili', 'mente', 'atore', 'atori', 'logia', 'logie', 'anza', 'anze', 'iche', 'ichi', 'ismo', 'ismi', 'ista', 'iste', 'isti', u('ist\xE0'), u('ist\xE8'), u('ist\xEC'), 'ante', 'anti', 'enza', 'enze', 'ico', 'ici', 'ica', 'ice', 'oso', 'osi', 'osa', 'ose', u('it\xE0'), 'ivo', 'ivi', 'iva', 'ive') __step2_suffixes = ('erebbero', 'irebbero', 'assero', 'assimo', 'eranno', 'erebbe', 'eremmo', 'ereste', 'eresti', 'essero', 'iranno', 'irebbe', 'iremmo', 'ireste', 'iresti', 'iscano', 'iscono', 'issero', 'arono', 'avamo', 'avano', 'avate', 'eremo', 'erete', 'erono', 'evamo', 'evano', 'evate', 'iremo', 'irete', 'irono', 'ivamo', 'ivano', 'ivate', 'ammo', 'ando', 'asse', 'assi', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erai', 'erei', 'Yamo', 'iamo', 'immo', 'irai', 'irei', 'isca', 'isce', 'isci', 'isco', 'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', u('er\xE0'), 'ere', u('er\xF2'), 'ete', 'eva', 'evi', 'evo', u('ir\xE0'), 'ire', u('ir\xF2'), 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'ar', 'ir') def stem(self, word): """ Stem an Italian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() step1_success = False # All acute accents are replaced by grave accents. word = (word.replace(u("\xE1"), u("\xE0")) .replace(u("\xE9"), u("\xE8")) .replace(u("\xED"), u("\xEC")) .replace(u("\xF3"), u("\xF2")) .replace(u("\xFA"), u("\xF9"))) # Every occurrence of 'u' after 'q' # is put into upper case. for i in range(1, len(word)): if word[i - 1] == "q" and word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) # Every occurrence of 'u' and 'i' # between vowels is put into upper case. for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) elif word[i] == "i": word = "".join((word[:i], "I", word[i + 1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Attached pronoun for suffix in self.__step0_suffixes: if rv.endswith(suffix): if rv[-len(suffix) - 4:-len(suffix)] in ("ando", "endo"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] elif (rv[-len(suffix) - 2:-len(suffix)] in ("ar", "er", "ir")): word = "".join((word[:-len(suffix)], "e")) r1 = "".join((r1[:-len(suffix)], "e")) r2 = "".join((r2[:-len(suffix)], "e")) rv = "".join((rv[:-len(suffix)], "e")) break # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic")): word = word[:-2] rv = rv[:-2] elif r2 .endswith("abil"): word = word[:-4] rv = rv[:-4] elif (suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith(suffix)): step1_success = True word = word[:-6] rv = rv[:-6] elif r2.endswith(suffix): step1_success = True if suffix in ("azione", "azioni", "atore", "atori"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] elif suffix in ("logia", "logie"): word = word[:-2] rv = word[:-2] elif suffix in ("uzione", "uzioni", "usione", "usioni"): word = word[:-5] rv = rv[:-5] elif suffix in ("enza", "enze"): word = "".join((word[:-2], "te")) rv = "".join((rv[:-2], "te")) elif suffix == u("it\xE0"): word = word[:-3] r2 = r2[:-3] rv = rv[:-3] if r2.endswith(("ic", "iv")): word = word[:-2] rv = rv[:-2] elif r2.endswith("abil"): word = word[:-4] rv = rv[:-4] elif suffix in ("ivo", "ivi", "iva", "ive"): word = word[:-3] r2 = r2[:-3] rv = rv[:-3] if r2.endswith("at"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] else: word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 2: Verb suffixes if not step1_success: for suffix in self.__step2_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 3a if rv.endswith(("a", "e", "i", "o", u("\xE0"), u("\xE8"), u("\xEC"), u("\xF2"))): word = word[:-1] rv = rv[:-1] if rv.endswith("i"): word = word[:-1] rv = rv[:-1] # STEP 3b if rv.endswith(("ch", "gh")): word = word[:-1] word = word.replace("I", "i").replace("U", "u") return word Whoosh-2.5.7/src/whoosh/lang/snowball/norwegian.py0000644000076500000240000000536312254366350022304 0ustar mattstaff00000000000000from .bases import _ScandinavianStemmer from whoosh.compat import u class NorwegianStemmer(_ScandinavianStemmer): """ The Norwegian Snowball stemmer. :cvar __vowels: The Norwegian vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Norwegian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/norwegian/stemmer.html """ __vowels = u("aeiouy\xE6\xE5\xF8") __s_ending = "bcdfghjlmnoprtvyz" __step1_suffixes = ("hetenes", "hetene", "hetens", "heter", "heten", "endes", "ande", "ende", "edes", "enes", "erte", "ede", "ane", "ene", "ens", "ers", "ets", "het", "ast", "ert", "en", "ar", "er", "as", "es", "et", "a", "e", "s") __step2_suffixes = ("dt", "vt") __step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov", "leg", "eig", "lig", "els", "lov", "ig") def stem(self, word): """ Stem a Norwegian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() r1 = self._r1_scandinavian(word, self.__vowels) # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix in ("erte", "ert"): word = "".join((word[:-len(suffix)], "er")) r1 = "".join((r1[:-len(suffix)], "er")) elif suffix == "s": if (word[-2] in self.__s_ending or (word[-2] == "k" and word[-3] not in self.__vowels)): word = word[:-1] r1 = r1[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 for suffix in self.__step3_suffixes: if r1.endswith(suffix): word = word[:-len(suffix)] break return word Whoosh-2.5.7/src/whoosh/lang/snowball/portugese.py0000644000076500000240000002003512254366350022321 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class PortugueseStemmer(_StandardStemmer): """ The Portuguese Snowball stemmer. :cvar __vowels: The Portuguese vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the Portuguese stemming algorithm can be found under http://snowball.tartarus.org/algorithms/portuguese/stemmer.html """ __vowels = u("aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4") __step1_suffixes = ('amentos', 'imentos', 'uciones', 'amento', 'imento', 'adoras', 'adores', u('a\xE7o~es'), u('log\xEDas'), u('\xEAncias'), 'amente', 'idades', 'ismos', 'istas', 'adora', u('a\xE7a~o'), 'antes', u('\xE2ncia'), u('log\xEDa'), u('uci\xF3n'), u('\xEAncia'), 'mente', 'idade', 'ezas', 'icos', 'icas', 'ismo', u('\xE1vel'), u('\xEDvel'), 'ista', 'osos', 'osas', 'ador', 'ante', 'ivas', 'ivos', 'iras', 'eza', 'ico', 'ica', 'oso', 'osa', 'iva', 'ivo', 'ira') __step2_suffixes = (u('ar\xEDamos'), u('er\xEDamos'), u('ir\xEDamos'), u('\xE1ssemos'), u('\xEAssemos'), u('\xEDssemos'), u('ar\xEDeis'), u('er\xEDeis'), u('ir\xEDeis'), u('\xE1sseis'), u('\xE9sseis'), u('\xEDsseis'), u('\xE1ramos'), u('\xE9ramos'), u('\xEDramos'), u('\xE1vamos'), 'aremos', 'eremos', 'iremos', 'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'ara~o', 'era~o', 'ira~o', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes', 'asses', 'esses', 'isses', 'astes', 'estes', 'istes', u('\xE1reis'), 'areis', u('\xE9reis'), 'ereis', u('\xEDreis'), 'ireis', u('\xE1veis'), u('\xEDamos'), 'armos', 'ermos', 'irmos', 'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'adas', 'idas', u('ar\xE1s'), 'aras', u('er\xE1s'), 'eras', u('ir\xE1s'), 'avas', 'ares', 'eres', 'ires', u('\xEDeis'), 'ados', 'idos', u('\xE1mos'), 'amos', 'emos', 'imos', 'iras', 'ada', 'ida', u('ar\xE1'), 'ara', u('er\xE1'), 'era', u('ir\xE1'), 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou') __step4_suffixes = ("os", "a", "i", "o", u("\xE1"), u("\xED"), u("\xF3")) def stem(self, word): """ Stem a Portuguese word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() step1_success = False step2_success = False word = (word.replace(u("\xE3"), "a~") .replace(u("\xF5"), "o~")) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic", "ad")): word = word[:-2] rv = rv[:-2] elif (suffix in ("ira", "iras") and rv.endswith(suffix) and word[-len(suffix) - 1:-len(suffix)] == "e"): step1_success = True word = "".join((word[:-len(suffix)], "ir")) rv = "".join((rv[:-len(suffix)], "ir")) elif r2.endswith(suffix): step1_success = True if suffix in (u("log\xEDa"), u("log\xEDas")): word = word[:-2] rv = rv[:-2] elif suffix in (u("uci\xF3n"), "uciones"): word = "".join((word[:-len(suffix)], "u")) rv = "".join((rv[:-len(suffix)], "u")) elif suffix in (u("\xEAncia"), u("\xEAncias")): word = "".join((word[:-len(suffix)], "ente")) rv = "".join((rv[:-len(suffix)], "ente")) elif suffix == "mente": word = word[:-5] r2 = r2[:-5] rv = rv[:-5] if r2.endswith(("ante", "avel", u("\xEDvel"))): word = word[:-4] rv = rv[:-4] elif suffix in ("idade", "idades"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith(("ic", "iv")): word = word[:-2] rv = rv[:-2] elif r2.endswith("abil"): word = word[:-4] rv = rv[:-4] elif suffix in ("iva", "ivo", "ivas", "ivos"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] else: word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 2: Verb suffixes if not step1_success: for suffix in self.__step2_suffixes: if rv.endswith(suffix): step2_success = True word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 3 if step1_success or step2_success: if rv.endswith("i") and word[-2] == "c": word = word[:-1] rv = rv[:-1] ### STEP 4: Residual suffix if not step1_success and not step2_success: for suffix in self.__step4_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 5 if rv.endswith(("e", u("\xE9"), u("\xEA"))): word = word[:-1] rv = rv[:-1] if ((word.endswith("gu") and rv.endswith("u")) or (word.endswith("ci") and rv.endswith("i"))): word = word[:-1] elif word.endswith(u("\xE7")): word = "".join((word[:-1], "c")) word = word.replace("a~", u("\xE3")).replace("o~", u("\xF5")) return word Whoosh-2.5.7/src/whoosh/lang/snowball/romanian.py0000644000076500000240000002674012254366350022121 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class RomanianStemmer(_StandardStemmer): """ The Romanian Snowball stemmer. :cvar __vowels: The Romanian vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Romanian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/romanian/stemmer.html """ __vowels = u("aeiou\u0103\xE2\xEE") __step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor', 'atei', u('a\u0163ie'), u('a\u0163ia'), 'aua', 'ele', 'iua', 'iei', 'ile', 'ul', 'ea', 'ii') __step1_suffixes = ('abilitate', 'abilitati', u('abilit\u0103\u0163i'), 'ibilitate', u('abilit\u0103i'), 'ivitate', 'ivitati', u('ivit\u0103\u0163i'), 'icitate', 'icitati', u('icit\u0103\u0163i'), 'icatori', u('ivit\u0103i'), u('icit\u0103i'), 'icator', u('a\u0163iune'), 'atoare', u('\u0103toare'), u('i\u0163iune'), 'itoare', 'iciva', 'icive', 'icivi', u('iciv\u0103'), 'icala', 'icale', 'icali', u('ical\u0103'), 'ativa', 'ative', 'ativi', u('ativ\u0103'), 'atori', u('\u0103tori'), 'itiva', 'itive', 'itivi', u('itiv\u0103'), 'itori', 'iciv', 'ical', 'ativ', 'ator', u('\u0103tor'), 'itiv', 'itor') __step2_suffixes = ('abila', 'abile', 'abili', u('abil\u0103'), 'ibila', 'ibile', 'ibili', u('ibil\u0103'), 'atori', 'itate', 'itati', u('it\u0103\u0163i'), 'abil', 'ibil', 'oasa', u('oas\u0103'), 'oase', 'anta', 'ante', 'anti', u('ant\u0103'), 'ator', u('it\u0103i'), 'iune', 'iuni', 'isme', 'ista', 'iste', 'isti', u('ist\u0103'), u('i\u015Fti'), 'ata', u('at\u0103'), 'ati', 'ate', 'uta', u('ut\u0103'), 'uti', 'ute', 'ita', u('it\u0103'), 'iti', 'ite', 'ica', 'ice', 'ici', u('ic\u0103'), 'osi', u('o\u015Fi'), 'ant', 'iva', 'ive', 'ivi', u('iv\u0103'), 'ism', 'ist', 'at', 'ut', 'it', 'ic', 'os', 'iv') __step3_suffixes = (u('seser\u0103\u0163i'), u('aser\u0103\u0163i'), u('iser\u0103\u0163i'), u('\xE2ser\u0103\u0163i'), u('user\u0103\u0163i'), u('seser\u0103m'), u('aser\u0103m'), u('iser\u0103m'), u('\xE2ser\u0103m'), u('user\u0103m'), u('ser\u0103\u0163i'), u('sese\u015Fi'), u('seser\u0103'), u('easc\u0103'), u('ar\u0103\u0163i'), u('ur\u0103\u0163i'), u('ir\u0103\u0163i'), u('\xE2r\u0103\u0163i'), u('ase\u015Fi'), u('aser\u0103'), u('ise\u015Fi'), u('iser\u0103'), u('\xe2se\u015Fi'), u('\xE2ser\u0103'), u('use\u015Fi'), u('user\u0103'), u('ser\u0103m'), 'sesem', 'indu', '\xE2ndu', u('eaz\u0103'), u('e\u015Fti'), u('e\u015Fte'), u('\u0103\u015Fti'), u('\u0103\u015Fte'), u('ea\u0163i'), u('ia\u0163i'), u('ar\u0103m'), u('ur\u0103m'), u('ir\u0103m'), u('\xE2r\u0103m'), 'asem', 'isem', '\xE2sem', 'usem', u('se\u015Fi'), u('ser\u0103'), 'sese', 'are', 'ere', 'ire', '\xE2re', 'ind', '\xE2nd', 'eze', 'ezi', 'esc', u('\u0103sc'), 'eam', 'eai', 'eau', 'iam', 'iai', 'iau', u('a\u015Fi'), u('ar\u0103'), u('u\u015Fi'), u('ur\u0103'), u('i\u015Fi'), u('ir\u0103'), u('\xE2\u015Fi'), u('\xe2r\u0103'), 'ase', 'ise', '\xE2se', 'use', u('a\u0163i'), u('e\u0163i'), u('i\u0163i'), u('\xe2\u0163i'), 'sei', 'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui', '\xE2i', u('\u0103m'), 'em', 'im', '\xE2m', 'se') def stem(self, word): """ Stem a Romanian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() step1_success = False step2_success = False for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) elif word[i] == "i": word = "".join((word[:i], "I", word[i + 1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Removal of plurals and other simplifications for suffix in self.__step0_suffixes: if word.endswith(suffix): if suffix in r1: if suffix in ("ul", "ului"): word = word[:-len(suffix)] if suffix in rv: rv = rv[:-len(suffix)] else: rv = "" elif (suffix == "aua" or suffix == "atei" or (suffix == "ile" and word[-5:-3] != "ab")): word = word[:-2] elif suffix in ("ea", "ele", "elor"): word = "".join((word[:-len(suffix)], "e")) if suffix in rv: rv = "".join((rv[:-len(suffix)], "e")) else: rv = "" elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"): word = "".join((word[:-len(suffix)], "i")) if suffix in rv: rv = "".join((rv[:-len(suffix)], "i")) else: rv = "" elif suffix in ("a\u0163ie", "a\u0163ia"): word = word[:-1] break # STEP 1: Reduction of combining suffixes while True: replacement_done = False for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix in r1: step1_success = True replacement_done = True if suffix in ("abilitate", "abilitati", "abilit\u0103i", "abilit\u0103\u0163i"): word = "".join((word[:-len(suffix)], "abil")) elif suffix == "ibilitate": word = word[:-5] elif suffix in ("ivitate", "ivitati", "ivit\u0103i", "ivit\u0103\u0163i"): word = "".join((word[:-len(suffix)], "iv")) elif suffix in ("icitate", "icitati", "icit\u0103i", "icit\u0103\u0163i", "icator", "icatori", "iciv", "iciva", "icive", "icivi", "iciv\u0103", "ical", "icala", "icale", "icali", "ical\u0103"): word = "".join((word[:-len(suffix)], "ic")) elif suffix in ("ativ", "ativa", "ative", "ativi", "ativ\u0103", "a\u0163iune", "atoare", "ator", "atori", "\u0103toare", "\u0103tor", "\u0103tori"): word = "".join((word[:-len(suffix)], "at")) if suffix in r2: r2 = "".join((r2[:-len(suffix)], "at")) elif suffix in ("itiv", "itiva", "itive", "itivi", "itiv\u0103", "i\u0163iune", "itoare", "itor", "itori"): word = "".join((word[:-len(suffix)], "it")) if suffix in r2: r2 = "".join((r2[:-len(suffix)], "it")) else: step1_success = False break if not replacement_done: break # STEP 2: Removal of standard suffixes for suffix in self.__step2_suffixes: if word.endswith(suffix): if suffix in r2: step2_success = True if suffix in ("iune", "iuni"): if word[-5] == "\u0163": word = "".join((word[:-5], "t")) elif suffix in ("ism", "isme", "ist", "ista", "iste", "isti", "ist\u0103", "i\u015Fti"): word = "".join((word[:-len(suffix)], "ist")) else: word = word[:-len(suffix)] break # STEP 3: Removal of verb suffixes if not step1_success and not step2_success: for suffix in self.__step3_suffixes: if word.endswith(suffix): if suffix in rv: if suffix in (u('seser\u0103\u0163i'), u('seser\u0103m'), u('ser\u0103\u0163i'), u('sese\u015Fi'), u('seser\u0103'), u('ser\u0103m'), 'sesem', u('se\u015Fi'), u('ser\u0103'), 'sese', u('a\u0163i'), u('e\u0163i'), u('i\u0163i'), u('\xE2\u0163i'), 'sei', u('\u0103m'), 'em', 'im', '\xE2m', 'se'): word = word[:-len(suffix)] rv = rv[:-len(suffix)] else: if (not rv.startswith(suffix) and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE"): word = word[:-len(suffix)] break # STEP 4: Removal of final vowel for suffix in ("ie", "a", "e", "i", "\u0103"): if word.endswith(suffix): if suffix in rv: word = word[:-len(suffix)] break word = word.replace("I", "i").replace("U", "u") return word Whoosh-2.5.7/src/whoosh/lang/snowball/russian.py0000644000076500000240000005065012254366350021776 0ustar mattstaff00000000000000from whoosh.compat import u class RussianStemmer(object): """ The Russian Snowball stemmer. :cvar __perfective_gerund_suffixes: Suffixes to be deleted. :type __perfective_gerund_suffixes: tuple :cvar __adjectival_suffixes: Suffixes to be deleted. :type __adjectival_suffixes: tuple :cvar __reflexive_suffixes: Suffixes to be deleted. :type __reflexive_suffixes: tuple :cvar __verb_suffixes: Suffixes to be deleted. :type __verb_suffixes: tuple :cvar __noun_suffixes: Suffixes to be deleted. :type __noun_suffixes: tuple :cvar __superlative_suffixes: Suffixes to be deleted. :type __superlative_suffixes: tuple :cvar __derivational_suffixes: Suffixes to be deleted. :type __derivational_suffixes: tuple :note: A detailed description of the Russian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/russian/stemmer.html """ __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'", "ivshi", "yvshi", "vshi", "iv", "yv", "v") __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a', 'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego', 'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu', 'ui^ushchikh', 'ui^ushchykh', 'ui^ushchui^u', 'ui^ushchaia', 'ui^ushchoi^u', 'ui^ushchei^u', 'i^ushchi^ui^u', 'i^ushchi^ai^a', 'ui^ushchee', 'ui^ushchie', 'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`', 'ui^ushchii`', 'ui^ushchyi`', 'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim', 'ui^ushchym', 'ui^ushchom', 'i^ushchimi', 'i^ushchymi', 'i^ushchego', 'i^ushchogo', 'i^ushchemu', 'i^ushchomu', 'i^ushchikh', 'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee', 'i^ushchie', 'i^ushchye', 'i^ushchoe', 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem', 'i^ushchim', 'i^ushchym', 'i^ushchom', 'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u', 'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a', 'shchimi', 'shchymi', 'shchego', 'shchogo', 'shchemu', 'shchomu', 'shchikh', 'shchykh', 'shchui^u', 'shchai^a', 'shchoi^u', 'shchei^u', 'ivshimi', 'ivshymi', 'ivshego', 'ivshogo', 'ivshemu', 'ivshomu', 'ivshikh', 'ivshykh', 'ivshui^u', 'ivshai^a', 'ivshoi^u', 'ivshei^u', 'yvshimi', 'yvshymi', 'yvshego', 'yvshogo', 'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh', 'yvshui^u', 'yvshai^a', 'yvshoi^u', 'yvshei^u', 'vshi^ui^u', 'vshi^ai^a', 'shchee', 'shchie', 'shchye', 'shchoe', 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', 'shchem', 'shchim', 'shchym', 'shchom', 'ivshee', 'ivshie', 'ivshye', 'ivshoe', 'ivshei`', 'ivshii`', 'ivshyi`', 'ivshoi`', 'ivshem', 'ivshim', 'ivshym', 'ivshom', 'yvshee', 'yvshie', 'yvshye', 'yvshoe', 'yvshei`', 'yvshii`', 'yvshyi`', 'yvshoi`', 'yvshem', 'yvshim', 'yvshym', 'yvshom', 'vshimi', 'vshymi', 'vshego', 'vshogo', 'vshemu', 'vshomu', 'vshikh', 'vshykh', 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u', 'emi^ui^u', 'emi^ai^a', 'nni^ui^u', 'nni^ai^a', 'vshee', 'vshie', 'vshye', 'vshoe', 'vshei`', 'vshii`', 'vshyi`', 'vshoi`', 'vshem', 'vshim', 'vshym', 'vshom', 'emimi', 'emymi', 'emego', 'emogo', 'ememu', 'emomu', 'emikh', 'emykh', 'emui^u', 'emai^a', 'emoi^u', 'emei^u', 'nnimi', 'nnymi', 'nnego', 'nnogo', 'nnemu', 'nnomu', 'nnikh', 'nnykh', 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', 'emee', 'emie', 'emye', 'emoe', 'emei`', 'emii`', 'emyi`', 'emoi`', 'emem', 'emim', 'emym', 'emom', 'nnee', 'nnie', 'nnye', 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`', 'nnem', 'nnim', 'nnym', 'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi', 'ego', 'ogo', 'emu', 'omu', 'ikh', 'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u', 'ee', 'ie', 'ye', 'oe', 'ei`', 'ii`', 'yi`', 'oi`', 'em', 'im', 'ym', 'om') __reflexive_suffixes = ("si^a", "s'") __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut', "ish'", 'ete', 'i`te', 'i^ut', 'nno', 'ila', 'yla', 'ena', 'ite', 'ili', 'yli', 'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny', "it'", "yt'", 'ui^u', 'la', 'na', 'li', 'em', 'lo', 'no', 'et', 'ny', "t'", 'ei`', 'ui`', 'il', 'yl', 'im', 'ym', 'en', 'it', 'yt', 'i^u', 'i`', 'l', 'n') __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh', 'ami', 'iei`', 'i^am', 'iem', 'akh', 'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov', 'ie', "'e", 'ei', 'ii', 'ei`', 'oi`', 'ii`', 'em', 'am', 'om', 'i^u', 'i^a', 'a', 'e', 'i', 'i`', 'o', 'u', 'y', "'") __superlative_suffixes = ("ei`she", "ei`sh") __derivational_suffixes = ("ost'", "ost") def stem(self, word): """ Stem a Russian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ chr_exceeded = False for i in range(len(word)): if ord(word[i]) > 255: chr_exceeded = True break if chr_exceeded: word = self.__cyrillic_to_roman(word) step1_success = False adjectival_removed = False verb_removed = False undouble_success = False superlative_removed = False rv, r2 = self.__regions_russian(word) # Step 1 for suffix in self.__perfective_gerund_suffixes: if rv.endswith(suffix): if suffix in ("v", "vshi", "vshis'"): if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or rv[-len(suffix) - 1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] step1_success = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] step1_success = True break if not step1_success: for suffix in self.__reflexive_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] break for suffix in self.__adjectival_suffixes: if rv.endswith(suffix): if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a', 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchimi', 'i^ushchymi', 'i^ushchego', 'i^ushchogo', 'i^ushchemu', 'i^ushchomu', 'i^ushchikh', 'i^ushchykh', 'shchi^ui^u', 'shchi^ai^a', 'i^ushchee', 'i^ushchie', 'i^ushchye', 'i^ushchoe', 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem', 'i^ushchim', 'i^ushchym', 'i^ushchom', 'vshi^ui^u', 'vshi^ai^a', 'shchui^u', 'shchai^a', 'shchoi^u', 'shchei^u', 'emi^ui^u', 'emi^ai^a', 'nni^ui^u', 'nni^ai^a', 'shchimi', 'shchymi', 'shchego', 'shchogo', 'shchemu', 'shchomu', 'shchikh', 'shchykh', 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u', 'shchee', 'shchie', 'shchye', 'shchoe', 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', 'shchem', 'shchim', 'shchym', 'shchom', 'vshimi', 'vshymi', 'vshego', 'vshogo', 'vshemu', 'vshomu', 'vshikh', 'vshykh', 'emui^u', 'emai^a', 'emoi^u', 'emei^u', 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', 'vshee', 'vshie', 'vshye', 'vshoe', 'vshei`', 'vshii`', 'vshyi`', 'vshoi`', 'vshem', 'vshim', 'vshym', 'vshom', 'emimi', 'emymi', 'emego', 'emogo', 'ememu', 'emomu', 'emikh', 'emykh', 'nnimi', 'nnymi', 'nnego', 'nnogo', 'nnemu', 'nnomu', 'nnikh', 'nnykh', 'emee', 'emie', 'emye', 'emoe', 'emei`', 'emii`', 'emyi`', 'emoi`', 'emem', 'emim', 'emym', 'emom', 'nnee', 'nnie', 'nnye', 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`', 'nnem', 'nnim', 'nnym', 'nnom'): if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or rv[-len(suffix) - 1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] adjectival_removed = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] adjectival_removed = True break if not adjectival_removed: for suffix in self.__verb_suffixes: if rv.endswith(suffix): if suffix in ("la", "na", "ete", "i`te", "li", "i`", "l", "em", "n", "lo", "no", "et", "i^ut", "ny", "t'", "esh'", "nno"): if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or rv[-len(suffix) - 1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] verb_removed = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] verb_removed = True break if not adjectival_removed and not verb_removed: for suffix in self.__noun_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] break # Step 2 if rv.endswith("i"): word = word[:-1] r2 = r2[:-1] # Step 3 for suffix in self.__derivational_suffixes: if r2.endswith(suffix): word = word[:-len(suffix)] break # Step 4 if word.endswith("nn"): word = word[:-1] undouble_success = True if not undouble_success: for suffix in self.__superlative_suffixes: if word.endswith(suffix): word = word[:-len(suffix)] superlative_removed = True break if word.endswith("nn"): word = word[:-1] if not undouble_success and not superlative_removed: if word.endswith("'"): word = word[:-1] if chr_exceeded: word = self.__roman_to_cyrillic(word) return word def __regions_russian(self, word): """ Return the regions RV and R2 which are used by the Russian stemmer. In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. :param word: The Russian word whose regions RV and R2 are determined. :type word: str or unicode :return: the regions RV and R2 for the respective Russian word. :rtype: tuple :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ r1 = "" r2 = "" rv = "" vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") word = (word.replace("i^a", "A") .replace("i^u", "U") .replace("e`", "E")) for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: r1 = word[i + 1:] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i - 1] in vowels: r2 = r1[i + 1:] break for i in range(len(word)): if word[i] in vowels: rv = word[i + 1:] break r2 = (r2.replace("A", "i^a") .replace("U", "i^u") .replace("E", "e`")) rv = (rv.replace("A", "i^a") .replace("U", "i^u") .replace("E", "e`")) return (rv, r2) def __cyrillic_to_roman(self, word): """ Transliterate a Russian word into the Roman alphabet. A Russian word whose letters consist of the Cyrillic alphabet are transliterated into the Roman alphabet in order to ease the forthcoming stemming process. :param word: The word that is transliterated. :type word: unicode :return: the transliterated word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ word = (word.replace(u("\u0410"), "a").replace(u("\u0430"), "a") .replace(u("\u0411"), "b").replace(u("\u0431"), "b") .replace(u("\u0412"), "v").replace(u("\u0432"), "v") .replace(u("\u0413"), "g").replace(u("\u0433"), "g") .replace(u("\u0414"), "d").replace(u("\u0434"), "d") .replace(u("\u0415"), "e").replace(u("\u0435"), "e") .replace(u("\u0401"), "e").replace(u("\u0451"), "e") .replace(u("\u0416"), "zh").replace(u("\u0436"), "zh") .replace(u("\u0417"), "z").replace(u("\u0437"), "z") .replace(u("\u0418"), "i").replace(u("\u0438"), "i") .replace(u("\u0419"), "i`").replace(u("\u0439"), "i`") .replace(u("\u041A"), "k").replace(u("\u043A"), "k") .replace(u("\u041B"), "l").replace(u("\u043B"), "l") .replace(u("\u041C"), "m").replace(u("\u043C"), "m") .replace(u("\u041D"), "n").replace(u("\u043D"), "n") .replace(u("\u041E"), "o").replace(u("\u043E"), "o") .replace(u("\u041F"), "p").replace(u("\u043F"), "p") .replace(u("\u0420"), "r").replace(u("\u0440"), "r") .replace(u("\u0421"), "s").replace(u("\u0441"), "s") .replace(u("\u0422"), "t").replace(u("\u0442"), "t") .replace(u("\u0423"), "u").replace(u("\u0443"), "u") .replace(u("\u0424"), "f").replace(u("\u0444"), "f") .replace(u("\u0425"), "kh").replace(u("\u0445"), "kh") .replace(u("\u0426"), "t^s").replace(u("\u0446"), "t^s") .replace(u("\u0427"), "ch").replace(u("\u0447"), "ch") .replace(u("\u0428"), "sh").replace(u("\u0448"), "sh") .replace(u("\u0429"), "shch").replace(u("\u0449"), "shch") .replace(u("\u042A"), "''").replace(u("\u044A"), "''") .replace(u("\u042B"), "y").replace(u("\u044B"), "y") .replace(u("\u042C"), "'").replace(u("\u044C"), "'") .replace(u("\u042D"), "e`").replace(u("\u044D"), "e`") .replace(u("\u042E"), "i^u").replace(u("\u044E"), "i^u") .replace(u("\u042F"), "i^a").replace(u("\u044F"), "i^a")) return word def __roman_to_cyrillic(self, word): """ Transliterate a Russian word back into the Cyrillic alphabet. A Russian word formerly transliterated into the Roman alphabet in order to ease the stemming process, is transliterated back into the Cyrillic alphabet, its original form. :param word: The word that is transliterated. :type word: str or unicode :return: word, the transliterated word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ word = (word.replace("i^u", u("\u044E")).replace("i^a", u("\u044F")) .replace("shch", u("\u0449")).replace("kh", u("\u0445")) .replace("t^s", u("\u0446")).replace("ch", u("\u0447")) .replace("e`", u("\u044D")).replace("i`", u("\u0439")) .replace("sh", u("\u0448")).replace("k", u("\u043A")) .replace("e", u("\u0435")).replace("zh", u("\u0436")) .replace("a", u("\u0430")).replace("b", u("\u0431")) .replace("v", u("\u0432")).replace("g", u("\u0433")) .replace("d", u("\u0434")).replace("e", u("\u0435")) .replace("z", u("\u0437")).replace("i", u("\u0438")) .replace("l", u("\u043B")).replace("m", u("\u043C")) .replace("n", u("\u043D")).replace("o", u("\u043E")) .replace("p", u("\u043F")).replace("r", u("\u0440")) .replace("s", u("\u0441")).replace("t", u("\u0442")) .replace("u", u("\u0443")).replace("f", u("\u0444")) .replace("''", u("\u044A")).replace("y", u("\u044B")) .replace("'", u("\u044C"))) return word Whoosh-2.5.7/src/whoosh/lang/snowball/spanish.py0000644000076500000240000002534212254366350021757 0ustar mattstaff00000000000000from .bases import _StandardStemmer from whoosh.compat import u class SpanishStemmer(_StandardStemmer): """ The Spanish Snowball stemmer. :cvar __vowels: The Spanish vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. :type __step2a_suffixes: tuple :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. :type __step2b_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Spanish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/spanish/stemmer.html """ __vowels = u("aeiou\xE1\xE9\xED\xF3\xFA\xFC") __step0_suffixes = ("selas", "selos", "sela", "selo", "las", "les", "los", "nos", "me", "se", "la", "le", "lo") __step1_suffixes = ('amientos', 'imientos', 'amiento', 'imiento', 'aciones', 'uciones', 'adoras', 'adores', 'ancias', u('log\xEDas'), 'encias', 'amente', 'idades', 'anzas', 'ismos', 'ables', 'ibles', 'istas', 'adora', u('aci\xF3n'), 'antes', 'ancia', u('log\xEDa'), u('uci\xf3n'), 'encia', 'mente', 'anza', 'icos', 'icas', 'ismo', 'able', 'ible', 'ista', 'osos', 'osas', 'ador', 'ante', 'idad', 'ivas', 'ivos', 'ico', 'ica', 'oso', 'osa', 'iva', 'ivo') __step2a_suffixes = ('yeron', 'yendo', 'yamos', 'yais', 'yan', 'yen', 'yas', 'yes', 'ya', 'ye', 'yo', u('y\xF3')) __step2b_suffixes = (u('ar\xEDamos'), u('er\xEDamos'), u('ir\xEDamos'), u('i\xE9ramos'), u('i\xE9semos'), u('ar\xEDais'), 'aremos', u('er\xEDais'), 'eremos', u('ir\xEDais'), 'iremos', 'ierais', 'ieseis', 'asteis', 'isteis', u('\xE1bamos'), u('\xE1ramos'), u('\xE1semos'), u('ar\xEDan'), u('ar\xEDas'), u('ar\xE9is'), u('er\xEDan'), u('er\xEDas'), u('er\xE9is'), u('ir\xEDan'), u('ir\xEDas'), u('ir\xE9is'), 'ieran', 'iesen', 'ieron', 'iendo', 'ieras', 'ieses', 'abais', 'arais', 'aseis', u('\xE9amos'), u('ar\xE1n'), u('ar\xE1s'), u('ar\xEDa'), u('er\xE1n'), u('er\xE1s'), u('er\xEDa'), u('ir\xE1n'), u('ir\xE1s'), u('ir\xEDa'), 'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'ando', 'abas', 'adas', 'idas', 'aras', 'ases', u('\xEDais'), 'ados', 'idos', 'amos', 'imos', 'emos', u('ar\xE1'), u('ar\xE9'), u('er\xE1'), u('er\xE9'), u('ir\xE1'), u('ir\xE9'), 'aba', 'ada', 'ida', 'ara', 'ase', u('\xEDan'), 'ado', 'ido', u('\xEDas'), u('\xE1is'), u('\xE9is'), u('\xEDa'), 'ad', 'ed', 'id', 'an', u('i\xF3'), 'ar', 'er', 'ir', 'as', u('\xEDs'), 'en', 'es') __step3_suffixes = ("os", "a", "e", "o", u("\xE1"), u("\xE9"), u("\xED"), u("\xF3")) def stem(self, word): """ Stem a Spanish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() step1_success = False r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Attached pronoun for suffix in self.__step0_suffixes: if word.endswith(suffix): if rv.endswith(suffix): if rv[:-len(suffix)].endswith((u("i\xE9ndo"), u("\xE1ndo"), u("\xE1r"), u("\xE9r"), u("\xEDr"))): word = (word[:-len(suffix)].replace(u("\xE1"), "a") .replace(u("\xE9"), "e") .replace(u("\xED"), "i")) r1 = (r1[:-len(suffix)].replace(u("\xE1"), "a") .replace(u("\xE9"), "e") .replace(u("\xED"), "i")) r2 = (r2[:-len(suffix)].replace(u("\xE1"), "a") .replace(u("\xE9"), "e") .replace(u("\xED"), "i")) rv = (rv[:-len(suffix)].replace(u("\xE1"), "a") .replace(u("\xE9"), "e") .replace(u("\xED"), "i")) elif rv[:-len(suffix)].endswith(("ando", "iendo", "ar", "er", "ir")): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] elif (rv[:-len(suffix)].endswith("yendo") and word[:-len(suffix)].endswith("uyendo")): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic", "ad")): word = word[:-2] rv = rv[:-2] elif r2.endswith(suffix): step1_success = True if suffix in ("adora", "ador", u("aci\xF3n"), "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] elif suffix in (u("log\xEDa"), u("log\xEDas")): word = word.replace(suffix, "log") rv = rv.replace(suffix, "log") elif suffix in (u("uci\xF3n"), "uciones"): word = word.replace(suffix, "u") rv = rv.replace(suffix, "u") elif suffix in ("encia", "encias"): word = word.replace(suffix, "ente") rv = rv.replace(suffix, "ente") elif suffix == "mente": word = word[:-5] r2 = r2[:-5] rv = rv[:-5] if r2.endswith(("ante", "able", "ible")): word = word[:-4] rv = rv[:-4] elif suffix in ("idad", "idades"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] for pre_suff in ("abil", "ic", "iv"): if r2.endswith(pre_suff): word = word[:-len(pre_suff)] rv = rv[:-len(pre_suff)] elif suffix in ("ivo", "iva", "ivos", "ivas"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] else: word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 2a: Verb suffixes beginning 'y' if not step1_success: for suffix in self.__step2a_suffixes: if (rv.endswith(suffix) and word[-len(suffix) - 1:-len(suffix)] == "u"): word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 2b: Other verb suffixes for suffix in self.__step2b_suffixes: if rv.endswith(suffix): if suffix in ("en", "es", u("\xE9is"), "emos"): word = word[:-len(suffix)] rv = rv[:-len(suffix)] if word.endswith("gu"): word = word[:-1] if rv.endswith("gu"): rv = rv[:-1] else: word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 3: Residual suffix for suffix in self.__step3_suffixes: if rv.endswith(suffix): if suffix in ("e", u("\xE9")): word = word[:-len(suffix)] rv = rv[:-len(suffix)] if word[-2:] == "gu" and rv[-1] == "u": word = word[:-1] else: word = word[:-len(suffix)] break word = (word.replace(u("\xE1"), "a").replace(u("\xE9"), "e") .replace(u("\xED"), "i").replace(u("\xF3"), "o") .replace(u("\xFA"), "u")) return word Whoosh-2.5.7/src/whoosh/lang/snowball/swedish.py0000644000076500000240000000531012254366350021751 0ustar mattstaff00000000000000from .bases import _ScandinavianStemmer from whoosh.compat import u class SwedishStemmer(_ScandinavianStemmer): """ The Swedish Snowball stemmer. :cvar __vowels: The Swedish vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Swedish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/swedish/stemmer.html """ __vowels = u("aeiouy\xE4\xE5\xF6") __s_ending = "bcdfghjklmnoprtvy" __step1_suffixes = ("heterna", "hetens", "heter", "heten", "anden", "arnas", "ernas", "ornas", "andes", "andet", "arens", "arna", "erna", "orna", "ande", "arne", "aste", "aren", "ades", "erns", "ade", "are", "ern", "ens", "het", "ast", "ad", "en", "ar", "er", "or", "as", "es", "at", "a", "e", "s") __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") __step3_suffixes = ("fullt", u("l\xF6st"), "els", "lig", "ig") def stem(self, word): """ Stem a Swedish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() r1 = self._r1_scandinavian(word, self.__vowels) # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix in ("els", "lig", "ig"): word = word[:-len(suffix)] elif suffix in ("fullt", u("l\xF6st")): word = word[:-1] break return word Whoosh-2.5.7/src/whoosh/lang/stopwords.py0000644000076500000240000004044712254366350020540 0ustar mattstaff00000000000000# coding=utf-8 from __future__ import unicode_literals # Stopwords Corpus # # This module contains lists of stop words for several languages. These # are high-frequency grammatical words which are usually ignored in text # retrieval applications. # # They were obtained from: # anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ # ===== # This module was generated from the original files using the following script #import os.path #import textwrap # #names = os.listdir("stopwords") #for name in names: # f = open("stopwords/" + name) # wordls = [line.strip() for line in f] # words = " ".join(wordls) # print '"%s": frozenset(u"""' % name # print textwrap.fill(words, 72) # print '""".split())' # print stoplists = { "da": frozenset(""" og i jeg det at en den til er som på de med han af for ikke der var mig sig men et har om vi min havde ham hun nu over da fra du ud sin dem os op man hans hvor eller hvad skal selv her alle vil blev kunne ind når være dog noget ville jo deres efter ned skulle denne end dette mit også under have dig anden hende mine alt meget sit sine vor mod disse hvis din nogle hos blive mange ad bliver hendes været thi jer sådan """.split()), "nl": frozenset(""" de en van ik te dat die in een hij het niet zijn is was op aan met als voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u want nog zal me zij nu ge geen omdat iets worden toch al waren veel meer doen toen moet ben zonder kan hun dus alles onder ja eens hier wie werd altijd doch wordt wezen kunnen ons zelf tegen na reeds wil kon niets uw iemand geweest andere """.split()), "en": frozenset(""" i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now """.split()), "fi": frozenset(""" olla olen olet on olemme olette ovat ole oli olisi olisit olisin olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet en et ei emme ette eivät minä minun minut minua minussa minusta minuun minulla minulta minulle sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle me meidän meidät meitä meissä meistä meihin meillä meiltä meille te teidän teidät teitä teissä teistä teihin teillä teiltä teille he heidän heidät heitä heissä heistä heihin heillä heiltä heille tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi se sen sitä siinä siitä siihen sillä siltä sille sinä siksi nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi mitkä joka jonka jota jossa josta johon jolla jolta jolle jona joksi jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi että ja jos koska kuin mutta niin sekä sillä tai vaan vai vaikka kanssa mukaan noin poikki yli kun niin nyt itse """.split()), "fr": frozenset(""" au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma mais me même mes moi mon ne nos notre nous on ou par pas pour qu que qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l à m n s t y été étée étées étés étant étante étants étantes suis es est sommes êtes sont serai seras sera serons serez seront serais serait serions seriez seraient étais était étions étiez étaient fus fut fûmes fûtes furent sois soit soyons soyez soient fusse fusses fût fussions fussiez fussent ayant ayante ayantes ayants eu eue eues eus ai as avons avez ont aurai auras aura aurons aurez auront aurais aurait aurions auriez auraient avais avait avions aviez avaient eut eûmes eûtes eurent aie aies ait ayons ayez aient eusse eusses eût eussions eussiez eussent """.split()), "de": frozenset(""" aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann der den des dem die das daß derselbe derselben denselben desselben demselben dieselbe dieselben dasselbe dazu dein deine deinem deinen deiner deines denn derer dessen dich dir du dies diese diesem diesen dieser dieses doch dort durch ein eine einem einen einer eines einig einige einigem einigen einiger einiges einmal er ihn ihm es etwas euer eure eurem euren eurer eures für gegen gewesen hab habe haben hat hatte hatten hier hin hinter ich mich mir ihr ihre ihrem ihren ihrer ihres euch im in indem ins ist jede jedem jeden jeder jedes jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner keines können könnte machen man manche manchem manchen mancher manches mein meine meinem meinen meiner meines mit muss musste nach nicht nichts noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines selbst sich sie ihnen sind so solche solchem solchen solcher solches soll sollte sondern sonst über um und uns unse unsem unsen unser unses unter viel vom von vor während war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte würde würden zu zum zur zwar zwischen """.split()), "hu": frozenset(""" a ahogy ahol aki akik akkor alatt által általában amely amelyek amelyekben amelyeket amelyet amelynek ami amit amolyan amíg amikor át abban ahhoz annak arra arról az azok azon azt azzal azért aztán azután azonban bár be belül benne cikk cikkek cikkeket csak de e eddig egész egy egyes egyetlen egyéb egyik egyre ekkor el elég ellen elõ elõször elõtt elsõ én éppen ebben ehhez emilyen ennek erre ez ezt ezek ezen ezzel ezért és fel felé hanem hiszen hogy hogyan igen így illetve ill. ill ilyen ilyenkor ison ismét itt jó jól jobban kell kellett keresztül keressünk ki kívül között közül legalább lehet lehetett legyen lenne lenni lesz lett maga magát majd majd már más másik meg még mellett mert mely melyek mi mit míg miért milyen mikor minden mindent mindenki mindig mint mintha mivel most nagy nagyobb nagyon ne néha nekem neki nem néhány nélkül nincs olyan ott össze õ õk õket pedig persze rá s saját sem semmi sok sokat sokkal számára szemben szerint szinte talán tehát teljes tovább továbbá több úgy ugyanis új újabb újra után utána utolsó vagy vagyis valaki valami valamint való vagyok van vannak volt voltam voltak voltunk vissza vele viszont volna """.split()), "it": frozenset(""" ad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli dall dagl dalla dalle di del dello dei degli dell degl della delle in nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull sugl sulla sulle per tra contro io tu lui lei noi voi loro mio mia miei mie tuo tua tuoi tue suo sua suoi sue nostro nostra nostri nostre vostro vostra vostri vostre mi ti ci vi lo la li le gli ne il un uno una ma ed se perché anche come dov dove che chi cui non più quale quanto quanti quanta quante quello quelli quella quelle questo questi questa queste si tutto tutti a c e i l o ho hai ha abbiamo avete hanno abbia abbiate abbiano avrò avrai avrà avremo avrete avranno avrei avresti avrebbe avremmo avreste avrebbero avevo avevi aveva avevamo avevate avevano ebbi avesti ebbe avemmo aveste ebbero avessi avesse avessimo avessero avendo avuto avuta avuti avute sono sei è siamo siete sia siate siano sarò sarai sarà saremo sarete saranno sarei saresti sarebbe saremmo sareste sarebbero ero eri era eravamo eravate erano fui fosti fu fummo foste furono fossi fosse fossimo fossero essendo faccio fai facciamo fanno faccia facciate facciano farò farai farà faremo farete faranno farei faresti farebbe faremmo fareste farebbero facevo facevi faceva facevamo facevate facevano feci facesti fece facemmo faceste fecero facessi facesse facessimo facessero facendo sto stai sta stiamo stanno stia stiate stiano starò starai starà staremo starete staranno starei staresti starebbe staremmo stareste starebbero stavo stavi stava stavamo stavate stavano stetti stesti stette stemmo steste stettero stessi stesse stessimo stessero stando """.split()), "no": frozenset(""" og i jeg det at en et den til er som på de med han av ikke ikkje der så var meg seg men ett har om vi min mitt ha hadde hun nå over da ved fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjøl her alle vil bli ble blei blitt kunne inn når være kom noen noe ville dere som deres kun ja etter ned skulle denne for deg si sine sitt mot å meget hvorfor dette disse uten hvordan ingen din ditt blir samme hvilken hvilke sånn inni mellom vår hver hvem vors hvis både bare enn fordi før mange også slik vært være båe begge siden dykk dykkar dei deira deires deim di då eg ein eit eitt elles honom hjå ho hoe henne hennar hennes hoss hossen ikkje ingi inkje korleis korso kva kvar kvarhelst kven kvi kvifor me medan mi mine mykje no nokon noka nokor noko nokre si sia sidan so somt somme um upp vere vore verte vort varte vart """.split()), "pt": frozenset(""" de a o que e do da em um para com não uma os no se na por mais as dos como mas ao ele das à seu sua ou quando muito nos já eu também só pelo pela até isso ela entre depois sem mesmo aos seus quem nas me esse eles você essa num nem suas meu às minha numa pelos elas qual nós lhe deles essas esses pelas este dele tu te vocês vos lhes meus minhas teu tua teus tuas nosso nossa nossos nossas dela delas esta estes estas aquele aquela aqueles aquelas isto aquilo estou está estamos estão estive esteve estivemos estiveram estava estávamos estavam estivera estivéramos esteja estejamos estejam estivesse estivéssemos estivessem estiver estivermos estiverem hei há havemos hão houve houvemos houveram houvera houvéramos haja hajamos hajam houvesse houvéssemos houvessem houver houvermos houverem houverei houverá houveremos houverão houveria houveríamos houveriam sou somos são era éramos eram fui foi fomos foram fora fôramos seja sejamos sejam fosse fôssemos fossem for formos forem serei será seremos serão seria seríamos seriam tenho tem temos tém tinha tínhamos tinham tive teve tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão teria teríamos teriam """.split()), "ru": frozenset(""" и в во не что он на я с со как а то все она так его но да ты к у же вы за бы по только ее мне было вот от меня еще нет о из ему теперь когда даже ну вдруг ли если уже или ни быть был него до вас нибудь опять уж вам ведь там потом себя ничего ей может они тут где есть надо ней для мы тебя их чем была сам чтоб без будто чего раз тоже себе под будет ж тогда кто этот того потому этого какой совсем ним здесь этом один почти мой тем чтобы нее сейчас были куда зачем всех никогда можно при наконец два об другой хоть после над больше тот через эти нас про всего них какая много разве три эту моя впрочем хорошо свою этой перед иногда лучше чуть том нельзя такой им более всегда конечно всю между """.split()), "es": frozenset(""" de la que el en y a los del se las por un para con no una su al lo como más pero sus le ya o este sí porque esta entre cuando muy sin sobre también me hasta hay donde quien desde todo nos durante todos uno les ni contra otros ese eso ante ellos e esto mí antes algunos qué unos yo otro otras otra él tanto esa estos mucho quienes nada muchos cual poco ella estar estas algunas algo nosotros mi mis tú te ti tu tus ellas nosotras vosostros vosostras os mío mía míos mías tuyo tuya tuyos tuyas suyo suya suyos suyas nuestro nuestra nuestros nuestras vuestro vuestra vuestros vuestras esos esas estoy estás está estamos estáis están esté estés estemos estéis estén estaré estarás estará estaremos estaréis estarán estaría estarías estaríamos estaríais estarían estaba estabas estábamos estabais estaban estuve estuviste estuvo estuvimos estuvisteis estuvieron estuviera estuvieras estuviéramos estuvierais estuvieran estuviese estuvieses estuviésemos estuvieseis estuviesen estando estado estada estados estadas estad he has ha hemos habéis han haya hayas hayamos hayáis hayan habré habrás habrá habremos habréis habrán habría habrías habríamos habríais habrían había habías habíamos habíais habían hube hubiste hubo hubimos hubisteis hubieron hubiera hubieras hubiéramos hubierais hubieran hubiese hubieses hubiésemos hubieseis hubiesen habiendo habido habida habidos habidas soy eres es somos sois son sea seas seamos seáis sean seré serás será seremos seréis serán sería serías seríamos seríais serían era eras éramos erais eran fui fuiste fue fuimos fuisteis fueron fuera fueras fuéramos fuerais fueran fuese fueses fuésemos fueseis fuesen sintiendo sentido sentida sentidos sentidas siente sentid tengo tienes tiene tenemos tenéis tienen tenga tengas tengamos tengáis tengan tendré tendrás tendrá tendremos tendréis tendrán tendría tendrías tendríamos tendríais tendrían tenía tenías teníamos teníais tenían tuve tuviste tuvo tuvimos tuvisteis tuvieron tuviera tuvieras tuviéramos tuvierais tuvieran tuviese tuvieses tuviésemos tuvieseis tuviesen teniendo tenido tenida tenidos tenidas tened """.split()), "sv": frozenset(""" och det att i en jag hon som han på den med var sig för så till är men ett om hade de av icke mig du henne då sin nu har inte hans honom skulle hennes där min man ej vid kunde något från ut när efter upp vi dem vara vad över än dig kan sina här ha mot alla under någon eller allt mycket sedan ju denna själv detta åt utan varit hur ingen mitt ni bli blev oss din dessa några deras blir mina samma vilken er sådan vår blivit dess inom mellan sådant varför varje vilka ditt vem vilket sitta sådana vart dina vars vårt våra ert era vilkas """.split()), "tr": frozenset(""" acaba ama aslında az bazı belki biri birkaç birşey biz bu çok çünkü da daha de defa diye eğer en gibi hem hep hepsi her hiç için ile ise kez ki kim mı mu mü nasıl ne neden nerde nerede nereye niçin niye o sanki şey siz şu tüm ve veya ya yani """.split()), } Whoosh-2.5.7/src/whoosh/lang/wordnet.py0000644000076500000240000002146712254366350020157 0ustar mattstaff00000000000000# Copyright 2009 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """This module contains low-level functions and a high-level class for parsing the prolog file "wn_s.pl" from the WordNet prolog download into an object suitable for looking up synonyms and performing query expansion. http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz """ from collections import defaultdict from whoosh.compat import iterkeys, text_type from whoosh.fields import Schema, ID, STORED def parse_file(f): """Parses the WordNet wn_s.pl prolog file and returns two dictionaries: word2nums and num2words. """ word2nums = defaultdict(list) num2words = defaultdict(list) for line in f: if not line.startswith("s("): continue line = line[2:] num = int(line[:line.find(",")]) qt = line.find("'") line = line[qt + 1:] qt = line.find("'") word = line[:qt].lower() if not word.isalpha(): continue word2nums[word].append(num) num2words[num].append(word) return word2nums, num2words def make_index(storage, indexname, word2nums, num2words): """Creates a Whoosh index in the given storage object containing synonyms taken from word2nums and num2words. Returns the Index object. """ schema = Schema(word=ID, syns=STORED) ix = storage.create_index(schema, indexname=indexname) w = ix.writer() for word in iterkeys(word2nums): syns = synonyms(word2nums, num2words, word) w.add_document(word=text_type(word), syns=syns) w.commit() return ix def synonyms(word2nums, num2words, word): """Uses the word2nums and num2words dicts to look up synonyms for the given word. Returns a list of synonym strings. """ keys = word2nums[word] syns = set() for key in keys: syns = syns.union(num2words[key]) if word in syns: syns.remove(word) return sorted(syns) class Thesaurus(object): """Represents the WordNet synonym database, either loaded into memory from the wn_s.pl Prolog file, or stored on disk in a Whoosh index. This class allows you to parse the prolog file "wn_s.pl" from the WordNet prolog download into an object suitable for looking up synonyms and performing query expansion. http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz To load a Thesaurus object from the wn_s.pl file... >>> t = Thesaurus.from_filename("wn_s.pl") To save the in-memory Thesaurus to a Whoosh index... >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t.to_storage(fs) To load a Thesaurus object from a Whoosh index... >>> t = Thesaurus.from_storage(fs) The Thesaurus object is thus usable in two ways: * Parse the wn_s.pl file into memory (Thesaurus.from_*) and then look up synonyms in memory. This has a startup cost for parsing the file, and uses quite a bit of memory to store two large dictionaries, however synonym look-ups are very fast. * Parse the wn_s.pl file into memory (Thesaurus.from_filename) then save it to an index (to_storage). From then on, open the thesaurus from the saved index (Thesaurus.from_storage). This has a large cost for storing the index, but after that it is faster to open the Thesaurus (than re-parsing the file) but slightly slower to look up synonyms. Here are timings for various tasks on my (fast) Windows machine, which might give an idea of relative costs for in-memory vs. on-disk. ================================================ ================ Task Approx. time (s) ================================================ ================ Parsing the wn_s.pl file 1.045 Saving to an on-disk index 13.084 Loading from an on-disk index 0.082 Look up synonyms for "light" (in memory) 0.0011 Look up synonyms for "light" (loaded from disk) 0.0028 ================================================ ================ Basically, if you can afford spending the memory necessary to parse the Thesaurus and then cache it, it's faster. Otherwise, use an on-disk index. """ def __init__(self): self.w2n = None self.n2w = None self.searcher = None @classmethod def from_file(cls, fileobj): """Creates a Thesaurus object from the given file-like object, which should contain the WordNet wn_s.pl file. >>> f = open("wn_s.pl") >>> t = Thesaurus.from_file(f) >>> t.synonyms("hail") ['acclaim', 'come', 'herald'] """ thes = cls() thes.w2n, thes.n2w = parse_file(fileobj) return thes @classmethod def from_filename(cls, filename): """Creates a Thesaurus object from the given filename, which should contain the WordNet wn_s.pl file. >>> t = Thesaurus.from_filename("wn_s.pl") >>> t.synonyms("hail") ['acclaim', 'come', 'herald'] """ f = open(filename, "rb") try: return cls.from_file(f) finally: f.close() @classmethod def from_storage(cls, storage, indexname="THES"): """Creates a Thesaurus object from the given storage object, which should contain an index created by Thesaurus.to_storage(). >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t = Thesaurus.from_storage(fs) >>> t.synonyms("hail") ['acclaim', 'come', 'herald'] :param storage: A :class:`whoosh.store.Storage` object from which to load the index. :param indexname: A name for the index. This allows you to store multiple indexes in the same storage object. """ thes = cls() index = storage.open_index(indexname=indexname) thes.searcher = index.searcher() return thes def to_storage(self, storage, indexname="THES"): """Creates am index in the given storage object from the synonyms loaded from a WordNet file. >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t = Thesaurus.from_filename("wn_s.pl") >>> t.to_storage(fs) :param storage: A :class:`whoosh.store.Storage` object in which to save the index. :param indexname: A name for the index. This allows you to store multiple indexes in the same storage object. """ if not self.w2n or not self.n2w: raise Exception("No synonyms loaded") make_index(storage, indexname, self.w2n, self.n2w) def synonyms(self, word): """Returns a list of synonyms for the given word. >>> thesaurus.synonyms("hail") ['acclaim', 'come', 'herald'] """ word = word.lower() if self.searcher: return self.searcher.document(word=word)["syns"] else: return synonyms(self.w2n, self.n2w, word) if __name__ == "__main__": from whoosh.filedb.filestore import FileStorage st = FileStorage("c:/testindex") # th = Thesaurus.from_filename("c:/wordnet/wn_s.pl") # # th.to_storage(st) # # t = clock() # print th.synonyms("light") # print(clock() - t) th = Thesaurus.from_storage(st) print(th.synonyms("hail")) Whoosh-2.5.7/src/whoosh/legacy.py0000644000076500000240000000660312254366350017013 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module contains code for maintaining backwards compatibility with old index formats. """ from whoosh.util.loading import RenamingUnpickler def load_110_toc(stream, gen, schema, version): # Between version -110 and version -111, I reorganized the modules and # changed the implementation of the NUMERIC field, so we have to change the # classes the unpickler tries to load if we need to read an old schema # Read the length of the pickled schema picklen = stream.read_varint() if schema: # If the user passed us a schema, use it and skip the one on disk stream.seek(picklen, 1) else: # Remap the old classes and functions to their moved versions as we # unpickle the schema scuts = {"wf": "whoosh.fields", "wsn": "whoosh.support.numeric", "wcw2": "whoosh.codec.whoosh2"} objmap = {"%(wf)s.NUMERIC": "%(wcw2)s.OLD_NUMERIC", "%(wf)s.DATETIME": "%(wcw2)s.OLD_DATETIME", "%(wsn)s.int_to_text": "%(wcw2)s.int_to_text", "%(wsn)s.text_to_int": "%(wcw2)s.text_to_int", "%(wsn)s.long_to_text": "%(wcw2)s.long_to_text", "%(wsn)s.text_to_long": "%(wcw2)s.text_to_long", "%(wsn)s.float_to_text": "%(wcw2)s.float_to_text", "%(wsn)s.text_to_float": "%(wcw2)s.text_to_float", } ru = RenamingUnpickler(stream, objmap, shortcuts=scuts) schema = ru.load() # Read the generation number index_gen = stream.read_int() assert gen == index_gen # Unused number _ = stream.read_int() # Unpickle the list of segment objects segments = stream.read_pickle() return schema, segments # Map TOC version numbers to functions to load that version toc_loaders = {-110: load_110_toc} # Map segment class names to functions to load the segment segment_loaders = {} Whoosh-2.5.7/src/whoosh/matching/0000755000076500000240000000000012277504634016766 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/matching/__init__.py0000644000076500000240000000321612254366350021075 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.matching.mcore import * from whoosh.matching.binary import * from whoosh.matching.wrappers import * from whoosh.matching.combo import * Whoosh-2.5.7/src/whoosh/matching/binary.py0000644000076500000240000005760412254366350020634 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.matching import mcore class BiMatcher(mcore.Matcher): """Base class for matchers that combine the results of two sub-matchers in some way. """ def __init__(self, a, b): super(BiMatcher, self).__init__() self.a = a self.b = b def reset(self): self.a.reset() self.b.reset() def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self.a, self.b) def children(self): return [self.a, self.b] def copy(self): return self.__class__(self.a.copy(), self.b.copy()) def depth(self): return 1 + max(self.a.depth(), self.b.depth()) def skip_to(self, id): if not self.is_active(): raise mcore.ReadTooFar ra = self.a.skip_to(id) rb = self.b.skip_to(id) return ra or rb def supports_block_quality(self): return (self.a.supports_block_quality() and self.b.supports_block_quality()) def supports(self, astype): return self.a.supports(astype) and self.b.supports(astype) class AdditiveBiMatcher(BiMatcher): """Base class for binary matchers where the scores of the sub-matchers are added together. """ def max_quality(self): q = 0.0 if self.a.is_active(): q += self.a.max_quality() if self.b.is_active(): q += self.b.max_quality() return q def block_quality(self): bq = 0.0 if self.a.is_active(): bq += self.a.block_quality() if self.b.is_active(): bq += self.b.block_quality() return bq def weight(self): return (self.a.weight() + self.b.weight()) def score(self): return (self.a.score() + self.b.score()) def __eq__(self, other): return self.__class__ is type(other) def __lt__(self, other): return type(other) is self.__class__ def __ne__(self, other): return not self.__eq__(other) def __gt__(self, other): return not (self.__lt__(other) or self.__eq__(other)) def __le__(self, other): return self.__eq__(other) or self.__lt__(other) def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) class UnionMatcher(AdditiveBiMatcher): """Matches the union (OR) of the postings in the two sub-matchers. """ _id = None def replace(self, minquality=0): a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() # If neither sub-matcher on its own has a high enough max quality to # contribute, convert to an intersection matcher if minquality and a_active and b_active: a_max = a.max_quality() b_max = b.max_quality() if a_max < minquality and b_max < minquality: return IntersectionMatcher(a, b).replace(minquality) elif a_max < minquality: return AndMaybeMatcher(b, a) elif b_max < minquality: return AndMaybeMatcher(a, b) # If one or both of the sub-matchers are inactive, convert if not (a_active or b_active): return mcore.NullMatcher() elif not a_active: return b.replace(minquality) elif not b_active: return a.replace(minquality) a = a.replace(minquality - b.max_quality() if minquality else 0) b = b.replace(minquality - a.max_quality() if minquality else 0) # If one of the sub-matchers changed, return a new union if a is not self.a or b is not self.b: return self.__class__(a, b) else: self._id = None return self def is_active(self): return self.a.is_active() or self.b.is_active() def skip_to(self, id): self._id = None ra = rb = False if self.a.is_active(): ra = self.a.skip_to(id) if self.b.is_active(): rb = self.b.skip_to(id) return ra or rb def id(self): _id = self._id if _id is not None: return _id a = self.a b = self.b if not a.is_active(): _id = b.id() elif not b.is_active(): _id = a.id() else: _id = min(a.id(), b.id()) self._id = _id return _id # Using sets is faster in most cases, but could potentially use a lot of # memory. Comment out this method override to not use sets. #def all_ids(self): # return iter(sorted(set(self.a.all_ids()) | set(self.b.all_ids()))) def next(self): self._id = None a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() # Shortcut when one matcher is inactive if not (a_active or b_active): raise mcore.ReadTooFar elif not a_active: return b.next() elif not b_active: return a.next() a_id = a.id() b_id = b.id() ar = br = None # After all that, here's the actual implementation if a_id <= b_id: ar = a.next() if b_id <= a_id: br = b.next() return ar or br def spans(self): if not self.a.is_active(): return self.b.spans() if not self.b.is_active(): return self.a.spans() id_a = self.a.id() id_b = self.b.id() if id_a < id_b: return self.a.spans() elif id_b < id_a: return self.b.spans() else: return sorted(set(self.a.spans()) | set(self.b.spans())) def weight(self): a = self.a b = self.b if not a.is_active(): return b.weight() if not b.is_active(): return a.weight() id_a = a.id() id_b = b.id() if id_a < id_b: return a.weight() elif id_b < id_a: return b.weight() else: return (a.weight() + b.weight()) def score(self): a = self.a b = self.b if not a.is_active(): return b.score() if not b.is_active(): return a.score() id_a = a.id() id_b = b.id() if id_a < id_b: return a.score() elif id_b < id_a: return b.score() else: return (a.score() + b.score()) def skip_to_quality(self, minquality): self._id = None a = self.a b = self.b if not (a.is_active() or b.is_active()): raise mcore.ReadTooFar # Short circuit if one matcher is inactive if not a.is_active(): return b.skip_to_quality(minquality) elif not b.is_active(): return a.skip_to_quality(minquality) skipped = 0 aq = a.block_quality() bq = b.block_quality() while a.is_active() and b.is_active() and aq + bq <= minquality: if aq < bq: skipped += a.skip_to_quality(minquality - bq) aq = a.block_quality() else: skipped += b.skip_to_quality(minquality - aq) bq = b.block_quality() return skipped class DisjunctionMaxMatcher(UnionMatcher): """Matches the union (OR) of two sub-matchers. Where both sub-matchers match the same posting, returns the weight/score of the higher-scoring posting. """ # TODO: this class inherits from AdditiveBiMatcher (through UnionMatcher) # but it does not add the scores of the sub-matchers together (it # overrides all methods that perform addition). Need to clean up the # inheritance. def __init__(self, a, b, tiebreak=0.0): super(DisjunctionMaxMatcher, self).__init__(a, b) self.tiebreak = tiebreak def copy(self): return self.__class__(self.a.copy(), self.b.copy(), tiebreak=self.tiebreak) def replace(self, minquality=0): a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() # DisMax takes the max of the sub-matcher qualities instead of adding # them, so we need special logic here if minquality and a_active and b_active: a_max = a.max_quality() b_max = b.max_quality() if a_max < minquality and b_max < minquality: # If neither sub-matcher has a high enough max quality to # contribute, return an inactive matcher return mcore.NullMatcher() elif b_max < minquality: # If the b matcher can't contribute, return a return a.replace(minquality) elif a_max < minquality: # If the a matcher can't contribute, return b return b.replace(minquality) if not (a_active or b_active): return mcore.NullMatcher() elif not a_active: return b.replace(minquality) elif not b_active: return a.replace(minquality) # We CAN pass the minquality down here, since we don't add the two # scores together a = a.replace(minquality) b = b.replace(minquality) a_active = a.is_active() b_active = b.is_active() # It's kind of tedious to check for inactive sub-matchers all over # again here after we replace them, but it's probably better than # returning a replacement with an inactive sub-matcher if not (a_active and b_active): return mcore.NullMatcher() elif not a_active: return b elif not b_active: return a elif a is not self.a or b is not self.b: # If one of the sub-matchers changed, return a new DisMax return self.__class__(a, b) else: return self def score(self): if not self.a.is_active(): return self.b.score() elif not self.b.is_active(): return self.a.score() else: return max(self.a.score(), self.b.score()) def max_quality(self): return max(self.a.max_quality(), self.b.max_quality()) def block_quality(self): return max(self.a.block_quality(), self.b.block_quality()) def skip_to_quality(self, minquality): a = self.a b = self.b # Short circuit if one matcher is inactive if not a.is_active(): sk = b.skip_to_quality(minquality) return sk elif not b.is_active(): return a.skip_to_quality(minquality) skipped = 0 aq = a.block_quality() bq = b.block_quality() while a.is_active() and b.is_active() and max(aq, bq) <= minquality: if aq <= minquality: skipped += a.skip_to_quality(minquality) aq = a.block_quality() if bq <= minquality: skipped += b.skip_to_quality(minquality) bq = b.block_quality() return skipped class IntersectionMatcher(AdditiveBiMatcher): """Matches the intersection (AND) of the postings in the two sub-matchers. """ def __init__(self, a, b): super(IntersectionMatcher, self).__init__(a, b) self._find_first() def reset(self): self.a.reset() self.b.reset() self._find_first() def _find_first(self): if (self.a.is_active() and self.b.is_active() and self.a.id() != self.b.id()): self._find_next() def replace(self, minquality=0): a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() if not (a_active and b_active): # Intersection matcher requires that both sub-matchers be active return mcore.NullMatcher() if minquality: a_max = a.max_quality() b_max = b.max_quality() if a_max + b_max < minquality: # If the combined quality of the sub-matchers can't contribute, # return an inactive matcher return mcore.NullMatcher() # Require that the replacements be able to contribute results # higher than the minquality a_min = minquality - b_max b_min = minquality - a_max else: a_min = b_min = 0 a = a.replace(a_min) b = b.replace(b_min) a_active = a.is_active() b_active = b.is_active() if not (a_active or b_active): return mcore.NullMatcher() elif not a_active: return b elif not b_active: return a elif a is not self.a or b is not self.b: return self.__class__(a, b) else: return self def is_active(self): return self.a.is_active() and self.b.is_active() def _find_next(self): a = self.a b = self.b a_id = a.id() b_id = b.id() assert a_id != b_id r = False while a.is_active() and b.is_active() and a_id != b_id: if a_id < b_id: ra = a.skip_to(b_id) if not a.is_active(): return r = r or ra a_id = a.id() else: rb = b.skip_to(a_id) if not b.is_active(): return r = r or rb b_id = b.id() return r def id(self): return self.a.id() # Using sets is faster in some cases, but could potentially use a lot of # memory def all_ids(self): return iter(sorted(set(self.a.all_ids()) & set(self.b.all_ids()))) def skip_to(self, id): if not self.is_active(): raise mcore.ReadTooFar ra = self.a.skip_to(id) rb = self.b.skip_to(id) if self.is_active(): rn = False if self.a.id() != self.b.id(): rn = self._find_next() return ra or rb or rn def skip_to_quality(self, minquality): a = self.a b = self.b minquality = minquality skipped = 0 aq = a.block_quality() bq = b.block_quality() while a.is_active() and b.is_active() and aq + bq <= minquality: if aq < bq: # If the block quality of A is less than B, skip A ahead until # it can contribute at least the balance of the required min # quality when added to B sk = a.skip_to_quality(minquality - bq) skipped += sk if not sk and a.is_active(): # The matcher couldn't skip ahead for some reason, so just # advance and try again a.next() else: # And vice-versa sk = b.skip_to_quality(minquality - aq) skipped += sk if not sk and b.is_active(): b.next() if not a.is_active() or not b.is_active(): # One of the matchers is exhausted break if a.id() != b.id(): # We want to always leave in a state where the matchers are at # the same document, so call _find_next() to sync them self._find_next() # Get the block qualities at the new matcher positions aq = a.block_quality() bq = b.block_quality() return skipped def next(self): if not self.is_active(): raise mcore.ReadTooFar # We must assume that the ids are equal whenever next() is called (they # should have been made equal by _find_next), so advance them both ar = self.a.next() if self.is_active(): nr = self._find_next() return ar or nr def spans(self): return sorted(set(self.a.spans()) | set(self.b.spans())) class AndNotMatcher(BiMatcher): """Matches the postings in the first sub-matcher that are NOT present in the second sub-matcher. """ def __init__(self, a, b): super(AndNotMatcher, self).__init__(a, b) self._find_first() def reset(self): self.a.reset() self.b.reset() self._find_first() def _find_first(self): if (self.a.is_active() and self.b.is_active() and self.a.id() == self.b.id()): self._find_next() def is_active(self): return self.a.is_active() def _find_next(self): pos = self.a neg = self.b if not neg.is_active(): return pos_id = pos.id() r = False if neg.id() < pos_id: neg.skip_to(pos_id) while pos.is_active() and neg.is_active() and pos_id == neg.id(): nr = pos.next() if not pos.is_active(): break r = r or nr pos_id = pos.id() neg.skip_to(pos_id) return r def supports_block_quality(self): return self.a.supports_block_quality() def replace(self, minquality=0): if not self.a.is_active(): # The a matcher is required, so if it's inactive, return an # inactive matcher return mcore.NullMatcher() elif (minquality and self.a.max_quality() < minquality): # If the quality of the required matcher isn't high enough to # contribute, return an inactive matcher return mcore.NullMatcher() elif not self.b.is_active(): # If the prohibited matcher is inactive, convert to just the # required matcher return self.a.replace(minquality) a = self.a.replace(minquality) b = self.b.replace() if a is not self.a or b is not self.b: # If one of the sub-matchers was replaced, return a new AndNot return self.__class__(a, b) else: return self def max_quality(self): return self.a.max_quality() def block_quality(self): return self.a.block_quality() def skip_to_quality(self, minquality): skipped = self.a.skip_to_quality(minquality) self._find_next() return skipped def id(self): return self.a.id() def next(self): if not self.a.is_active(): raise mcore.ReadTooFar ar = self.a.next() nr = False if self.a.is_active() and self.b.is_active(): nr = self._find_next() return ar or nr def skip_to(self, id): if not self.a.is_active(): raise mcore.ReadTooFar if id < self.a.id(): return self.a.skip_to(id) if self.b.is_active(): self.b.skip_to(id) self._find_next() def weight(self): return self.a.weight() def score(self): return self.a.score() def supports(self, astype): return self.a.supports(astype) def value(self): return self.a.value() def value_as(self, astype): return self.a.value_as(astype) class AndMaybeMatcher(AdditiveBiMatcher): """Matches postings in the first sub-matcher, and if the same posting is in the second sub-matcher, adds their scores. """ def __init__(self, a, b): AdditiveBiMatcher.__init__(self, a, b) self._first_b() def reset(self): self.a.reset() self.b.reset() self._first_b() def _first_b(self): a = self.a b = self.b if a.is_active() and b.is_active() and a.id() != b.id(): b.skip_to(a.id()) def is_active(self): return self.a.is_active() def id(self): return self.a.id() def next(self): if not self.a.is_active(): raise mcore.ReadTooFar ar = self.a.next() br = False if self.a.is_active() and self.b.is_active(): br = self.b.skip_to(self.a.id()) return ar or br def skip_to(self, id): if not self.a.is_active(): raise mcore.ReadTooFar ra = self.a.skip_to(id) rb = False if self.a.is_active() and self.b.is_active(): rb = self.b.skip_to(id) return ra or rb def replace(self, minquality=0): a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() if not a_active: return mcore.NullMatcher() elif minquality and b_active: if a.max_quality() + b.max_quality() < minquality: # If the combined max quality of the sub-matchers isn't high # enough to possibly contribute, return an inactive matcher return mcore.NullMatcher() elif a.max_quality() < minquality: # If the max quality of the main sub-matcher isn't high enough # to ever contribute without the optional sub- matcher, change # into an IntersectionMatcher return IntersectionMatcher(self.a, self.b) elif not b_active: return a.replace(minquality) new_a = a.replace(minquality - b.max_quality()) new_b = b.replace(minquality - a.max_quality()) if new_a is not a or new_b is not b: # If one of the sub-matchers changed, return a new AndMaybe return self.__class__(new_a, new_b) else: return self def skip_to_quality(self, minquality): a = self.a b = self.b minquality = minquality if not a.is_active(): raise mcore.ReadTooFar if not b.is_active(): return a.skip_to_quality(minquality) skipped = 0 aq = a.block_quality() bq = b.block_quality() while a.is_active() and b.is_active() and aq + bq <= minquality: if aq < bq: skipped += a.skip_to_quality(minquality - bq) aq = a.block_quality() else: skipped += b.skip_to_quality(minquality - aq) bq = b.block_quality() return skipped def weight(self): if self.a.id() == self.b.id(): return self.a.weight() + self.b.weight() else: return self.a.weight() def score(self): if self.b.is_active() and self.a.id() == self.b.id(): return self.a.score() + self.b.score() else: return self.a.score() def supports(self, astype): return self.a.supports(astype) def value(self): return self.a.value() def value_as(self, astype): return self.a.value_as(astype) Whoosh-2.5.7/src/whoosh/matching/combo.py0000644000076500000240000002330612254366350020437 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division from array import array from whoosh.compat import xrange from whoosh.matching import mcore class CombinationMatcher(mcore.Matcher): def __init__(self, submatchers, boost=1.0): self._submatchers = submatchers self._boost = boost def supports_block_quality(self): return all(m.supports_block_quality() for m in self._submatchers) def max_quality(self): return max(m.max_quality() for m in self._submatchers if m.is_active()) * self._boost def supports(self, astype): return all(m.supports(astype) for m in self._submatchers) def children(self): return iter(self._submatchers) def score(self): return sum(m.score() for m in self._submatchers) * self._boost class PreloadedUnionMatcher(CombinationMatcher): """Instead of marching the sub-matchers along in parallel, this matcher pre-reads the scores for EVERY MATCHING DOCUMENT, trading memory for speed. This is faster than the implementation using a binary tree of :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just because of less overhead), but it doesn't allow getting information about the "current" document other than the score, because there isn't really a current document, just an array of scores. """ def __init__(self, submatchers, doccount, boost=1.0, scored=True): CombinationMatcher.__init__(self, submatchers, boost=boost) self._doccount = doccount a = array("d") active = [subm for subm in self._submatchers if subm.is_active()] if active: offset = self._docnum = min(m.id() for m in active) for m in active: while m.is_active(): if scored: score = m.score() * boost else: score = boost docnum = m.id() place = docnum - offset if len(a) <= place: a.extend(0 for _ in xrange(place - len(a) + 1)) a[place] += score m.next() self._a = a self._offset = offset else: self._docnum = 0 self._offset = 0 self._a = a def is_active(self): return self._docnum - self._offset < len(self._a) def id(self): return self._docnum def score(self): return self._a[self._docnum - self._offset] def next(self): a = self._a offset = self._offset place = self._docnum - offset place += 1 while place < len(a) and a[place] == 0: place += 1 self._docnum = place + offset def max_quality(self): return max(self._a[self._docnum - self._offset:]) def block_quality(self): return self.max_quality() def skip_to(self, docnum): if docnum < self._docnum: return self._docnum = docnum i = docnum - self._offset if i < len(self._a) and self._a[i] == 0: self.next() def skip_to_quality(self, minquality): a = self._a offset = self._offset place = self._docnum - offset skipped = 0 while place < len(a) and a[place] <= minquality: place += 1 skipped = 1 self._docnum = place + offset return skipped def supports(self, astype): # This matcher doesn't support any posting values return False def all_ids(self): a = self._a offset = self._offset place = self._docnum - offset while place < len(a): if a[place] > 0: yield place + offset place += 1 class ArrayUnionMatcher(CombinationMatcher): """Instead of marching the sub-matchers along in parallel, this matcher pre-reads the scores for a large block of documents at a time from each matcher, accumulating the scores in an array. This is faster than the implementation using a binary tree of :class:`~whoosh.matching.binary.UnionMatcher` objects (possibly just because of less overhead), but it doesn't allow getting information about the "current" document other than the score, because there isn't really a current document, just an array of scores. """ def __init__(self, submatchers, doccount, boost=1.0, scored=True, partsize=2048): CombinationMatcher.__init__(self, submatchers, boost=boost) self._scored = scored self._doccount = doccount if not partsize: partsize = doccount self._partsize = partsize self._a = array("d", (0 for _ in xrange(self._partsize))) self._docnum = self._min_id() self._read_part() def __repr__(self): return ("%s(%r, boost=%f, scored=%r, partsize=%d)" % (self.__class__.__name__, self._submatchers, self._boost, self._scored, self._partsize)) def _min_id(self): active = [subm for subm in self._submatchers if subm.is_active()] if active: return min(subm.id() for subm in active) else: return self._doccount def _read_part(self): scored = self._scored boost = self._boost limit = min(self._docnum + self._partsize, self._doccount) offset = self._docnum a = self._a # Clear the array for i in xrange(self._partsize): a[i] = 0 # Add the scores from the submatchers into the array for m in self._submatchers: while m.is_active() and m.id() < limit: i = m.id() - offset if scored: a[i] += m.score() * boost else: a[i] = 1 m.next() self._offset = offset self._limit = limit def _find_next(self): a = self._a docnum = self._docnum offset = self._offset limit = self._limit while docnum < limit: if a[docnum - offset] > 0: break docnum += 1 if docnum == limit: self._docnum = self._min_id() self._read_part() else: self._docnum = docnum def supports(self, astype): # This matcher doesn't support any posting values return False def is_active(self): return self._docnum < self._doccount def max_quality(self): return max(m.max_quality() for m in self._submatchers) def block_quality(self): return max(self._a) def skip_to(self, docnum): if docnum < self._offset: # We've already passed it return elif docnum < self._limit: # It's in the current part self._docnum = docnum self._find_next() return # Advance all active submatchers submatchers = self._submatchers active = False for subm in submatchers: if subm.is_active(): subm.skip_to(docnum) if any(subm.is_active() for subm in submatchers): # Rebuffer self._docnum = self._min_id() self._read_part() else: self._docnum = self._doccount def skip_to_quality(self, minquality): skipped = 0 while self.is_active() and self.block_quality() <= minquality: skipped += 1 self._docnum = self._limit self._read_part() if self.is_active(): self._find_next() return skipped def id(self): return self._docnum def all_ids(self): doccount = self._doccount docnum = self._docnum offset = self._offset limit = self._limit a = self._a while docnum < doccount: if a[docnum - offset] > 0: yield docnum docnum += 1 if docnum == limit: self._docnum = docnum self._read_part() offset = self._offset limit = self._limit def next(self): self._docnum += 1 return self._find_next() def score(self): return self._a[self._docnum - self._offset] Whoosh-2.5.7/src/whoosh/matching/mcore.py0000644000076500000240000004463112254366350020451 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module contains "matcher" classes. Matchers deal with posting lists. The most basic matcher, which reads the list of postings for a term, will be provided by the backend implementation (for example, :class:`whoosh.filedb.filepostings.FilePostingReader`). The classes in this module provide additional functionality, such as combining the results of two matchers, or modifying the results of a matcher. You do not need to deal with the classes in this module unless you need to write your own Matcher implementation to provide some new functionality. These classes are not instantiated by the user. They are usually created by a :class:`~whoosh.query.Query` object's :meth:`~whoosh.query.Query.matcher()` method, which returns the appropriate matcher to implement the query (for example, the :class:`~whoosh.query.Or` query's :meth:`~whoosh.query.Or.matcher()` method returns a :py:class:`~whoosh.matching.UnionMatcher` object). Certain backends support "quality" optimizations. These backends have the ability to skip ahead if it knows the current block of postings can't contribute to the top N documents. If the matcher tree and backend support these optimizations, the matcher's :meth:`Matcher.supports_block_quality()` method will return ``True``. """ import sys from itertools import repeat from whoosh.compat import izip, xrange from whoosh.compat import abstractmethod # Exceptions class ReadTooFar(Exception): """Raised when :meth:`~whoosh.matching.Matcher.next()` or :meth:`~whoosh.matching.Matcher.skip_to()` are called on an inactive matcher. """ class NoQualityAvailable(Exception): """Raised when quality methods are called on a matcher that does not support block quality optimizations. """ # Classes class Matcher(object): """Base class for all matchers. """ @abstractmethod def is_active(self): """Returns True if this matcher is still "active", that is, it has not yet reached the end of the posting list. """ raise NotImplementedError @abstractmethod def reset(self): """Returns to the start of the posting list. Note that reset() may not do what you expect after you call :meth:`Matcher.replace()`, since this can mean calling reset() not on the original matcher, but on an optimized replacement. """ raise NotImplementedError def term(self): """Returns a ``("fieldname", "termtext")`` tuple for the term this matcher matches, or None if this matcher is not a term matcher. """ return None def term_matchers(self): """Returns an iterator of term matchers in this tree. """ if self.term() is not None: yield self else: for cm in self.children(): for m in cm.term_matchers(): yield m def matching_terms(self, id=None): """Returns an iterator of ``("fieldname", "termtext")`` tuples for the **currently matching** term matchers in this tree. """ if not self.is_active(): return if id is None: id = self.id() elif id != self.id(): return t = self.term() if t is None: for c in self.children(): for t in c.matching_terms(id): yield t else: yield t def is_leaf(self): return not bool(self.children()) def children(self): """Returns an (possibly empty) list of the submatchers of this matcher. """ return [] def replace(self, minquality=0): """Returns a possibly-simplified version of this matcher. For example, if one of the children of a UnionMatcher is no longer active, calling this method on the UnionMatcher will return the other child. """ return self @abstractmethod def copy(self): """Returns a copy of this matcher. """ raise NotImplementedError def depth(self): """Returns the depth of the tree under this matcher, or 0 if this matcher does not have any children. """ return 0 def supports_block_quality(self): """Returns True if this matcher supports the use of ``quality`` and ``block_quality``. """ return False def max_quality(self): """Returns the maximum possible quality measurement for this matcher, according to the current weighting algorithm. Raises ``NoQualityAvailable`` if the matcher or weighting do not support quality measurements. """ raise NoQualityAvailable(self.__class__) def block_quality(self): """Returns a quality measurement of the current block of postings, according to the current weighting algorithm. Raises ``NoQualityAvailable`` if the matcher or weighting do not support quality measurements. """ raise NoQualityAvailable(self.__class__) @abstractmethod def id(self): """Returns the ID of the current posting. """ raise NotImplementedError def all_ids(self): """Returns a generator of all IDs in the matcher. What this method returns for a matcher that has already read some postings (whether it only yields the remaining postings or all postings from the beginning) is undefined, so it's best to only use this method on fresh matchers. """ i = 0 m = self while m.is_active(): yield m.id() m.next() i += 1 if i == 10: m = m.replace() i = 0 def all_items(self): """Returns a generator of all (ID, encoded value) pairs in the matcher. What this method returns for a matcher that has already read some postings (whether it only yields the remaining postings or all postings from the beginning) is undefined, so it's best to only use this method on fresh matchers. """ i = 0 m = self while self.is_active(): yield (m.id(), m.value()) m.next() i += 1 if i == 10: m = m.replace() i = 0 def items_as(self, astype): """Returns a generator of all (ID, decoded value) pairs in the matcher. What this method returns for a matcher that has already read some postings (whether it only yields the remaining postings or all postings from the beginning) is undefined, so it's best to only use this method on fresh matchers. """ while self.is_active(): yield (self.id(), self.value_as(astype)) self.next() @abstractmethod def value(self): """Returns the encoded value of the current posting. """ raise NotImplementedError @abstractmethod def supports(self, astype): """Returns True if the field's format supports the named data type, for example 'frequency' or 'characters'. """ raise NotImplementedError("supports not implemented in %s" % self.__class__) @abstractmethod def value_as(self, astype): """Returns the value(s) of the current posting as the given type. """ raise NotImplementedError("value_as not implemented in %s" % self.__class__) def spans(self): """Returns a list of :class:`~whoosh.query.spans.Span` objects for the matches in this document. Raises an exception if the field being searched does not store positions. """ from whoosh.query.spans import Span if self.supports("characters"): return [Span(pos, startchar=startchar, endchar=endchar) for pos, startchar, endchar in self.value_as("characters")] elif self.supports("positions"): return [Span(pos) for pos in self.value_as("positions")] else: raise Exception("Field does not support spans") def skip_to(self, id): """Moves this matcher to the first posting with an ID equal to or greater than the given ID. """ while self.is_active() and self.id() < id: self.next() def skip_to_quality(self, minquality): """Moves this matcher to the next block with greater than the given minimum quality value. """ raise NotImplementedError(self.__class__.__name__) @abstractmethod def next(self): """Moves this matcher to the next posting. """ raise NotImplementedError(self.__class__.__name__) def weight(self): """Returns the weight of the current posting. """ return self.value_as("weight") @abstractmethod def score(self): """Returns the score of the current posting. """ raise NotImplementedError(self.__class__.__name__) def __eq__(self, other): return self.__class__ is type(other) def __lt__(self, other): return type(other) is self.__class__ def __ne__(self, other): return not self.__eq__(other) def __gt__(self, other): return not (self.__lt__(other) or self.__eq__(other)) def __le__(self, other): return self.__eq__(other) or self.__lt__(other) def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) # Simple intermediate classes class ConstantScoreMatcher(Matcher): def __init__(self, score=1.0): self._score = score def supports_block_quality(self): return True def max_quality(self): return self._score def block_quality(self): return self._score def skip_to_quality(self, minquality): if minquality >= self._score: self.go_inactive() def score(self): return self._score # Null matcher class NullMatcherClass(Matcher): """Matcher with no postings which is never active. """ def __call__(self): return self def __repr__(self): return "" def supports_block_quality(self): return True def max_quality(self): return 0 def block_quality(self): return 0 def skip_to_quality(self, minquality): return 0 def is_active(self): return False def reset(self): pass def all_ids(self): return [] def copy(self): return self # Singleton instance NullMatcher = NullMatcherClass() class ListMatcher(Matcher): """Synthetic matcher backed by a list of IDs. """ def __init__(self, ids, weights=None, values=None, format=None, scorer=None, position=0, all_weights=None, term=None, terminfo=None): """ :param ids: a list of doc IDs. :param weights: a list of weights corresponding to the list of IDs. If this argument is not supplied, a list of 1.0 values is used. :param values: a list of encoded values corresponding to the list of IDs. :param format: a :class:`whoosh.formats.Format` object representing the format of the field. :param scorer: a :class:`whoosh.scoring.BaseScorer` object for scoring the postings. :param term: a ``("fieldname", "text")`` tuple, or None if this is not a term matcher. """ self._ids = ids self._weights = weights self._all_weights = all_weights self._values = values self._i = position self._format = format self._scorer = scorer self._term = term self._terminfo = terminfo def __repr__(self): return "<%s>" % self.__class__.__name__ def is_active(self): return self._i < len(self._ids) def reset(self): self._i = 0 def skip_to(self, id): if not self.is_active(): raise ReadTooFar if id < self.id(): return while self._i < len(self._ids) and self._ids[self._i] < id: self._i += 1 def term(self): return self._term def copy(self): return self.__class__(self._ids, self._weights, self._values, self._format, self._scorer, self._i, self._all_weights) def replace(self, minquality=0): if not self.is_active(): return NullMatcher() elif minquality and self.max_quality() < minquality: return NullMatcher() else: return self def supports_block_quality(self): return (self._scorer is not None and self._scorer.supports_block_quality()) def max_quality(self): # This matcher treats all postings in the list as one "block", so the # block quality is the same as the quality of the entire list if self._scorer: return self._scorer.block_quality(self) else: return self.block_max_weight() def block_quality(self): return self._scorer.block_quality(self) def skip_to_quality(self, minquality): while self._i < len(self._ids) and self.block_quality() <= minquality: self._i += 1 return 0 def id(self): return self._ids[self._i] def all_ids(self): return iter(self._ids) def all_items(self): values = self._values if values is None: values = repeat('') return izip(self._ids, values) def value(self): if self._values: v = self._values[self._i] if isinstance(v, list): # This object supports "values" that are actually lists of # value strings. This is to support combining the results of # several different matchers into a single ListMatcher (see the # TOO_MANY_CLAUSES functionality of MultiTerm). We combine the # values here instead of combining them first and then making # the ListMatcher to avoid wasting time combining values if the # consumer never asks for them. assert len(v) > 0 if len(v) == 1: v = v[0] else: v = self._format.combine(v) # Replace the list with the computed value string self._values[self._i] = v return v else: return '' def value_as(self, astype): decoder = self._format.decoder(astype) return decoder(self.value()) def supports(self, astype): return self._format.supports(astype) def next(self): self._i += 1 def weight(self): if self._all_weights: return self._all_weights elif self._weights: return self._weights[self._i] else: return 1.0 def block_min_length(self): return self._terminfo.min_length() def block_max_length(self): return self._terminfo.max_length() def block_max_weight(self): if self._all_weights: return self._all_weights elif self._weights: return max(self._weights) elif self._terminfo is not None: return self._terminfo.max_weight() else: return 1.0 def score(self): if self._scorer: return self._scorer.score(self) else: return self.weight() # Term/vector leaf posting matcher middleware class LeafMatcher(Matcher): # Subclasses need to set # self.scorer -- a Scorer object or None # self.format -- Format object for the posting values def __repr__(self): return "%s(%r, %s)" % (self.__class__.__name__, self.term(), self.is_active()) def term(self): return self._term def items_as(self, astype): decoder = self.format.decoder(astype) for id, value in self.all_items(): yield (id, decoder(value)) def supports(self, astype): return self.format.supports(astype) def value_as(self, astype): decoder = self.format.decoder(astype) return decoder(self.value()) def spans(self): from whoosh.query.spans import Span if self.supports("characters"): return [Span(pos, startchar=startchar, endchar=endchar) for pos, startchar, endchar in self.value_as("characters")] elif self.supports("positions"): return [Span(pos) for pos in self.value_as("positions")] else: raise Exception("Field does not support positions (%r)" % self.term()) def supports_block_quality(self): return self.scorer and self.scorer.supports_block_quality() def max_quality(self): return self.scorer.max_quality() def block_quality(self): return self.scorer.block_quality(self) def score(self): return self.scorer.score(self) Whoosh-2.5.7/src/whoosh/matching/wrappers.py0000644000076500000240000004204612277504454021211 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division from whoosh.compat import xrange from whoosh.matching import mcore class WrappingMatcher(mcore.Matcher): """Base class for matchers that wrap sub-matchers. """ def __init__(self, child, boost=1.0): self.child = child self.boost = boost def __repr__(self): return "%s(%r, boost=%s)" % (self.__class__.__name__, self.child, self.boost) def copy(self): kwargs = {} if hasattr(self, "boost"): kwargs["boost"] = self.boost return self.__class__(self.child.copy(), **kwargs) def depth(self): return 1 + self.child.depth() def _replacement(self, newchild): return self.__class__(newchild, boost=self.boost) def replace(self, minquality=0): # Replace the child matcher r = self.child.replace(minquality) if r is not self.child: # If the child changed, return a new wrapper on the new child return self._replacement(r) else: return self def id(self): return self.child.id() def all_ids(self): return self.child.all_ids() def is_active(self): return self.child.is_active() def reset(self): self.child.reset() def children(self): return [self.child] def supports(self, astype): return self.child.supports(astype) def value(self): return self.child.value() def value_as(self, astype): return self.child.value_as(astype) def spans(self): return self.child.spans() def skip_to(self, id): return self.child.skip_to(id) def next(self): self.child.next() def supports_block_quality(self): return self.child.supports_block_quality() def skip_to_quality(self, minquality): return self.child.skip_to_quality(minquality / self.boost) def max_quality(self): return self.child.max_quality() * self.boost def block_quality(self): return self.child.block_quality() * self.boost def weight(self): return self.child.weight() * self.boost def score(self): return self.child.score() * self.boost class MultiMatcher(mcore.Matcher): """Serializes the results of a list of sub-matchers. """ def __init__(self, matchers, idoffsets, scorer=None, current=0): """ :param matchers: a list of Matcher objects. :param idoffsets: a list of offsets corresponding to items in the ``matchers`` list. """ self.matchers = matchers self.offsets = idoffsets self.scorer = scorer self.current = current self._next_matcher() def __repr__(self): return "%s(%r, %r, current=%s)" % (self.__class__.__name__, self.matchers, self.offsets, self.current) def is_active(self): return self.current < len(self.matchers) def reset(self): for mr in self.matchers: mr.reset() self.current = 0 def children(self): return [self.matchers[self.current]] def _next_matcher(self): matchers = self.matchers while (self.current < len(matchers) and not matchers[self.current].is_active()): self.current += 1 def copy(self): return self.__class__([mr.copy() for mr in self.matchers], self.offsets, current=self.current) def depth(self): if self.is_active(): return 1 + max(mr.depth() for mr in self.matchers[self.current:]) else: return 0 def replace(self, minquality=0): m = self if minquality: # Skip sub-matchers that don't have a high enough max quality to # contribute while (m.is_active() and m.matchers[m.current].max_quality() < minquality): m = self.__class__(self.matchers, self.offsets, self.scorer, m.current + 1) m._next_matcher() if not m.is_active(): return mcore.NullMatcher() # TODO: Possible optimization: if the last matcher is current, replace # this with the last matcher, but wrap it with a matcher that adds the # offset. Have to check whether that's actually faster, though. return m def id(self): current = self.current return self.matchers[current].id() + self.offsets[current] def all_ids(self): offsets = self.offsets for i, mr in enumerate(self.matchers): for id in mr.all_ids(): yield id + offsets[i] def spans(self): return self.matchers[self.current].spans() def supports(self, astype): return self.matchers[self.current].supports(astype) def value(self): return self.matchers[self.current].value() def value_as(self, astype): return self.matchers[self.current].value_as(astype) def next(self): if not self.is_active(): raise mcore.ReadTooFar self.matchers[self.current].next() if not self.matchers[self.current].is_active(): self._next_matcher() def skip_to(self, id): if not self.is_active(): raise mcore.ReadTooFar if id <= self.id(): return matchers = self.matchers offsets = self.offsets r = False while self.current < len(matchers) and id > self.id(): mr = matchers[self.current] sr = mr.skip_to(id - offsets[self.current]) r = sr or r if mr.is_active(): break self._next_matcher() return r def supports_block_quality(self): return all(mr.supports_block_quality() for mr in self.matchers[self.current:]) def max_quality(self): return max(m.max_quality() for m in self.matchers[self.current:]) def block_quality(self): return self.matchers[self.current].block_quality() def weight(self): return self.matchers[self.current].weight() def score(self): return self.scorer.score(self) def ExcludeMatcher(child, excluded, boost=1.0): return FilterMatcher(child, excluded, exclude=True, boost=boost) class FilterMatcher(WrappingMatcher): """Filters the postings from the wrapped based on whether the IDs are present in or absent from a set. """ def __init__(self, child, ids, exclude=False, boost=1.0): """ :param child: the child matcher. :param ids: a set of IDs to filter by. :param exclude: by default, only IDs from the wrapped matcher that are **in** the set are used. If this argument is True, only IDs from the wrapped matcher that are **not in** the set are used. """ super(FilterMatcher, self).__init__(child) self._ids = ids self._exclude = exclude self.boost = boost self._find_next() def __repr__(self): return "%s(%r, %r, %r, boost=%s)" % (self.__class__.__name__, self.child, self._ids, self._exclude, self.boost) def reset(self): self.child.reset() self._find_next() def copy(self): return self.__class__(self.child.copy(), self._ids, self._exclude, boost=self.boost) def _replacement(self, newchild): return self.__class__(newchild, self._ids, exclude=self._exclude, boost=self.boost) def _find_next(self): child = self.child ids = self._ids r = False if self._exclude: while child.is_active() and child.id() in ids: r = child.next() or r else: while child.is_active() and child.id() not in ids: r = child.next() or r return r def next(self): self.child.next() self._find_next() def skip_to(self, id): self.child.skip_to(id) self._find_next() def all_ids(self): ids = self._ids if self._exclude: return (id for id in self.child.all_ids() if id not in ids) else: return (id for id in self.child.all_ids() if id in ids) def all_items(self): ids = self._ids if self._exclude: return (item for item in self.child.all_items() if item[0] not in ids) else: return (item for item in self.child.all_items() if item[0] in ids) class InverseMatcher(WrappingMatcher): """Synthetic matcher, generates postings that are NOT present in the wrapped matcher. """ def __init__(self, child, limit, missing=None, weight=1.0, id=0): super(InverseMatcher, self).__init__(child) self.limit = limit self._weight = weight self.missing = missing or (lambda id: False) self._id = id self._find_next() def copy(self): return self.__class__(self.child.copy(), self.limit, weight=self._weight, missing=self.missing, id=self._id) def _replacement(self, newchild): return self.__class__(newchild, self.limit, missing=self.missing, weight=self._weight, id=self._id) def is_active(self): return self._id < self.limit def reset(self): self.child.reset() self._id = 0 self._find_next() def supports_block_quality(self): return False def _find_next(self): child = self.child missing = self.missing # If the current docnum isn't missing and the child matcher is # exhausted (so we don't have to worry about skipping its matches), we # don't have to do anything if not child.is_active() and not missing(self._id): return # Skip missing documents while self._id < self.limit and missing(self._id): self._id += 1 # Catch the child matcher up to where this matcher is if child.is_active() and child.id() < self._id: child.skip_to(self._id) # While self._id is missing or is in the child matcher, increase it while child.is_active() and self._id < self.limit: if missing(self._id): self._id += 1 continue if self._id == child.id(): self._id += 1 child.next() continue break def id(self): return self._id def all_ids(self): return mcore.Matcher.all_ids(self) def next(self): if self._id >= self.limit: raise mcore.ReadTooFar self._id += 1 self._find_next() def skip_to(self, id): if self._id >= self.limit: raise mcore.ReadTooFar if id < self._id: return self._id = id self._find_next() def weight(self): return self._weight def score(self): return self._weight class RequireMatcher(WrappingMatcher): """Matches postings that are in both sub-matchers, but only uses scores from the first. """ def __init__(self, a, b): from whoosh.matching.binary import IntersectionMatcher self.a = a self.b = b WrappingMatcher.__init__(self, IntersectionMatcher(a, b)) def copy(self): return self.__class__(self.a.copy(), self.b.copy()) def supports_block_quality(self): return self.a.supports_block_quality() def replace(self, minquality=0): if not self.child.is_active(): # If one of the sub-matchers is inactive, go inactive return mcore.NullMatcher() elif minquality and self.a.max_quality() < minquality: # If the required matcher doesn't have a high enough max quality # to possibly contribute, return an inactive matcher return mcore.NullMatcher() new_a = self.a.replace(minquality) new_b = self.b.replace() if not new_a.is_active(): return mcore.NullMatcher() elif new_a is not self.a or new_b is not self.b: # If one of the sub-matchers changed, return a new Require return self.__class__(new_a, self.b) else: return self def max_quality(self): return self.a.max_quality() def block_quality(self): return self.a.block_quality() def skip_to_quality(self, minquality): skipped = self.a.skip_to_quality(minquality) self.child._find_next() return skipped def weight(self): return self.a.weight() def score(self): return self.a.score() def supports(self, astype): return self.a.supports(astype) def value(self): return self.a.value() def value_as(self, astype): return self.a.value_as(astype) class ConstantScoreWrapperMatcher(WrappingMatcher): def __init__(self, child, score=1.0): WrappingMatcher.__init__(self, child) self._score = score def copy(self): return self.__class__(self.child.copy(), score=self._score) def _replacement(self, newchild): return self.__class__(newchild, score=self._score) def max_quality(self): return self._score def block_quality(self): return self._score def score(self): return self._score class SingleTermMatcher(WrappingMatcher): """Makes a tree of matchers act as if they were a matcher for a single term for the purposes of "what terms are matching?" questions. """ def __init__(self, child, term): WrappingMatcher.__init__(self, child) self._term = term def term(self): return self._term def replace(self, minquality=0): return self class CoordMatcher(WrappingMatcher): """Modifies the computed score to penalize documents that don't match all terms in the matcher tree. Because this matcher modifies the score, it may give unexpected results when compared to another matcher returning the unmodified score. """ def __init__(self, child, scale=1.0): WrappingMatcher.__init__(self, child) self._termcount = len(list(child.term_matchers())) self._maxqual = child.max_quality() self._scale = scale def _replacement(self, newchild): return self.__class__(newchild, scale=self._scale) def _sqr(self, score, matching): # This is the "SQR" (Short Query Ranking) function used by Apple's old # V-twin search library, described in the paper "V-Twin: A Lightweight # Engine for Interactive Use". # # http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.56.1916 # score - document score using the current weighting function # matching - number of matching terms in the current document termcount = self._termcount # Number of terms in this tree scale = self._scale # Scaling factor sqr = ((score + ((matching - 1) / (termcount - scale) ** 2)) * ((termcount - 1) / termcount)) return sqr def max_quality(self): return self._sqr(self.child.max_quality(), self._termcount) def block_quality(self): return self._sqr(self.child.block_quality(), self._termcount) def score(self): child = self.child score = child.score() matching = 0 for _ in child.matching_terms(child.id()): matching += 1 return self._sqr(score, matching) Whoosh-2.5.7/src/whoosh/multiproc.py0000644000076500000240000003531212254366350017564 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement import os from multiprocessing import Process, Queue, cpu_count from whoosh.compat import xrange, iteritems, pickle from whoosh.codec import base from whoosh.writing import PostingPool, SegmentWriter from whoosh.externalsort import imerge from whoosh.util import random_name def finish_subsegment(writer, k=64): # Tell the pool to finish up the current file writer.pool.save() # Tell the pool to merge any and all runs in the pool until there # is only one run remaining. "k" is an optional parameter passed # from the parent which sets the maximum number of files to open # while reducing. writer.pool.reduce_to(1, k) # The filename of the single remaining run runname = writer.pool.runs[0] # The indexed field names fieldnames = writer.pool.fieldnames # The segment object (parent can use this to re-open the files created # by the sub-writer) segment = writer._partial_segment() return runname, fieldnames, segment # Multiprocessing Writer class SubWriterTask(Process): # This is a Process object that takes "jobs" off a job Queue, processes # them, and when it's done, puts a summary of its work on a results Queue def __init__(self, storage, indexname, jobqueue, resultqueue, kwargs, multisegment): Process.__init__(self) self.storage = storage self.indexname = indexname self.jobqueue = jobqueue self.resultqueue = resultqueue self.kwargs = kwargs self.multisegment = multisegment self.running = True def run(self): # This is the main loop of the process. OK, so the way this works is # kind of brittle and stupid, but I had to figure out how to use the # multiprocessing module, work around bugs, and address performance # issues, so there is at least some reasoning behind some of this # The "parent" task farms individual documents out to the subtasks for # indexing. You could pickle the actual documents and put them in the # queue, but that is not very performant. Instead, we assume the tasks # share a filesystem and use that to pass the information around. The # parent task writes a certain number of documents to a file, then puts # the filename on the "job queue". A subtask gets the filename off the # queue and reads through the file processing the documents. jobqueue = self.jobqueue resultqueue = self.resultqueue multisegment = self.multisegment # Open a placeholder object representing the index ix = self.storage.open_index(self.indexname) # Open a writer for the index. The _lk=False parameter means to not try # to lock the index (the parent object that started me takes care of # locking the index) writer = self.writer = SegmentWriter(ix, _lk=False, **self.kwargs) # If the parent task calls cancel() on me, it will set self.running to # False, so I'll notice the next time through the loop while self.running: # Take an object off the job queue jobinfo = jobqueue.get() # If the object is None, it means the parent task wants me to # finish up if jobinfo is None: break # The object from the queue is a tuple of (filename, # number_of_docs_in_file). Pass those two pieces of information as # arguments to _process_file(). self._process_file(*jobinfo) if not self.running: # I was cancelled, so I'll cancel my underlying writer writer.cancel() else: if multisegment: # Actually finish the segment and return it with no run runname = None fieldnames = writer.pool.fieldnames segment = writer._finalize_segment() else: # Merge all runs in the writer's pool into one run, close the # segment, and return the run name and the segment k = self.kwargs.get("k", 64) runname, fieldnames, segment = finish_subsegment(writer, k) # Put the results (the run filename and the segment object) on the # result queue resultqueue.put((runname, fieldnames, segment), timeout=5) def _process_file(self, filename, doc_count): # This method processes a "job file" written out by the parent task. A # job file is a series of pickled (code, arguments) tuples. Currently # the only command codes is 0=add_document writer = self.writer tempstorage = writer.temp_storage() load = pickle.load with tempstorage.open_file(filename).raw_file() as f: for _ in xrange(doc_count): # Load the next pickled tuple from the file code, args = load(f) assert code == 0 writer.add_document(**args) # Remove the job file tempstorage.delete_file(filename) def cancel(self): self.running = False class MpWriter(SegmentWriter): def __init__(self, ix, procs=None, batchsize=100, subargs=None, multisegment=False, **kwargs): # This is the "main" writer that will aggregate the results created by # the sub-tasks SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() # The maximum number of documents in each job file submitted to the # sub-tasks self.batchsize = batchsize # You can use keyword arguments or the "subargs" argument to pass # keyword arguments to the sub-writers self.subargs = subargs if subargs else kwargs # If multisegment is True, don't merge the segments created by the # sub-writers, just add them directly to the TOC self.multisegment = multisegment # A list to hold the sub-task Process objects self.tasks = [] # A queue to pass the filenames of job files to the sub-tasks self.jobqueue = Queue(self.procs * 4) # A queue to get back the final results of the sub-tasks self.resultqueue = Queue() # A buffer for documents before they are flushed to a job file self.docbuffer = [] self._grouping = 0 self._added_sub = False def _new_task(self): task = SubWriterTask(self.storage, self.indexname, self.jobqueue, self.resultqueue, self.subargs, self.multisegment) self.tasks.append(task) task.start() return task def _enqueue(self): # Flush the documents stored in self.docbuffer to a file and put the # filename on the job queue docbuffer = self.docbuffer dump = pickle.dump length = len(docbuffer) filename = "%s.doclist" % random_name() with self.temp_storage().create_file(filename).raw_file() as f: for item in docbuffer: dump(item, f, -1) if len(self.tasks) < self.procs: self._new_task() jobinfo = (filename, length) self.jobqueue.put(jobinfo) self.docbuffer = [] def cancel(self): try: for task in self.tasks: task.cancel() finally: SegmentWriter.cancel(self) def start_group(self): self._grouping += 1 def end_group(self): if not self._grouping: raise Exception("Unbalanced end_group") self._grouping -= 1 def add_document(self, **fields): # Add the document to the docbuffer self.docbuffer.append((0, fields)) # If the buffer is full, flush it to the job queue if not self._grouping and len(self.docbuffer) >= self.batchsize: self._enqueue() self._added_sub = True def _read_and_renumber_run(self, path, offset): # Note that SortingPool._read_run() automatically deletes the run file # when it's finished gen = self.pool._read_run(path) # If offset is 0, just return the items unchanged if not offset: return gen else: # Otherwise, add the offset to each docnum return ((fname, text, docnum + offset, weight, value) for fname, text, docnum, weight, value in gen) def commit(self, mergetype=None, optimize=None, merge=None): if self._added_sub: # If documents have been added to sub-writers, use the parallel # merge commit code self._commit(mergetype, optimize, merge) else: # Otherwise, just do a regular-old commit SegmentWriter.commit(self, mergetype=mergetype, optimize=optimize, merge=merge) def _commit(self, mergetype, optimize, merge): # Index the remaining documents in the doc buffer if self.docbuffer: self._enqueue() # Tell the tasks to finish for task in self.tasks: self.jobqueue.put(None) # Merge existing segments finalsegments = self._merge_segments(mergetype, optimize, merge) # Wait for the subtasks to finish for task in self.tasks: task.join() # Pull a (run_file_name, fieldnames, segment) tuple off the result # queue for each sub-task, representing the final results of the task results = [] for task in self.tasks: results.append(self.resultqueue.get(timeout=5)) if self.multisegment: # If we're not merging the segments, we don't care about the runname # and fieldnames in the results... just pull out the segments and # add them to the list of final segments finalsegments += [s for _, _, s in results] if self._added: finalsegments.append(self._finalize_segment()) else: self._close_segment() assert self.perdocwriter.is_closed else: # Merge the posting sources from the sub-writers and my # postings into this writer self._merge_subsegments(results, mergetype) self._close_segment() self._assemble_segment() finalsegments.append(self.get_segment()) assert self.perdocwriter.is_closed self._commit_toc(finalsegments) self._finish() def _merge_subsegments(self, results, mergetype): schema = self.schema schemanames = set(schema.names()) storage = self.storage codec = self.codec sources = [] # If information was added to this writer the conventional (e.g. # through add_reader or merging segments), add it as an extra source if self._added: sources.append(self.pool.iter_postings()) pdrs = [] for runname, fieldnames, segment in results: fieldnames = set(fieldnames) | schemanames pdr = codec.per_document_reader(storage, segment) pdrs.append(pdr) basedoc = self.docnum docmap = self.write_per_doc(fieldnames, pdr) assert docmap is None items = self._read_and_renumber_run(runname, basedoc) sources.append(items) # Create a MultiLengths object combining the length files from the # subtask segments self.perdocwriter.close() pdrs.insert(0, self.per_document_reader()) mpdr = base.MultiPerDocumentReader(pdrs) try: # Merge the iterators into the field writer self.fieldwriter.add_postings(schema, mpdr, imerge(sources)) finally: mpdr.close() self._added = True class SerialMpWriter(MpWriter): # A non-parallel version of the MpWriter for testing purposes def __init__(self, ix, procs=None, batchsize=100, subargs=None, **kwargs): SegmentWriter.__init__(self, ix, **kwargs) self.procs = procs or cpu_count() self.batchsize = batchsize self.subargs = subargs if subargs else kwargs self.tasks = [SegmentWriter(ix, _lk=False, **self.subargs) for _ in xrange(self.procs)] self.pointer = 0 self._added_sub = False def add_document(self, **fields): self.tasks[self.pointer].add_document(**fields) self.pointer = (self.pointer + 1) % len(self.tasks) self._added_sub = True def _commit(self, mergetype, optimize, merge): # Pull a (run_file_name, segment) tuple off the result queue for each # sub-task, representing the final results of the task # Merge existing segments finalsegments = self._merge_segments(mergetype, optimize, merge) results = [] for writer in self.tasks: results.append(finish_subsegment(writer)) self._merge_subsegments(results, mergetype) self._close_segment() self._assemble_segment() finalsegments.append(self.get_segment()) self._commit_toc(finalsegments) self._finish() # For compatibility with old multiproc module class MultiSegmentWriter(MpWriter): def __init__(self, *args, **kwargs): MpWriter.__init__(self, *args, **kwargs) self.multisegment = True Whoosh-2.5.7/src/whoosh/qparser/0000755000076500000240000000000012277504634016651 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/qparser/__init__.py0000644000076500000240000000315012254366350020755 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.qparser.default import * from whoosh.qparser.plugins import * from whoosh.qparser.syntax import * Whoosh-2.5.7/src/whoosh/qparser/common.py0000644000076500000240000000461712254366764020530 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module contains common utility objects/functions for the other query parser modules. """ import sys from whoosh.compat import string_type class QueryParserError(Exception): def __init__(self, cause, msg=None): super(QueryParserError, self).__init__(str(cause)) self.cause = cause def get_single_text(field, text, **kwargs): """Returns the first token from an analyzer's output. """ for t in field.process_text(text, mode="query", **kwargs): return t def attach(q, stxnode): if q: try: q.startchar = stxnode.startchar q.endchar = stxnode.endchar except AttributeError: raise AttributeError("Can't set attribute on %s" % q.__class__.__name__) return q def print_debug(level, msg, out=sys.stderr): if level: out.write("%s%s\n" % (" " * (level - 1), msg)) Whoosh-2.5.7/src/whoosh/qparser/dateparse.py0000644000076500000240000010000312254366350021161 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import re import sys from datetime import datetime, timedelta from whoosh.compat import string_type, iteritems from whoosh.qparser import plugins, syntax from whoosh.qparser.taggers import Tagger from whoosh.support.relativedelta import relativedelta from whoosh.util.text import rcompile from whoosh.util.times import adatetime, timespan from whoosh.util.times import fill_in, is_void, relative_days from whoosh.util.times import TimeError class DateParseError(Exception): "Represents an error in parsing date text." # Utility functions def print_debug(level, msg, *args): if level > 0: print((" " * (level - 1)) + (msg % args)) # Parser element objects class Props(object): """A dumb little object that just puts copies a dictionary into attibutes so I can use dot syntax instead of square bracket string item lookup and save a little bit of typing. Used by :class:`Regex`. """ def __init__(self, **args): self.__dict__ = args def __repr__(self): return repr(self.__dict__) def get(self, key, default=None): return self.__dict__.get(key, default) class ParserBase(object): """Base class for date parser elements. """ def to_parser(self, e): if isinstance(e, string_type): return Regex(e) else: return e def parse(self, text, dt, pos=0, debug=-9999): raise NotImplementedError def date_from(self, text, dt=None, pos=0, debug=-9999): if dt is None: dt = datetime.now() d, pos = self.parse(text, dt, pos, debug + 1) return d class MultiBase(ParserBase): """Base class for date parser elements such as Sequence and Bag that have sub-elements. """ def __init__(self, elements, name=None): """ :param elements: the sub-elements to match. :param name: a name for this element (for debugging purposes only). """ self.elements = [self.to_parser(e) for e in elements] self.name = name def __repr__(self): return "%s<%s>%r" % (self.__class__.__name__, self.name or '', self.elements) class Sequence(MultiBase): """Merges the dates parsed by a sequence of sub-elements. """ def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", name=None, progressive=False): """ :param elements: the sequence of sub-elements to parse. :param sep: a separator regular expression to match between elements, or None to not have separators. :param name: a name for this element (for debugging purposes only). :param progressive: if True, elements after the first do not need to match. That is, for elements (a, b, c) and progressive=True, the sequence matches like ``a[b[c]]``. """ super(Sequence, self).__init__(elements, name) self.sep_pattern = sep if sep: self.sep_expr = rcompile(sep, re.IGNORECASE) else: self.sep_expr = None self.progressive = progressive def parse(self, text, dt, pos=0, debug=-9999): d = adatetime() first = True foundall = False failed = False print_debug(debug, "Seq %s sep=%r text=%r", self.name, self.sep_pattern, text[pos:]) for e in self.elements: print_debug(debug, "Seq %s text=%r", self.name, text[pos:]) if self.sep_expr and not first: print_debug(debug, "Seq %s looking for sep", self.name) m = self.sep_expr.match(text, pos) if m: pos = m.end() else: print_debug(debug, "Seq %s didn't find sep", self.name) break print_debug(debug, "Seq %s trying=%r at=%s", self.name, e, pos) try: at, newpos = e.parse(text, dt, pos=pos, debug=debug + 1) except TimeError: failed = True break print_debug(debug, "Seq %s result=%r", self.name, at) if not at: break pos = newpos print_debug(debug, "Seq %s adding=%r to=%r", self.name, at, d) try: d = fill_in(d, at) except TimeError: print_debug(debug, "Seq %s Error in fill_in", self.name) failed = True break print_debug(debug, "Seq %s filled date=%r", self.name, d) first = False else: foundall = True if not failed and (foundall or (not first and self.progressive)): print_debug(debug, "Seq %s final=%r", self.name, d) return (d, pos) else: print_debug(debug, "Seq %s failed", self.name) return (None, None) class Combo(Sequence): """Parses a sequence of elements in order and combines the dates parsed by the sub-elements somehow. The default behavior is to accept two dates from the sub-elements and turn them into a range. """ def __init__(self, elements, fn=None, sep="(\\s+|\\s*,\\s*)", min=2, max=2, name=None): """ :param elements: the sequence of sub-elements to parse. :param fn: a function to run on all dates found. It should return a datetime, adatetime, or timespan object. If this argument is None, the default behavior accepts two dates and returns a timespan. :param sep: a separator regular expression to match between elements, or None to not have separators. :param min: the minimum number of dates required from the sub-elements. :param max: the maximum number of dates allowed from the sub-elements. :param name: a name for this element (for debugging purposes only). """ super(Combo, self).__init__(elements, sep=sep, name=name) self.fn = fn self.min = min self.max = max def parse(self, text, dt, pos=0, debug=-9999): dates = [] first = True print_debug(debug, "Combo %s sep=%r text=%r", self.name, self.sep_pattern, text[pos:]) for e in self.elements: if self.sep_expr and not first: print_debug(debug, "Combo %s looking for sep at %r", self.name, text[pos:]) m = self.sep_expr.match(text, pos) if m: pos = m.end() else: print_debug(debug, "Combo %s didn't find sep", self.name) return (None, None) print_debug(debug, "Combo %s trying=%r", self.name, e) try: at, pos = e.parse(text, dt, pos, debug + 1) except TimeError: at, pos = None, None print_debug(debug, "Combo %s result=%r", self.name, at) if at is None: return (None, None) first = False if is_void(at): continue if len(dates) == self.max: print_debug(debug, "Combo %s length > %s", self.name, self.max) return (None, None) dates.append(at) print_debug(debug, "Combo %s dates=%r", self.name, dates) if len(dates) < self.min: print_debug(debug, "Combo %s length < %s", self.name, self.min) return (None, None) return (self.dates_to_timespan(dates), pos) def dates_to_timespan(self, dates): if self.fn: return self.fn(dates) elif len(dates) == 2: return timespan(dates[0], dates[1]) else: raise DateParseError("Don't know what to do with %r" % (dates,)) class Choice(MultiBase): """Returns the date from the first of its sub-elements that matches. """ def parse(self, text, dt, pos=0, debug=-9999): print_debug(debug, "Choice %s text=%r", self.name, text[pos:]) for e in self.elements: print_debug(debug, "Choice %s trying=%r", self.name, e) try: d, newpos = e.parse(text, dt, pos, debug + 1) except TimeError: d, newpos = None, None if d: print_debug(debug, "Choice %s matched", self.name) return (d, newpos) print_debug(debug, "Choice %s no match", self.name) return (None, None) class Bag(MultiBase): """Parses its sub-elements in any order and merges the dates. """ def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", onceper=True, requireall=False, allof=None, anyof=None, name=None): """ :param elements: the sub-elements to parse. :param sep: a separator regular expression to match between elements, or None to not have separators. :param onceper: only allow each element to match once. :param requireall: if True, the sub-elements can match in any order, but they must all match. :param allof: a list of indexes into the list of elements. When this argument is not None, this element matches only if all the indicated sub-elements match. :param allof: a list of indexes into the list of elements. When this argument is not None, this element matches only if any of the indicated sub-elements match. :param name: a name for this element (for debugging purposes only). """ super(Bag, self).__init__(elements, name) self.sep_expr = rcompile(sep, re.IGNORECASE) self.onceper = onceper self.requireall = requireall self.allof = allof self.anyof = anyof def parse(self, text, dt, pos=0, debug=-9999): first = True d = adatetime() seen = [False] * len(self.elements) while True: newpos = pos print_debug(debug, "Bag %s text=%r", self.name, text[pos:]) if not first: print_debug(debug, "Bag %s looking for sep", self.name) m = self.sep_expr.match(text, pos) if m: newpos = m.end() else: print_debug(debug, "Bag %s didn't find sep", self.name) break for i, e in enumerate(self.elements): print_debug(debug, "Bag %s trying=%r", self.name, e) try: at, xpos = e.parse(text, dt, newpos, debug + 1) except TimeError: at, xpos = None, None print_debug(debug, "Bag %s result=%r", self.name, at) if at: if self.onceper and seen[i]: return (None, None) d = fill_in(d, at) newpos = xpos seen[i] = True break else: break pos = newpos if self.onceper and all(seen): break first = False if (not any(seen) or (self.allof and not all(seen[pos] for pos in self.allof)) or (self.anyof and not any(seen[pos] for pos in self.anyof)) or (self.requireall and not all(seen))): return (None, None) print_debug(debug, "Bag %s final=%r", self.name, d) return (d, pos) class Optional(ParserBase): """Wraps a sub-element to indicate that the sub-element is optional. """ def __init__(self, element): self.element = self.to_parser(element) def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.element) def parse(self, text, dt, pos=0, debug=-9999): try: d, pos = self.element.parse(text, dt, pos, debug + 1) except TimeError: d, pos = None, None if d: return (d, pos) else: return (adatetime(), pos) class ToEnd(ParserBase): """Wraps a sub-element and requires that the end of the sub-element's match be the end of the text. """ def __init__(self, element): self.element = element def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.element) def parse(self, text, dt, pos=0, debug=-9999): try: d, pos = self.element.parse(text, dt, pos, debug + 1) except TimeError: d, pos = None, None if d and pos == len(text): return (d, pos) else: return (None, None) class Regex(ParserBase): """Matches a regular expression and maps named groups in the pattern to datetime attributes using a function or overridden method. There are two points at which you can customize the behavior of this class, either by supplying functions to the initializer or overriding methods. * The ``modify`` function or ``modify_props`` method takes a ``Props`` object containing the named groups and modifies its values (in place). * The ``fn`` function or ``props_to_date`` method takes a ``Props`` object and the base datetime and returns an adatetime/datetime. """ fn = None modify = None def __init__(self, pattern, fn=None, modify=None): self.pattern = pattern self.expr = rcompile(pattern, re.IGNORECASE) self.fn = fn self.modify = modify def __repr__(self): return "<%r>" % (self.pattern,) def parse(self, text, dt, pos=0, debug=-9999): m = self.expr.match(text, pos) if not m: return (None, None) props = self.extract(m) self.modify_props(props) try: d = self.props_to_date(props, dt) except TimeError: d = None if d: return (d, m.end()) else: return (None, None) def extract(self, match): d = match.groupdict() for key, value in iteritems(d): try: value = int(value) d[key] = value except (ValueError, TypeError): pass return Props(**d) def modify_props(self, props): if self.modify: self.modify(props) def props_to_date(self, props, dt): if self.fn: return self.fn(props, dt) else: args = {} for key in adatetime.units: args[key] = props.get(key) return adatetime(**args) class Month(Regex): def __init__(self, *patterns): self.patterns = patterns self.exprs = [rcompile(pat, re.IGNORECASE) for pat in self.patterns] self.pattern = ("(?P" + "|".join("(%s)" % pat for pat in self.patterns) + ")") self.expr = rcompile(self.pattern, re.IGNORECASE) def modify_props(self, p): text = p.month for i, expr in enumerate(self.exprs): m = expr.match(text) if m: p.month = i + 1 break class PlusMinus(Regex): def __init__(self, years, months, weeks, days, hours, minutes, seconds): rel_years = "((?P[0-9]+) *(%s))?" % years rel_months = "((?P[0-9]+) *(%s))?" % months rel_weeks = "((?P[0-9]+) *(%s))?" % weeks rel_days = "((?P[0-9]+) *(%s))?" % days rel_hours = "((?P[0-9]+) *(%s))?" % hours rel_mins = "((?P[0-9]+) *(%s))?" % minutes rel_secs = "((?P[0-9]+) *(%s))?" % seconds self.pattern = ("(?P

[+-]) *%s *%s *%s *%s *%s *%s *%s(?=(\\W|$))" % (rel_years, rel_months, rel_weeks, rel_days, rel_hours, rel_mins, rel_secs)) self.expr = rcompile(self.pattern, re.IGNORECASE) def props_to_date(self, p, dt): if p.dir == "-": dir = -1 else: dir = 1 delta = relativedelta(years=(p.get("years") or 0) * dir, months=(p.get("months") or 0) * dir, weeks=(p.get("weeks") or 0) * dir, days=(p.get("days") or 0) * dir, hours=(p.get("hours") or 0) * dir, minutes=(p.get("mins") or 0) * dir, seconds=(p.get("secs") or 0) * dir) return dt + delta class Daynames(Regex): def __init__(self, next, last, daynames): self.next_pattern = next self.last_pattern = last self._dayname_exprs = tuple(rcompile(pat, re.IGNORECASE) for pat in daynames) dn_pattern = "|".join(daynames) self.pattern = ("(?P%s|%s) +(?P%s)(?=(\\W|$))" % (next, last, dn_pattern)) self.expr = rcompile(self.pattern, re.IGNORECASE) def props_to_date(self, p, dt): if re.match(p.dir, self.last_pattern): dir = -1 else: dir = 1 for daynum, expr in enumerate(self._dayname_exprs): m = expr.match(p.day) if m: break current_daynum = dt.weekday() days_delta = relative_days(current_daynum, daynum, dir) d = dt.date() + timedelta(days=days_delta) return adatetime(year=d.year, month=d.month, day=d.day) class Time12(Regex): def __init__(self): self.pattern = ("(?P[1-9]|10|11|12)(:(?P[0-5][0-9])" "(:(?P[0-5][0-9])(\\.(?P[0-9]{1,5}))?)?)?" "\\s*(?Pam|pm)(?=(\\W|$))") self.expr = rcompile(self.pattern, re.IGNORECASE) def props_to_date(self, p, dt): isam = p.ampm.lower().startswith("a") if p.hour == 12: if isam: hr = 0 else: hr = 12 else: hr = p.hour if not isam: hr += 12 return adatetime(hour=hr, minute=p.mins, second=p.secs, microsecond=p.usecs) # Top-level parser classes class DateParser(object): """Base class for locale-specific parser classes. """ day = Regex("(?P([123][0-9])|[1-9])(?=(\\W|$))(?!=:)", lambda p, dt: adatetime(day=p.day)) year = Regex("(?P[0-9]{4})(?=(\\W|$))", lambda p, dt: adatetime(year=p.year)) time24 = Regex("(?P([0-1][0-9])|(2[0-3])):(?P[0-5][0-9])" "(:(?P[0-5][0-9])(\\.(?P[0-9]{1,5}))?)?" "(?=(\\W|$))", lambda p, dt: adatetime(hour=p.hour, minute=p.mins, second=p.secs, microsecond=p.usecs)) time12 = Time12() def __init__(self): simple_year = "(?P[0-9]{4})" simple_month = "(?P[0-1][0-9])" simple_day = "(?P[0-3][0-9])" simple_hour = "(?P([0-1][0-9])|(2[0-3]))" simple_minute = "(?P[0-5][0-9])" simple_second = "(?P[0-5][0-9])" simple_usec = "(?P[0-9]{6})" tup = (simple_year, simple_month, simple_day, simple_hour, simple_minute, simple_second, simple_usec) simple_seq = Sequence(tup, sep="[- .:/]*", name="simple", progressive=True) self.simple = Sequence((simple_seq, "(?=(\\s|$))"), sep='') self.setup() def setup(self): raise NotImplementedError # def get_parser(self): return self.all def parse(self, text, dt, pos=0, debug=-9999): parser = self.get_parser() d, newpos = parser.parse(text, dt, pos=pos, debug=debug) if isinstance(d, (adatetime, timespan)): d = d.disambiguated(dt) return (d, newpos) def date_from(self, text, basedate=None, pos=0, debug=-9999, toend=True): if basedate is None: basedate = datetime.utcnow() parser = self.get_parser() if toend: parser = ToEnd(parser) d = parser.date_from(text, basedate, pos=pos, debug=debug) if isinstance(d, (adatetime, timespan)): d = d.disambiguated(basedate) return d class English(DateParser): day = Regex("(?P([123][0-9])|[1-9])(st|nd|rd|th)?(?=(\\W|$))", lambda p, dt: adatetime(day=p.day)) def setup(self): self.plusdate = PlusMinus("years|year|yrs|yr|ys|y", "months|month|mons|mon|mos|mo", "weeks|week|wks|wk|ws|w", "days|day|dys|dy|ds|d", "hours|hour|hrs|hr|hs|h", "minutes|minute|mins|min|ms|m", "seconds|second|secs|sec|s") self.dayname = Daynames("next", "last", ("monday|mon|mo", "tuesday|tues|tue|tu", "wednesday|wed|we", "thursday|thur|thu|th", "friday|fri|fr", "saturday|sat|sa", "sunday|sun|su")) midnight_l = lambda p, dt: adatetime(hour=0, minute=0, second=0, microsecond=0) midnight = Regex("midnight", midnight_l) noon_l = lambda p, dt: adatetime(hour=12, minute=0, second=0, microsecond=0) noon = Regex("noon", noon_l) now = Regex("now", lambda p, dt: dt) self.time = Choice((self.time12, self.time24, midnight, noon, now), name="time") def tomorrow_to_date(p, dt): d = dt.date() + timedelta(days=+1) return adatetime(year=d.year, month=d.month, day=d.day) tomorrow = Regex("tomorrow", tomorrow_to_date) def yesterday_to_date(p, dt): d = dt.date() + timedelta(days=-1) return adatetime(year=d.year, month=d.month, day=d.day) yesterday = Regex("yesterday", yesterday_to_date) thisyear = Regex("this year", lambda p, dt: adatetime(year=dt.year)) thismonth = Regex("this month", lambda p, dt: adatetime(year=dt.year, month=dt.month)) today = Regex("today", lambda p, dt: adatetime(year=dt.year, month=dt.month, day=dt.day)) self.month = Month("january|jan", "february|febuary|feb", "march|mar", "april|apr", "may", "june|jun", "july|jul", "august|aug", "september|sept|sep", "october|oct", "november|nov", "december|dec") # If you specify a day number you must also specify a month... this # Choice captures that constraint self.dmy = Choice((Sequence((self.day, self.month, self.year), name="dmy"), Sequence((self.month, self.day, self.year), name="mdy"), Sequence((self.year, self.month, self.day), name="ymd"), Sequence((self.year, self.day, self.month), name="ydm"), Sequence((self.day, self.month), name="dm"), Sequence((self.month, self.day), name="md"), Sequence((self.month, self.year), name="my"), self.month, self.year, self.dayname, tomorrow, yesterday, thisyear, thismonth, today, now, ), name="date") self.datetime = Bag((self.time, self.dmy), name="datetime") self.bundle = Choice((self.plusdate, self.datetime, self.simple), name="bundle") self.torange = Combo((self.bundle, "to", self.bundle), name="torange") self.all = Choice((self.torange, self.bundle), name="all") # QueryParser plugin class DateParserPlugin(plugins.Plugin): """Adds more powerful parsing of DATETIME fields. >>> parser.add_plugin(DateParserPlugin()) >>> parser.parse(u"date:'last tuesday'") """ def __init__(self, basedate=None, dateparser=None, callback=None, free=False, free_expr="([A-Za-z][A-Za-z_0-9]*):([^^]+)"): """ :param basedate: a datetime object representing the current time against which to measure relative dates. If you do not supply this argument, the plugin uses ``datetime.utcnow()``. :param dateparser: an instance of :class:`whoosh.qparser.dateparse.DateParser`. If you do not supply this argument, the plugin automatically uses :class:`whoosh.qparser.dateparse.English`. :param callback: a callback function for parsing errors. This allows you to provide feedback to the user about problems parsing dates. :param remove: if True, unparseable dates are removed from the token stream instead of being replaced with ErrorToken. :param free: if True, this plugin will install a filter early in the parsing process and try to find undelimited dates such as ``date:last tuesday``. Note that allowing this could result in normal query words accidentally being parsed as dates sometimes. """ self.basedate = basedate if dateparser is None: dateparser = English() self.dateparser = dateparser self.callback = callback self.free = free self.freeexpr = free_expr def taggers(self, parser): if self.free: # If we're tokenizing, we have to go before the FieldsPlugin return [(DateTagger(self, self.freeexpr), -1)] else: return () def filters(self, parser): # Run the filter after the FieldsPlugin assigns field names return [(self.do_dates, 110)] def errorize(self, message, node): if self.callback: self.callback(message) return syntax.ErrorNode(message, node) def text_to_dt(self, node): text = node.text try: dt = self.dateparser.date_from(text, self.basedate) if dt is None: return self.errorize(text, node) else: n = DateTimeNode(node.fieldname, dt, node.boost) except DateParseError: e = sys.exc_info()[1] n = self.errorize(e, node) n.startchar = node.startchar n.endchar = node.endchar return n def range_to_dt(self, node): start = end = None dp = self.dateparser.get_parser() if node.start: start = dp.date_from(node.start, self.basedate) if start is None: return self.errorize(node.start, node) if node.end: end = dp.date_from(node.end, self.basedate) if end is None: return self.errorize(node.end, node) if start and end: ts = timespan(start, end).disambiguated(self.basedate) start, end = ts.start, ts.end elif start: start = start.disambiguated(self.basedate) if isinstance(start, timespan): start = start.start elif end: end = end.disambiguated(self.basedate) if isinstance(end, timespan): end = end.end drn = DateRangeNode(node.fieldname, start, end, boost=node.boost) drn.startchar = node.startchar drn.endchar = node.endchar return drn def do_dates(self, parser, group): schema = parser.schema if not schema: return group from whoosh.fields import DATETIME datefields = frozenset(fieldname for fieldname, field in parser.schema.items() if isinstance(field, DATETIME)) for i, node in enumerate(group): if node.has_fieldname: fname = node.fieldname or parser.fieldname else: fname = None if isinstance(node, syntax.GroupNode): group[i] = self.do_dates(parser, node) elif fname in datefields: if node.has_text: group[i] = self.text_to_dt(node) elif isinstance(node, syntax.RangeNode): group[i] = self.range_to_dt(node) return group class DateTimeNode(syntax.SyntaxNode): has_fieldname = True has_boost = True def __init__(self, fieldname, dt, boost=1.0): self.fieldname = fieldname self.dt = dt self.boost = 1.0 def r(self): return repr(self.dt) def query(self, parser): from whoosh import query fieldname = self.fieldname or parser.fieldname field = parser.schema[fieldname] dt = self.dt if isinstance(self.dt, datetime): btext = field.to_bytes(dt) return query.Term(fieldname, btext, boost=self.boost) elif isinstance(self.dt, timespan): return query.DateRange(fieldname, dt.start, dt.end, boost=self.boost) else: raise Exception("Unknown time object: %r" % dt) class DateRangeNode(syntax.SyntaxNode): has_fieldname = True has_boost = True def __init__(self, fieldname, start, end, boost=1.0): self.fieldname = fieldname self.start = start self.end = end self.boost = 1.0 def r(self): return "%r-%r" % (self.start, self.end) def query(self, parser): from whoosh import query fieldname = self.fieldname or parser.fieldname return query.DateRange(fieldname, self.start, self.end, boost=self.boost) class DateTagger(Tagger): def __init__(self, plugin, expr): self.plugin = plugin self.expr = rcompile(expr, re.IGNORECASE) def match(self, parser, text, pos): from whoosh.fields import DATETIME match = self.expr.match(text, pos) if match: fieldname = match.group(1) dtext = match.group(2) if parser.schema and fieldname in parser.schema: field = parser.schema[fieldname] if isinstance(field, DATETIME): plugin = self.plugin dateparser = plugin.dateparser basedate = plugin.basedate d, newpos = dateparser.parse(dtext, basedate) if d: node = DateTimeNode(fieldname, d) node.startchar = match.start() node.endchar = newpos + match.start(2) return node Whoosh-2.5.7/src/whoosh/qparser/default.py0000644000076500000240000004105412254366350020647 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import sys from whoosh import query from whoosh.compat import text_type from whoosh.qparser import syntax from whoosh.qparser.common import print_debug, QueryParserError # Query parser object class QueryParser(object): """A hand-written query parser built on modular plug-ins. The default configuration implements a powerful fielded query language similar to Lucene's. You can use the ``plugins`` argument when creating the object to override the default list of plug-ins, and/or use ``add_plugin()`` and/or ``remove_plugin_class()`` to change the plug-ins included in the parser. >>> from whoosh import qparser >>> parser = qparser.QueryParser("content", schema) >>> parser.remove_plugin_class(qparser.WildcardPlugin) >>> parser.add_plugin(qparser.PrefixPlugin()) >>> parser.parse(u"hello there") And([Term("content", u"hello"), Term("content", u"there")]) """ def __init__(self, fieldname, schema, plugins=None, termclass=query.Term, phraseclass=query.Phrase, group=syntax.AndGroup): """ :param fieldname: the default field -- the parser uses this as the field for any terms without an explicit field. :param schema: a :class:`whoosh.fields.Schema` object to use when parsing. The appropriate fields in the schema will be used to tokenize terms/phrases before they are turned into query objects. You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes. :param plugins: a list of plugins to use. WhitespacePlugin is automatically included, do not put it in this list. This overrides the default list of plugins. Classes in the list will be automatically instantiated. :param termclass: the query class to use for individual search terms. The default is :class:`whoosh.query.Term`. :param phraseclass: the query class to use for phrases. The default is :class:`whoosh.query.Phrase`. :param group: the default grouping. ``AndGroup`` makes terms required by default. ``OrGroup`` makes terms optional by default. """ self.fieldname = fieldname self.schema = schema self.termclass = termclass self.phraseclass = phraseclass self.group = group self.plugins = [] if not plugins: plugins = self.default_set() self._add_ws_plugin() self.add_plugins(plugins) def default_set(self): """Returns the default list of plugins to use. """ from whoosh.qparser import plugins return [plugins.WhitespacePlugin(), plugins.SingleQuotePlugin(), plugins.FieldsPlugin(), plugins.WildcardPlugin(), plugins.PhrasePlugin(), plugins.RangePlugin(), plugins.GroupPlugin(), plugins.OperatorsPlugin(), plugins.BoostPlugin(), plugins.EveryPlugin(), ] def add_plugins(self, pins): """Adds the given list of plugins to the list of plugins in this parser. """ for pin in pins: self.add_plugin(pin) def add_plugin(self, pin): """Adds the given plugin to the list of plugins in this parser. """ if isinstance(pin, type): pin = pin() self.plugins.append(pin) def _add_ws_plugin(self): from whoosh.qparser.plugins import WhitespacePlugin self.add_plugin(WhitespacePlugin()) def remove_plugin(self, pi): """Removes the given plugin object from the list of plugins in this parser. """ self.plugins.remove(pi) def remove_plugin_class(self, cls): """Removes any plugins of the given class from this parser. """ self.plugins = [pi for pi in self.plugins if not isinstance(pi, cls)] def replace_plugin(self, plugin): """Removes any plugins of the class of the given plugin and then adds it. This is a convenience method to keep from having to call ``remove_plugin_class`` followed by ``add_plugin`` each time you want to reconfigure a default plugin. >>> qp = qparser.QueryParser("content", schema) >>> qp.replace_plugin(qparser.NotPlugin("(^| )-")) """ self.remove_plugin_class(plugin.__class__) self.add_plugin(plugin) def _priorized(self, methodname): # methodname is "taggers" or "filters". Returns a priorized list of # tagger objects or filter functions. items_and_priorities = [] for plugin in self.plugins: # Call either .taggers() or .filters() on the plugin method = getattr(plugin, methodname) for item in method(self): items_and_priorities.append(item) # Sort the list by priority (lower priority runs first) items_and_priorities.sort(key=lambda x: x[1]) # Return the sorted list without the priorities return [item for item, _ in items_and_priorities] def multitoken_query(self, spec, texts, fieldname, termclass, boost): """Returns a query for multiple texts. This method implements the intention specified in the field's ``multitoken_query`` attribute, which specifies what to do when strings that look like single terms to the parser turn out to yield multiple tokens when analyzed. :param spec: a string describing how to join the text strings into a query. This is usually the value of the field's ``multitoken_query`` attribute. :param texts: a list of token strings. :param fieldname: the name of the field. :param termclass: the query class to use for single terms. :param boost: the original term's boost in the query string, should be applied to the returned query object. """ spec = spec.lower() if spec == "first": # Throw away all but the first token return termclass(fieldname, texts[0], boost=boost) elif spec == "phrase": # Turn the token into a phrase return self.phraseclass(fieldname, texts, boost=boost) else: if spec == "default": qclass = self.group.qclass elif spec == "and": qclass = query.And elif spec == "or": qclass = query.Or else: raise QueryParserError("Unknown multitoken_query value %r" % spec) return qclass([termclass(fieldname, t, boost=boost) for t in texts]) def term_query(self, fieldname, text, termclass, boost=1.0, tokenize=True, removestops=True): """Returns the appropriate query object for a single term in the query string. """ if self.schema and fieldname in self.schema: field = self.schema[fieldname] # If this field type wants to parse queries itself, let it do so # and return early if field.self_parsing(): try: q = field.parse_query(fieldname, text, boost=boost) return q except: e = sys.exc_info()[1] return query.error_query(e) # Otherwise, ask the field to process the text into a list of # tokenized strings texts = list(field.process_text(text, mode="query", tokenize=tokenize, removestops=removestops)) # If the analyzer returned more than one token, use the field's # multitoken_query attribute to decide what query class, if any, to # use to put the tokens together if len(texts) > 1: return self.multitoken_query(field.multitoken_query, texts, fieldname, termclass, boost) # It's possible field.process_text() will return an empty list (for # example, on a stop word) if not texts: return None text = texts[0] return termclass(fieldname, text, boost=boost) def taggers(self): """Returns a priorized list of tagger objects provided by the parser's currently configured plugins. """ return self._priorized("taggers") def filters(self): """Returns a priorized list of filter functions provided by the parser's currently configured plugins. """ return self._priorized("filters") def tag(self, text, pos=0, debug=False): """Returns a group of syntax nodes corresponding to the given text, created by matching the Taggers provided by the parser's plugins. :param text: the text to tag. :param pos: the position in the text to start tagging at. """ # The list out output tags stack = [] # End position of the previous match prev = pos # Priorized list of taggers provided by the parser's plugins taggers = self.taggers() if debug: print_debug(debug, "Taggers: %r" % taggers) # Define a function that will make a WordNode from the "interstitial" # text between matches def inter(startchar, endchar): n = syntax.WordNode(text[startchar:endchar]) n.startchar = startchar n.endchar = endchar return n while pos < len(text): node = None # Try each tagger to see if it matches at the current position for tagger in taggers: node = tagger.match(self, text, pos) if node is not None: if node.endchar <= pos: raise Exception("Token %r did not move cursor forward." " (%r, %s)" % (tagger, text, pos)) if prev < pos: tween = inter(prev, pos) if debug: print_debug(debug, "Tween: %r" % tween) stack.append(tween) if debug: print_debug(debug, "Tagger: %r at %s: %r" % (tagger, pos, node)) stack.append(node) prev = pos = node.endchar break if not node: # No taggers matched, move forward pos += 1 # If there's unmatched text left over on the end, put it in a WordNode if prev < len(text): stack.append(inter(prev, len(text))) # Wrap the list of nodes in a group node group = self.group(stack) if debug: print_debug(debug, "Tagged group: %r" % group) return group def filterize(self, nodes, debug=False): """Takes a group of nodes and runs the filters provided by the parser's plugins. """ # Call each filter in the priorized list of plugin filters if debug: print_debug(debug, "Pre-filtered group: %r" % nodes) for f in self.filters(): if debug: print_debug(debug, "..Applying: %r" % f) nodes = f(self, nodes) if debug: print_debug(debug, "..Result: %r" % nodes) if nodes is None: raise Exception("Filter %r did not return anything" % f) return nodes def process(self, text, pos=0, debug=False): """Returns a group of syntax nodes corresponding to the given text, tagged by the plugin Taggers and filtered by the plugin filters. :param text: the text to tag. :param pos: the position in the text to start tagging at. """ nodes = self.tag(text, pos=pos, debug=debug) nodes = self.filterize(nodes, debug=debug) return nodes def parse(self, text, normalize=True, debug=False): """Parses the input string and returns a :class:`whoosh.query.Query` object/tree. :param text: the unicode string to parse. :param normalize: whether to call normalize() on the query object/tree before returning it. This should be left on unless you're trying to debug the parser output. :rtype: :class:`whoosh.query.Query` """ if not isinstance(text, text_type): text = text.decode("latin1") nodes = self.process(text, debug=debug) if debug: print_debug(debug, "Syntax tree: %r" % nodes) q = nodes.query(self) if not q: q = query.NullQuery if debug: print_debug(debug, "Pre-normalized query: %r" % q) if normalize: q = q.normalize() if debug: print_debug(debug, "Normalized query: %r" % q) return q def parse_(self, text, normalize=True): pass # Premade parser configurations def MultifieldParser(fieldnames, schema, fieldboosts=None, **kwargs): """Returns a QueryParser configured to search in multiple fields. Instead of assigning unfielded clauses to a default field, this parser transforms them into an OR clause that searches a list of fields. For example, if the list of multi-fields is "f1", "f2" and the query string is "hello there", the class will parse "(f1:hello OR f2:hello) (f1:there OR f2:there)". This is very useful when you have two textual fields (e.g. "title" and "content") you want to search by default. :param fieldnames: a list of field names to search. :param fieldboosts: an optional dictionary mapping field names to boosts. """ from whoosh.qparser.plugins import MultifieldPlugin p = QueryParser(None, schema, **kwargs) mfp = MultifieldPlugin(fieldnames, fieldboosts=fieldboosts) p.add_plugin(mfp) return p def SimpleParser(fieldname, schema, **kwargs): """Returns a QueryParser configured to support only +, -, and phrase syntax. """ from whoosh.qparser import plugins pins = [plugins.WhitespacePlugin, plugins.PlusMinusPlugin, plugins.PhrasePlugin] return QueryParser(fieldname, schema, plugins=pins, **kwargs) def DisMaxParser(fieldboosts, schema, tiebreak=0.0, **kwargs): """Returns a QueryParser configured to support only +, -, and phrase syntax, and which converts individual terms into DisjunctionMax queries across a set of fields. :param fieldboosts: a dictionary mapping field names to boosts. """ from whoosh.qparser import plugins mfp = plugins.MultifieldPlugin(list(fieldboosts.keys()), fieldboosts=fieldboosts, group=syntax.DisMaxGroup) pins = [plugins.WhitespacePlugin, plugins.PlusMinusPlugin, plugins.PhrasePlugin, mfp] return QueryParser(None, schema, plugins=pins, **kwargs) Whoosh-2.5.7/src/whoosh/qparser/plugins.py0000644000076500000240000014167712254366350020720 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import copy from whoosh import query from whoosh.compat import u from whoosh.compat import iteritems, xrange from whoosh.qparser import syntax from whoosh.qparser.common import attach from whoosh.qparser.taggers import RegexTagger, FnTagger from whoosh.util.text import rcompile class Plugin(object): """Base class for parser plugins. """ def taggers(self, parser): """Should return a list of ``(Tagger, priority)`` tuples to add to the syntax the parser understands. Lower priorities run first. """ return () def filters(self, parser): """Should return a list of ``(filter_function, priority)`` tuples to add to parser. Lower priority numbers run first. Filter functions will be called with ``(parser, groupnode)`` and should return a group node. """ return () class TaggingPlugin(RegexTagger): """A plugin that also acts as a Tagger, to avoid having an extra Tagger class for simple cases. A TaggingPlugin object should have a ``priority`` attribute and either a ``nodetype`` attribute or a ``create()`` method. If the subclass doesn't override ``create()``, the base class will call ``self.nodetype`` with the Match object's named groups as keyword arguments. """ priority = 0 def __init__(self, expr=None): self.expr = rcompile(expr or self.expr) def taggers(self, parser): return [(self, self.priority)] def filters(self, parser): return () def create(self, parser, match): # Groupdict keys can be unicode sometimes apparently? Convert them to # str for use as keyword arguments. This should be Py3-safe. kwargs = dict((str(k), v) for k, v in iteritems(match.groupdict())) return self.nodetype(**kwargs) class WhitespacePlugin(TaggingPlugin): """Tags whitespace and removes it at priority 500. Depending on whether your plugin's filter wants to see where whitespace was in the original query, it should run with priority lower than 500 (before removal of whitespace) or higher than 500 (after removal of whitespace). """ nodetype = syntax.Whitespace priority = 100 def __init__(self, expr=r"\s+"): TaggingPlugin.__init__(self, expr) def filters(self, parser): return [(self.remove_whitespace, 500)] def remove_whitespace(self, parser, group): newgroup = group.empty_copy() for node in group: if isinstance(node, syntax.GroupNode): newgroup.append(self.remove_whitespace(parser, node)) elif not node.is_ws(): newgroup.append(node) return newgroup class SingleQuotePlugin(TaggingPlugin): """Adds the ability to specify single "terms" containing spaces by enclosing them in single quotes. """ expr = r"(^|(?<=\W))'(?P.*?)'(?=\s|\]|[)}]|$)" nodetype = syntax.WordNode class PrefixPlugin(TaggingPlugin): """Adds the ability to specify prefix queries by ending a term with an asterisk. This plugin is useful if you want the user to be able to create prefix but not wildcard queries (for performance reasons). If you are including the wildcard plugin, you should not include this plugin as well. >>> qp = qparser.QueryParser("content", myschema) >>> qp.remove_plugin_class(qparser.WildcardPlugin) >>> qp.add_plugin(qparser.PrefixPlugin()) >>> q = qp.parse("pre*") """ class PrefixNode(syntax.TextNode): qclass = query.Prefix def r(self): return "%r*" % self.text expr = "(?P[^ \t\r\n*]+)[*](?= |$|\\))" nodetype = PrefixNode class WildcardPlugin(TaggingPlugin): # \u055E = Armenian question mark # \u061F = Arabic question mark # \u1367 = Ethiopic question mark qmarks = u("?\u055E\u061F\u1367") expr = "(?P[*%s])" % qmarks def filters(self, parser): # Run early, but definitely before multifield plugin return [(self.do_wildcards, 50)] def do_wildcards(self, parser, group): i = 0 while i < len(group): node = group[i] if isinstance(node, self.WildcardNode): if i < len(group) - 1 and group[i + 1].is_text(): nextnode = group.pop(i + 1) node.text += nextnode.text if i > 0 and group[i - 1].is_text(): prevnode = group.pop(i - 1) node.text = prevnode.text + node.text else: i += 1 else: if isinstance(node, syntax.GroupNode): self.do_wildcards(parser, node) i += 1 for i in xrange(len(group)): node = group[i] if isinstance(node, self.WildcardNode): text = node.text if len(text) > 1 and not any(qm in text for qm in self.qmarks): if text.find("*") == len(text) - 1: newnode = PrefixPlugin.PrefixNode(text[:-1]) newnode.startchar = node.startchar newnode.endchar = node.endchar group[i] = newnode return group class WildcardNode(syntax.TextNode): # Note that this node inherits tokenize = False from TextNode, # so the text in this node will not be analyzed... just passed # straight to the query qclass = query.Wildcard def r(self): return "Wild %r" % self.text nodetype = WildcardNode class RegexPlugin(TaggingPlugin): """Adds the ability to specify regular expression term queries. The default syntax for a regular expression term is ``r"termexpr"``. >>> qp = qparser.QueryParser("content", myschema) >>> qp.add_plugin(qparser.RegexPlugin()) >>> q = qp.parse('foo title:r"bar+"') """ class RegexNode(syntax.TextNode): qclass = query.Regex def r(self): return "Regex %r" % self.text expr = 'r"(?P[^"]*)"' nodetype = RegexNode class BoostPlugin(TaggingPlugin): """Adds the ability to boost clauses of the query using the circumflex. >>> qp = qparser.QueryParser("content", myschema) >>> q = qp.parse("hello there^2") """ expr = "\\^(?P[0-9]*(\\.[0-9]+)?)($|(?=[ \t\r\n)]))" class BoostNode(syntax.SyntaxNode): def __init__(self, original, boost): self.original = original self.boost = boost def r(self): return "^ %s" % self.boost def create(self, parser, match): # Override create so we can grab group 0 original = match.group(0) try: boost = float(match.group("boost")) except ValueError: # The text after the ^ wasn't a valid number, so turn it into a # word node = syntax.WordNode(original) else: node = self.BoostNode(original, boost) return node def filters(self, parser): return [(self.clean_boost, 0), (self.do_boost, 510)] def clean_boost(self, parser, group): """This filter finds any BoostNodes in positions where they can't boost the previous node (e.g. at the very beginning, after whitespace, or after another BoostNode) and turns them into WordNodes. """ bnode = self.BoostNode for i, node in enumerate(group): if isinstance(node, bnode): if (not i or not group[i - 1].has_boost): group[i] = syntax.to_word(node) return group def do_boost(self, parser, group): """This filter finds BoostNodes and applies the boost to the previous node. """ newgroup = group.empty_copy() for node in group: if isinstance(node, syntax.GroupNode): node = self.do_boost(parser, node) elif isinstance(node, self.BoostNode): if (newgroup and newgroup[-1].has_boost): # Apply the BoostNode's boost to the previous node newgroup[-1].set_boost(node.boost) # Skip adding the BoostNode to the new group continue else: node = syntax.to_word(node) newgroup.append(node) return newgroup class GroupPlugin(Plugin): """Adds the ability to group clauses using parentheses. """ # Marker nodes for open and close bracket class OpenBracket(syntax.SyntaxNode): def r(self): return "(" class CloseBracket(syntax.SyntaxNode): def r(self): return ")" def __init__(self, openexpr="[(]", closeexpr="[)]"): self.openexpr = openexpr self.closeexpr = closeexpr def taggers(self, parser): return [(FnTagger(self.openexpr, self.OpenBracket, "openB"), 0), (FnTagger(self.closeexpr, self.CloseBracket, "closeB"), 0)] def filters(self, parser): return [(self.do_groups, 0)] def do_groups(self, parser, group): """This filter finds open and close bracket markers in a flat group and uses them to organize the nodes into a hierarchy. """ ob, cb = self.OpenBracket, self.CloseBracket # Group hierarchy stack stack = [parser.group()] for node in group: if isinstance(node, ob): # Open bracket: push a new level of hierarchy on the stack stack.append(parser.group()) elif isinstance(node, cb): # Close bracket: pop the current level of hierarchy and append # it to the previous level if len(stack) > 1: last = stack.pop() stack[-1].append(last) else: # Anything else: add it to the current level of hierarchy stack[-1].append(node) top = stack[0] # If the parens were unbalanced (more opens than closes), just take # whatever levels of hierarchy were left on the stack and tack them on # the end of the top-level if len(stack) > 1: for ls in stack[1:]: top.extend(ls) if len(top) == 1 and isinstance(top[0], syntax.GroupNode): boost = top.boost top = top[0] top.boost = boost return top class EveryPlugin(TaggingPlugin): expr = "[*]:[*]" priority = -1 def create(self, parser, match): return self.EveryNode() class EveryNode(syntax.SyntaxNode): def r(self): return "*:*" def query(self, parser): return query.Every() class FieldsPlugin(TaggingPlugin): """Adds the ability to specify the field of a clause. """ class FieldnameTagger(RegexTagger): def create(self, parser, match): return syntax.FieldnameNode(match.group("text"), match.group(0)) def __init__(self, expr=r"(?P\w+|[*]):", remove_unknown=True): """ :param expr: the regular expression to use for tagging fields. :param remove_unknown: if True, converts field specifications for fields that aren't in the schema into regular text. """ self.expr = expr self.removeunknown = remove_unknown def taggers(self, parser): return [(self.FieldnameTagger(self.expr), 0)] def filters(self, parser): return [(self.do_fieldnames, 100)] def do_fieldnames(self, parser, group): """This filter finds FieldnameNodes in the tree and applies their fieldname to the next node. """ fnclass = syntax.FieldnameNode if self.removeunknown and parser.schema: # Look for field nodes that aren't in the schema and convert them # to text schema = parser.schema newgroup = group.empty_copy() prev_field_node = None for node in group: if isinstance(node, fnclass) and node.fieldname not in schema: prev_field_node = node continue elif prev_field_node: # If prev_field_node is not None, it contains a field node # that appeared before this node but isn't in the schema, # so we'll convert it to text here if node.has_text: node.text = prev_field_node.original + node.text else: newgroup.append(syntax.to_word(prev_field_node)) prev_field_node = None newgroup.append(node) if prev_field_node: newgroup.append(syntax.to_word(prev_field_node)) group = newgroup newgroup = group.empty_copy() # Iterate backwards through the stream, looking for field-able objects # with field nodes in front of them i = len(group) while i > 0: i -= 1 node = group[i] if isinstance(node, fnclass): # If we see a fieldname node, it must not have been in front # of something fieldable, since we would have already removed # it (since we're iterating backwards), so convert it to text node = syntax.to_word(node) elif isinstance(node, syntax.GroupNode): node = self.do_fieldnames(parser, node) if i > 0 and not node.is_ws() and isinstance(group[i - 1], fnclass): node.set_fieldname(group[i - 1].fieldname, override=False) i -= 1 newgroup.append(node) newgroup.reverse() return newgroup class FuzzyTermPlugin(TaggingPlugin): """Adds syntax to the query parser to create "fuzzy" term queries, which match any term within a certain "edit distance" (number of inserted, deleted, or transposed characters) by appending a tilde (``~``) and an optional maximum edit distance to a term. If you don't specify an explicit maximum edit distance, the default is 1. >>> qp = qparser.QueryParser("content", myschema) >>> qp.add_plugin(qparser.FuzzyTermPlugin()) >>> q = qp.parse("Stephen~2 Colbert") For example, the following query creates a :class:`whoosh.query.FuzzyTerm` query with a maximum edit distance of 1:: bob~ The following creates a fuzzy term query with a maximum edit distance of 2:: bob~2 The maximum edit distance can only be a single digit. Note that edit distances greater than 2 can take an extremely long time and are generally not useful. You can specify a prefix length using ``~n/m``. For example, to allow a maximum edit distance of 2 and require a prefix match of 3 characters:: johannson~2/3 To specify a prefix with the default edit distance:: johannson~/3 """ expr = rcompile(""" (?<=\\S) # Only match right after non-space ~ # Initial tilde (?P[0-9])? # Optional maxdist (/ # Optional prefix slash (?P[1-9][0-9]*) # prefix )? # (end prefix group) """, verbose=True) class FuzzinessNode(syntax.SyntaxNode): def __init__(self, maxdist, prefixlength, original): self.maxdist = maxdist self.prefixlength = prefixlength self.original = original def __repr__(self): return "<~%d/%d>" % (self.maxdist, self.prefixlength) class FuzzyTermNode(syntax.TextNode): qclass = query.FuzzyTerm def __init__(self, wordnode, maxdist, prefixlength): self.fieldname = wordnode.fieldname self.text = wordnode.text self.boost = wordnode.boost self.startchar = wordnode.startchar self.endchar = wordnode.endchar self.maxdist = maxdist self.prefixlength = prefixlength def r(self): return "%r ~%d/%d" % (self.text, self.maxdist, self.prefixlength) def query(self, parser): # Use the superclass's query() method to create a FuzzyTerm query # (it looks at self.qclass), just because it takes care of some # extra checks and attributes q = syntax.TextNode.query(self, parser) # Set FuzzyTerm-specific attributes q.maxdist = self.maxdist q.prefixlength = self.prefixlength return q def create(self, parser, match): mdstr = match.group("maxdist") maxdist = int(mdstr) if mdstr else 1 pstr = match.group("prefix") prefixlength = int(pstr) if pstr else 0 return self.FuzzinessNode(maxdist, prefixlength, match.group(0)) def filters(self, parser): return [(self.do_fuzzyterms, 0)] def do_fuzzyterms(self, parser, group): newgroup = group.empty_copy() i = 0 while i < len(group): node = group[i] if i < len(group) - 1 and isinstance(node, syntax.WordNode): nextnode = group[i + 1] if isinstance(nextnode, self.FuzzinessNode): node = self.FuzzyTermNode(node, nextnode.maxdist, nextnode.prefixlength) i += 1 if isinstance(node, self.FuzzinessNode): node = syntax.to_word(node) if isinstance(node, syntax.GroupNode): node = self.do_fuzzyterms(parser, node) newgroup.append(node) i += 1 return newgroup class FunctionPlugin(TaggingPlugin): """Adds an abitrary "function call" syntax to the query parser to allow advanced and extensible query functionality. This is unfinished and experimental. """ expr = rcompile(""" [#](?P[A-Za-z_][A-Za-z0-9._]*) # function name ( # optional args \\[ # inside square brackets (?P.*?) \\] )? """, verbose=True) class FunctionNode(syntax.SyntaxNode): has_fieldname = False has_boost = True merging = False def __init__(self, name, fn, args, kwargs): self.name = name self.fn = fn self.args = args self.kwargs = kwargs self.nodes = [] self.boost = None def __repr__(self): return "#%s<%r>(%r)" % (self.name, self.args, self.nodes) def query(self, parser): qs = [n.query(parser) for n in self.nodes] kwargs = self.kwargs if "boost" not in kwargs and self.boost is not None: kwargs["boost"] = self.boost # TODO: If this call raises an exception, return an error query return self.fn(qs, *self.args, **self.kwargs) def __init__(self, fns): """ :param fns: a dictionary mapping names to functions that return a query. """ self.fns = fns def create(self, parser, match): name = match.group("name") if name in self.fns: fn = self.fns[name] argstring = match.group("args") if argstring: args, kwargs = self._parse_args(argstring) else: args = () kwargs = {} return self.FunctionNode(name, fn, args, kwargs) def _parse_args(self, argstring): args = [] kwargs = {} parts = argstring.split(",") for part in parts: if "=" in part: name, value = part.split("=", 1) # Wrap with str() because Python 2.5 can't handle unicode kws name = str(name.strip()) else: name = None value = part value = value.strip() if value.startswith("'") and value.endswith("'"): value = value[1:-1] if name: kwargs[name] = value else: args.append(value) return args, kwargs def filters(self, parser): return [(self.do_functions, 600)] def do_functions(self, parser, group): newgroup = group.empty_copy() i = 0 while i < len(group): node = group[i] if (isinstance(node, self.FunctionNode) and i < len(group) - 1 and isinstance(group[i + 1], syntax.GroupNode)): nextnode = group[i + 1] node.nodes = list(self.do_functions(parser, nextnode)) if nextnode.boost != 1: node.set_boost(nextnode.boost) i += 1 elif isinstance(node, syntax.GroupNode): node = self.do_functions(parser, node) newgroup.append(node) i += 1 return newgroup class PhrasePlugin(Plugin): """Adds the ability to specify phrase queries inside double quotes. """ # Didn't use TaggingPlugin because I need to add slop parsing at some # point # Expression used to find words if a schema isn't available wordexpr = rcompile(r'\S+') class PhraseNode(syntax.TextNode): def __init__(self, text, textstartchar, slop=1): syntax.TextNode.__init__(self, text) self.textstartchar = textstartchar self.slop = slop def r(self): return "%s %r~%s" % (self.__class__.__name__, self.text, self.slop) def apply(self, fn): return self.__class__(self.type, [fn(node) for node in self.nodes], slop=self.slop, boost=self.boost) def query(self, parser): text = self.text fieldname = self.fieldname or parser.fieldname # We want to process the text of the phrase into "words" (tokens), # and also record the startchar and endchar of each word sc = self.textstartchar if parser.schema and fieldname in parser.schema: field = parser.schema[fieldname] if field.analyzer: # We have a field with an analyzer, so use it to parse # the phrase into tokens tokens = field.tokenize(text, mode="query", chars=True) words = [] char_ranges = [] for t in tokens: words.append(t.text) char_ranges.append((sc + t.startchar, sc + t.endchar)) else: # We have a field but it doesn't have a format object, # for some reason (it's self-parsing?), so use process_text # to get the texts (we won't know the start/end chars) words = list(field.process_text(text, mode="query")) char_ranges = [(None, None)] * len(words) else: # We're parsing without a schema, so just use the default # regular expression to break the text into words words = [] char_ranges = [] for match in PhrasePlugin.wordexpr.finditer(text): words.append(match.group(0)) char_ranges.append((sc + match.start(), sc + match.end())) qclass = parser.phraseclass q = qclass(fieldname, words, slop=self.slop, boost=self.boost, char_ranges=char_ranges) return attach(q, self) class PhraseTagger(RegexTagger): def create(self, parser, match): text = match.group("text") textstartchar = match.start("text") slopstr = match.group("slop") slop = int(slopstr) if slopstr else 1 return PhrasePlugin.PhraseNode(text, textstartchar, slop) def __init__(self, expr='"(?P.*?)"(~(?P[1-9][0-9]*))?'): self.expr = expr def taggers(self, parser): return [(self.PhraseTagger(self.expr), 0)] class SequencePlugin(Plugin): """Adds the ability to group arbitrary queries inside double quotes to produce a query matching the individual sub-queries in sequence. To enable this plugin, first remove the default PhrasePlugin, then add this plugin:: qp = qparser.QueryParser("field", my_schema) qp.remove_plugin_class(qparser.PhrasePlugin) qp.add_plugin(qparser.SequencePlugin()) This enables parsing "phrases" such as:: "(jon OR john OR jonathan~1) smith*" """ def __init__(self, expr='["](~(?P[1-9][0-9]*))?'): """ :param expr: a regular expression for the marker at the start and end of a phrase. The default is the double-quotes character. """ self.expr = expr class SequenceNode(syntax.GroupNode): qclass = query.Sequence class QuoteNode(syntax.MarkerNode): def __init__(self, slop=None): self.slop = int(slop) if slop else 1 def taggers(self, parser): return [(FnTagger(self.expr, self.QuoteNode, "quote"), 0)] def filters(self, parser): return [(self.do_quotes, 550)] def do_quotes(self, parser, group): # New group to copy nodes into newgroup = group.empty_copy() # Buffer for sequence nodes; when it's None, it means we're not in # a sequence seq = None # Start copying nodes from group to newgroup. When we find a quote # node, start copying nodes into the buffer instead. When we find # the next (end) quote, put the buffered nodes into a SequenceNode # and add it to newgroup. for node in group: if isinstance(node, syntax.GroupNode): # Recurse node = self.do_quotes(parser, node) if isinstance(node, self.QuoteNode): if seq is None: # Start a new sequence seq = [] else: # End the current sequence sn = self.SequenceNode(seq, slop=node.slop) newgroup.append(sn) seq = None elif seq is None: # Not in a sequence, add directly newgroup.append(node) else: # In a sequence, add it to the buffer seq.append(node) # We can end up with buffered nodes if there was an unbalanced quote; # just add the buffered nodes directly to newgroup if seq is not None: newgroup.extend(seq) return newgroup class RangePlugin(Plugin): """Adds the ability to specify term ranges. """ expr = rcompile(r""" (?P\{|\[) # Open paren (?P ('[^']*?'\s+) # single-quoted | # or ([^\]}]+?(?=[Tt][Oo])) # everything until "to" )? [Tt][Oo] # "to" (?P (\s+'[^']*?') # single-quoted | # or ([^\]}]+?) # everything until "]" or "}" )? (?P}|]) # Close paren """, verbose=True) class RangeTagger(RegexTagger): def __init__(self, expr, excl_start, excl_end): self.expr = expr self.excl_start = excl_start self.excl_end = excl_end def create(self, parser, match): start = match.group("start") end = match.group("end") if start: # Strip the space before the "to" start = start.rstrip() # Strip single quotes if start.startswith("'") and start.endswith("'"): start = start[1:-1] if end: # Strip the space before the "to" end = end.lstrip() # Strip single quotes if end.startswith("'") and end.endswith("'"): end = end[1:-1] # What kind of open and close brackets were used? startexcl = match.group("open") == self.excl_start endexcl = match.group("close") == self.excl_end rn = syntax.RangeNode(start, end, startexcl, endexcl) return rn def __init__(self, expr=None, excl_start="{", excl_end="}"): self.expr = expr or self.expr self.excl_start = excl_start self.excl_end = excl_end def taggers(self, parser): tagger = self.RangeTagger(self.expr, self.excl_start, self.excl_end) return [(tagger, 1)] class OperatorsPlugin(Plugin): """By default, adds the AND, OR, ANDNOT, ANDMAYBE, and NOT operators to the parser syntax. This plugin scans the token stream for subclasses of :class:`Operator` and calls their :meth:`Operator.make_group` methods to allow them to manipulate the stream. There are two levels of configuration available. The first level is to change the regular expressions of the default operators, using the ``And``, ``Or``, ``AndNot``, ``AndMaybe``, and/or ``Not`` keyword arguments. The keyword value can be a pattern string or a compiled expression, or None to remove the operator:: qp = qparser.QueryParser("content", schema) cp = qparser.OperatorsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~", Not=None) qp.replace_plugin(cp) You can also specify a list of ``(OpTagger, priority)`` pairs as the first argument to the initializer to use custom operators. See :ref:`custom-op` for more information on this. """ class OpTagger(RegexTagger): def __init__(self, expr, grouptype, optype=syntax.InfixOperator, leftassoc=True, memo=""): RegexTagger.__init__(self, expr) self.grouptype = grouptype self.optype = optype self.leftassoc = leftassoc self.memo = memo def __repr__(self): return "<%s %r (%s)>" % (self.__class__.__name__, self.expr.pattern, self.memo) def create(self, parser, match): return self.optype(match.group(0), self.grouptype, self.leftassoc) def __init__(self, ops=None, clean=False, And=r"(?<=\s)AND(?=\s)", Or=r"(?<=\s)OR(?=\s)", AndNot=r"(?<=\s)ANDNOT(?=\s)", AndMaybe=r"(?<=\s)ANDMAYBE(?=\s)", Not=r"(^|(?<=(\s|[()])))NOT(?=\s)", Require=r"(^|(?<=\s))REQUIRE(?=\s)"): if ops: ops = list(ops) else: ops = [] if not clean: ot = self.OpTagger if Not: ops.append((ot(Not, syntax.NotGroup, syntax.PrefixOperator, memo="not"), 0)) if And: ops.append((ot(And, syntax.AndGroup, memo="and"), 0)) if Or: ops.append((ot(Or, syntax.OrGroup, memo="or"), 0)) if AndNot: ops.append((ot(AndNot, syntax.AndNotGroup, memo="anot"), -5)) if AndMaybe: ops.append((ot(AndMaybe, syntax.AndMaybeGroup, memo="amaybe"), -5)) if Require: ops.append((ot(Require, syntax.RequireGroup, memo="req"), 0)) self.ops = ops def taggers(self, parser): return self.ops def filters(self, parser): return [(self.do_operators, 600)] def do_operators(self, parser, group): """This filter finds PrefixOperator, PostfixOperator, and InfixOperator nodes in the tree and calls their logic to rearrange the nodes. """ for tagger, _ in self.ops: # Get the operators created by the configured taggers optype = tagger.optype gtype = tagger.grouptype # Left-associative infix operators are replaced left-to-right, and # right-associative infix operators are replaced right-to-left. # Most of the work is done in the different implementations of # Operator.replace_self(). if tagger.leftassoc: i = 0 while i < len(group): t = group[i] if isinstance(t, optype) and t.grouptype is gtype: i = t.replace_self(parser, group, i) else: i += 1 else: i = len(group) - 1 while i >= 0: t = group[i] if isinstance(t, optype): i = t.replace_self(parser, group, i) i -= 1 # Descend into the groups and recursively call do_operators for i, t in enumerate(group): if isinstance(t, syntax.GroupNode): group[i] = self.do_operators(parser, t) return group # class PlusMinusPlugin(Plugin): """Adds the ability to use + and - in a flat OR query to specify required and prohibited terms. This is the basis for the parser configuration returned by ``SimpleParser()``. """ # Marker nodes for + and - class Plus(syntax.MarkerNode): pass class Minus(syntax.MarkerNode): pass def __init__(self, plusexpr="\\+", minusexpr="-"): self.plusexpr = plusexpr self.minusexpr = minusexpr def taggers(self, parser): return [(FnTagger(self.plusexpr, self.Plus, "plus"), 0), (FnTagger(self.minusexpr, self.Minus, "minus"), 0)] def filters(self, parser): return [(self.do_plusminus, 510)] def do_plusminus(self, parser, group): """This filter sorts nodes in a flat group into "required", "optional", and "banned" subgroups based on the presence of plus and minus nodes. """ required = syntax.AndGroup() optional = syntax.OrGroup() banned = syntax.OrGroup() # Which group to put the next node we see into next = optional for node in group: if isinstance(node, self.Plus): # +: put the next node in the required group next = required elif isinstance(node, self.Minus): # -: put the next node in the banned group next = banned else: # Anything else: put it in the appropriate group next.append(node) # Reset to putting things in the optional group by default next = optional group = optional if required: group = syntax.AndMaybeGroup([required, group]) if banned: group = syntax.AndNotGroup([group, banned]) return group class GtLtPlugin(TaggingPlugin): """Allows the user to use greater than/less than symbols to create range queries:: a:>100 b:<=z c:>=-1.4 d:``, ``<``, ``>=``, ``<=``, ``=>``, and ``=<`` after a field specifier. The field specifier is required. You cannot do the following:: >100 This plugin requires the FieldsPlugin and RangePlugin to work. """ class GtLtNode(syntax.SyntaxNode): def __init__(self, rel): self.rel = rel def __repr__(self): return "(%s)" % self.rel expr = r"(?P(<=|>=|<|>|=<|=>))" nodetype = GtLtNode def filters(self, parser): # Run before the fields filter removes FilenameNodes at priority 100. return [(self.do_gtlt, 99)] def do_gtlt(self, parser, group): """This filter translate FieldnameNode/GtLtNode pairs into RangeNodes. """ fname = syntax.FieldnameNode newgroup = group.empty_copy() i = 0 lasti = len(group) - 1 while i < len(group): node = group[i] # If this is a GtLtNode... if isinstance(node, self.GtLtNode): # If it's not the last node in the group... if i < lasti: prevnode = newgroup[-1] nextnode = group[i + 1] # If previous was a fieldname and next node has text if isinstance(prevnode, fname) and nextnode.has_text: # Make the next node into a range based on the symbol newgroup.append(self.make_range(nextnode, node.rel)) # Skip the next node i += 1 else: # If it's not a GtLtNode, add it to the filtered group newgroup.append(node) i += 1 return newgroup def make_range(self, node, rel): text = node.text if rel == "<": n = syntax.RangeNode(None, text, False, True) elif rel == ">": n = syntax.RangeNode(text, None, True, False) elif rel == "<=" or rel == "=<": n = syntax.RangeNode(None, text, False, False) elif rel == ">=" or rel == "=>": n = syntax.RangeNode(text, None, False, False) return n.set_range(node.startchar, node.endchar) class MultifieldPlugin(Plugin): """Converts any unfielded terms into OR clauses that search for the term in a specified list of fields. >>> qp = qparser.QueryParser(None, myschema) >>> qp.add_plugin(qparser.MultifieldPlugin(["a", "b"]) >>> qp.parse("alfa c:bravo") And([Or([Term("a", "alfa"), Term("b", "alfa")]), Term("c", "bravo")]) This plugin is the basis for the ``MultifieldParser``. """ def __init__(self, fieldnames, fieldboosts=None, group=syntax.OrGroup): """ :param fieldnames: a list of fields to search. :param fieldboosts: an optional dictionary mapping field names to a boost to use for that field. :param group: the group to use to relate the fielded terms to each other. """ self.fieldnames = fieldnames self.boosts = fieldboosts or {} self.group = group def filters(self, parser): # Run after the fields filter applies explicit fieldnames (at priority # 100) return [(self.do_multifield, 110)] def do_multifield(self, parser, group): for i, node in enumerate(group): if isinstance(node, syntax.GroupNode): # Recurse inside groups group[i] = self.do_multifield(parser, node) elif node.has_fieldname and node.fieldname is None: # For an unfielded node, create a new group containing fielded # versions of the node for each configured "multi" field. newnodes = [] for fname in self.fieldnames: newnode = copy.copy(node) newnode.set_fieldname(fname) newnode.set_boost(self.boosts.get(fname, 1.0)) newnodes.append(newnode) group[i] = self.group(newnodes) return group class FieldAliasPlugin(Plugin): """Adds the ability to use "aliases" of fields in the query string. This plugin is useful for allowing users of languages that can't be represented in ASCII to use field names in their own language, and translate them into the "real" field names, which must be valid Python identifiers. >>> # Allow users to use 'body' or 'text' to refer to the 'content' field >>> parser.add_plugin(FieldAliasPlugin({"content": ["body", "text"]})) >>> parser.parse("text:hello") Term("content", "hello") """ def __init__(self, fieldmap): self.fieldmap = fieldmap self.reverse = {} for key, values in iteritems(fieldmap): for value in values: self.reverse[value] = key def filters(self, parser): # Run before fields plugin at 100 return [(self.do_aliases, 90)] def do_aliases(self, parser, group): for i, node in enumerate(group): if isinstance(node, syntax.GroupNode): group[i] = self.do_aliases(parser, node) elif node.has_fieldname and node.fieldname is not None: fname = node.fieldname if fname in self.reverse: node.set_fieldname(self.reverse[fname], override=True) return group class CopyFieldPlugin(Plugin): """Looks for basic syntax nodes (terms, prefixes, wildcards, phrases, etc.) occurring in a certain field and replaces it with a group (by default OR) containing the original token and the token copied to a new field. For example, the query:: hello name:matt could be automatically converted by ``CopyFieldPlugin({"name", "author"})`` to:: hello (name:matt OR author:matt) This is useful where one field was indexed with a differently-analyzed copy of another, and you want the query to search both fields. You can specify a different group type with the ``group`` keyword. You can also specify ``group=None``, in which case the copied node is inserted "inline" next to the original, instead of in a new group:: hello name:matt author:matt """ def __init__(self, map, group=syntax.OrGroup, mirror=False): """ :param map: a dictionary mapping names of fields to copy to the names of the destination fields. :param group: the type of group to create in place of the original token. You can specify ``group=None`` to put the copied node "inline" next to the original node instead of in a new group. :param two_way: if True, the plugin copies both ways, so if the user specifies a query in the 'toname' field, it will be copied to the 'fromname' field. """ self.map = map self.group = group if mirror: # Add in reversed mappings map.update(dict((v, k) for k, v in iteritems(map))) def filters(self, parser): # Run after the fieldname filter (100) but before multifield (110) return [(self.do_copyfield, 109)] def do_copyfield(self, parser, group): map = self.map newgroup = group.empty_copy() for node in group: if isinstance(node, syntax.GroupNode): # Recurse into groups node = self.do_copyfield(parser, node) elif node.has_fieldname: fname = node.fieldname or parser.fieldname if fname in map: newnode = copy.copy(node) newnode.set_fieldname(map[fname], override=True) if self.group is None: newgroup.append(node) newgroup.append(newnode) else: newgroup.append(self.group([node, newnode])) continue newgroup.append(node) return newgroup class PseudoFieldPlugin(Plugin): """This is an advanced plugin that lets you define "pseudo-fields" the user can use in their queries. When the parser encounters one of these fields, it runs a given function on the following node in the abstract syntax tree. Unfortunately writing the transform function(s) requires knowledge of the parser's abstract syntax tree classes. A transform function takes a :class:`whoosh.qparser.SyntaxNode` and returns a :class:`~whoosh.qparser.SyntaxNode` (or None if the node should be removed instead of transformed). Some things you can do in the transform function:: from whoosh import qparser def my_xform_fn(node): # Is this a text node? if node.has_text: # Change the node's text node.text = node.text + "foo" # Change the node into a prefix query node = qparser.PrefixPlugin.PrefixNode(node.text) # Set the field the node should search in node.set_fieldname("title") return node else: # If the pseudo-field wasn't applied to a text node (e.g. # it preceded a group, as in ``pfield:(a OR b)`` ), remove the # node. Alternatively you could just ``return node`` here to # leave the non-text node intact. return None In the following example, if the user types ``regex:foo.bar``, the function transforms the text in the pseudo-field "regex" into a regular expression query in the "content" field:: from whoosh import qparser def regex_maker(node): if node.has_text: node = qparser.RegexPlugin.RegexNode(node.text) node.set_fieldname("content") return node qp = qparser.QueryParser("content", myindex.schema) qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker})) q = qp.parse("alfa regex:br.vo") The name of the "pseudo" field can be the same as an actual field. Imagine the schema has a field named ``reverse``, and you want the user to be able to type ``reverse:foo`` and transform it to ``reverse:(foo OR oof)``:: def rev_text(node): if node.has_text: # Create a word node for the reversed text revtext = node.text[::-1] # Reverse the text rnode = qparser.WordNode(revtext) # Put the original node and the reversed node in an OrGroup group = qparser.OrGroup([node, rnode]) # Need to set the fieldname here because the PseudoFieldPlugin # removes the field name syntax group.set_fieldname("reverse") return group qp = qparser.QueryParser("content", myindex.schema) qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text})) q = qp.parse("alfa reverse:bravo") Note that transforming the query like this can potentially really confuse the spell checker! This plugin works as a filter, so it can only operate on the query after it has been parsed into an abstract syntax tree. For parsing control (i.e. to give a pseudo-field its own special syntax), you would need to write your own parsing plugin. """ def __init__(self, xform_map): """ :param xform_map: a dictionary mapping psuedo-field names to transform functions. The function should take a :class:`whoosh.qparser.SyntaxNode` as an argument, and return a :class:`~whoosh.qparser.SyntaxNode`. If the function returns None, the node will be removed from the query. """ self.xform_map = xform_map def filters(self, parser): # Run before the fieldname filter (100) return [(self.do_pseudofield, 99)] def do_pseudofield(self, parser, group): xform_map = self.xform_map newgroup = group.empty_copy() xform_next = None for node in group: if isinstance(node, syntax.GroupNode): node = self.do_pseudofield(parser, node) elif (isinstance(node, syntax.FieldnameNode) and node.fieldname in xform_map): xform_next = xform_map[node.fieldname] continue if xform_next: newnode = xform_next(node) xform_next = None if newnode is None: continue else: newnode.set_range(node.startchar, node.endchar) node = newnode newgroup.append(node) return newgroup Whoosh-2.5.7/src/whoosh/qparser/syntax.py0000644000076500000240000004374212254366350020557 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import sys, weakref from whoosh import query from whoosh.qparser.common import get_single_text, QueryParserError, attach class SyntaxNode(object): """Base class for nodes that make up the abstract syntax tree (AST) of a parsed user query string. The AST is an intermediate step, generated from the query string, then converted into a :class:`whoosh.query.Query` tree by calling the ``query()`` method on the nodes. Instances have the following required attributes: ``has_fieldname`` True if this node has a ``fieldname`` attribute. ``has_text`` True if this node has a ``text`` attribute ``has_boost`` True if this node has a ``boost`` attribute. ``startchar`` The character position in the original text at which this node started. ``endchar`` The character position in the original text at which this node ended. """ has_fieldname = False has_text = False has_boost = False _parent = None def __repr__(self): r = "<" if self.has_fieldname: r += "%r:" % self.fieldname r += self.r() if self.has_boost and self.boost != 1.0: r += " ^%s" % self.boost r += ">" return r def r(self): """Returns a basic representation of this node. The base class's ``__repr__`` method calls this, then does the extra busy work of adding fieldname and boost where appropriate. """ return "%s %r" % (self.__class__.__name__, self.__dict__) def apply(self, fn): return self def accept(self, fn): def fn_wrapper(n): return fn(n.apply(fn_wrapper)) return fn_wrapper(self) def query(self, parser): """Returns a :class:`whoosh.query.Query` instance corresponding to this syntax tree node. """ raise NotImplementedError(self.__class__.__name__) def is_ws(self): """Returns True if this node is ignorable whitespace. """ return False def is_text(self): return False def set_fieldname(self, name, override=False): """Sets the fieldname associated with this node. If ``override`` is False (the default), the fieldname will only be replaced if this node does not already have a fieldname set. For nodes that don't have a fieldname, this is a no-op. """ if not self.has_fieldname: return if self.fieldname is None or override: self.fieldname = name return self def set_boost(self, boost): """Sets the boost associated with this node. For nodes that don't have a boost, this is a no-op. """ if not self.has_boost: return self.boost = boost return self def set_range(self, startchar, endchar): """Sets the character range associated with this node. """ self.startchar = startchar self.endchar = endchar return self # Navigation methods def parent(self): if self._parent: return self._parent() def next_sibling(self): p = self.parent() if p: return p.node_after(self) def prev_sibling(self): p = self.parent() if p: return p.node_before(self) def bake(self, parent): self._parent = weakref.ref(parent) class MarkerNode(SyntaxNode): """Base class for nodes that only exist to mark places in the tree. """ def r(self): return self.__class__.__name__ class Whitespace(MarkerNode): """Abstract syntax tree node for ignorable whitespace. """ def r(self): return " " def is_ws(self): return True class FieldnameNode(SyntaxNode): """Abstract syntax tree node for field name assignments. """ has_fieldname = True def __init__(self, fieldname, original): self.fieldname = fieldname self.original = original def __repr__(self): return "<%r:>" % self.fieldname class GroupNode(SyntaxNode): """Base class for abstract syntax tree node types that group together sub-nodes. Instances have the following attributes: ``merging`` True if side-by-side instances of this group can be merged into a single group. ``qclass`` If a subclass doesn't override ``query()``, the base class will simply wrap this class around the queries returned by the subnodes. This class implements a number of list methods for operating on the subnodes. """ has_boost = True merging = True qclass = None def __init__(self, nodes=None, boost=1.0, **kwargs): self.nodes = nodes or [] self.boost = boost self.kwargs = kwargs def r(self): return "%s %s" % (self.__class__.__name__, ", ".join(repr(n) for n in self.nodes)) @property def startchar(self): if not self.nodes: return None return self.nodes[0].startchar @property def endchar(self): if not self.nodes: return None return self.nodes[-1].endchar def apply(self, fn): return self.__class__(self.type, [fn(node) for node in self.nodes], boost=self.boost, **self.kwargs) def query(self, parser): subs = [] for node in self.nodes: subq = node.query(parser) if subq is not None: subs.append(subq) q = self.qclass(subs, boost=self.boost, **self.kwargs) return attach(q, self) def empty_copy(self): """Returns an empty copy of this group. This is used in the common pattern where a filter creates an new group and then adds nodes from the input group to it if they meet certain criteria, then returns the new group:: def remove_whitespace(parser, group): newgroup = group.empty_copy() for node in group: if not node.is_ws(): newgroup.append(node) return newgroup """ c = self.__class__(**self.kwargs) if self.has_boost: c.boost = self.boost if self.has_fieldname: c.fieldname = self.fieldname if self.has_text: c.text = self.text return c def set_fieldname(self, name, override=False): SyntaxNode.set_fieldname(self, name, override=override) for node in self.nodes: node.set_fieldname(name, override=override) def set_range(self, startchar, endchar): for node in self.nodes: node.set_range(startchar, endchar) return self # List-like methods def __nonzero__(self): return bool(self.nodes) __bool__ = __nonzero__ def __iter__(self): return iter(self.nodes) def __len__(self): return len(self.nodes) def __getitem__(self, n): return self.nodes.__getitem__(n) def __setitem__(self, n, v): self.nodes.__setitem__(n, v) def __delitem__(self, n): self.nodes.__delitem__(n) def insert(self, n, v): self.nodes.insert(n, v) def append(self, v): self.nodes.append(v) def extend(self, vs): self.nodes.extend(vs) def pop(self, *args, **kwargs): return self.nodes.pop(*args, **kwargs) def reverse(self): self.nodes.reverse() def index(self, v): return self.nodes.index(v) # Navigation methods def bake(self, parent): SyntaxNode.bake(self, parent) for node in self.nodes: node.bake(self) def node_before(self, n): try: i = self.nodes.index(n) except ValueError: return if i > 0: return self.nodes[i - 1] def node_after(self, n): try: i = self.nodes.index(n) except ValueError: return if i < len(self.nodes) - 2: return self.nodes[i + 1] class BinaryGroup(GroupNode): """Intermediate base class for group nodes that have two subnodes and whose ``qclass`` initializer takes two arguments instead of a list. """ merging = False has_boost = False def query(self, parser): assert len(self.nodes) == 2 qa = self.nodes[0].query(parser) qb = self.nodes[1].query(parser) if qa is None and qb is None: q = query.NullQuery elif qa is None: q = qb elif qb is None: q = qa else: q = self.qclass(self.nodes[0].query(parser), self.nodes[1].query(parser)) return attach(q, self) class Wrapper(GroupNode): """Intermediate base class for nodes that wrap a single sub-node. """ merging = False def query(self, parser): q = self.nodes[0].query(parser) if q: return attach(self.qclass(q), self) class ErrorNode(SyntaxNode): def __init__(self, message, node=None): self.message = message self.node = node def r(self): return "ERR %r %r" % (self.node, self.message) @property def startchar(self): return self.node.startchar @property def endchar(self): return self.node.endchar def query(self, parser): if self.node: q = self.node.query(parser) else: q = query.NullQuery return attach(query.error_query(self.message, q), self) class AndGroup(GroupNode): qclass = query.And class OrGroup(GroupNode): qclass = query.Or @classmethod def factory(cls, scale=1.0): def maker(nodes=None, **kwargs): return cls(nodes=nodes, scale=scale, **kwargs) return maker class DisMaxGroup(GroupNode): qclass = query.DisjunctionMax class OrderedGroup(GroupNode): qclass = query.Ordered class AndNotGroup(BinaryGroup): qclass = query.AndNot class AndMaybeGroup(BinaryGroup): qclass = query.AndMaybe class RequireGroup(BinaryGroup): qclass = query.Require class NotGroup(Wrapper): qclass = query.Not class RangeNode(SyntaxNode): """Syntax node for range queries. """ has_fieldname = True def __init__(self, start, end, startexcl, endexcl): self.start = start self.end = end self.startexcl = startexcl self.endexcl = endexcl self.boost = 1.0 self.fieldname = None self.kwargs = {} def r(self): b1 = "{" if self.startexcl else "[" b2 = "}" if self.endexcl else "]" return "%s%r %r%s" % (b1, self.start, self.end, b2) def query(self, parser): fieldname = self.fieldname or parser.fieldname start = self.start end = self.end if parser.schema and fieldname in parser.schema: field = parser.schema[fieldname] if field.self_parsing(): try: q = field.parse_range(fieldname, start, end, self.startexcl, self.endexcl, boost=self.boost) if q is not None: return attach(q, self) except QueryParserError: e = sys.exc_info()[1] return attach(query.error_query(e), self) if start: start = get_single_text(field, start, tokenize=False, removestops=False) if end: end = get_single_text(field, end, tokenize=False, removestops=False) q = query.TermRange(fieldname, start, end, self.startexcl, self.endexcl, boost=self.boost) return attach(q, self) class TextNode(SyntaxNode): """Intermediate base class for basic nodes that search for text, such as term queries, wildcards, prefixes, etc. Instances have the following attributes: ``qclass`` If a subclass does not override ``query()``, the base class will use this class to construct the query. ``tokenize`` If True and the subclass does not override ``query()``, the node's text will be tokenized before constructing the query ``removestops`` If True and the subclass does not override ``query()``, and the field's analyzer has a stop word filter, stop words will be removed from the text before constructing the query. """ has_fieldname = True has_text = True has_boost = True qclass = None tokenize = False removestops = False def __init__(self, text): self.fieldname = None self.text = text self.boost = 1.0 def r(self): return "%s %r" % (self.__class__.__name__, self.text) def is_text(self): return True def query(self, parser): fieldname = self.fieldname or parser.fieldname termclass = self.qclass or parser.termclass q = parser.term_query(fieldname, self.text, termclass, boost=self.boost, tokenize=self.tokenize, removestops=self.removestops) return attach(q, self) class WordNode(TextNode): """Syntax node for term queries. """ tokenize = True removestops = True def r(self): return repr(self.text) # Operators class Operator(SyntaxNode): """Base class for PrefixOperator, PostfixOperator, and InfixOperator. Operators work by moving the nodes they apply to (e.g. for prefix operator, the previous node, for infix operator, the nodes on either side, etc.) into a group node. The group provides the code for what to do with the nodes. """ def __init__(self, text, grouptype, leftassoc=True): """ :param text: the text of the operator in the query string. :param grouptype: the type of group to create in place of the operator and the node(s) it operates on. :param leftassoc: for infix opeators, whether the operator is left associative. use ``leftassoc=False`` for right-associative infix operators. """ self.text = text self.grouptype = grouptype self.leftassoc = leftassoc def r(self): return "OP %r" % self.text def replace_self(self, parser, group, position): """Called with the parser, a group, and the position at which the operator occurs in that group. Should return a group with the operator replaced by whatever effect the operator has (e.g. for an infix op, replace the op and the nodes on either side with a sub-group). """ raise NotImplementedError class PrefixOperator(Operator): def replace_self(self, parser, group, position): length = len(group) del group[position] if position < length - 1: group[position] = self.grouptype([group[position]]) return position class PostfixOperator(Operator): def replace_self(self, parser, group, position): del group[position] if position > 0: group[position - 1] = self.grouptype([group[position - 1]]) return position class InfixOperator(Operator): def replace_self(self, parser, group, position): la = self.leftassoc gtype = self.grouptype merging = gtype.merging if position > 0 and position < len(group) - 1: left = group[position - 1] right = group[position + 1] # The first two clauses check whether the "strong" side is already # a group of the type we are going to create. If it is, we just # append the "weak" side to the "strong" side instead of creating # a new group inside the existing one. This is necessary because # we can quickly run into Python's recursion limit otherwise. if merging and la and isinstance(left, gtype): left.append(right) del group[position:position + 2] elif merging and not la and isinstance(right, gtype): right.insert(0, left) del group[position - 1:position + 1] return position - 1 else: # Replace the operator and the two surrounding objects group[position - 1:position + 2] = [gtype([left, right])] else: del group[position] return position # Functions def to_word(n): node = WordNode(n.original) node.startchar = n.startchar node.endchar = n.endchar return node Whoosh-2.5.7/src/whoosh/qparser/taggers.py0000644000076500000240000000705012254366350020655 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.util.text import rcompile # Tagger objects class Tagger(object): """Base class for taggers, objects which match syntax in the query string and translate it into a :class:`whoosh.qparser.syntax.SyntaxNode` object. """ def match(self, parser, text, pos): """This method should see if this tagger matches the query string at the given position. If it matches, it should return :param parser: the :class:`whoosh.qparser.default.QueryParser` object. :param text: the text being parsed. :param pos: the position in the text at which the tagger should try to match. """ raise NotImplementedError class RegexTagger(Tagger): """Tagger class that uses regular expressions to match the query string. Subclasses should override ``create()`` instead of ``match()``. """ def __init__(self, expr): self.expr = rcompile(expr) def match(self, parser, text, pos): match = self.expr.match(text, pos) if match: node = self.create(parser, match) if node is not None: node = node.set_range(match.start(), match.end()) return node def create(self, parser, match): """When the regular expression matches, this method is called to translate the regex match object into a syntax node. :param parser: the :class:`whoosh.qparser.default.QueryParser` object. :param match: the regex match object. """ raise NotImplementedError class FnTagger(RegexTagger): """Tagger that takes a regular expression and a class or function, and for matches calls the class/function with the regex match's named groups as keyword arguments. """ def __init__(self, expr, fn, memo=""): RegexTagger.__init__(self, expr) self.fn = fn self.memo = memo def __repr__(self): return "<%s %r (%s)>" % (self.__class__.__name__, self.expr, self.memo) def create(self, parser, match): return self.fn(**match.groupdict()) Whoosh-2.5.7/src/whoosh/query/0000755000076500000240000000000012277504634016341 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/query/__init__.py0000644000076500000240000000346312254366350020454 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.query.qcore import * from whoosh.query.terms import * from whoosh.query.compound import * from whoosh.query.positional import * from whoosh.query.ranges import * from whoosh.query.wrappers import * from whoosh.query.nested import * from whoosh.query.qcolumns import * from whoosh.query.spans import * Whoosh-2.5.7/src/whoosh/query/compound.py0000644000076500000240000005303012254366350020534 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division from whoosh import matching from whoosh.compat import text_type, u from whoosh.compat import xrange from whoosh.query import qcore from whoosh.util import make_binary_tree, make_weighted_tree class CompoundQuery(qcore.Query): """Abstract base class for queries that combine or manipulate the results of multiple sub-queries . """ def __init__(self, subqueries, boost=1.0): for subq in subqueries: if not isinstance(subq, qcore.Query): raise qcore.QueryError("%r is not a query" % subq) self.subqueries = subqueries self.boost = boost def __repr__(self): r = "%s(%r" % (self.__class__.__name__, self.subqueries) if hasattr(self, "boost") and self.boost != 1: r += ", boost=%s" % self.boost r += ")" return r def __unicode__(self): r = u("(") r += self.JOINT.join([text_type(s) for s in self.subqueries]) r += u(")") return r __str__ = __unicode__ def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.subqueries == other.subqueries and self.boost == other.boost) def __getitem__(self, i): return self.subqueries.__getitem__(i) def __len__(self): return len(self.subqueries) def __iter__(self): return iter(self.subqueries) def __hash__(self): h = hash(self.__class__.__name__) ^ hash(self.boost) for q in self.subqueries: h ^= hash(q) return h def is_leaf(self): return False def children(self): return iter(self.subqueries) def apply(self, fn): return self.__class__([fn(q) for q in self.subqueries], boost=self.boost) def field(self): if self.subqueries: f = self.subqueries[0].field() if all(q.field() == f for q in self.subqueries[1:]): return f def estimate_size(self, ixreader): est = sum(q.estimate_size(ixreader) for q in self.subqueries) return min(est, ixreader.doc_count()) def estimate_min_size(self, ixreader): from whoosh.query import Not subs = self.subqueries qs = [(q, q.estimate_min_size(ixreader)) for q in subs if not isinstance(q, Not)] pos = [minsize for q, minsize in qs if minsize > 0] if pos: neg = [q.estimate_size(ixreader) for q in subs if isinstance(q, Not)] size = min(pos) - sum(neg) if size > 0: return size return 0 def normalize(self): from whoosh.query import Every, TermRange, NumericRange # Normalize subqueries and merge nested instances of this class subqueries = [] for s in self.subqueries: s = s.normalize() if isinstance(s, self.__class__): subqueries += [ss.with_boost(ss.boost * s.boost) for ss in s] else: subqueries.append(s) # If every subquery is Null, this query is Null if all(q is qcore.NullQuery for q in subqueries): return qcore.NullQuery # If there's an unfielded Every inside, then this query is Every if any((isinstance(q, Every) and q.fieldname is None) for q in subqueries): return Every() # Merge ranges and Everys everyfields = set() i = 0 while i < len(subqueries): q = subqueries[i] f = q.field() if f in everyfields: subqueries.pop(i) continue if isinstance(q, (TermRange, NumericRange)): j = i + 1 while j < len(subqueries): if q.overlaps(subqueries[j]): qq = subqueries.pop(j) q = q.merge(qq, intersect=self.intersect_merge) else: j += 1 q = subqueries[i] = q.normalize() if isinstance(q, Every): everyfields.add(q.fieldname) i += 1 # Eliminate duplicate queries subqs = [] seenqs = set() for s in subqueries: if not isinstance(s, Every) and s.field() in everyfields: continue if s in seenqs: continue seenqs.add(s) subqs.append(s) # Remove NullQuerys subqs = [q for q in subqs if q is not qcore.NullQuery] if not subqs: return qcore.NullQuery if len(subqs) == 1: sub = subqs[0] if not (self.boost == 1.0 and sub.boost == 1.0): sub = sub.with_boost(sub.boost * self.boost) return sub return self.__class__(subqs, boost=self.boost) def simplify(self, ixreader): subs = self.subqueries if subs: q = self.__class__([subq.simplify(ixreader) for subq in subs], boost=self.boost).normalize() else: q = qcore.NullQuery return q def matcher(self, searcher, context=None): # This method does a little sanity checking and then passes the info # down to the _matcher() method which subclasses must implement subs = self.subqueries if not subs: return matching.NullMatcher() if len(subs) == 1: m = subs[0].matcher(searcher, context) else: m = self._matcher(subs, searcher, context) return m def _matcher(self, subs, searcher, context): # Subclasses must implement this method raise NotImplementedError def _tree_matcher(self, subs, mcls, searcher, context, q_weight_fn, **kwargs): # q_weight_fn is a function which is called on each query and returns a # "weight" value which is used to build a huffman-like matcher tree. If # q_weight_fn is None, an order-preserving binary tree is used instead. # Create a matcher from the list of subqueries subms = [q.matcher(searcher, context) for q in subs] if len(subms) == 1: m = subms[0] elif q_weight_fn is None: m = make_binary_tree(mcls, subms, **kwargs) else: w_subms = [(q_weight_fn(q), m) for q, m in zip(subs, subms)] m = make_weighted_tree(mcls, w_subms, **kwargs) # If this query had a boost, add a wrapping matcher to apply the boost if self.boost != 1.0: m = matching.WrappingMatcher(m, self.boost) return m class And(CompoundQuery): """Matches documents that match ALL of the subqueries. >>> And([Term("content", u"render"), ... Term("content", u"shade"), ... Not(Term("content", u"texture"))]) >>> # You can also do this >>> Term("content", u"render") & Term("content", u"shade") """ # This is used by the superclass's __unicode__ method. JOINT = " AND " intersect_merge = True def requires(self): s = set() for q in self.subqueries: s |= q.requires() return s def estimate_size(self, ixreader): return min(q.estimate_size(ixreader) for q in self.subqueries) def _matcher(self, subs, searcher, context): r = searcher.reader() q_weight_fn = lambda q: 0 - q.estimate_size(r) return self._tree_matcher(subs, matching.IntersectionMatcher, searcher, context, q_weight_fn) class Or(CompoundQuery): """Matches documents that match ANY of the subqueries. >>> Or([Term("content", u"render"), ... And([Term("content", u"shade"), Term("content", u"texture")]), ... Not(Term("content", u"network"))]) >>> # You can also do this >>> Term("content", u"render") | Term("content", u"shade") """ # This is used by the superclass's __unicode__ method. JOINT = " OR " intersect_merge = False TOO_MANY_CLAUSES = 1024 # For debugging: set the array_type property to control matcher selection AUTO_MATCHER = 0 # Use automatic heuristics to choose matcher DEFAULT_MATCHER = 1 # Use a binary tree of UnionMatchers SPLIT_MATCHER = 2 # Use a different strategy for short and long queries ARRAY_MATCHER = 3 # Use a matcher that pre-loads docnums and scores matcher_type = AUTO_MATCHER def __init__(self, subqueries, boost=1.0, minmatch=0, scale=None): """ :param subqueries: a list of :class:`Query` objects to search for. :param boost: a boost factor to apply to the scores of all matching documents. :param minmatch: not yet implemented. :param scale: a scaling factor for a "coordination bonus". If this value is not None, it should be a floating point number greater than 0 and less than 1. The scores of the matching documents are boosted/penalized based on the number of query terms that matched in the document. This number scales the effect of the bonuses. """ CompoundQuery.__init__(self, subqueries, boost=boost) self.minmatch = minmatch self.scale = scale def __unicode__(self): r = u("(") r += (self.JOINT).join([text_type(s) for s in self.subqueries]) r += u(")") if self.minmatch: r += u(">%s") % self.minmatch return r __str__ = __unicode__ def normalize(self): norm = CompoundQuery.normalize(self) if norm.__class__ is self.__class__: norm.minmatch = self.minmatch norm.scale = self.scale return norm def requires(self): if len(self.subqueries) == 1: return self.subqueries[0].requires() else: return set() def _matcher(self, subs, searcher, context): needs_current = context.needs_current if context else True weighting = context.weighting if context else None matcher_type = self.matcher_type if matcher_type == self.AUTO_MATCHER: dc = searcher.doc_count_all() if (len(subs) < self.TOO_MANY_CLAUSES and (needs_current or self.scale or len(subs) == 2 or dc > 5000)): # If the parent matcher needs the current match, or there's just # two sub-matchers, use the standard binary tree of Unions matcher_type = self.DEFAULT_MATCHER else: # For small indexes, or too many clauses, just preload all # matches matcher_type = self.ARRAY_MATCHER if matcher_type == self.DEFAULT_MATCHER: # Implementation of Or that creates a binary tree of Union matchers cls = DefaultOr elif matcher_type == self.SPLIT_MATCHER: # Hybrid of pre-loading small queries and a binary tree of union # matchers for big queries cls = SplitOr elif matcher_type == self.ARRAY_MATCHER: # Implementation that pre-loads docnums and scores into an array cls = PreloadedOr else: raise ValueError("Unknown matcher_type %r" % self.matcher_type) return cls(subs, boost=self.boost, minmatch=self.minmatch, scale=self.scale).matcher(searcher, context) class DefaultOr(Or): JOINT = " dOR " def _matcher(self, subs, searcher, context): reader = searcher.reader() q_weight_fn = lambda q: q.estimate_size(reader) m = self._tree_matcher(subs, matching.UnionMatcher, searcher, context, q_weight_fn) # If a scaling factor was given, wrap the matcher in a CoordMatcher to # alter scores based on term coordination if self.scale and any(m.term_matchers()): m = matching.CoordMatcher(m, scale=self.scale) return m class SplitOr(Or): JOINT = " sOr " SPLIT_DOC_LIMIT = 8000 def matcher(self, searcher, context=None): from whoosh import collectors # Get the subqueries subs = self.subqueries if not subs: return matching.NullMatcher() elif len(subs) == 1: return subs[0].matcher(searcher, context) # Sort the subqueries into "small" and "big" queries based on their # estimated size. This works best for term queries. reader = searcher.reader() smallqs = [] bigqs = [] for q in subs: size = q.estimate_size(reader) if size <= self.SPLIT_DOC_LIMIT: smallqs.append(q) else: bigqs.append(q) # Build a pre-scored matcher for the small queries minscore = 0 smallmatcher = None if smallqs: smallmatcher = DefaultOr(smallqs).matcher(searcher, context) smallmatcher = matching.ArrayMatcher(smallmatcher, context.limit) minscore = smallmatcher.limit_quality() if bigqs: # Get a matcher for the big queries m = DefaultOr(bigqs).matcher(searcher, context) # Add the prescored matcher for the small queries if smallmatcher: m = matching.UnionMatcher(m, smallmatcher) # Set the minimum score based on the prescored matcher m.set_min_quality(minscore) elif smallmatcher: # If there are no big queries, just return the prescored matcher m = smallmatcher else: m = matching.NullMatcher() return m class PreloadedOr(Or): JOINT = " pOR " def _matcher(self, subs, searcher, context): if context: scored = context.weighting is not None else: scored = True ms = [sub.matcher(searcher, context) for sub in subs] doccount = searcher.doc_count_all() am = matching.ArrayUnionMatcher(ms, doccount, boost=self.boost, scored=scored) return am class DisjunctionMax(CompoundQuery): """Matches all documents that match any of the subqueries, but scores each document using the maximum score from the subqueries. """ def __init__(self, subqueries, boost=1.0, tiebreak=0.0): CompoundQuery.__init__(self, subqueries, boost=boost) self.tiebreak = tiebreak def __unicode__(self): r = u("DisMax(") r += " ".join(sorted(text_type(s) for s in self.subqueries)) r += u(")") if self.tiebreak: r += u("~") + text_type(self.tiebreak) return r __str__ = __unicode__ def normalize(self): norm = CompoundQuery.normalize(self) if norm.__class__ is self.__class__: norm.tiebreak = self.tiebreak return norm def requires(self): if len(self.subqueries) == 1: return self.subqueries[0].requires() else: return set() def _matcher(self, subs, searcher, context): r = searcher.reader() q_weight_fn = lambda q: q.estimate_size(r) return self._tree_matcher(subs, matching.DisjunctionMaxMatcher, searcher, context, q_weight_fn, tiebreak=self.tiebreak) # Boolean queries class BinaryQuery(CompoundQuery): """Base class for binary queries (queries which are composed of two sub-queries). Subclasses should set the ``matcherclass`` attribute or override ``matcher()``, and may also need to override ``normalize()``, ``estimate_size()``, and/or ``estimate_min_size()``. """ boost = 1.0 def __init__(self, a, b): self.a = a self.b = b self.subqueries = (a, b) def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.a == other.a and self.b == other.b) def __hash__(self): return (hash(self.__class__.__name__) ^ hash(self.a) ^ hash(self.b)) def apply(self, fn): return self.__class__(fn(self.a), fn(self.b)) def field(self): f = self.a.field() if self.b.field() == f: return f def with_boost(self, boost): return self.__class__(self.a.with_boost(boost), self.b.with_boost(boost)) def normalize(self): a = self.a.normalize() b = self.b.normalize() if a is qcore.NullQuery and b is qcore.NullQuery: return qcore.NullQuery elif a is qcore.NullQuery: return b elif b is qcore.NullQuery: return a return self.__class__(a, b) def matcher(self, searcher, context=None): return self.matcherclass(self.a.matcher(searcher, context), self.b.matcher(searcher, context)) class AndNot(BinaryQuery): """Binary boolean query of the form 'a ANDNOT b', where documents that match b are removed from the matches for a. """ JOINT = " ANDNOT " def with_boost(self, boost): return self.__class__(self.a.with_boost(boost), self.b) def normalize(self): a = self.a.normalize() b = self.b.normalize() if a is qcore.NullQuery: return qcore.NullQuery elif b is qcore.NullQuery: return a return self.__class__(a, b) def requires(self): return self.a.requires() def matcher(self, searcher, context=None): scoredm = self.a.matcher(searcher, context) notm = self.b.matcher(searcher, searcher.boolean_context()) return matching.AndNotMatcher(scoredm, notm) class Otherwise(BinaryQuery): """A binary query that only matches the second clause if the first clause doesn't match any documents. """ JOINT = " OTHERWISE " def matcher(self, searcher, context=None): m = self.a.matcher(searcher, context) if not m.is_active(): m = self.b.matcher(searcher, context) return m class Require(BinaryQuery): """Binary query returns results from the first query that also appear in the second query, but only uses the scores from the first query. This lets you filter results without affecting scores. """ JOINT = " REQUIRE " matcherclass = matching.RequireMatcher def requires(self): return self.a.requires() | self.b.requires() def estimate_size(self, ixreader): return self.b.estimate_size(ixreader) def estimate_min_size(self, ixreader): return self.b.estimate_min_size(ixreader) def with_boost(self, boost): return self.__class__(self.a.with_boost(boost), self.b) def normalize(self): a = self.a.normalize() b = self.b.normalize() if a is qcore.NullQuery or b is qcore.NullQuery: return qcore.NullQuery return self.__class__(a, b) def docs(self, searcher): return And(self.subqueries).docs(searcher) def matcher(self, searcher, context=None): scoredm = self.a.matcher(searcher, context) requiredm = self.b.matcher(searcher, searcher.boolean_context()) return matching.AndNotMatcher(scoredm, requiredm) class AndMaybe(BinaryQuery): """Binary query takes results from the first query. If and only if the same document also appears in the results from the second query, the score from the second query will be added to the score from the first query. """ JOINT = " ANDMAYBE " matcherclass = matching.AndMaybeMatcher def normalize(self): a = self.a.normalize() b = self.b.normalize() if a is qcore.NullQuery: return qcore.NullQuery if b is qcore.NullQuery: return a return self.__class__(a, b) def requires(self): return self.a.requires() def estimate_min_size(self, ixreader): return self.subqueries[0].estimate_min_size(ixreader) def docs(self, searcher): return self.subqueries[0].docs(searcher) def BooleanQuery(required, should, prohibited): return AndNot(AndMaybe(And(required), Or(should)), Or(prohibited)).normalize() Whoosh-2.5.7/src/whoosh/query/nested.py0000644000076500000240000003434612254366350020203 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh import matching from whoosh.compat import text_type, u, xrange from whoosh.query import qcore from whoosh.query.wrappers import WrappingQuery class NestedParent(WrappingQuery): """A query that allows you to search for "nested" documents, where you can index (possibly multiple levels of) "parent" and "child" documents using the :meth:`~whoosh.writing.IndexWriter.group` and/or :meth:`~whoosh.writing.IndexWriter.start_group` methods of a :class:`whoosh.writing.IndexWriter` to indicate that hierarchically related documents should be kept together:: schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True)) with ix.writer() as w: # Say we're indexing chapters (type=chap) and each chapter has a # number of paragraphs (type=p) with w.group(): w.add_document(type="chap", text="Chapter 1") w.add_document(type="p", text="Able baker") w.add_document(type="p", text="Bright morning") with w.group(): w.add_document(type="chap", text="Chapter 2") w.add_document(type="p", text="Car trip") w.add_document(type="p", text="Dog eared") w.add_document(type="p", text="Every day") with w.group(): w.add_document(type="chap", text="Chapter 3") w.add_document(type="p", text="Fine day") The ``NestedParent`` query wraps two sub-queries: the "parent query" matches a class of "parent documents". The "sub query" matches nested documents you want to find. For each "sub document" the "sub query" finds, this query acts as if it found the corresponding "parent document". >>> with ix.searcher() as s: ... r = s.search(query.Term("text", "day")) ... for hit in r: ... print(hit["text"]) ... Chapter 2 Chapter 3 """ def __init__(self, parents, subq, per_parent_limit=None, score_fn=sum): """ :param parents: a query, DocIdSet object, or Results object representing the documents you want to use as the "parent" documents. Where the sub-query matches, the corresponding document in these results will be returned as the match. :param subq: a query matching the information you want to find. :param per_parent_limit: a maximum number of "sub documents" to search per parent. The default is None, meaning no limit. :param score_fn: a function to use to combine the scores of matching sub-documents to calculate the score returned for the parent document. The default is ``sum``, that is, add up the scores of the sub-documents. """ self.parents = parents self.child = subq self.per_parent_limit = per_parent_limit self.score_fn = score_fn def normalize(self): p = self.parents if isinstance(p, qcore.Query): p = p.normalize() q = self.child.normalize() if p is qcore.NullQuery or q is qcore.NullQuery: return qcore.NullQuery return self.__class__(p, q) def requires(self): return self.child.requires() def matcher(self, searcher, context=None): bits = searcher._filter_to_comb(self.parents) if not bits: return matching.NullMatcher m = self.child.matcher(searcher, context) if not m.is_active(): return matching.NullMatcher return self.NestedParentMatcher(bits, m, self.per_parent_limit, searcher.doc_count_all()) def deletion_docs(self, searcher): bits = searcher._filter_to_comb(self.parents) if not bits: return m = self.child.matcher(searcher, searcher.boolean_context()) maxdoc = searcher.doc_count_all() while m.is_active(): docnum = m.id() parentdoc = bits.before(docnum + 1) nextparent = bits.after(docnum) or maxdoc for i in xrange(parentdoc, nextparent): yield i m.skip_to(nextparent) class NestedParentMatcher(matching.Matcher): def __init__(self, comb, child, per_parent_limit, maxdoc): self.comb = comb self.child = child self.per_parent_limit = per_parent_limit self.maxdoc = maxdoc self._nextdoc = None if self.child.is_active(): self._gather() def is_active(self): return self._nextdoc is not None def supports_block_quality(self): return False def _gather(self): # This is where the magic happens ;) child = self.child pplimit = self.per_parent_limit # The next document returned by this matcher is the parent of the # child's current document. We don't have to worry about whether # the parent is deleted, because the query that gave us the parents # wouldn't return deleted documents. self._nextdoc = self.comb.before(child.id() + 1) # The next parent after the child matcher's current document nextparent = self.comb.after(child.id()) or self.maxdoc # Sum the scores of all matching documents under the parent count = 1 score = 0 while child.is_active() and child.id() < nextparent: if pplimit and count > pplimit: child.skip_to(nextparent) break score += child.score() child.next() count += 1 self._nextscore = score def id(self): return self._nextdoc def score(self): return self._nextscore def reset(self): self.child.reset() self._gather() def next(self): if self.child.is_active(): self._gather() else: if self._nextdoc is None: raise matching.ReadTooFar else: self._nextdoc = None def skip_to(self, id): self.child.skip_to(id) self._gather() def value(self): raise NotImplementedError(self.__class__) def spans(self): return [] class NestedChildren(WrappingQuery): """This is the reverse of a :class:`NestedParent` query: instead of taking a query that matches children but returns the parent, this query matches parents but returns the children. This is useful, for example, to search for an album title and return the songs in the album:: schema = fields.Schema(type=fields.ID(stored=True), album_name=fields.TEXT(stored=True), track_num=fields.NUMERIC(stored=True), track_name=fields.TEXT(stored=True), lyrics=fields.TEXT) ix = RamStorage().create_index(schema) # Indexing with ix.writer() as w: # For each album, index a "group" of a parent "album" document and # multiple child "track" documents. with w.group(): w.add_document(type="album", artist="The Cure", album_name="Disintegration") w.add_document(type="track", track_num=1, track_name="Plainsong") w.add_document(type="track", track_num=2, track_name="Pictures of You") # ... # ... # Find songs where the song name has "heaven" in the title and the # album the song is on has "hell" in the title qp = QueryParser("lyrics", ix.schema) with ix.searcher() as s: # A query that matches all parents all_albums = qp.parse("type:album") # A query that matches the parents we want albums_with_hell = qp.parse("album_name:hell") # A query that matches the desired albums but returns the tracks songs_on_hell_albums = NestedChildren(all_albums, albums_with_hell) # A query that matches tracks with heaven in the title songs_with_heaven = qp.parse("track_name:heaven") # A query that finds tracks with heaven in the title on albums # with hell in the title q = query.And([songs_on_hell_albums, songs_with_heaven]) """ def __init__(self, parents, subq, boost=1.0): self.parents = parents self.child = subq self.boost = boost def matcher(self, searcher, context=None): bits = searcher._filter_to_comb(self.parents) if not bits: return matching.NullMatcher m = self.child.matcher(searcher, context) if not m.is_active(): return matching.NullMatcher return self.NestedChildMatcher(bits, m, searcher.doc_count_all(), searcher.reader().is_deleted, boost=self.boost) class NestedChildMatcher(matching.WrappingMatcher): def __init__(self, comb, m, limit, is_deleted, boost=1.0): self.comb = comb self.child = m self.limit = limit self.is_deleted = is_deleted self.boost = boost self._reset() def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self.comb, self.child) def reset(self): self.child.reset() self._reset() def _reset(self): self._nextchild = -1 self._nextparent = -1 self._find_next_children() def is_active(self): return self._nextchild < self._nextparent def replace(self, minscore): return self def _find_next_children(self): comb = self.comb m = self.child limit = self.limit is_deleted = self.is_deleted nextchild = self._nextchild nextparent = self._nextparent while m.is_active(): # Move the "child id" to the document after the current match nextchild = m.id() + 1 # Move the parent matcher to the next match m.next() # Find the next parent document (matching or not) after this nextparent = comb.after(nextchild) if nextparent is None: nextparent = limit # Skip any deleted child documents while is_deleted(nextchild): nextchild += 1 # If skipping deleted documents put us to or past the next # parent doc, go again if nextchild >= nextparent: continue else: # Otherwise, we're done break self._nextchild = nextchild self._nextparent = nextparent def id(self): return self._nextchild def all_ids(self): while self.is_active(): yield self.id() self.next() def next(self): is_deleted = self.is_deleted limit = self.limit nextparent = self._nextparent # Go to the next document nextchild = self._nextchild nextchild += 1 # Skip over any deleted child documents while nextchild < nextparent and is_deleted(nextchild): nextchild += 1 self._nextchild = nextchild # If we're at or past the next parent doc, go to the next set of # children if nextchild >= limit: return elif nextchild >= nextparent: self._find_next_children() def skip_to(self, docid): if docid <= self._nextchild: return m = self.child if not m.is_active() or docid < m.id(): # We've already read-ahead past the desired doc, so iterate while self.is_active() and self._nextchild < docid: self.next() elif m.is_active(): # The child is active and hasn't read-ahead to the desired doc # yet, so skip to it and re-find m.skip_to(docid) self._find_next_children() else: # Go inactive self._nextchild = self.limit def value(self): raise NotImplementedError(self.__class__) def score(self): return self.boost def spans(self): return [] Whoosh-2.5.7/src/whoosh/query/positional.py0000644000076500000240000002207212254366350021073 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division import copy from whoosh import matching from whoosh.analysis import Token from whoosh.compat import u from whoosh.query import qcore, terms, compound class Sequence(compound.CompoundQuery): """Matches documents containing a list of sub-queries in adjacent positions. This object has no sanity check to prevent you from using queries in different fields. """ JOINT = " NEAR " intersect_merge = True def __init__(self, subqueries, slop=1, ordered=True, boost=1.0): """ :param subqueries: a list of :class:`whoosh.query.Query` objects to match in sequence. :param slop: the maximum difference in position allowed between the subqueries. :param ordered: if True, the position differences between subqueries must be positive (that is, each subquery in the list must appear after the previous subquery in the document). :param boost: a boost factor to add to the score of documents matching this query. """ compound.CompoundQuery.__init__(self, subqueries, boost=boost) self.slop = slop self.ordered = ordered def __eq__(self, other): return (other and type(self) is type(other) and self.subqueries == other.subqueries and self.boost == other.boost) def __repr__(self): return "%s(%r, slop=%d, boost=%f)" % (self.__class__.__name__, self.subqueries, self.slop, self.boost) def __hash__(self): h = hash(self.slop) ^ hash(self.boost) for q in self.subqueries: h ^= hash(q) return h def normalize(self): # Because the subqueries are in sequence, we can't do the fancy merging # that CompoundQuery does return self.__class__([q.normalize() for q in self.subqueries], self.slop, self.ordered, self.boost) def _and_query(self): return compound.And(self.subqueries) def estimate_size(self, ixreader): return self._and_query().estimate_size(ixreader) def estimate_min_size(self, ixreader): return self._and_query().estimate_min_size(ixreader) def _matcher(self, subs, searcher, context): from whoosh.query.spans import SpanNear return self._tree_matcher(subs, SpanNear.SpanNearMatcher, searcher, context, None, slop=self.slop, ordered=self.ordered) class Ordered(Sequence): """Matches documents containing a list of sub-queries in the given order. """ JOINT = " BEFORE " def _matcher(self, subs, searcher, context): from whoosh.query.spans import SpanBefore return self._tree_matcher(subs, SpanBefore._Matcher, searcher, context, None) class Phrase(qcore.Query): """Matches documents containing a given phrase.""" def __init__(self, fieldname, words, slop=1, boost=1.0, char_ranges=None): """ :param fieldname: the field to search. :param words: a list of words (unicode strings) in the phrase. :param slop: the number of words allowed between each "word" in the phrase; the default of 1 means the phrase must match exactly. :param boost: a boost factor that to apply to the raw score of documents matched by this query. :param char_ranges: if a Phrase object is created by the query parser, it will set this attribute to a list of (startchar, endchar) pairs corresponding to the words in the phrase """ self.fieldname = fieldname self.words = words self.slop = slop self.boost = boost self.char_ranges = char_ranges def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.fieldname == other.fieldname and self.words == other.words and self.slop == other.slop and self.boost == other.boost) def __repr__(self): return "%s(%r, %r, slop=%s, boost=%f)" % (self.__class__.__name__, self.fieldname, self.words, self.slop, self.boost) def __unicode__(self): return u('%s:"%s"') % (self.fieldname, u(" ").join(self.words)) __str__ = __unicode__ def __hash__(self): h = hash(self.fieldname) ^ hash(self.slop) ^ hash(self.boost) for w in self.words: h ^= hash(w) return h def has_terms(self): return True def terms(self, phrases=False): if phrases and self.field(): for word in self.words: yield (self.field(), word) def tokens(self, boost=1.0): char_ranges = self.char_ranges startchar = endchar = None for i, word in enumerate(self.words): if char_ranges: startchar, endchar = char_ranges[i] yield Token(fieldname=self.fieldname, text=word, boost=boost * self.boost, startchar=startchar, endchar=endchar, chars=True) def normalize(self): if not self.words: return qcore.NullQuery if len(self.words) == 1: t = terms.Term(self.fieldname, self.words[0]) if self.char_ranges: t.startchar, t.endchar = self.char_ranges[0] return t words = [w for w in self.words if w is not None] return self.__class__(self.fieldname, words, slop=self.slop, boost=self.boost, char_ranges=self.char_ranges) def replace(self, fieldname, oldtext, newtext): q = copy.copy(self) if q.fieldname == fieldname: for i, word in enumerate(q.words): if word == oldtext: q.words[i] = newtext return q def _and_query(self): return compound.And([terms.Term(self.fieldname, word) for word in self.words]) def estimate_size(self, ixreader): return self._and_query().estimate_size(ixreader) def estimate_min_size(self, ixreader): return self._and_query().estimate_min_size(ixreader) def matcher(self, searcher, context=None): from whoosh.query import Term, SpanNear2 fieldname = self.fieldname if fieldname not in searcher.schema: return matching.NullMatcher() field = searcher.schema[fieldname] if not field.format or not field.format.supports("positions"): raise qcore.QueryError("Phrase search: %r field has no positions" % self.fieldname) terms = [] # Build a list of Term queries from the words in the phrase reader = searcher.reader() for word in self.words: try: word = field.to_bytes(word) except ValueError: return matching.NullMatcher() if (fieldname, word) not in reader: # Shortcut the query if one of the words doesn't exist. return matching.NullMatcher() terms.append(Term(fieldname, word)) # Create the equivalent SpanNear2 query from the terms q = SpanNear2(terms, slop=self.slop, ordered=True, mindist=1) # Get the matcher m = q.matcher(searcher, context) if self.boost != 1.0: m = matching.WrappingMatcher(m, boost=self.boost) return m Whoosh-2.5.7/src/whoosh/query/qcolumns.py0000644000076500000240000001014512254366350020551 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.matching import ConstantScoreMatcher, NullMatcher, ReadTooFar from whoosh.query import Query class ColumnQuery(Query): """A query that matches per-document values stored in a column rather than terms in the inverted index. This may be useful in special circumstances, but note that this is MUCH SLOWER than searching an indexed field. """ def __init__(self, fieldname, condition): """ :param fieldname: the name of the field to look in. If the field does not have a column, this query will not match anything. :param condition: if this is a callable, it is called on each value in the column, and for documents where callable(docvalue) returns True are returned as matching documents. If this is not a callable, the document values are compared to it (using ``==``). """ self.fieldname = fieldname self.condition = condition def is_leaf(self): return True def matcher(self, searcher, context=None): fieldname = self.fieldname condition = self.condition if callable(condition): comp = condition else: def comp(v): # Made this a function instead of a lambda so I could put # debug prints here if necessary ;) return v == condition reader = searcher.reader() if not reader.has_column(fieldname): return NullMatcher() creader = reader.column_reader(fieldname) return ColumnMatcher(creader, comp) class ColumnMatcher(ConstantScoreMatcher): def __init__(self, creader, condition): self.creader = creader self.condition = condition self._i = 0 self._find_next() def _find_next(self): condition = self.condition creader = self.creader while self._i < len(creader) and not condition(creader[self._i]): self._i += 1 def is_active(self): return self._i < len(self.creader) def next(self): if not self.is_active(): raise ReadTooFar self._i += 1 self._find_next() def reset(self): self._i = 0 self._find_next() def id(self): return self._i def all_ids(self): condition = self.condition for docnum, v in enumerate(self.creader): if condition(v): yield docnum def supports(self, astype): return False def skip_to_quality(self, minquality): if self._score <= minquality: self._i = len(self.creader) return True Whoosh-2.5.7/src/whoosh/query/qcore.py0000644000076500000240000005415612254366350020033 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division import copy from array import array from whoosh import matching from whoosh.compat import u from whoosh.reading import TermNotFound from whoosh.compat import methodcaller # Exceptions class QueryError(Exception): """Error encountered while running a query. """ pass # Functions def error_query(msg, q=None): """Returns the query in the second argument (or a :class:`NullQuery` if the second argument is not given) with its ``error`` attribute set to ``msg``. """ if q is None: q = _NullQuery() q.error = msg return q def token_lists(q, phrases=True): """Returns the terms in the query tree, with the query hierarchy represented as nested lists. """ if q.is_leaf(): from whoosh.query import Phrase if phrases or not isinstance(q, Phrase): return list(q.tokens()) else: ls = [] for qq in q.children(): t = token_lists(qq, phrases=phrases) if len(t) == 1: t = t[0] if t: ls.append(t) return ls # Utility classes class Lowest(object): """A value that is always compares lower than any other object except itself. """ def __cmp__(self, other): if other.__class__ is Lowest: return 0 return -1 def __eq__(self, other): return self.__class__ is type(other) def __lt__(self, other): return type(other) is not self.__class__ def __ne__(self, other): return not self.__eq__(other) def __gt__(self, other): return not (self.__lt__(other) or self.__eq__(other)) def __le__(self, other): return self.__eq__(other) or self.__lt__(other) def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) class Highest(object): """A value that is always compares higher than any other object except itself. """ def __cmp__(self, other): if other.__class__ is Highest: return 0 return 1 def __eq__(self, other): return self.__class__ is type(other) def __lt__(self, other): return type(other) is self.__class__ def __ne__(self, other): return not self.__eq__(other) def __gt__(self, other): return not (self.__lt__(other) or self.__eq__(other)) def __le__(self, other): return self.__eq__(other) or self.__lt__(other) def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) Lowest = Lowest() Highest = Highest() # Base classes class Query(object): """Abstract base class for all queries. Note that this base class implements __or__, __and__, and __sub__ to allow slightly more convenient composition of query objects:: >>> Term("content", u"a") | Term("content", u"b") Or([Term("content", u"a"), Term("content", u"b")]) >>> Term("content", u"a") & Term("content", u"b") And([Term("content", u"a"), Term("content", u"b")]) >>> Term("content", u"a") - Term("content", u"b") And([Term("content", u"a"), Not(Term("content", u"b"))]) """ # For queries produced by the query parser, record where in the user # query this object originated startchar = endchar = None # For queries produced by the query parser, records an error that resulted # in this query error = None def __unicode__(self): raise NotImplementedError(self.__class__.__name__) def __getitem__(self, item): raise NotImplementedError def __or__(self, query): """Allows you to use | between query objects to wrap them in an Or query. """ from whoosh.query import Or return Or([self, query]).normalize() def __and__(self, query): """Allows you to use & between query objects to wrap them in an And query. """ from whoosh.query import And return And([self, query]).normalize() def __sub__(self, query): """Allows you to use - between query objects to add the right-hand query as a "NOT" query. """ from whoosh.query import And, Not return And([self, Not(query)]).normalize() def __hash__(self): raise NotImplementedError def __ne__(self, other): return not self.__eq__(other) def is_leaf(self): """Returns True if this is a leaf node in the query tree, or False if this query has sub-queries. """ return True def children(self): """Returns an iterator of the subqueries of this object. """ return iter([]) def is_range(self): """Returns True if this object searches for values within a range. """ return False def has_terms(self): """Returns True if this specific object represents a search for a specific term (as opposed to a pattern, as in Wildcard and Prefix) or terms (i.e., whether the ``replace()`` method does something meaningful on this instance). """ return False def apply(self, fn): """If this query has children, calls the given function on each child and returns a new copy of this node with the new children returned by the function. If this is a leaf node, simply returns this object. This is useful for writing functions that transform a query tree. For example, this function changes all Term objects in a query tree into Variations objects:: def term2var(q): if isinstance(q, Term): return Variations(q.fieldname, q.text) else: return q.apply(term2var) q = And([Term("f", "alfa"), Or([Term("f", "bravo"), Not(Term("f", "charlie"))])]) q = term2var(q) Note that this method does not automatically create copies of nodes. To avoid modifying the original tree, your function should call the :meth:`Query.copy` method on nodes before changing their attributes. """ return self def accept(self, fn): """Applies the given function to this query's subqueries (if any) and then to this query itself:: def boost_phrases(q): if isintance(q, Phrase): q.boost *= 2.0 return q myquery = myquery.accept(boost_phrases) This method automatically creates copies of the nodes in the original tree before passing them to your function, so your function can change attributes on nodes without altering the original tree. This method is less flexible than using :meth:`Query.apply` (in fact it's implemented using that method) but is often more straightforward. """ def fn_wrapper(q): q = q.apply(fn_wrapper) return fn(q) return fn_wrapper(self) def replace(self, fieldname, oldtext, newtext): """Returns a copy of this query with oldtext replaced by newtext (if oldtext was anywhere in this query). Note that this returns a *new* query with the given text replaced. It *does not* modify the original query "in place". """ # The default implementation uses the apply method to "pass down" the # replace() method call if self.is_leaf(): return copy.copy(self) else: return self.apply(methodcaller("replace", fieldname, oldtext, newtext)) def copy(self): """Deprecated, just use ``copy.deepcopy``. """ return copy.deepcopy(self) def all_terms(self, phrases=True): """Returns a set of all terms in this query tree. This method exists for backwards-compatibility. Use iter_all_terms() instead. :param phrases: Whether to add words found in Phrase queries. :rtype: set """ return set(self.iter_all_terms(phrases=phrases)) def terms(self, phrases=False): """Yields zero or more (fieldname, text) pairs queried by this object. You can check whether a query object targets specific terms before you call this method using :meth:`Query.has_terms`. To get all terms in a query tree, use :meth:`Query.iter_all_terms`. """ return iter(()) def expanded_terms(self, ixreader, phrases=True): return self.terms(phrases=phrases) def existing_terms(self, ixreader, phrases=True, expand=False, fieldname=None): """Returns a set of all byteterms in this query tree that exist in the given ixreader. :param ixreader: A :class:`whoosh.reading.IndexReader` object. :param phrases: Whether to add words found in Phrase queries. :param expand: If True, queries that match multiple terms will return all matching expansions. :rtype: set """ schema = ixreader.schema termset = set() for q in self.leaves(): if fieldname and fieldname != q.field(): continue if expand: terms = q.expanded_terms(ixreader, phrases=phrases) else: terms = q.terms(phrases=phrases) for fieldname, text in terms: if (fieldname, text) in termset: continue if fieldname in schema: field = schema[fieldname] try: btext = field.to_bytes(text) except ValueError: continue if (fieldname, btext) in ixreader: termset.add((fieldname, btext)) return termset def leaves(self): """Returns an iterator of all the leaf queries in this query tree as a flat series. """ if self.is_leaf(): yield self else: for q in self.children(): for qq in q.leaves(): yield qq def iter_all_terms(self, phrases=True): """Returns an iterator of (fieldname, text) pairs for all terms in this query tree. >>> qp = qparser.QueryParser("text", myindex.schema) >>> q = myparser.parse("alfa bravo title:charlie") >>> # List the terms in a query >>> list(q.iter_all_terms()) [("text", "alfa"), ("text", "bravo"), ("title", "charlie")] >>> # Get a set of all terms in the query that don't exist in the index >>> r = myindex.reader() >>> missing = set(t for t in q.iter_all_terms() if t not in r) set([("text", "alfa"), ("title", "charlie")]) >>> # All terms in the query that occur in fewer than 5 documents in >>> # the index >>> [t for t in q.iter_all_terms() if r.doc_frequency(t[0], t[1]) < 5] [("title", "charlie")] :param phrases: Whether to add words found in Phrase queries. """ for q in self.leaves(): if q.has_terms(): for t in q.terms(phrases=phrases): yield t def all_tokens(self, boost=1.0): """Returns an iterator of :class:`analysis.Token` objects corresponding to all terms in this query tree. The Token objects will have the ``fieldname``, ``text``, and ``boost`` attributes set. If the query was built by the query parser, they Token objects will also have ``startchar`` and ``endchar`` attributes indexing into the original user query. """ if self.is_leaf(): for token in self.tokens(boost): yield token else: boost *= self.boost if hasattr(self, "boost") else 1.0 for child in self.children(): for token in child.all_tokens(boost): yield token def tokens(self, boost=1.0, exreader=None): """Yields zero or more :class:`analysis.Token` objects corresponding to the terms searched for by this query object. You can check whether a query object targets specific terms before you call this method using :meth:`Query.has_terms`. The Token objects will have the ``fieldname``, ``text``, and ``boost`` attributes set. If the query was built by the query parser, they Token objects will also have ``startchar`` and ``endchar`` attributes indexing into the original user query. To get all tokens for a query tree, use :meth:`Query.all_tokens`. :param exreader: a reader to use to expand multiterm queries such as prefixes and wildcards. The default is None meaning do not expand. """ return iter(()) def requires(self): """Returns a set of queries that are *known* to be required to match for the entire query to match. Note that other queries might also turn out to be required but not be determinable by examining the static query. >>> a = Term("f", u"a") >>> b = Term("f", u"b") >>> And([a, b]).requires() set([Term("f", u"a"), Term("f", u"b")]) >>> Or([a, b]).requires() set([]) >>> AndMaybe(a, b).requires() set([Term("f", u"a")]) >>> a.requires() set([Term("f", u"a")]) """ # Subclasses should implement the _add_required_to(qset) method return set([self]) def field(self): """Returns the field this query matches in, or None if this query does not match in a single field. """ return self.fieldname def with_boost(self, boost): """Returns a COPY of this query with the boost set to the given value. If a query type does not accept a boost itself, it will try to pass the boost on to its children, if any. """ q = self.copy() q.boost = boost return q def estimate_size(self, ixreader): """Returns an estimate of how many documents this query could potentially match (for example, the estimated size of a simple term query is the document frequency of the term). It is permissible to overestimate, but not to underestimate. """ raise NotImplementedError def estimate_min_size(self, ixreader): """Returns an estimate of the minimum number of documents this query could potentially match. """ return self.estimate_size(ixreader) def matcher(self, searcher, context=None): """Returns a :class:`~whoosh.matching.Matcher` object you can use to retrieve documents and scores matching this query. :rtype: :class:`whoosh.matching.Matcher` """ raise NotImplementedError def docs(self, searcher): """Returns an iterator of docnums matching this query. >>> with my_index.searcher() as searcher: ... list(my_query.docs(searcher)) [10, 34, 78, 103] :param searcher: A :class:`whoosh.searching.Searcher` object. """ try: context = searcher.boolean_context() return self.matcher(searcher, context).all_ids() except TermNotFound: return iter([]) def deletion_docs(self, searcher): """Returns an iterator of docnums matching this query for the purpose of deletion. The :meth:`~whoosh.writing.IndexWriter.delete_by_query` method will use this method when deciding what documents to delete, allowing special queries (e.g. nested queries) to override what documents are deleted. The default implementation just forwards to :meth:`Query.docs`. """ return self.docs(searcher) def normalize(self): """Returns a recursively "normalized" form of this query. The normalized form removes redundancy and empty queries. This is called automatically on query trees created by the query parser, but you may want to call it yourself if you're writing your own parser or building your own queries. >>> q = And([And([Term("f", u"a"), ... Term("f", u"b")]), ... Term("f", u"c"), Or([])]) >>> q.normalize() And([Term("f", u"a"), Term("f", u"b"), Term("f", u"c")]) Note that this returns a *new, normalized* query. It *does not* modify the original query "in place". """ return self def simplify(self, ixreader): """Returns a recursively simplified form of this query, where "second-order" queries (such as Prefix and Variations) are re-written into lower-level queries (such as Term and Or). """ return self # Null query class _NullQuery(Query): "Represents a query that won't match anything." boost = 1.0 def __init__(self): self.error = None def __unicode__(self): return u("<_NullQuery>") def __call__(self): return self def __repr__(self): return "<%s>" % (self.__class__.__name__) def __eq__(self, other): return isinstance(other, _NullQuery) def __ne__(self, other): return not self.__eq__(other) def __hash__(self): return id(self) def __copy__(self): return self def __deepcopy__(self, memo): return self def field(self): return None def estimate_size(self, ixreader): return 0 def normalize(self): return self def simplify(self, ixreader): return self def docs(self, searcher): return [] def matcher(self, searcher, context=None): return matching.NullMatcher() NullQuery = _NullQuery() # Every class Every(Query): """A query that matches every document containing any term in a given field. If you don't specify a field, the query matches every document. >>> # Match any documents with something in the "path" field >>> q = Every("path") >>> # Matcher every document >>> q = Every() The unfielded form (matching every document) is efficient. The fielded is more efficient than a prefix query with an empty prefix or a '*' wildcard, but it can still be very slow on large indexes. It requires the searcher to read the full posting list of every term in the given field. Instead of using this query it is much more efficient when you create the index to include a single term that appears in all documents that have the field you want to match. For example, instead of this:: # Match all documents that have something in the "path" field q = Every("path") Do this when indexing:: # Add an extra field that indicates whether a document has a path schema = fields.Schema(path=fields.ID, has_path=fields.ID) # When indexing, set the "has_path" field based on whether the document # has anything in the "path" field writer.add_document(text=text_value1) writer.add_document(text=text_value2, path=path_value2, has_path="t") Then to find all documents with a path:: q = Term("has_path", "t") """ def __init__(self, fieldname=None, boost=1.0): """ :param fieldname: the name of the field to match, or ``None`` or ``*`` to match all documents. """ if not fieldname or fieldname == "*": fieldname = None self.fieldname = fieldname self.boost = boost def __repr__(self): return "%s(%r, boost=%s)" % (self.__class__.__name__, self.fieldname, self.boost) def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.fieldname == other.fieldname and self.boost == other.boost) def __unicode__(self): return u("%s:*") % self.fieldname __str__ = __unicode__ def __hash__(self): return hash(self.fieldname) def estimate_size(self, ixreader): return ixreader.doc_count() def matcher(self, searcher, context=None): fieldname = self.fieldname reader = searcher.reader() if fieldname in (None, "", "*"): # This takes into account deletions doclist = array("I", reader.all_doc_ids()) else: # This is a hacky hack, but just create an in-memory set of all the # document numbers of every term in the field. This is SLOOOW for # large indexes doclist = set() for text in searcher.lexicon(fieldname): pr = searcher.postings(fieldname, text) doclist.update(pr.all_ids()) doclist = sorted(doclist) return matching.ListMatcher(doclist, all_weights=self.boost) Whoosh-2.5.7/src/whoosh/query/ranges.py0000644000076500000240000003215412254366350020173 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division from whoosh.compat import b, u from whoosh.query import qcore, terms, compound, wrappers from whoosh.util.times import datetime_to_long class RangeMixin(object): # Contains methods shared by TermRange and NumericRange def __repr__(self): return ('%s(%r, %r, %r, %s, %s, boost=%s, constantscore=%s)' % (self.__class__.__name__, self.fieldname, self.start, self.end, self.startexcl, self.endexcl, self.boost, self.constantscore)) def __unicode__(self): startchar = "{" if self.startexcl else "[" endchar = "}" if self.endexcl else "]" start = '' if self.start is None else self.start end = '' if self.end is None else self.end return u("%s:%s%s TO %s%s") % (self.fieldname, startchar, start, end, endchar) __str__ = __unicode__ def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.fieldname == other.fieldname and self.start == other.start and self.end == other.end and self.startexcl == other.startexcl and self.endexcl == other.endexcl and self.boost == other.boost and self.constantscore == other.constantscore) def __hash__(self): return (hash(self.fieldname) ^ hash(self.start) ^ hash(self.startexcl) ^ hash(self.end) ^ hash(self.endexcl) ^ hash(self.boost)) def is_range(self): return True def _comparable_start(self): if self.start is None: return (qcore.Lowest, 0) else: second = 1 if self.startexcl else 0 return (self.start, second) def _comparable_end(self): if self.end is None: return (qcore.Highest, 0) else: second = -1 if self.endexcl else 0 return (self.end, second) def overlaps(self, other): if not isinstance(other, TermRange): return False if self.fieldname != other.fieldname: return False start1 = self._comparable_start() start2 = other._comparable_start() end1 = self._comparable_end() end2 = other._comparable_end() return ((start1 >= start2 and start1 <= end2) or (end1 >= start2 and end1 <= end2) or (start2 >= start1 and start2 <= end1) or (end2 >= start1 and end2 <= end1)) def merge(self, other, intersect=True): assert self.fieldname == other.fieldname start1 = self._comparable_start() start2 = other._comparable_start() end1 = self._comparable_end() end2 = other._comparable_end() if start1 >= start2 and end1 <= end2: start = start2 end = end2 elif start2 >= start1 and end2 <= end1: start = start1 end = end1 elif intersect: start = max(start1, start2) end = min(end1, end2) else: start = min(start1, start2) end = max(end1, end2) startval = None if start[0] is qcore.Lowest else start[0] startexcl = start[1] == 1 endval = None if end[0] is qcore.Highest else end[0] endexcl = end[1] == -1 boost = max(self.boost, other.boost) constantscore = self.constantscore or other.constantscore return self.__class__(self.fieldname, startval, endval, startexcl, endexcl, boost=boost, constantscore=constantscore) class TermRange(RangeMixin, terms.MultiTerm): """Matches documents containing any terms in a given range. >>> # Match documents where the indexed "id" field is greater than or equal >>> # to 'apple' and less than or equal to 'pear'. >>> TermRange("id", u"apple", u"pear") """ def __init__(self, fieldname, start, end, startexcl=False, endexcl=False, boost=1.0, constantscore=True): """ :param fieldname: The name of the field to search. :param start: Match terms equal to or greater than this. :param end: Match terms equal to or less than this. :param startexcl: If True, the range start is exclusive. If False, the range start is inclusive. :param endexcl: If True, the range end is exclusive. If False, the range end is inclusive. :param boost: Boost factor that should be applied to the raw score of results matched by this query. """ self.fieldname = fieldname self.start = start self.end = end self.startexcl = startexcl self.endexcl = endexcl self.boost = boost self.constantscore = constantscore def normalize(self): if self.start in ('', None) and self.end in (u('\uffff'), None): from whoosh.query import Every return Every(self.fieldname, boost=self.boost) elif self.start == self.end: if self.startexcl or self.endexcl: return qcore.NullQuery return terms.Term(self.fieldname, self.start, boost=self.boost) else: return TermRange(self.fieldname, self.start, self.end, self.startexcl, self.endexcl, boost=self.boost) #def replace(self, fieldname, oldtext, newtext): # q = self.copy() # if q.fieldname == fieldname: # if q.start == oldtext: # q.start = newtext # if q.end == oldtext: # q.end = newtext # return q def _btexts(self, ixreader): fieldname = self.fieldname field = ixreader.schema[fieldname] startexcl = self.startexcl endexcl = self.endexcl if self.start is None: start = b("") else: try: start = field.to_bytes(self.start) except ValueError: return if self.end is None: end = b("\xFF\xFF\xFF\xFF") else: try: end = field.to_bytes(self.end) except ValueError: return for fname, t in ixreader.terms_from(fieldname, start): if fname != fieldname: break if t == start and startexcl: continue if t == end and endexcl: break if t > end: break yield t class NumericRange(RangeMixin, qcore.Query): """A range query for NUMERIC fields. Takes advantage of tiered indexing to speed up large ranges by matching at a high resolution at the edges of the range and a low resolution in the middle. >>> # Match numbers from 10 to 5925 in the "number" field. >>> nr = NumericRange("number", 10, 5925) """ def __init__(self, fieldname, start, end, startexcl=False, endexcl=False, boost=1.0, constantscore=True): """ :param fieldname: The name of the field to search. :param start: Match terms equal to or greater than this number. This should be a number type, not a string. :param end: Match terms equal to or less than this number. This should be a number type, not a string. :param startexcl: If True, the range start is exclusive. If False, the range start is inclusive. :param endexcl: If True, the range end is exclusive. If False, the range end is inclusive. :param boost: Boost factor that should be applied to the raw score of results matched by this query. :param constantscore: If True, the compiled query returns a constant score (the value of the ``boost`` keyword argument) instead of actually scoring the matched terms. This gives a nice speed boost and won't affect the results in most cases since numeric ranges will almost always be used as a filter. """ self.fieldname = fieldname self.start = start self.end = end self.startexcl = startexcl self.endexcl = endexcl self.boost = boost self.constantscore = constantscore def simplify(self, ixreader): return self._compile_query(ixreader).simplify(ixreader) def estimate_size(self, ixreader): return self._compile_query(ixreader).estimate_size(ixreader) def estimate_min_size(self, ixreader): return self._compile_query(ixreader).estimate_min_size(ixreader) def docs(self, searcher): q = self._compile_query(searcher.reader()) return q.docs(searcher) def _compile_query(self, ixreader): from whoosh.fields import NUMERIC from whoosh.util.numeric import tiered_ranges field = ixreader.schema[self.fieldname] if not isinstance(field, NUMERIC): raise Exception("NumericRange: field %r is not numeric" % self.fieldname) start = self.start if start is not None: start = field.prepare_number(start) end = self.end if end is not None: end = field.prepare_number(end) subqueries = [] stb = field.sortable_to_bytes # Get the term ranges for the different resolutions ranges = tiered_ranges(field.numtype, field.bits, field.signed, start, end, field.shift_step, self.startexcl, self.endexcl) for startnum, endnum, shift in ranges: if startnum == endnum: subq = terms.Term(self.fieldname, stb(startnum, shift)) else: startbytes = stb(startnum, shift) endbytes = stb(endnum, shift) subq = TermRange(self.fieldname, startbytes, endbytes) subqueries.append(subq) if len(subqueries) == 1: q = subqueries[0] elif subqueries: q = compound.Or(subqueries, boost=self.boost) else: return qcore.NullQuery if self.constantscore: q = wrappers.ConstantScoreQuery(q, self.boost) return q def matcher(self, searcher, context=None): q = self._compile_query(searcher.reader()) return q.matcher(searcher, context) class DateRange(NumericRange): """This is a very thin subclass of :class:`NumericRange` that only overrides the initializer and ``__repr__()`` methods to work with datetime objects instead of numbers. Internally this object converts the datetime objects it's created with to numbers and otherwise acts like a ``NumericRange`` query. >>> DateRange("date", datetime(2010, 11, 3, 3, 0), ... datetime(2010, 11, 3, 17, 59)) """ def __init__(self, fieldname, start, end, startexcl=False, endexcl=False, boost=1.0, constantscore=True): self.startdate = start self.enddate = end if start: start = datetime_to_long(start) if end: end = datetime_to_long(end) super(DateRange, self).__init__(fieldname, start, end, startexcl=startexcl, endexcl=endexcl, boost=boost, constantscore=constantscore) def __repr__(self): return '%s(%r, %r, %r, %s, %s, boost=%s)' % (self.__class__.__name__, self.fieldname, self.startdate, self.enddate, self.startexcl, self.endexcl, self.boost) Whoosh-2.5.7/src/whoosh/query/spans.py0000644000076500000240000007007112254366350020040 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module contains Query objects that deal with "spans". Span queries allow for positional constraints on matching documents. For example, the :class:`whoosh.spans.SpanNear` query matches documents where one term occurs near another. Because you can nest span queries, and wrap them around almost any non-span query, you can create very complex constraints. For example, to find documents containing "whoosh" at most 5 positions before "library" in the "text" field:: from whoosh import query, spans t1 = query.Term("text", "whoosh") t2 = query.Term("text", "library") q = spans.SpanNear(t1, t2, slop=5) """ from whoosh.matching import mcore, wrappers, binary from whoosh.query import Query, And, AndMaybe, Or, Term from whoosh.util import make_binary_tree # Span class class Span(object): __slots__ = ("start", "end", "startchar", "endchar", "boost") def __init__(self, start, end=None, startchar=None, endchar=None, boost=1.0): if end is None: end = start assert start <= end self.start = start self.end = end self.startchar = startchar self.endchar = endchar self.boost = boost def __repr__(self): if self.startchar is not None or self.endchar is not None: return "<%d-%d %d:%d>" % (self.start, self.end, self.startchar, self.endchar) else: return "<%d-%d>" % (self.start, self.end) def __eq__(self, span): return (self.start == span.start and self.end == span.end and self.startchar == span.startchar and self.endchar == span.endchar) def __ne__(self, span): return self.start != span.start or self.end != span.end def __lt__(self, span): return self.start < span.start def __gt__(self, span): return self.start > span.start def __hash__(self): return hash((self.start, self.end)) @classmethod def merge(cls, spans): """Merges overlapping and touches spans in the given list of spans. Note that this modifies the original list. >>> spans = [Span(1,2), Span(3)] >>> Span.merge(spans) >>> spans [<1-3>] """ i = 0 while i < len(spans) - 1: here = spans[i] j = i + 1 while j < len(spans): there = spans[j] if there.start > here.end + 1: break if here.touches(there) or here.overlaps(there): here = here.to(there) spans[i] = here del spans[j] else: j += 1 i += 1 return spans def to(self, span): if self.startchar is None: minchar = span.startchar elif span.startchar is None: minchar = self.startchar else: minchar = min(self.startchar, span.startchar) if self.endchar is None: maxchar = span.endchar elif span.endchar is None: maxchar = self.endchar else: maxchar = max(self.endchar, span.endchar) minpos = min(self.start, span.start) maxpos = max(self.end, span.end) return self.__class__(minpos, maxpos, minchar, maxchar) def overlaps(self, span): return ((self.start >= span.start and self.start <= span.end) or (self.end >= span.start and self.end <= span.end) or (span.start >= self.start and span.start <= self.end) or (span.end >= self.start and span.end <= self.end)) def surrounds(self, span): return self.start < span.start and self.end > span.end def is_within(self, span): return self.start >= span.start and self.end <= span.end def is_before(self, span): return self.end < span.start def is_after(self, span): return self.start > span.end def touches(self, span): return self.start == span.end + 1 or self.end == span.start - 1 def distance_to(self, span): if self.overlaps(span): return 0 elif self.is_before(span): return span.start - self.end else: return self.start - span.end def bisect_spans(spans, start): lo = 0 hi = len(spans) while lo < hi: mid = (lo + hi) // 2 if spans[mid].start < start: lo = mid + 1 else: hi = mid return lo # Base matchers class SpanWrappingMatcher(wrappers.WrappingMatcher): """An abstract matcher class that wraps a "regular" matcher. This matcher uses the sub-matcher's matching logic, but only matches documents that have matching spans, i.e. where ``_get_spans()`` returns a non-empty list. Subclasses must implement the ``_get_spans()`` method, which returns a list of valid spans for the current document. """ def __init__(self, child): super(SpanWrappingMatcher, self).__init__(child) self._spans = None if self.is_active(): self._find_next() def copy(self): m = self.__class__(self.child.copy()) m._spans = self._spans return m def _replacement(self, newchild): return self.__class__(newchild) def _find_next(self): if not self.is_active(): return child = self.child r = False spans = self._get_spans() while child.is_active() and not spans: r = child.next() or r if not child.is_active(): return True spans = self._get_spans() self._spans = spans return r def spans(self): return self._spans def next(self): self.child.next() self._find_next() def skip_to(self, id): self.child.skip_to(id) self._find_next() def all_ids(self): while self.is_active(): if self.spans(): yield self.id() self.next() class SpanBiMatcher(SpanWrappingMatcher): def copy(self): return self.__class__(self.a.copy(), self.b.copy()) def depth(self): return 1 + max(self.a.depth(), self.b.depth()) def replace(self, minquality=0): # TODO: fix this if not self.is_active(): return mcore.NullMatcher() return self # Queries class SpanQuery(Query): """Abstract base class for span-based queries. Each span query type wraps a "regular" query that implements the basic document-matching functionality (for example, SpanNear wraps an And query, because SpanNear requires that the two sub-queries occur in the same documents. The wrapped query is stored in the ``q`` attribute. Subclasses usually only need to implement the initializer to set the wrapped query, and ``matcher()`` to return a span-aware matcher object. """ def _subm(self, s, context=None): return self.q.matcher(s, context) def __getattr__(self, name): return super(Query, self).__getattr__(self.q, name) def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.q) def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.q == other.q) def __hash__(self): return hash(self.__class__.__name__) ^ hash(self.q) class SpanFirst(SpanQuery): """Matches spans that end within the first N positions. This lets you for example only match terms near the beginning of the document. """ def __init__(self, q, limit=0): """ :param q: the query to match. :param limit: the query must match within this position at the start of a document. The default is ``0``, which means the query must match at the first position. """ self.q = q self.limit = limit def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.q == other.q and self.limit == other.limit) def __hash__(self): return hash(self.q) ^ hash(self.limit) def is_leaf(self): return False def apply(self, fn): return self.__class__(fn(self.q), limit=self.limit) def matcher(self, searcher, context=None): m = self._subm(searcher, context) return SpanFirst.SpanFirstMatcher(m, limit=self.limit) class SpanFirstMatcher(SpanWrappingMatcher): def __init__(self, child, limit=0): self.limit = limit super(SpanFirst.SpanFirstMatcher, self).__init__(child) def copy(self): return self.__class__(self.child.copy(), limit=self.limit) def _replacement(self, newchild): return self.__class__(newchild, limit=self.limit) def _get_spans(self): return [span for span in self.child.spans() if span.end <= self.limit] class SpanNear(SpanQuery): """ Note: for new code, use :class:`SpanNear2` instead of this class. SpanNear2 takes a list of sub-queries instead of requiring you to create a binary tree of query objects. Matches queries that occur near each other. By default, only matches queries that occur right next to each other (slop=1) and in order (ordered=True). For example, to find documents where "whoosh" occurs next to "library" in the "text" field:: from whoosh import query, spans t1 = query.Term("text", "whoosh") t2 = query.Term("text", "library") q = spans.SpanNear(t1, t2) To find documents where "whoosh" occurs at most 5 positions before "library":: q = spans.SpanNear(t1, t2, slop=5) To find documents where "whoosh" occurs at most 5 positions before or after "library":: q = spans.SpanNear(t1, t2, slop=5, ordered=False) You can use the ``phrase()`` class method to create a tree of SpanNear queries to match a list of terms:: q = spans.SpanNear.phrase("text", ["whoosh", "search", "library"], slop=2) """ def __init__(self, a, b, slop=1, ordered=True, mindist=1): """ :param a: the first query to match. :param b: the second query that must occur within "slop" positions of the first query. :param slop: the number of positions within which the queries must occur. Default is 1, meaning the queries must occur right next to each other. :param ordered: whether a must occur before b. Default is True. :pram mindist: the minimum distance allowed between the queries. """ self.q = And([a, b]) self.a = a self.b = b self.slop = slop self.ordered = ordered self.mindist = mindist def __repr__(self): return ("%s(%r, slop=%d, ordered=%s, mindist=%d)" % (self.__class__.__name__, self.q, self.slop, self.ordered, self.mindist)) def __eq__(self, other): return (other and self.__class__ == other.__class__ and self.q == other.q and self.slop == other.slop and self.ordered == other.ordered and self.mindist == other.mindist) def __hash__(self): return (hash(self.a) ^ hash(self.b) ^ hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist)) def is_leaf(self): return False def apply(self, fn): return self.__class__(fn(self.a), fn(self.b), slop=self.slop, ordered=self.ordered, mindist=self.mindist) def matcher(self, searcher, context=None): ma = self.a.matcher(searcher, context) mb = self.b.matcher(searcher, context) return SpanNear.SpanNearMatcher(ma, mb, slop=self.slop, ordered=self.ordered, mindist=self.mindist) @classmethod def phrase(cls, fieldname, words, slop=1, ordered=True): """Returns a tree of SpanNear queries to match a list of terms. This class method is a convenience for constructing a phrase query using a binary tree of SpanNear queries:: SpanNear.phrase("content", ["alfa", "bravo", "charlie", "delta"]) :param fieldname: the name of the field to search in. :param words: a sequence of texts to search for. :param slop: the number of positions within which the terms must occur. Default is 1, meaning the terms must occur right next to each other. :param ordered: whether the terms must occur in order. Default is True. """ terms = [Term(fieldname, word) for word in words] return make_binary_tree(cls, terms, slop=slop, ordered=ordered) class SpanNearMatcher(SpanWrappingMatcher): def __init__(self, a, b, slop=1, ordered=True, mindist=1): self.a = a self.b = b self.slop = slop self.ordered = ordered self.mindist = mindist isect = binary.IntersectionMatcher(a, b) super(SpanNear.SpanNearMatcher, self).__init__(isect) def copy(self): return self.__class__(self.a.copy(), self.b.copy(), slop=self.slop, ordered=self.ordered, mindist=self.mindist) def replace(self, minquality=0): # TODO: fix this if not self.is_active(): return mcore.NullMatcher() return self def _get_spans(self): slop = self.slop mindist = self.mindist ordered = self.ordered spans = set() bspans = self.b.spans() for aspan in self.a.spans(): for bspan in bspans: if (bspan.end < aspan.start - slop or (ordered and aspan.start > bspan.start)): # B is too far in front of A, or B is in front of A # *at all* when ordered is True continue if bspan.start > aspan.end + slop: # B is too far from A. Since spans are listed in # start position order, we know that all spans after # this one will also be too far. break # Check the distance between the spans dist = aspan.distance_to(bspan) if mindist <= dist <= slop: spans.add(aspan.to(bspan)) return sorted(spans) class SpanNear2(SpanQuery): """ Matches queries that occur near each other. By default, only matches queries that occur right next to each other (slop=1) and in order (ordered=True). New code should use this query type instead of :class:`SpanNear`. (Unlike :class:`SpanNear`, this query takes a list of subqueries instead of requiring you to build a binary tree of query objects. This query should also be slightly faster due to less overhead.) For example, to find documents where "whoosh" occurs next to "library" in the "text" field:: from whoosh import query, spans t1 = query.Term("text", "whoosh") t2 = query.Term("text", "library") q = spans.SpanNear2([t1, t2]) To find documents where "whoosh" occurs at most 5 positions before "library":: q = spans.SpanNear2([t1, t2], slop=5) To find documents where "whoosh" occurs at most 5 positions before or after "library":: q = spans.SpanNear2(t1, t2, slop=5, ordered=False) """ def __init__(self, qs, slop=1, ordered=True, mindist=1): """ :param qs: a sequence of sub-queries to match. :param slop: the number of positions within which the queries must occur. Default is 1, meaning the queries must occur right next to each other. :param ordered: whether a must occur before b. Default is True. :pram mindist: the minimum distance allowed between the queries. """ self.qs = qs self.slop = slop self.ordered = ordered self.mindist = mindist def __repr__(self): return ("%s(%r, slop=%d, ordered=%s, mindist=%d)" % (self.__class__.__name__, self.qs, self.slop, self.ordered, self.mindist)) def __eq__(self, other): return (other and self.__class__ == other.__class__ and self.qs == other.qs and self.slop == other.slop and self.ordered == other.ordered and self.mindist == other.mindist) def __hash__(self): h = hash(self.slop) ^ hash(self.ordered) ^ hash(self.mindist) for q in self.qs: h ^= hash(q) return h def is_leaf(self): return False def children(self): return self.qs def apply(self, fn): return self.__class__([fn(q) for q in self.qs], slop=self.slop, ordered=self.ordered, mindist=self.mindist) def matcher(self, searcher, context=None): ms = [q.matcher(searcher, context) for q in self.qs] return self.SpanNear2Matcher(ms, slop=self.slop, ordered=self.ordered, mindist=self.mindist) class SpanNear2Matcher(SpanWrappingMatcher): def __init__(self, ms, slop=1, ordered=True, mindist=1): self.ms = ms self.slop = slop self.ordered = ordered self.mindist = mindist isect = make_binary_tree(binary.IntersectionMatcher, ms) super(SpanNear2.SpanNear2Matcher, self).__init__(isect) def copy(self): return self.__class__([m.copy() for m in self.ms], slop=self.slop, ordered=self.ordered, mindist=self.mindist) def replace(self, minquality=0): # TODO: fix this if not self.is_active(): return mcore.NullMatcher() return self def _get_spans(self): slop = self.slop mindist = self.mindist ordered = self.ordered ms = self.ms aspans = ms[0].spans() i = 1 while i < len(ms) and aspans: bspans = ms[i].spans() spans = set() for aspan in aspans: # Use a binary search to find the first position we should # start looking for possible matches if ordered: start = aspan.start else: start = max(0, aspan.start - slop) j = bisect_spans(bspans, start) while j < len(bspans): bspan = bspans[j] j += 1 if (bspan.end < aspan.start - slop or (ordered and aspan.start > bspan.start)): # B is too far in front of A, or B is in front of A # *at all* when ordered is True continue if bspan.start > aspan.end + slop: # B is too far from A. Since spans are listed in # start position order, we know that all spans after # this one will also be too far. break # Check the distance between the spans dist = aspan.distance_to(bspan) if mindist <= dist <= slop: spans.add(aspan.to(bspan)) aspans = sorted(spans) i += 1 if i == len(ms): return aspans else: return [] class SpanOr(SpanQuery): """Matches documents that match any of a list of sub-queries. Unlike query.Or, this class merges together matching spans from the different sub-queries when they overlap. """ def __init__(self, subqs): """ :param subqs: a list of queries to match. """ self.q = Or(subqs) self.subqs = subqs def is_leaf(self): return False def apply(self, fn): return self.__class__([fn(sq) for sq in self.subqs]) def matcher(self, searcher, context=None): matchers = [q.matcher(searcher, context) for q in self.subqs] return make_binary_tree(SpanOr.SpanOrMatcher, matchers) class SpanOrMatcher(SpanBiMatcher): def __init__(self, a, b): self.a = a self.b = b um = binary.UnionMatcher(a, b) super(SpanOr.SpanOrMatcher, self).__init__(um) def _get_spans(self): a_active = self.a.is_active() b_active = self.b.is_active() if a_active: a_id = self.a.id() if b_active: b_id = self.b.id() if a_id == b_id: spans = sorted(set(self.a.spans()) | set(self.b.spans())) elif a_id < b_id: spans = self.a.spans() else: spans = self.b.spans() else: spans = self.a.spans() else: spans = self.b.spans() Span.merge(spans) return spans class SpanBiQuery(SpanQuery): # Intermediate base class for methods common to "a/b" span query types def is_leaf(self): return False def apply(self, fn): return self.__class__(fn(self.a), fn(self.b)) def matcher(self, searcher, context=None): ma = self.a.matcher(searcher, context) mb = self.b.matcher(searcher, context) return self._Matcher(ma, mb) class SpanNot(SpanBiQuery): """Matches spans from the first query only if they don't overlap with spans from the second query. If there are no non-overlapping spans, the document does not match. For example, to match documents that contain "bear" at most 2 places after "apple" in the "text" field but don't have "cute" between them:: from whoosh import query, spans t1 = query.Term("text", "apple") t2 = query.Term("text", "bear") near = spans.SpanNear(t1, t2, slop=2) q = spans.SpanNot(near, query.Term("text", "cute")) """ def __init__(self, a, b): """ :param a: the query to match. :param b: do not match any spans that overlap with spans from this query. """ self.q = AndMaybe(a, b) self.a = a self.b = b class _Matcher(SpanBiMatcher): def __init__(self, a, b): self.a = a self.b = b amm = binary.AndMaybeMatcher(a, b) super(SpanNot._Matcher, self).__init__(amm) def _get_spans(self): if self.a.id() == self.b.id(): spans = [] bspans = self.b.spans() for aspan in self.a.spans(): overlapped = False for bspan in bspans: if aspan.overlaps(bspan): overlapped = True break if not overlapped: spans.append(aspan) return spans else: return self.a.spans() class SpanContains(SpanBiQuery): """Matches documents where the spans of the first query contain any spans of the second query. For example, to match documents where "apple" occurs at most 10 places before "bear" in the "text" field and "cute" is between them:: from whoosh import query, spans t1 = query.Term("text", "apple") t2 = query.Term("text", "bear") near = spans.SpanNear(t1, t2, slop=10) q = spans.SpanContains(near, query.Term("text", "cute")) """ def __init__(self, a, b): """ :param a: the query to match. :param b: the query whose spans must occur within the matching spans of the first query. """ self.q = And([a, b]) self.a = a self.b = b class _Matcher(SpanBiMatcher): def __init__(self, a, b): self.a = a self.b = b im = binary.IntersectionMatcher(a, b) super(SpanContains._Matcher, self).__init__(im) def _get_spans(self): spans = [] bspans = self.b.spans() for aspan in self.a.spans(): for bspan in bspans: if aspan.start > bspan.end: continue if aspan.end < bspan.start: break if bspan.is_within(aspan): spans.append(aspan) break return spans class SpanBefore(SpanBiQuery): """Matches documents where the spans of the first query occur before any spans of the second query. For example, to match documents where "apple" occurs anywhere before "bear":: from whoosh import query, spans t1 = query.Term("text", "apple") t2 = query.Term("text", "bear") q = spans.SpanBefore(t1, t2) """ def __init__(self, a, b): """ :param a: the query that must occur before the second. :param b: the query that must occur after the first. """ self.a = a self.b = b self.q = And([a, b]) class _Matcher(SpanBiMatcher): def __init__(self, a, b): self.a = a self.b = b im = binary.IntersectionMatcher(a, b) super(SpanBefore._Matcher, self).__init__(im) def _get_spans(self): bminstart = min(bspan.start for bspan in self.b.spans()) return [aspan for aspan in self.a.spans() if aspan.end < bminstart] class SpanCondition(SpanBiQuery): """Matches documents that satisfy both subqueries, but only uses the spans from the first subquery. This is useful when you want to place conditions on matches but not have those conditions affect the spans returned. For example, to get spans for the term ``alfa`` in documents that also must contain the term ``bravo``:: SpanCondition(Term("text", u"alfa"), Term("text", u"bravo")) """ def __init__(self, a, b): self.a = a self.b = b self.q = And([a, b]) class _Matcher(SpanBiMatcher): def __init__(self, a, b): self.a = a im = binary.IntersectionMatcher(a, b) super(SpanCondition._Matcher, self).__init__(im) def _get_spans(self): return self.a.spans() Whoosh-2.5.7/src/whoosh/query/terms.py0000644000076500000240000004245412254366350020052 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division import copy import fnmatch import re from collections import defaultdict from whoosh import matching from whoosh.analysis import Token from whoosh.compat import bytes_type, text_type, u from whoosh.lang.morph_en import variations from whoosh.query import qcore class Term(qcore.Query): """Matches documents containing the given term (fieldname+text pair). >>> Term("content", u"render") """ __inittypes__ = dict(fieldname=str, text=text_type, boost=float) def __init__(self, fieldname, text, boost=1.0, minquality=None): self.fieldname = fieldname self.text = text self.boost = boost self.minquality = minquality def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.fieldname == other.fieldname and self.text == other.text and self.boost == other.boost) def __repr__(self): r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) if self.boost != 1.0: r += ", boost=%s" % self.boost r += ")" return r def __unicode__(self): text = self.text if isinstance(text, bytes_type): try: text = text.decode("ascii") except UnicodeDecodeError: text = repr(text) t = u("%s:%s") % (self.fieldname, text) if self.boost != 1: t += u("^") + text_type(self.boost) return t __str__ = __unicode__ def __hash__(self): return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost) def has_terms(self): return True def tokens(self, boost=1.0): yield Token(fieldname=self.fieldname, text=self.text, boost=boost * self.boost, startchar=self.startchar, endchar=self.endchar, chars=True) def terms(self, phrases=False): if self.field(): yield (self.field(), self.text) def replace(self, fieldname, oldtext, newtext): q = copy.copy(self) if q.fieldname == fieldname and q.text == oldtext: q.text = newtext return q def estimate_size(self, ixreader): fieldname = self.fieldname if fieldname not in ixreader.schema: return 0 field = ixreader.schema[fieldname] try: text = field.to_bytes(self.text) except ValueError: return 0 return ixreader.doc_frequency(fieldname, text) def matcher(self, searcher, context=None): fieldname = self.fieldname text = self.text if fieldname not in searcher.schema: return matching.NullMatcher() field = searcher.schema[fieldname] try: text = field.to_bytes(text) except ValueError: return matching.NullMatcher() if (self.fieldname, text) in searcher.reader(): if context is None: w = searcher.weighting else: w = context.weighting m = searcher.postings(self.fieldname, text, weighting=w) if self.minquality: m.set_min_quality(self.minquality) if self.boost != 1.0: m = matching.WrappingMatcher(m, boost=self.boost) return m else: return matching.NullMatcher() class MultiTerm(qcore.Query): """Abstract base class for queries that operate on multiple terms in the same field. """ constantscore = False def _btexts(self, ixreader): raise NotImplementedError(self.__class__.__name__) def expanded_terms(self, ixreader, phrases=False): fieldname = self.field() if fieldname: for btext in self._btexts(ixreader): yield (fieldname, btext) def tokens(self, boost=1.0, exreader=None): fieldname = self.field() if exreader is None: btexts = [self.text] else: btexts = self._btexts(exreader) for btext in btexts: yield Token(fieldname=fieldname, text=btext, boost=boost * self.boost, startchar=self.startchar, endchar=self.endchar, chars=True) def simplify(self, ixreader): fieldname = self.field() if fieldname not in ixreader.schema: return qcore.NullQuery() field = ixreader.schema[fieldname] existing = [] for btext in sorted(set(self._btexts(ixreader))): text = field.from_bytes(btext) existing.append(Term(fieldname, text, boost=self.boost)) if len(existing) == 1: return existing[0] elif existing: from whoosh.query import Or return Or(existing) else: return qcore.NullQuery def estimate_size(self, ixreader): fieldname = self.field() return sum(ixreader.doc_frequency(fieldname, btext) for btext in self._btexts(ixreader)) def estimate_min_size(self, ixreader): fieldname = self.field() return min(ixreader.doc_frequency(fieldname, text) for text in self._btexts(ixreader)) def matcher(self, searcher, context=None): from whoosh.query import Or fieldname = self.field() constantscore = self.constantscore reader = searcher.reader() qs = [Term(fieldname, word) for word in self._btexts(reader)] if not qs: return matching.NullMatcher() if len(qs) == 1: # If there's only one term, just use it m = qs[0].matcher(searcher, context) else: if constantscore: # To tell the sub-query that score doesn't matter, set weighting # to None if context: context = context.set(weighting=None) else: from whoosh.searching import SearchContext context = SearchContext(weighting=None) # Or the terms together m = Or(qs, boost=self.boost).matcher(searcher, context) return m class PatternQuery(MultiTerm): """An intermediate base class for common methods of Prefix and Wildcard. """ __inittypes__ = dict(fieldname=str, text=text_type, boost=float) def __init__(self, fieldname, text, boost=1.0, constantscore=True): self.fieldname = fieldname self.text = text self.boost = boost self.constantscore = constantscore def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.fieldname == other.fieldname and self.text == other.text and self.boost == other.boost and self.constantscore == other.constantscore) def __repr__(self): r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) if self.boost != 1: r += ", boost=%s" % self.boost r += ")" return r def __hash__(self): return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost) ^ hash(self.constantscore)) def _get_pattern(self): raise NotImplementedError def _find_prefix(self, text): # Subclasses/instances should set the SPECIAL_CHARS attribute to a set # of characters that mark the end of the literal prefix specialchars = self.SPECIAL_CHARS i = 0 for i, char in enumerate(text): if char in specialchars: break return text[:i] def _btexts(self, ixreader): field = ixreader.schema[self.fieldname] exp = re.compile(self._get_pattern()) prefix = self._find_prefix(self.text) if prefix: candidates = ixreader.expand_prefix(self.fieldname, prefix) else: candidates = ixreader.lexicon(self.fieldname) from_bytes = field.from_bytes for btext in candidates: text = from_bytes(btext) if exp.match(text): yield btext class Prefix(PatternQuery): """Matches documents that contain any terms that start with the given text. >>> # Match documents containing words starting with 'comp' >>> Prefix("content", u"comp") """ def __unicode__(self): return "%s:%s*" % (self.fieldname, self.text) __str__ = __unicode__ def _btexts(self, ixreader): return ixreader.expand_prefix(self.fieldname, self.text) def matcher(self, searcher, context=None): if self.text == "": from whoosh.query import Every eq = Every(self.fieldname, boost=self.boost) return eq.matcher(searcher, context) else: return PatternQuery.matcher(self, searcher, context) class Wildcard(PatternQuery): """Matches documents that contain any terms that match a "glob" pattern. See the Python ``fnmatch`` module for information about globs. >>> Wildcard("content", u"in*f?x") """ SPECIAL_CHARS = frozenset("*?[") def __unicode__(self): return "%s:%s" % (self.fieldname, self.text) __str__ = __unicode__ def _get_pattern(self): return fnmatch.translate(self.text) def normalize(self): # If there are no wildcard characters in this "wildcard", turn it into # a simple Term text = self.text if text == "*": from whoosh.query import Every return Every(self.fieldname, boost=self.boost) if "*" not in text and "?" not in text: # If no wildcard chars, convert to a normal term. return Term(self.fieldname, self.text, boost=self.boost) elif ("?" not in text and text.endswith("*") and text.find("*") == len(text) - 1): # If the only wildcard char is an asterisk at the end, convert to a # Prefix query. return Prefix(self.fieldname, self.text[:-1], boost=self.boost) else: return self def matcher(self, searcher, context=None): if self.text == "*": from whoosh.query import Every eq = Every(self.fieldname, boost=self.boost) return eq.matcher(searcher, context) else: return PatternQuery.matcher(self, searcher, context) # _btexts() implemented in PatternQuery class Regex(PatternQuery): """Matches documents that contain any terms that match a regular expression. See the Python ``re`` module for information about regular expressions. """ SPECIAL_CHARS = frozenset("{}()[].?*+^$\\") def __unicode__(self): return '%s:r"%s"' % (self.fieldname, self.text) __str__ = __unicode__ def _get_pattern(self): return self.text def _find_prefix(self, text): if "|" in text: return "" if text.startswith("^"): text = text[1:] elif text.startswith("\\A"): text = text[2:] prefix = PatternQuery._find_prefix(self, text) lp = len(prefix) if lp < len(text) and text[lp] in "*?": # we stripped something starting from * or ? - they both MAY mean # "0 times". As we had stripped starting from FIRST special char, # that implies there were only ordinary chars left of it. Thus, # the very last of them is not part of the real prefix: prefix = prefix[:-1] return prefix def matcher(self, searcher, context=None): if self.text == ".*": from whoosh.query import Every eq = Every(self.fieldname, boost=self.boost) return eq.matcher(searcher, context) else: return PatternQuery.matcher(self, searcher, context) # _btexts() implemented in PatternQuery class ExpandingTerm(MultiTerm): """Intermediate base class for queries such as FuzzyTerm and Variations that expand into multiple queries, but come from a single term. """ def has_terms(self): return True def terms(self, phrases=False): if self.field(): yield (self.field(), self.text) class FuzzyTerm(ExpandingTerm): """Matches documents containing words similar to the given term. """ __inittypes__ = dict(fieldname=str, text=text_type, boost=float, maxdist=float, prefixlength=int) def __init__(self, fieldname, text, boost=1.0, maxdist=1, prefixlength=1, constantscore=True): """ :param fieldname: The name of the field to search. :param text: The text to search for. :param boost: A boost factor to apply to scores of documents matching this query. :param maxdist: The maximum edit distance from the given text. :param prefixlength: The matched terms must share this many initial characters with 'text'. For example, if text is "light" and prefixlength is 2, then only terms starting with "li" are checked for similarity. """ self.fieldname = fieldname self.text = text self.boost = boost self.maxdist = maxdist self.prefixlength = prefixlength self.constantscore = constantscore def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.fieldname == other.fieldname and self.text == other.text and self.maxdist == other.maxdist and self.prefixlength == other.prefixlength and self.boost == other.boost and self.constantscore == other.constantscore) def __repr__(self): r = "%s(%r, %r, boost=%f, maxdist=%d, prefixlength=%d)" return r % (self.__class__.__name__, self.fieldname, self.text, self.boost, self.maxdist, self.prefixlength) def __unicode__(self): r = u("%s:%s") % (self.fieldname, self.text) + u("~") if self.maxdist > 1: r += u("%d") % self.maxdist if self.boost != 1.0: r += u("^%f") % self.boost return r __str__ = __unicode__ def __hash__(self): return (hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost) ^ hash(self.maxdist) ^ hash(self.prefixlength) ^ hash(self.constantscore)) def _btexts(self, ixreader): return ixreader.terms_within(self.fieldname, self.text, self.maxdist, prefix=self.prefixlength) class Variations(ExpandingTerm): """Query that automatically searches for morphological variations of the given word in the same field. """ def __init__(self, fieldname, text, boost=1.0): self.fieldname = fieldname self.text = text self.boost = boost def __repr__(self): r = "%s(%r, %r" % (self.__class__.__name__, self.fieldname, self.text) if self.boost != 1: r += ", boost=%s" % self.boost r += ")" return r def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.fieldname == other.fieldname and self.text == other.text and self.boost == other.boost) def __hash__(self): return hash(self.fieldname) ^ hash(self.text) ^ hash(self.boost) def _btexts(self, ixreader): fieldname = self.fieldname to_bytes = ixreader.schema[fieldname].to_bytes for word in variations(self.text): try: btext = to_bytes(word) except ValueError: continue if (fieldname, btext) in ixreader: yield btext def __unicode__(self): return u("%s:<%s>") % (self.fieldname, self.text) __str__ = __unicode__ def replace(self, fieldname, oldtext, newtext): q = copy.copy(self) if q.fieldname == fieldname and q.text == oldtext: q.text = newtext return q Whoosh-2.5.7/src/whoosh/query/wrappers.py0000644000076500000240000001513412254366350020556 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division from array import array from whoosh import matching from whoosh.compat import text_type, u, xrange from whoosh.query import qcore class WrappingQuery(qcore.Query): def __init__(self, child): self.child = child def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.child) def __hash__(self): return hash(self.__class__.__name__) ^ hash(self.child) def _rewrap(self, child): return self.__class__(child) def is_leaf(self): return False def children(self): yield self.child def apply(self, fn): return self._rewrap(fn(self.child)) def requires(self): return self.child.requires() def field(self): return self.child.field() def with_boost(self, boost): return self._rewrap(self.child.with_boost(boost)) def estimate_size(self, ixreader): return self.child.estimate_size(ixreader) def estimate_min_size(self, ixreader): return self.child.estimate_min_size(ixreader) def matcher(self, searcher, context=None): return self.child.matcher(searcher, context) class Not(qcore.Query): """Excludes any documents that match the subquery. >>> # Match documents that contain 'render' but not 'texture' >>> And([Term("content", u"render"), ... Not(Term("content", u"texture"))]) >>> # You can also do this >>> Term("content", u"render") - Term("content", u"texture") """ __inittypes__ = dict(query=qcore.Query) def __init__(self, query, boost=1.0): """ :param query: A :class:`Query` object. The results of this query are *excluded* from the parent query. :param boost: Boost is meaningless for excluded documents but this keyword argument is accepted for the sake of a consistent interface. """ self.query = query self.boost = boost def __eq__(self, other): return other and self.__class__ is other.__class__ and\ self.query == other.query def __repr__(self): return "%s(%s)" % (self.__class__.__name__, repr(self.query)) def __unicode__(self): return u("NOT ") + text_type(self.query) __str__ = __unicode__ def __hash__(self): return (hash(self.__class__.__name__) ^ hash(self.query) ^ hash(self.boost)) def is_leaf(self): return False def children(self): yield self.query def apply(self, fn): return self.__class__(fn(self.query)) def normalize(self): q = self.query.normalize() if q is qcore.NullQuery: return q else: return self.__class__(q, boost=self.boost) def field(self): return None def estimate_size(self, ixreader): return ixreader.doc_count() def estimate_min_size(self, ixreader): return 1 if ixreader.doc_count() else 0 def matcher(self, searcher, context=None): # Usually only called if Not is the root query. Otherwise, queries such # as And and Or do special handling of Not subqueries. reader = searcher.reader() child = self.query.matcher(searcher, searcher.boolean_context()) return matching.InverseMatcher(child, reader.doc_count_all(), missing=reader.is_deleted) class ConstantScoreQuery(WrappingQuery): """Wraps a query and uses a matcher that always gives a constant score to all matching documents. This is a useful optimization when you don't care about scores from a certain branch of the query tree because it is simply acting as a filter. See also the :class:`AndMaybe` query. """ def __init__(self, child, score=1.0): WrappingQuery.__init__(self, child) self.score = score def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.child == other.child and self.score == other.score) def __hash__(self): return hash(self.child) ^ hash(self.score) def _rewrap(self, child): return self.__class__(child, self.score) def matcher(self, searcher, context=None): from whoosh.searching import SearchContext context = context or SearchContext() m = self.child.matcher(searcher, context) if context.needs_current or isinstance(m, matching.NullMatcherClass): return m else: ids = array("I", m.all_ids()) return matching.ListMatcher(ids, all_weights=self.score, term=m.term()) class WeightingQuery(WrappingQuery): """Wraps a query and uses a specific :class:`whoosh.sorting.WeightingModel` to score documents that match the wrapped query. """ def __init__(self, child, weighting): WrappingQuery.__init__(self, child) self.weighting = weighting def matcher(self, searcher, context=None): # Replace the passed-in weighting with the one configured on this query context.set(weighting=self.weighting) return self.child.matcher(searcher, context) Whoosh-2.5.7/src/whoosh/reading.py0000644000076500000240000012357112254366764017175 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """This module contains classes that allow reading from an index. """ from math import log from bisect import bisect_left, bisect_right from heapq import heapify, heapreplace, heappop, nlargest from whoosh import columns, scoring from whoosh.automata import fst from whoosh.compat import abstractmethod from whoosh.compat import xrange, zip_, next, iteritems from whoosh.filedb.filestore import OverlayStorage from whoosh.matching import MultiMatcher from whoosh.support.levenshtein import distance from whoosh.system import emptybytes # Exceptions class ReaderClosed(Exception): """Exception raised when you try to do some operation on a closed searcher (or a Results object derived from a searcher that has since been closed). """ message = "Operation on a closed reader" class TermNotFound(Exception): pass class NoGraphError(Exception): pass # Term Info base class class TermInfo(object): """Represents a set of statistics about a term. This object is returned by :meth:`IndexReader.term_info`. These statistics may be useful for optimizations and scoring algorithms. """ def __init__(self, weight=0, df=0, minlength=None, maxlength=0, maxweight=0, minid=None, maxid=0): self._weight = weight self._df = df self._minlength = minlength self._maxlength = maxlength self._maxweight = maxweight self._minid = minid self._maxid = maxid def add_posting(self, docnum, weight, length=None): if self._minid is None: self._minid = docnum self._maxid = docnum self._weight += weight self._df += 1 self._maxweight = max(self._maxweight, weight) if length is not None: if self._minlength is None: self._minlength = length else: self._minlength = min(self._minlength, length) self._maxlength = max(self._maxlength, length) def weight(self): """Returns the total frequency of the term across all documents. """ return self._weight def doc_frequency(self): """Returns the number of documents the term appears in. """ return self._df def min_length(self): """Returns the length of the shortest field value the term appears in. """ return self._minlength def max_length(self): """Returns the length of the longest field value the term appears in. """ return self._maxlength def max_weight(self): """Returns the number of times the term appears in the document in which it appears the most. """ return self._maxweight def min_id(self): """Returns the lowest document ID this term appears in. """ return self._minid def max_id(self): """Returns the highest document ID this term appears in. """ return self._maxid # Reader base class class IndexReader(object): """Do not instantiate this object directly. Instead use Index.reader(). """ def __enter__(self): return self def __exit__(self, *args): self.close() @abstractmethod def __contains__(self, term): """Returns True if the given term tuple (fieldname, text) is in this reader. """ raise NotImplementedError def codec(self): """Returns the :class:`whoosh.codec.base.Codec` object used to read this reader's segment. If this reader is not atomic (``reader.is_atomic() == True``), returns None. """ return None def segment(self): """Returns the :class:`whoosh.index.Segment` object used by this reader. If this reader is not atomic (``reader.is_atomic() == True``), returns None. """ return None def storage(self): """Returns the :class:`whoosh.filedb.filestore.Storage` object used by this reader to read its files. If the reader is not atomic, (``reader.is_atomic() == True``), returns None. """ return None def is_atomic(self): return True def _text_to_bytes(self, fieldname, text): if fieldname not in self.schema: raise TermNotFound((fieldname, text)) return self.schema[fieldname].to_bytes(text) def close(self): """Closes the open files associated with this reader. """ pass def generation(self): """Returns the generation of the index being read, or -1 if the backend is not versioned. """ return None @abstractmethod def indexed_field_names(self): """Returns an iterable of strings representing the names of the indexed fields. This may include additional names not explicitly listed in the Schema if you use "glob" fields. """ raise NotImplementedError @abstractmethod def all_terms(self): """Yields (fieldname, text) tuples for every term in the index. """ raise NotImplementedError def terms_from(self, fieldname, prefix): """Yields (fieldname, text) tuples for every term in the index starting at the given prefix. """ # The default implementation just scans the whole list of terms for fname, text in self.all_terms(): if fname < fieldname or text < prefix: continue yield (fname, text) @abstractmethod def term_info(self, fieldname, text): """Returns a :class:`TermInfo` object allowing access to various statistics about the given term. """ raise NotImplementedError def expand_prefix(self, fieldname, prefix): """Yields terms in the given field that start with the given prefix. """ for fn, text in self.terms_from(fieldname, prefix): if fn != fieldname or not text.startswith(prefix): return yield text def lexicon(self, fieldname): """Yields all bytestrings in the given field. """ for fn, btext in self.terms_from(fieldname, emptybytes): if fn != fieldname: return yield btext def field_terms(self, fieldname): """Yields all term values (converted from on-disk bytes) in the given field. """ from_bytes = self.schema[fieldname].from_bytes for btext in self.lexicon(fieldname): yield from_bytes(btext) def __iter__(self): """Yields ((fieldname, text), terminfo) tuples for each term in the reader, in lexical order. """ term_info = self.term_info for term in self.all_terms(): yield (term, term_info(*term)) def iter_from(self, fieldname, text): """Yields ((fieldname, text), terminfo) tuples for all terms in the reader, starting at the given term. """ term_info = self.term_info text = self._text_to_bytes(fieldname, text) for term in self.terms_from(fieldname, text): yield (term, term_info(*term)) def iter_field(self, fieldname, prefix=''): """Yields (text, terminfo) tuples for all terms in the given field. """ prefix = self._text_to_bytes(fieldname, prefix) for (fn, text), terminfo in self.iter_from(fieldname, prefix): if fn != fieldname: return yield text, terminfo def iter_prefix(self, fieldname, prefix): """Yields (text, terminfo) tuples for all terms in the given field with a certain prefix. """ prefix = self._text_to_bytes(fieldname, prefix) for text, terminfo in self.iter_field(fieldname, prefix): if not text.startswith(prefix): return yield (text, terminfo) @abstractmethod def has_deletions(self): """Returns True if the underlying index/segment has deleted documents. """ raise NotImplementedError def all_doc_ids(self): """Returns an iterator of all (undeleted) document IDs in the reader. """ is_deleted = self.is_deleted return (docnum for docnum in xrange(self.doc_count_all()) if not is_deleted(docnum)) def iter_docs(self): """Yields a series of ``(docnum, stored_fields_dict)`` tuples for the undeleted documents in the reader. """ for docnum in self.all_doc_ids(): yield docnum, self.stored_fields(docnum) @abstractmethod def is_deleted(self, docnum): """Returns True if the given document number is marked deleted. """ raise NotImplementedError @abstractmethod def stored_fields(self, docnum): """Returns the stored fields for the given document number. :param numerickeys: use field numbers as the dictionary keys instead of field names. """ raise NotImplementedError def all_stored_fields(self): """Yields the stored fields for all documents (including deleted documents). """ for docnum in xrange(self.doc_count_all()): yield self.stored_fields(docnum) @abstractmethod def doc_count_all(self): """Returns the total number of documents, DELETED OR UNDELETED, in this reader. """ raise NotImplementedError @abstractmethod def doc_count(self): """Returns the total number of UNDELETED documents in this reader. """ return self.doc_count_all() - self.deleted_count() @abstractmethod def frequency(self, fieldname, text): """Returns the total number of instances of the given term in the collection. """ raise NotImplementedError @abstractmethod def doc_frequency(self, fieldname, text): """Returns how many documents the given term appears in. """ raise NotImplementedError @abstractmethod def field_length(self, fieldname): """Returns the total number of terms in the given field. This is used by some scoring algorithms. """ raise NotImplementedError @abstractmethod def min_field_length(self, fieldname): """Returns the minimum length of the field across all documents. This is used by some scoring algorithms. """ raise NotImplementedError @abstractmethod def max_field_length(self, fieldname): """Returns the minimum length of the field across all documents. This is used by some scoring algorithms. """ raise NotImplementedError @abstractmethod def doc_field_length(self, docnum, fieldname, default=0): """Returns the number of terms in the given field in the given document. This is used by some scoring algorithms. """ raise NotImplementedError def first_id(self, fieldname, text): """Returns the first ID in the posting list for the given term. This may be optimized in certain backends. """ text = self._text_to_bytes(fieldname, text) p = self.postings(fieldname, text) if p.is_active(): return p.id() raise TermNotFound((fieldname, text)) def iter_postings(self): """Low-level method, yields all postings in the reader as ``(fieldname, text, docnum, weight, valuestring)`` tuples. """ for fieldname, btext in self.all_terms(): m = self.postings(fieldname, btext) while m.is_active(): yield (fieldname, btext, m.id(), m.weight(), m.value()) m.next() @abstractmethod def postings(self, fieldname, text): """Returns a :class:`~whoosh.matching.Matcher` for the postings of the given term. >>> pr = reader.postings("content", "render") >>> pr.skip_to(10) >>> pr.id 12 :param fieldname: the field name or field number of the term. :param text: the text of the term. :rtype: :class:`whoosh.matching.Matcher` """ raise NotImplementedError @abstractmethod def has_vector(self, docnum, fieldname): """Returns True if the given document has a term vector for the given field. """ raise NotImplementedError @abstractmethod def vector(self, docnum, fieldname, format_=None): """Returns a :class:`~whoosh.matching.Matcher` object for the given term vector. >>> docnum = searcher.document_number(path=u'/a/b/c') >>> v = searcher.vector(docnum, "content") >>> v.all_as("frequency") [(u"apple", 3), (u"bear", 2), (u"cab", 2)] :param docnum: the document number of the document for which you want the term vector. :param fieldname: the field name or field number of the field for which you want the term vector. :rtype: :class:`whoosh.matching.Matcher` """ raise NotImplementedError def vector_as(self, astype, docnum, fieldname): """Returns an iterator of (termtext, value) pairs for the terms in the given term vector. This is a convenient shortcut to calling vector() and using the Matcher object when all you want are the terms and/or values. >>> docnum = searcher.document_number(path=u'/a/b/c') >>> searcher.vector_as("frequency", docnum, "content") [(u"apple", 3), (u"bear", 2), (u"cab", 2)] :param docnum: the document number of the document for which you want the term vector. :param fieldname: the field name or field number of the field for which you want the term vector. :param astype: a string containing the name of the format you want the term vector's data in, for example "weights". """ vec = self.vector(docnum, fieldname) if astype == "weight": while vec.is_active(): yield (vec.id(), vec.weight()) vec.next() else: format_ = self.schema[fieldname].format decoder = format_.decoder(astype) while vec.is_active(): yield (vec.id(), decoder(vec.value())) vec.next() def has_word_graph(self, fieldname): """Returns True if the given field has a "word graph" associated with it, allowing suggestions for correcting mis-typed words and fast fuzzy term searching. """ return False def word_graph(self, fieldname): """Returns the root :class:`whoosh.fst.Node` for the given field, if the field has a stored word graph (otherwise raises an exception). You can check whether a field has a word graph using :meth:`IndexReader.has_word_graph`. """ raise KeyError def corrector(self, fieldname): """Returns a :class:`whoosh.spelling.Corrector` object that suggests corrections based on the terms in the given field. """ from whoosh.spelling import ReaderCorrector return ReaderCorrector(self, fieldname) def terms_within(self, fieldname, text, maxdist, prefix=0): """Returns a generator of words in the given field within ``maxdist`` Damerau-Levenshtein edit distance of the given text. Important: the terms are returned in **no particular order**. The only criterion is that they are within ``maxdist`` edits of ``text``. You may want to run this method multiple times with increasing ``maxdist`` values to ensure you get the closest matches first. You may also have additional information (such as term frequency or an acoustic matching algorithm) you can use to rank terms with the same edit distance. :param maxdist: the maximum edit distance. :param prefix: require suggestions to share a prefix of this length with the given word. This is often justifiable since most misspellings do not involve the first letter of the word. Using a prefix dramatically decreases the time it takes to generate the list of words. :param seen: an optional set object. Words that appear in the set will not be yielded. """ fieldobj = self.schema[fieldname] for btext in self.expand_prefix(fieldname, text[:prefix]): word = fieldobj.from_bytes(btext) k = distance(word, text, limit=maxdist) if k <= maxdist: yield word def most_frequent_terms(self, fieldname, number=5, prefix=''): """Returns the top 'number' most frequent terms in the given field as a list of (frequency, text) tuples. """ gen = ((terminfo.weight(), text) for text, terminfo in self.iter_prefix(fieldname, prefix)) return nlargest(number, gen) def most_distinctive_terms(self, fieldname, number=5, prefix=''): """Returns the top 'number' terms with the highest `tf*idf` scores as a list of (score, text) tuples. """ N = float(self.doc_count()) gen = ((terminfo.weight() * log(N / terminfo.doc_frequency()), text) for text, terminfo in self.iter_prefix(fieldname, prefix)) return nlargest(number, gen) def leaf_readers(self): """Returns a list of (IndexReader, docbase) pairs for the child readers of this reader if it is a composite reader. If this is not a composite reader, it returns `[(self, 0)]`. """ return [(self, 0)] def supports_caches(self): return False def has_column(self, fieldname): return False def column_reader(self, fieldname, column=None, reverse=False, translate=False): """ :param fieldname: the name of the field for which to get a reader. :param column: if passed, use this Column object instead of the one associated with the field in the Schema. :param reverse: if passed, reverses the order of keys returned by the reader's ``sort_key()`` method. If the column type is not reversible, this will raise a ``NotImplementedError``. :param translate: if True, wrap the reader to call the field's ``from_bytes()`` method on the returned values. :return: a :class:`whoosh.columns.ColumnReader` object. """ raise NotImplementedError # Segment-based reader class SegmentReader(IndexReader): def __init__(self, storage, schema, segment, generation=None, codec=None): self.schema = schema self.is_closed = False self._segment = segment self._segid = self._segment.segment_id() self._gen = generation # self.files is a storage object from which to load the segment files. # This is different from the general storage (which will be used for # caches) if the segment is in a compound file. if segment.is_compound(): # Open the compound file as a storage object files = segment.open_compound_file(storage) # Use an overlay here instead of just the compound storage, in rare # circumstances a segment file may be added after the segment is # written self._storage = OverlayStorage(files, storage) else: self._storage = storage # Get subreaders from codec self._codec = codec if codec else segment.codec() self._terms = self._codec.terms_reader(self._storage, segment) self._perdoc = self._codec.per_document_reader(self._storage, segment) self._graph = None # Lazy open with self._get_graph() def _get_graph(self): if not self._graph: self._graph = self._codec.graph_reader(self._storage, self._segment) return self._graph def codec(self): return self._codec def segment(self): return self._segment def storage(self): return self._storage def has_deletions(self): if self.is_closed: raise ReaderClosed return self._perdoc.has_deletions() def doc_count(self): if self.is_closed: raise ReaderClosed return self._perdoc.doc_count() def doc_count_all(self): if self.is_closed: raise ReaderClosed return self._perdoc.doc_count_all() def is_deleted(self, docnum): if self.is_closed: raise ReaderClosed return self._perdoc.is_deleted(docnum) def generation(self): return self._gen def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self._storage, self._segment) def __contains__(self, term): if self.is_closed: raise ReaderClosed fieldname, text = term if fieldname not in self.schema: return False text = self._text_to_bytes(fieldname, text) return (fieldname, text) in self._terms def close(self): if self.is_closed: raise ReaderClosed("Reader already closed") self._terms.close() self._perdoc.close() if self._graph: self._graph.close() # It's possible some weird codec that doesn't use storage might have # passed None instead of a storage object if self._storage: self._storage.close() self.is_closed = True def stored_fields(self, docnum): if self.is_closed: raise ReaderClosed assert docnum >= 0 schema = self.schema sfs = self._perdoc.stored_fields(docnum) # Double-check with schema to filter out removed fields return dict(item for item in iteritems(sfs) if item[0] in schema) # Delegate doc methods to the per-doc reader def all_doc_ids(self): if self.is_closed: raise ReaderClosed return self._perdoc.all_doc_ids() def iter_docs(self): if self.is_closed: raise ReaderClosed return self._perdoc.iter_docs() def all_stored_fields(self): if self.is_closed: raise ReaderClosed return self._perdoc.all_stored_fields() def field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.field_length(fieldname) def min_field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.min_field_length(fieldname) def max_field_length(self, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.max_field_length(fieldname) def doc_field_length(self, docnum, fieldname, default=0): if self.is_closed: raise ReaderClosed return self._perdoc.doc_field_length(docnum, fieldname, default) def has_vector(self, docnum, fieldname): if self.is_closed: raise ReaderClosed return self._perdoc.has_vector(docnum, fieldname) # def _test_field(self, fieldname): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) if self.schema[fieldname].format is None: raise TermNotFound("Field %r is not indexed" % fieldname) def indexed_field_names(self): return self._terms.indexed_field_names() def all_terms(self): if self.is_closed: raise ReaderClosed schema = self.schema return ((fieldname, text) for fieldname, text in self._terms.terms() if fieldname in schema) def terms_from(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) schema = self.schema return ((fname, text) for fname, text in self._terms.terms_from(fieldname, prefix) if fname in schema) def term_info(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.term_info(fieldname, text) except KeyError: raise TermNotFound("%s:%r" % (fieldname, text)) def expand_prefix(self, fieldname, prefix): self._test_field(fieldname) prefix = self._text_to_bytes(fieldname, prefix) return IndexReader.expand_prefix(self, fieldname, prefix) def lexicon(self, fieldname): self._test_field(fieldname) return IndexReader.lexicon(self, fieldname) def __iter__(self): if self.is_closed: raise ReaderClosed schema = self.schema return ((term, terminfo) for term, terminfo in self._terms.items() if term[0] in schema) def iter_from(self, fieldname, text): self._test_field(fieldname) schema = self.schema text = self._text_to_bytes(fieldname, text) for term, terminfo in self._terms.items_from(fieldname, text): if term[0] not in schema: continue yield (term, terminfo) def frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.frequency(fieldname, text) except KeyError: return 0 def doc_frequency(self, fieldname, text): self._test_field(fieldname) text = self._text_to_bytes(fieldname, text) try: return self._terms.doc_frequency(fieldname, text) except KeyError: return 0 def postings(self, fieldname, text, scorer=None): from whoosh.matching.wrappers import FilterMatcher if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) text = self._text_to_bytes(fieldname, text) format_ = self.schema[fieldname].format matcher = self._terms.matcher(fieldname, text, format_, scorer=scorer) deleted = frozenset(self._perdoc.deleted_docs()) if deleted: matcher = FilterMatcher(matcher, deleted, exclude=True) return matcher def vector(self, docnum, fieldname, format_=None): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: raise TermNotFound("No field %r" % fieldname) vformat = format_ or self.schema[fieldname].vector if not vformat: raise Exception("No vectors are stored for field %r" % fieldname) return self._perdoc.vector(docnum, fieldname, vformat) # Graph methods def has_word_graph(self, fieldname): if self.is_closed: raise ReaderClosed if fieldname not in self.schema: return False if not self.schema[fieldname].spelling: return False try: gr = self._get_graph() except NoGraphError: return False return gr.has_root(fieldname) def word_graph(self, fieldname): if self.is_closed: raise ReaderClosed if not self.has_word_graph(fieldname): raise KeyError("No word graph for field %r" % fieldname) gr = self._get_graph() return fst.Node(gr, gr.root(fieldname)) def terms_within(self, fieldname, text, maxdist, prefix=0): if self.is_closed: raise ReaderClosed if not self.has_word_graph(fieldname): # This reader doesn't have a graph stored, use the slow method return IndexReader.terms_within(self, fieldname, text, maxdist, prefix=prefix) gr = self._get_graph() return fst.within(gr, text, k=maxdist, prefix=prefix, address=self._graph.root(fieldname)) # Column methods def has_column(self, fieldname): if self.is_closed: raise ReaderClosed coltype = self.schema[fieldname].column_type return coltype and self._perdoc.has_column(fieldname) def column_reader(self, fieldname, column=None, reverse=False, translate=True): if self.is_closed: raise ReaderClosed fieldobj = self.schema[fieldname] column = column or fieldobj.column_type if not column: raise Exception("No column for field %r in %r" % (fieldname, self)) if self._perdoc.has_column(fieldname): creader = self._perdoc.column_reader(fieldname, column) if reverse: creader.set_reverse() else: # This segment doesn't have a column file for this field, so create # a fake column reader that always returns the default value. default = column.default_value(reverse) creader = columns.EmptyColumnReader(default, self.doc_count_all()) if translate: # Wrap the column in a Translator to give the caller # nice values instead of sortable representations fcv = fieldobj.from_column_value creader = columns.TranslatingColumnReader(creader, fcv) return creader # Fake IndexReader class for empty indexes class EmptyReader(IndexReader): def __init__(self, schema): self.schema = schema def __contains__(self, term): return False def __iter__(self): return iter([]) def indexed_field_names(self): return [] def all_terms(self): return iter([]) def term_info(self, fieldname, text): raise TermNotFound((fieldname, text)) def iter_from(self, fieldname, text): return iter([]) def iter_field(self, fieldname, prefix=''): return iter([]) def iter_prefix(self, fieldname, prefix=''): return iter([]) def lexicon(self, fieldname): return iter([]) def has_deletions(self): return False def is_deleted(self, docnum): return False def stored_fields(self, docnum): raise KeyError("No document number %s" % docnum) def all_stored_fields(self): return iter([]) def doc_count_all(self): return 0 def doc_count(self): return 0 def frequency(self, fieldname, text): return 0 def doc_frequency(self, fieldname, text): return 0 def field_length(self, fieldname): return 0 def min_field_length(self, fieldname): return 0 def max_field_length(self, fieldname): return 0 def doc_field_length(self, docnum, fieldname, default=0): return default def postings(self, fieldname, text, scorer=None): raise TermNotFound("%s:%r" % (fieldname, text)) def has_vector(self, docnum, fieldname): return False def vector(self, docnum, fieldname, format_=None): raise KeyError("No document number %s" % docnum) def most_frequent_terms(self, fieldname, number=5, prefix=''): return iter([]) def most_distinctive_terms(self, fieldname, number=5, prefix=None): return iter([]) # Multisegment reader class class MultiReader(IndexReader): """Do not instantiate this object directly. Instead use Index.reader(). """ def __init__(self, readers, generation=None): self.readers = readers self._gen = generation self.schema = None if readers: self.schema = readers[0].schema self.doc_offsets = [] self.base = 0 for r in self.readers: self.doc_offsets.append(self.base) self.base += r.doc_count_all() self.is_closed = False def _document_segment(self, docnum): return max(0, bisect_right(self.doc_offsets, docnum) - 1) def _segment_and_docnum(self, docnum): segmentnum = self._document_segment(docnum) offset = self.doc_offsets[segmentnum] return segmentnum, docnum - offset def is_atomic(self): return False def leaf_readers(self): return zip_(self.readers, self.doc_offsets) def add_reader(self, reader): self.readers.append(reader) self.doc_offsets.append(self.base) self.base += reader.doc_count_all() def close(self): for d in self.readers: d.close() self.is_closed = True def generation(self): return self._gen def format(self, fieldname): for r in self.readers: fmt = r.format(fieldname) if fmt is not None: return fmt def vector_format(self, fieldname): for r in self.readers: vfmt = r.vector_format(fieldname) if vfmt is not None: return vfmt # Term methods def __contains__(self, term): return any(r.__contains__(term) for r in self.readers) def _merge_terms(self, iterlist): # Merge-sorts terms coming from a list of term iterators. # Create a map so we can look up each iterator by its id() value itermap = {} for it in iterlist: itermap[id(it)] = it # Fill in the list with the head term from each iterator. current = [] for it in iterlist: try: term = next(it) except StopIteration: continue current.append((term, id(it))) # Number of active iterators active = len(current) # If only one iterator is active, just yield from it and return if active == 1: term, itid = current[0] it = itermap[itid] yield term for term in it: yield term return # Otherwise, do a streaming heap sort of the terms from the iterators heapify(current) while active: # Peek at the first term in the sorted list term = current[0][0] # Re-iterate on all items in the list that have that term while active and current[0][0] == term: it = itermap[current[0][1]] try: nextterm = next(it) heapreplace(current, (nextterm, id(it))) except StopIteration: heappop(current) active -= 1 # Yield the term yield term def indexed_field_names(self): names = set() for r in self.reader(): names.update(r.indexed_field_names()) return iter(names) def all_terms(self): return self._merge_terms([r.all_terms() for r in self.readers]) def terms_from(self, fieldname, prefix): return self._merge_terms([r.terms_from(fieldname, prefix) for r in self.readers]) def term_info(self, fieldname, text): term = (fieldname, text) # Get the term infos for the sub-readers containing the term tis = [(r.term_info(fieldname, text), offset) for r, offset in zip_(self.readers, self.doc_offsets) if term in r] # If only one reader had the term, return its terminfo with the offset # added if not tis: raise TermNotFound(term) elif len(tis) == 1: ti, offset = tis[0] ti._minid += offset ti._maxid += offset return ti # Combine the various statistics w = sum(ti.weight() for ti, _ in tis) df = sum(ti.doc_frequency() for ti, _ in tis) ml = min(ti.min_length() for ti, _ in tis) xl = max(ti.max_length() for ti, _ in tis) xw = max(ti.max_weight() for ti, _ in tis) # For min and max ID, we need to add the doc offsets mid = min(ti.min_id() + offset for ti, offset in tis) xid = max(ti.max_id() + offset for ti, offset in tis) return TermInfo(w, df, ml, xl, xw, mid, xid) def frequency(self, fieldname, text): return sum(r.frequency(fieldname, text) for r in self.readers) def doc_frequency(self, fieldname, text): return sum(r.doc_frequency(fieldname, text) for r in self.readers) def postings(self, fieldname, text): # This method does not add a scorer; for that, use Searcher.postings() postreaders = [] docoffsets = [] term = (fieldname, text) for i, r in enumerate(self.readers): if term in r: offset = self.doc_offsets[i] pr = r.postings(fieldname, text) postreaders.append(pr) docoffsets.append(offset) if not postreaders: raise TermNotFound(fieldname, text) return MultiMatcher(postreaders, docoffsets) def first_id(self, fieldname, text): for i, r in enumerate(self.readers): try: id = r.first_id(fieldname, text) except (KeyError, TermNotFound): pass else: if id is None: raise TermNotFound((fieldname, text)) else: return self.doc_offsets[i] + id raise TermNotFound((fieldname, text)) # Deletion methods def has_deletions(self): return any(r.has_deletions() for r in self.readers) def is_deleted(self, docnum): segmentnum, segmentdoc = self._segment_and_docnum(docnum) return self.readers[segmentnum].is_deleted(segmentdoc) def stored_fields(self, docnum): segmentnum, segmentdoc = self._segment_and_docnum(docnum) return self.readers[segmentnum].stored_fields(segmentdoc) # Per doc methods def all_stored_fields(self): for reader in self.readers: for result in reader.all_stored_fields(): yield result def doc_count_all(self): return sum(dr.doc_count_all() for dr in self.readers) def doc_count(self): return sum(dr.doc_count() for dr in self.readers) def field_length(self, fieldname): return sum(dr.field_length(fieldname) for dr in self.readers) def min_field_length(self, fieldname): return min(r.min_field_length(fieldname) for r in self.readers) def max_field_length(self, fieldname): return max(r.max_field_length(fieldname) for r in self.readers) def doc_field_length(self, docnum, fieldname, default=0): segmentnum, segmentdoc = self._segment_and_docnum(docnum) reader = self.readers[segmentnum] return reader.doc_field_length(segmentdoc, fieldname, default=default) def has_vector(self, docnum, fieldname): segmentnum, segmentdoc = self._segment_and_docnum(docnum) return self.readers[segmentnum].has_vector(segmentdoc, fieldname) def vector(self, docnum, fieldname, format_=None): segmentnum, segmentdoc = self._segment_and_docnum(docnum) return self.readers[segmentnum].vector(segmentdoc, fieldname) def vector_as(self, astype, docnum, fieldname): segmentnum, segmentdoc = self._segment_and_docnum(docnum) return self.readers[segmentnum].vector_as(astype, segmentdoc, fieldname) # Graph methods def has_word_graph(self, fieldname): return any(r.has_word_graph(fieldname) for r in self.readers) def word_graph(self, fieldname): from whoosh.automata.fst import UnionNode from whoosh.util import make_binary_tree if not self.has_word_graph(fieldname): raise Exception("No word graph for field %r" % fieldname) graphs = [r.word_graph(fieldname) for r in self.readers if r.has_word_graph(fieldname)] if len(graphs) == 0: raise KeyError("No readers have graph for %r" % fieldname) if len(graphs) == 1: return graphs[0] return make_binary_tree(UnionNode, graphs) def terms_within(self, fieldname, text, maxdist, prefix=0): tset = set() for r in self.readers: tset.update(r.terms_within(fieldname, text, maxdist, prefix=prefix)) return tset # Column methods def has_column(self, fieldname): return any(r.has_column(fieldname) for r in self.readers) def column_reader(self, fieldname, column=None, reverse=False, translate=True): column = column or self.schema[fieldname].column_type if not column: raise Exception("Field %r has no column type" % (fieldname,)) creaders = [] for r in self.readers: cr = r.column_reader(fieldname, column=column, reverse=reverse, translate=translate) creaders.append(cr) return columns.MultiColumnReader(creaders) Whoosh-2.5.7/src/whoosh/scoring.py0000644000076500000240000005071412254366350017215 0ustar mattstaff00000000000000# Copyright 2008 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module contains classes for scoring (and sorting) search results. """ from __future__ import division from math import log, pi from whoosh.compat import iteritems # Base classes class WeightingModel(object): """Abstract base class for scoring models. A WeightingModel object provides a method, ``scorer``, which returns an instance of :class:`whoosh.scoring.Scorer`. Basically, WeightingModel objects store the configuration information for the model (for example, the values of B and K1 in the BM25F model), and then creates a scorer instance based on additional run-time information (the searcher, the fieldname, and term text) to do the actual scoring. """ use_final = False def idf(self, searcher, fieldname, text): """Returns the inverse document frequency of the given term. """ parent = searcher.get_parent() n = parent.doc_frequency(fieldname, text) dc = parent.doc_count_all() return log(dc / (n + 1)) + 1 def scorer(self, searcher, fieldname, text, qf=1): """Returns an instance of :class:`whoosh.scoring.Scorer` configured for the given searcher, fieldname, and term text. """ raise NotImplementedError(self.__class__.__name__) def final(self, searcher, docnum, score): """Returns a final score for each document. You can use this method in subclasses to apply document-level adjustments to the score, for example using the value of stored field to influence the score (although that would be slow). WeightingModel sub-classes that use ``final()`` should have the attribute ``use_final`` set to ``True``. :param searcher: :class:`whoosh.searching.Searcher` for the index. :param docnum: the doc number of the document being scored. :param score: the document's accumulated term score. :rtype: float """ return score class BaseScorer(object): """Base class for "scorer" implementations. A scorer provides a method for scoring a document, and sometimes methods for rating the "quality" of a document and a matcher's current "block", to implement quality-based optimizations. Scorer objects are created by WeightingModel objects. Basically, WeightingModel objects store the configuration information for the model (for example, the values of B and K1 in the BM25F model), and then creates a scorer instance. """ def supports_block_quality(self): """Returns True if this class supports quality optimizations. """ return False def score(self, matcher): """Returns a score for the current document of the matcher. """ raise NotImplementedError(self.__class__.__name__) def max_quality(self): """Returns the *maximum limit* on the possible score the matcher can give. This can be an estimate and not necessarily the actual maximum score possible, but it must never be less than the actual maximum score. """ raise NotImplementedError(self.__class__.__name__) def block_quality(self, matcher): """Returns the *maximum limit* on the possible score the matcher can give **in its current "block"** (whatever concept of "block" the backend might use). This can be an estimate and not necessarily the actual maximum score possible, but it must never be less than the actual maximum score. If this score is less than the minimum score required to make the "top N" results, then we can tell the matcher to skip ahead to another block with better "quality". """ raise NotImplementedError(self.__class__.__name__) # Scorer that just returns term weight class WeightScorer(BaseScorer): """A scorer that simply returns the weight as the score. This is useful for more complex weighting models to return when they are asked for a scorer for fields that aren't scorable (don't store field lengths). """ def __init__(self, maxweight): self._maxweight = maxweight def supports_block_quality(self): return True def score(self, matcher): return matcher.weight() def max_quality(self): return self._maxweight def block_quality(self, matcher): return matcher.block_max_weight() @classmethod def for_(cls, searcher, fieldname, text): ti = searcher.term_info(fieldname, text) return cls(ti.max_weight()) # Base scorer for models that only use weight and field length class WeightLengthScorer(BaseScorer): """Base class for scorers where the only per-document variables are term weight and field length. Subclasses should override the ``_score(weight, length)`` method to return the score for a document with the given weight and length, and call the ``setup()`` method at the end of the initializer to set up common attributes. """ def setup(self, searcher, fieldname, text): """Initializes the scorer and then does the busy work of adding the ``dfl()`` function and maximum quality attribute. This method assumes the initializers of WeightLengthScorer subclasses always take ``searcher, offset, fieldname, text`` as the first three arguments. Any additional arguments given to this method are passed through to the initializer. Note: this method calls ``self._score()``, so you should only call it in the initializer after setting up whatever attributes ``_score()`` depends on:: class MyScorer(WeightLengthScorer): def __init__(self, searcher, fieldname, text, parm=1.0): self.parm = parm self.setup(searcher, fieldname, text) def _score(self, weight, length): return (weight / (length + 1)) * self.parm """ ti = searcher.term_info(fieldname, text) if not searcher.schema[fieldname].scorable: return WeightScorer(ti.max_weight()) self.dfl = lambda docid: searcher.doc_field_length(docid, fieldname, 1) self._maxquality = self._score(ti.max_weight(), ti.min_length()) def supports_block_quality(self): return True def score(self, matcher): return self._score(matcher.weight(), self.dfl(matcher.id())) def max_quality(self): return self._maxquality def block_quality(self, matcher): return self._score(matcher.block_max_weight(), matcher.block_min_length()) def _score(self, weight, length): # Override this method with the actual scoring function raise NotImplementedError(self.__class__.__name__) # WeightingModel implementations # Debugging model class DebugModel(WeightingModel): def __init__(self): self.log = [] def scorer(self, searcher, fieldname, text, qf=1): return DebugScorer(searcher, fieldname, text, self.log) class DebugScorer(BaseScorer): def __init__(self, searcher, fieldname, text, log): ti = searcher.term_info(fieldname, text) self._maxweight = ti.max_weight() self.searcher = searcher self.fieldname = fieldname self.text = text self.log = log def supports_block_quality(self): return True def score(self, matcher): fieldname, text = self.fieldname, self.text docid = matcher.id() w = matcher.weight() length = self.searcher.doc_field_length(docid, fieldname) self.log.append((fieldname, text, docid, w, length)) return w def max_quality(self): return self._maxweight def block_quality(self, matcher): return matcher.block_max_weight() # BM25F Model def bm25(idf, tf, fl, avgfl, B, K1): # idf - inverse document frequency # tf - term frequency in the current document # fl - field length in the current document # avgfl - average field length across documents in collection # B, K1 - free paramters return idf * ((tf * (K1 + 1)) / (tf + K1 * ((1 - B) + B * fl / avgfl))) class BM25F(WeightingModel): """Implements the BM25F scoring algorithm. """ def __init__(self, B=0.75, K1=1.2, **kwargs): """ >>> from whoosh import scoring >>> # Set a custom B value for the "content" field >>> w = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) :param B: free parameter, see the BM25 literature. Keyword arguments of the form ``fieldname_B`` (for example, ``body_B``) set field- specific values for B. :param K1: free parameter, see the BM25 literature. """ self.B = B self.K1 = K1 self._field_B = {} for k, v in iteritems(kwargs): if k.endswith("_B"): fieldname = k[:-2] self._field_B[fieldname] = v def supports_block_quality(self): return True def scorer(self, searcher, fieldname, text, qf=1): if not searcher.schema[fieldname].scorable: return WeightScorer.for_(searcher, fieldname, text) if fieldname in self._field_B: B = self._field_B[fieldname] else: B = self.B return BM25FScorer(searcher, fieldname, text, B, self.K1, qf=qf) class BM25FScorer(WeightLengthScorer): def __init__(self, searcher, fieldname, text, B, K1, qf=1): # IDF and average field length are global statistics, so get them from # the top-level searcher parent = searcher.get_parent() # Returns self if no parent self.idf = parent.idf(fieldname, text) self.avgfl = parent.avg_field_length(fieldname) or 1 self.B = B self.K1 = K1 self.qf = qf self.setup(searcher, fieldname, text) def _score(self, weight, length): s = bm25(self.idf, weight, length, self.avgfl, self.B, self.K1) return s # DFree model def dfree(tf, cf, qf, dl, fl): # tf - term frequency in current document # cf - term frequency in collection # qf - term frequency in query # dl - field length in current document # fl - total field length across all documents in collection prior = tf / dl post = (tf + 1.0) / (dl + 1.0) invpriorcol = fl / cf norm = tf * log(post / prior) return qf * norm * (tf * (log(prior * invpriorcol)) + (tf + 1.0) * (log(post * invpriorcol)) + 0.5 * log(post / prior)) class DFree(WeightingModel): """Implements the DFree scoring model from Terrier. See http://terrier.org/ """ def supports_block_quality(self): return True def scorer(self, searcher, fieldname, text, qf=1): if not searcher.schema[fieldname].scorable: return WeightScorer.for_(searcher, fieldname, text) return DFreeScorer(searcher, fieldname, text, qf=qf) class DFreeScorer(WeightLengthScorer): def __init__(self, searcher, fieldname, text, qf=1): # Total term weight and total field length are global statistics, so # get them from the top-level searcher parent = searcher.get_parent() # Returns self if no parent self.cf = parent.weight(fieldname, text) self.fl = parent.field_length(fieldname) self.qf = qf self.setup(searcher, fieldname, text) def _score(self, weight, length): return dfree(weight, self.cf, self.qf, length, self.fl) # PL2 model rec_log2_of_e = 1.0 / log(2) def pl2(tf, cf, qf, dc, fl, avgfl, c): # tf - term frequency in the current document # cf - term frequency in the collection # qf - term frequency in the query # dc - doc count # fl - field length in the current document # avgfl - average field length across all documents # c -free parameter TF = tf * log(1.0 + (c * avgfl) / fl) norm = 1.0 / (TF + 1.0) f = cf / dc return norm * qf * (TF * log(1.0 / f) + f * rec_log2_of_e + 0.5 * log(2 * pi * TF) + TF * (log(TF) - rec_log2_of_e)) class PL2(WeightingModel): """Implements the PL2 scoring model from Terrier. See http://terrier.org/ """ def __init__(self, c=1.0): self.c = c def scorer(self, searcher, fieldname, text, qf=1): if not searcher.schema[fieldname].scorable: return WeightScorer.for_(searcher, fieldname, text) return PL2Scorer(searcher, fieldname, text, self.c, qf=qf) class PL2Scorer(WeightLengthScorer): def __init__(self, searcher, fieldname, text, c, qf=1): # Total term weight, document count, and average field length are # global statistics, so get them from the top-level searcher parent = searcher.get_parent() # Returns self if no parent self.cf = parent.frequency(fieldname, text) self.dc = parent.doc_count_all() self.avgfl = parent.avg_field_length(fieldname) or 1 self.c = c self.qf = qf self.setup(searcher, fieldname, text) def _score(self, weight, length): return pl2(weight, self.cf, self.qf, self.dc, length, self.avgfl, self.c) # Simple models class Frequency(WeightingModel): def scorer(self, searcher, fieldname, text, qf=1): maxweight = searcher.term_info(fieldname, text).max_weight() return WeightScorer(maxweight) class TF_IDF(WeightingModel): def scorer(self, searcher, fieldname, text, qf=1): # IDF is a global statistic, so get it from the top-level searcher parent = searcher.get_parent() # Returns self if no parent idf = parent.idf(fieldname, text) maxweight = searcher.term_info(fieldname, text).max_weight() return TF_IDFScorer(maxweight, idf) class TF_IDFScorer(BaseScorer): def __init__(self, maxweight, idf): self._maxquality = maxweight * idf self.idf = idf def supports_block_quality(self): return True def score(self, matcher): return matcher.weight() * self.idf def max_quality(self): return self._maxquality def block_quality(self, matcher): return matcher.block_max_weight() * self.idf # Utility models class Weighting(WeightingModel): """This class provides backwards-compatibility with the old weighting class architecture, so any existing custom scorers don't need to be rewritten. """ def scorer(self, searcher, fieldname, text, qf=1): return self.CompatibilityScorer(searcher, fieldname, text, self.score) def score(self, searcher, fieldname, text, docnum, weight): raise NotImplementedError class CompatibilityScorer(BaseScorer): def __init__(self, searcher, fieldname, text, scoremethod): self.searcher = searcher self.fieldname = fieldname self.text = text self.scoremethod = scoremethod def score(self, matcher): return self.scoremethod(self.searcher, self.fieldname, self.text, matcher.id(), matcher.weight()) class FunctionWeighting(WeightingModel): """Uses a supplied function to do the scoring. For simple scoring functions and experiments this may be simpler to use than writing a full weighting model class and scorer class. The function should accept the arguments ``searcher, fieldname, text, matcher``. For example, the following function will score documents based on the earliest position of the query term in the document:: def pos_score_fn(searcher, fieldname, text, matcher): poses = matcher.value_as("positions") return 1.0 / (poses[0] + 1) pos_weighting = scoring.FunctionWeighting(pos_score_fn) with myindex.searcher(weighting=pos_weighting) as s: results = s.search(q) Note that the searcher passed to the function may be a per-segment searcher for performance reasons. If you want to get global statistics inside the function, you should use ``searcher.get_parent()`` to get the top-level searcher. (However, if you are using global statistics, you should probably write a real model/scorer combo so you can cache them on the object.) """ def __init__(self, fn): self.fn = fn def scorer(self, searcher, fieldname, text, qf=1): return self.FunctionScorer(self.fn, searcher, fieldname, text, qf=qf) class FunctionScorer(BaseScorer): def __init__(self, fn, searcher, fieldname, text, qf=1): self.fn = fn self.searcher = searcher self.fieldname = fieldname self.text = text self.qf = qf def score(self, matcher): return self.fn(self.searcher, self.fieldname, self.text, matcher) class MultiWeighting(WeightingModel): """Chooses from multiple scoring algorithms based on the field. """ def __init__(self, default, **weightings): """The only non-keyword argument specifies the default :class:`Weighting` instance to use. Keyword arguments specify Weighting instances for specific fields. For example, to use ``BM25`` for most fields, but ``Frequency`` for the ``id`` field and ``TF_IDF`` for the ``keys`` field:: mw = MultiWeighting(BM25(), id=Frequency(), keys=TF_IDF()) :param default: the Weighting instance to use for fields not specified in the keyword arguments. """ self.default = default # Store weighting functions by field name self.weightings = weightings def scorer(self, searcher, fieldname, text, qf=1): w = self.weightings.get(fieldname, self.default) return w.scorer(searcher, fieldname, text, qf=qf) class ReverseWeighting(WeightingModel): """Wraps a weighting object and subtracts the wrapped model's scores from 0, essentially reversing the weighting model. """ def __init__(self, weighting): self.weighting = weighting def scorer(self, searcher, fieldname, text, qf=1): subscorer = self.weighting.scorer(searcher, fieldname, text, qf=qf) return ReverseWeighting.ReverseScorer(subscorer) class ReverseScorer(BaseScorer): def __init__(self, subscorer): self.subscorer = subscorer def supports_block_quality(self): return self.subscorer.supports_block_quality() def score(self, matcher): return 0 - self.subscorer.score(matcher) def max_quality(self): return 0 - self.subscorer.max_quality() def block_quality(self, matcher): return 0 - self.subscorer.block_quality(matcher) #class PositionWeighting(WeightingModel): # def __init__(self, reversed=False): # self.reversed = reversed # # def scorer(self, searcher, fieldname, text, qf=1): # return PositionWeighting.PositionScorer() # # class PositionScorer(BaseScorer): # def score(self, matcher): # p = min(span.pos for span in matcher.spans()) # if self.reversed: # return p # else: # return 0 - p Whoosh-2.5.7/src/whoosh/searching.py0000644000076500000240000017525612254366764017536 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """This module contains classes and functions related to searching the index. """ from __future__ import division import copy import weakref from math import ceil from whoosh import classify, highlight, query, scoring from whoosh.compat import iteritems, itervalues, iterkeys, xrange from whoosh.idsets import DocIdSet, BitSet from whoosh.reading import TermNotFound from whoosh.util.cache import lru_cache class NoTermsException(Exception): """Exception raised you try to access matched terms on a :class:`Results` object was created without them. To record which terms matched in which document, you need to call the :meth:`Searcher.search` method with ``terms=True``. """ message = "Results were created without recording terms" class TimeLimit(Exception): """Raised by :class:`TimeLimitedCollector` if the time limit is reached before the search finishes. If you have a reference to the collector, you can get partial results by calling :meth:`TimeLimitedCollector.results`. """ pass # Context class class SearchContext(object): """A container for information about the current search that may be used by the collector or the query objects to change how they operate. """ def __init__(self, needs_current=False, weighting=None, top_query=None, limit=0): """ :param needs_current: if True, the search requires that the matcher tree be "valid" and able to access information about the current match. For queries during matcher instantiation, this means they should not instantiate a matcher that doesn't allow access to the current match's value, weight, and so on. For collectors, this means they should advanced the matcher doc-by-doc rather than using shortcut methods such as all_ids(). :param weighting: the Weighting object to use for scoring documents. :param top_query: a reference to the top-level query object. :param limit: the number of results requested by the user. """ self.needs_current = needs_current self.weighting = weighting self.top_query = top_query self.limit = limit def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.__dict__) def set(self, **kwargs): ctx = copy.copy(self) ctx.__dict__.update(kwargs) return ctx # Searcher class class Searcher(object): """Wraps an :class:`~whoosh.reading.IndexReader` object and provides methods for searching the index. """ def __init__(self, reader, weighting=scoring.BM25F, closereader=True, fromindex=None, parent=None): """ :param reader: An :class:`~whoosh.reading.IndexReader` object for the index to search. :param weighting: A :class:`whoosh.scoring.Weighting` object to use to score found documents. :param closereader: Whether the underlying reader will be closed when the searcher is closed. :param fromindex: An optional reference to the index of the underlying reader. This is required for :meth:`Searcher.up_to_date` and :meth:`Searcher.refresh` to work. """ self.ixreader = reader self.is_closed = False self._closereader = closereader self._ix = fromindex self._doccount = self.ixreader.doc_count_all() # Cache for PostingCategorizer objects (supports fields without columns) self._field_caches = {} if parent: self.parent = weakref.ref(parent) self.schema = parent.schema self._idf_cache = parent._idf_cache self._filter_cache = parent._filter_cache else: self.parent = None self.schema = self.ixreader.schema self._idf_cache = {} self._filter_cache = {} if type(weighting) is type: self.weighting = weighting() else: self.weighting = weighting self.leafreaders = None self.subsearchers = None if not self.ixreader.is_atomic(): self.leafreaders = self.ixreader.leaf_readers() self.subsearchers = [(self._subsearcher(r), offset) for r, offset in self.leafreaders] # Copy attributes/methods from wrapped reader for name in ("stored_fields", "all_stored_fields", "has_vector", "vector", "vector_as", "lexicon", "field_terms", "frequency", "doc_frequency", "term_info", "doc_field_length", "corrector", "iter_docs"): setattr(self, name, getattr(self.ixreader, name)) def __enter__(self): return self def __exit__(self, *exc_info): self.close() def _subsearcher(self, reader): return self.__class__(reader, fromindex=self._ix, weighting=self.weighting, parent=self) def _offset_for_subsearcher(self, subsearcher): for ss, offset in self.subsearchers: if ss is subsearcher: return offset def leaf_searchers(self): if self.is_atomic(): return [(self, 0)] else: return self.subsearchers def is_atomic(self): return self.reader().is_atomic() def has_parent(self): return self.parent is not None def get_parent(self): """Returns the parent of this searcher (if has_parent() is True), or else self. """ if self.has_parent(): # Call the weak reference to get the parent searcher return self.parent() else: return self def doc_count(self): """Returns the number of UNDELETED documents in the index. """ return self.ixreader.doc_count() def doc_count_all(self): """Returns the total number of documents, DELETED OR UNDELETED, in the index. """ return self._doccount def field_length(self, fieldname): if self.has_parent(): return self.get_parent().field_length(fieldname) else: return self.reader().field_length(fieldname) def max_field_length(self, fieldname): if self.has_parent(): return self.get_parent().max_field_length(fieldname) else: return self.reader().max_field_length(fieldname) def up_to_date(self): """Returns True if this Searcher represents the latest version of the index, for backends that support versioning. """ if not self._ix: raise Exception("No reference to index") return self._ix.latest_generation() == self.ixreader.generation() def refresh(self): """Returns a fresh searcher for the latest version of the index:: my_searcher = my_searcher.refresh() If the index has not changed since this searcher was created, this searcher is simply returned. This method may CLOSE underlying resources that are no longer needed by the refreshed searcher, so you CANNOT continue to use the original searcher after calling ``refresh()`` on it. """ if not self._ix: raise Exception("No reference to index") if self._ix.latest_generation() == self.reader().generation(): return self # Get a new reader, re-using resources from the current reader if # possible self.is_closed = True newreader = self._ix.reader(reuse=self.ixreader) return self.__class__(newreader, fromindex=self._ix, weighting=self.weighting) def close(self): if self._closereader: self.ixreader.close() self.is_closed = True def avg_field_length(self, fieldname, default=None): if not self.schema[fieldname].scorable: return default return self.field_length(fieldname) / (self._doccount or 1) def reader(self): """Returns the underlying :class:`~whoosh.reading.IndexReader`. """ return self.ixreader def context(self, **kwargs): """Generates a :class:`SearchContext` for this searcher. """ if "weighting" not in kwargs: kwargs["weighting"] = self.weighting return SearchContext(**kwargs) def boolean_context(self): """Shortcut returns a SearchContext set for unscored (boolean) searching. """ return self.context(needs_current=False, weighting=None) def postings(self, fieldname, text, weighting=None, qf=1): """Returns a :class:`whoosh.matching.Matcher` for the postings of the given term. Unlike the :func:`whoosh.reading.IndexReader.postings` method, this method automatically sets the scoring functions on the matcher from the searcher's weighting object. """ weighting = weighting or self.weighting globalscorer = weighting.scorer(self, fieldname, text, qf=qf) if self.is_atomic(): return self.ixreader.postings(fieldname, text, scorer=globalscorer) else: from whoosh.matching import MultiMatcher matchers = [] docoffsets = [] term = (fieldname, text) for subsearcher, offset in self.subsearchers: r = subsearcher.reader() if term in r: # Make a segment-specific scorer; the scorer should call # searcher.parent() to get global stats scorer = weighting.scorer(subsearcher, fieldname, text, qf=qf) m = r.postings(fieldname, text, scorer=scorer) matchers.append(m) docoffsets.append(offset) if not matchers: raise TermNotFound(fieldname, text) return MultiMatcher(matchers, docoffsets, globalscorer) def idf(self, fieldname, text): """Calculates the Inverse Document Frequency of the current term (calls idf() on the searcher's Weighting object). """ # This method just calls the Weighting object's idf() method, but # caches the result. So Weighting objects should call *this* method # which will then call *their own* idf() methods. cache = self._idf_cache term = (fieldname, text) if term in cache: return cache[term] idf = self.weighting.idf(self, fieldname, text) cache[term] = idf return idf def document(self, **kw): """Convenience method returns the stored fields of a document matching the given keyword arguments, where the keyword keys are field names and the values are terms that must appear in the field. This method is equivalent to:: searcher.stored_fields(searcher.document_number()) Where Searcher.documents() returns a generator, this function returns either a dictionary or None. Use it when you assume the given keyword arguments either match zero or one documents (i.e. at least one of the fields is a unique key). >>> stored_fields = searcher.document(path=u"/a/b") >>> if stored_fields: ... print(stored_fields['title']) ... else: ... print("There is no document with the path /a/b") """ for p in self.documents(**kw): return p def documents(self, **kw): """Convenience method returns the stored fields of a document matching the given keyword arguments, where the keyword keys are field names and the values are terms that must appear in the field. Returns a generator of dictionaries containing the stored fields of any documents matching the keyword arguments. If you do not specify any arguments (``Searcher.documents()``), this method will yield **all** documents. >>> for stored_fields in searcher.documents(emailto=u"matt@whoosh.ca"): ... print("Email subject:", stored_fields['subject']) """ ixreader = self.ixreader return (ixreader.stored_fields(docnum) for docnum in self.document_numbers(**kw)) def _kw_to_text(self, kw): for k, v in iteritems(kw): field = self.schema[k] kw[k] = field.to_bytes(v) def _query_for_kw(self, kw): subqueries = [] for key, value in iteritems(kw): subqueries.append(query.Term(key, value)) if subqueries: q = query.And(subqueries).normalize() else: q = query.Every() return q def document_number(self, **kw): """Returns the document number of the document matching the given keyword arguments, where the keyword keys are field names and the values are terms that must appear in the field. >>> docnum = searcher.document_number(path=u"/a/b") Where Searcher.document_numbers() returns a generator, this function returns either an int or None. Use it when you assume the given keyword arguments either match zero or one documents (i.e. at least one of the fields is a unique key). :rtype: int """ # In the common case where only one keyword was given, just use # first_id() instead of building a query. self._kw_to_text(kw) if len(kw) == 1: k, v = list(kw.items())[0] try: return self.reader().first_id(k, v) except TermNotFound: return None else: m = self._query_for_kw(kw).matcher(self, self.boolean_context()) if m.is_active(): return m.id() def document_numbers(self, **kw): """Returns a generator of the document numbers for documents matching the given keyword arguments, where the keyword keys are field names and the values are terms that must appear in the field. If you do not specify any arguments (``Searcher.document_numbers()``), this method will yield **all** document numbers. >>> docnums = list(searcher.document_numbers(emailto="matt@whoosh.ca")) """ self._kw_to_text(kw) return self.docs_for_query(self._query_for_kw(kw)) def _find_unique(self, uniques): # uniques is a list of ("unique_field_name", "field_value") tuples delset = set() for name, value in uniques: docnum = self.document_number(**{name: value}) if docnum is not None: delset.add(docnum) return delset @lru_cache(20) def _query_to_comb(self, fq): return BitSet(self.docs_for_query(fq), size=self.doc_count_all()) def _filter_to_comb(self, obj): if obj is None: return None if isinstance(obj, (set, DocIdSet)): c = obj elif isinstance(obj, Results): c = obj.docs() elif isinstance(obj, ResultsPage): c = obj.results.docs() elif isinstance(obj, query.Query): c = self._query_to_comb(obj) else: raise Exception("Don't know what to do with filter object %r" % obj) return c def suggest(self, fieldname, text, limit=5, maxdist=2, prefix=0): """Returns a sorted list of suggested corrections for the given mis-typed word ``text`` based on the contents of the given field:: >>> searcher.suggest("content", "specail") ["special"] This is a convenience method. If you are planning to get suggestions for multiple words in the same field, it is more efficient to get a :class:`~whoosh.spelling.Corrector` object and use it directly:: corrector = searcher.corrector("fieldname") for word in words: print(corrector.suggest(word)) :param limit: only return up to this many suggestions. If there are not enough terms in the field within ``maxdist`` of the given word, the returned list will be shorter than this number. :param maxdist: the largest edit distance from the given word to look at. Numbers higher than 2 are not very effective or efficient. :param prefix: require suggestions to share a prefix of this length with the given word. This is often justifiable since most misspellings do not involve the first letter of the word. Using a prefix dramatically decreases the time it takes to generate the list of words. """ c = self.reader().corrector(fieldname) return c.suggest(text, limit=limit, maxdist=maxdist, prefix=prefix) def key_terms(self, docnums, fieldname, numterms=5, model=classify.Bo1Model, normalize=True): """Returns the 'numterms' most important terms from the documents listed (by number) in 'docnums'. You can get document numbers for the documents your interested in with the document_number() and document_numbers() methods. "Most important" is generally defined as terms that occur frequently in the top hits but relatively infrequently in the collection as a whole. >>> docnum = searcher.document_number(path=u"/a/b") >>> keywords_and_scores = searcher.key_terms([docnum], "content") This method returns a list of ("term", score) tuples. The score may be useful if you want to know the "strength" of the key terms, however to just get the terms themselves you can just do this: >>> kws = [kw for kw, score in searcher.key_terms([docnum], "content")] :param fieldname: Look at the terms in this field. This field must store vectors. :param docnums: A sequence of document numbers specifying which documents to extract key terms from. :param numterms: Return this number of important terms. :param model: The classify.ExpansionModel to use. See the classify module. :param normalize: normalize the scores. :returns: a list of ("term", score) tuples. """ expander = classify.Expander(self.ixreader, fieldname, model=model) for docnum in docnums: expander.add_document(docnum) return expander.expanded_terms(numterms, normalize=normalize) def key_terms_from_text(self, fieldname, text, numterms=5, model=classify.Bo1Model, normalize=True): """Return the 'numterms' most important terms from the given text. :param numterms: Return this number of important terms. :param model: The classify.ExpansionModel to use. See the classify module. """ expander = classify.Expander(self.ixreader, fieldname, model=model) expander.add_text(text) return expander.expanded_terms(numterms, normalize=normalize) def more_like(self, docnum, fieldname, text=None, top=10, numterms=5, model=classify.Bo1Model, normalize=False, filter=None): """Returns a :class:`Results` object containing documents similar to the given document, based on "key terms" in the given field:: # Get the ID for the document you're interested in docnum = search.document_number(path=u"/a/b/c") r = searcher.more_like(docnum) print("Documents like", searcher.stored_fields(docnum)["title"]) for hit in r: print(hit["title"]) :param fieldname: the name of the field to use to test similarity. :param text: by default, the method will attempt to load the contents of the field from the stored fields for the document, or from a term vector. If the field isn't stored or vectored in the index, but you have access to the text another way (for example, loading from a file or a database), you can supply it using the ``text`` parameter. :param top: the number of results to return. :param numterms: the number of "key terms" to extract from the hit and search for. Using more terms is slower but gives potentially more and more accurate results. :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use to compute "key terms". :param normalize: whether to normalize term weights. :param filter: a query, Results object, or set of docnums. The results will only contain documents that are also in the filter object. """ if text: kts = self.key_terms_from_text(fieldname, text, numterms=numterms, model=model, normalize=normalize) else: kts = self.key_terms([docnum], fieldname, numterms=numterms, model=model, normalize=normalize) # Create an Or query from the key terms q = query.Or([query.Term(fieldname, word, boost=weight) for word, weight in kts]) return self.search(q, limit=top, filter=filter, mask=set([docnum])) def search_page(self, query, pagenum, pagelen=10, **kwargs): """This method is Like the :meth:`Searcher.search` method, but returns a :class:`ResultsPage` object. This is a convenience function for getting a certain "page" of the results for the given query, which is often useful in web search interfaces. For example:: querystring = request.get("q") query = queryparser.parse("content", querystring) pagenum = int(request.get("page", 1)) pagelen = int(request.get("perpage", 10)) results = searcher.search_page(query, pagenum, pagelen=pagelen) print("Page %d of %d" % (results.pagenum, results.pagecount)) print("Showing results %d-%d of %d" % (results.offset + 1, results.offset + results.pagelen + 1, len(results))) for hit in results: print("%d: %s" % (hit.rank + 1, hit["title"])) (Note that results.pagelen might be less than the pagelen argument if there aren't enough results to fill a page.) Any additional keyword arguments you supply are passed through to :meth:`Searcher.search`. For example, you can get paged results of a sorted search:: results = searcher.search_page(q, 2, sortedby="date", reverse=True) Currently, searching for page 100 with pagelen of 10 takes the same amount of time as using :meth:`Searcher.search` to find the first 1000 results. That is, this method does not have any special optimizations or efficiencies for getting a page from the middle of the full results list. (A future enhancement may allow using previous page results to improve the efficiency of finding the next page.) This method will raise a ``ValueError`` if you ask for a page number higher than the number of pages in the resulting query. :param query: the :class:`whoosh.query.Query` object to match. :param pagenum: the page number to retrieve, starting at ``1`` for the first page. :param pagelen: the number of results per page. :returns: :class:`ResultsPage` """ if pagenum < 1: raise ValueError("pagenum must be >= 1") results = self.search(query, limit=pagenum * pagelen, **kwargs) return ResultsPage(results, pagenum, pagelen) def find(self, defaultfield, querystring, **kwargs): from whoosh.qparser import QueryParser qp = QueryParser(defaultfield, schema=self.ixreader.schema) q = qp.parse(querystring) return self.search(q, **kwargs) def docs_for_query(self, q, for_deletion=False): """Returns an iterator of document numbers for documents matching the given :class:`whoosh.query.Query` object. """ # If we're getting the document numbers so we can delete them, use the # deletion_docs method instead of docs; this lets special queries # (e.g. nested queries) override what gets deleted if for_deletion: method = q.deletion_docs else: method = q.docs if self.subsearchers: for s, offset in self.subsearchers: for docnum in method(s): yield docnum + offset else: for docnum in method(self): yield docnum def collector(self, limit=10, sortedby=None, reverse=False, groupedby=None, collapse=None, collapse_limit=1, collapse_order=None, optimize=True, filter=None, mask=None, terms=False, maptype=None, scored=True): """Low-level method: returns a configured :class:`whoosh.collectors.Collector` object based on the given arguments. You can use this object with :meth:`Searcher.search_with_collector` to search. See the documentation for the :meth:`Searcher.search` method for a description of the parameters. This method may be useful to get a basic collector object and then wrap it with another collector from ``whoosh.collectors`` or with a custom collector of your own:: # Equivalent of # results = mysearcher.search(myquery, limit=10) # but with a time limt... # Create a TopCollector c = mysearcher.collector(limit=10) # Wrap it with a TimeLimitedCollector with a time limit of # 10.5 seconds from whoosh.collectors import TimeLimitedCollector c = TimeLimitCollector(c, 10.5) # Search using the custom collector results = mysearcher.search_with_collector(myquery, c) """ from whoosh import collectors if limit is not None and limit < 1: raise ValueError("limit must be >= 1") if not scored and not sortedby: c = collectors.UnsortedCollector() elif sortedby: c = collectors.SortingCollector(sortedby, limit=limit, reverse=reverse) elif groupedby or reverse or not limit or limit >= self.doc_count(): # A collector that gathers every matching document c = collectors.UnlimitedCollector(reverse=reverse) else: # A collector that uses block quality optimizations and a heap # queue to only collect the top N documents c = collectors.TopCollector(limit, usequality=optimize) if groupedby: c = collectors.FacetCollector(c, groupedby, maptype=maptype) if terms: c = collectors.TermsCollector(c) if collapse: c = collectors.CollapseCollector(c, collapse, limit=collapse_limit, order=collapse_order) # Filtering wraps last so it sees the docs first if filter or mask: c = collectors.FilterCollector(c, filter, mask) return c def search(self, q, **kwargs): """Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object. See :doc:`/searching` for more information. This method takes many keyword arguments (documented below). See :doc:`/facets` for information on using ``sortedby`` and/or ``groupedby``. See :ref:`collapsing` for more information on using ``collapse``, ``collapse_limit``, and ``collapse_order``. :param query: a :class:`whoosh.query.Query` object to use to match documents. :param limit: the maximum number of documents to score. If you're only interested in the top N documents, you can set limit=N to limit the scoring for a faster search. Default is 10. :param scored: whether to score the results. Overriden by ``sortedby``. If both ``scored=False`` and ``sortedby=None``, the results will be in arbitrary order, but will usually be computed faster than scored or sorted results. :param sortedby: see :doc:`/facets`. :param reverse: Reverses the direction of the sort. Default is False. :param groupedby: see :doc:`/facets`. :param optimize: use optimizations to get faster results when possible. Default is True. :param filter: a query, Results object, or set of docnums. The results will only contain documents that are also in the filter object. :param mask: a query, Results object, or set of docnums. The results will not contain any documents that are in the mask object. :param terms: if True, record which terms were found in each matching document. See :doc:`/searching` for more information. Default is False. :param maptype: by default, the results of faceting with ``groupedby`` is a dictionary mapping group names to ordered lists of document numbers in the group. You can pass a :class:`whoosh.sorting.FacetMap` subclass to this keyword argument to specify a different (usually faster) method for grouping. For example, ``maptype=sorting.Count`` would store only the count of documents in each group, instead of the full list of document IDs. :param collapse: a :doc:`facet ` to use to collapse the results. See :ref:`collapsing` for more information. :param collapse_limit: the maximum number of documents to allow with the same collapse key. See :ref:`collapsing` for more information. :param collapse_order: an optional ordering :doc:`facet ` to control which documents are kept when collapsing. The default (``collapse_order=None``) uses the results order (e.g. the highest scoring documents in a scored search). :rtype: :class:`Results` """ # Call the collector() method to build a collector based on the # parameters passed to this method c = self.collector(**kwargs) # Call the lower-level method to run the collector self.search_with_collector(q, c) # Return the results object from the collector return c.results() def search_with_collector(self, q, collector, context=None): """Low-level method: runs a :class:`whoosh.query.Query` object on this searcher using the given :class:`whoosh.collectors.Collector` object to collect the results:: myquery = query.Term("content", "cabbage") uc = collectors.UnlimitedCollector() tc = TermsCollector(uc) mysearcher.search_with_collector(myquery, tc) print(tc.docterms) print(tc.results()) Note that this method does not return a :class:`Results` object. You need to access the collector to get a results object or other information the collector might hold after the search. :param q: a :class:`whoosh.query.Query` object to use to match documents. :param collector: a :class:`whoosh.collectors.Collector` object to feed the results into. """ # Get the search context object from the searcher context = context or self.context() # Allow collector to set up based on the top-level information collector.prepare(self, q, context) collector.run() def correct_query(self, q, qstring, correctors=None, allfields=False, terms=None, prefix=0, maxdist=2): """Returns a corrected version of the given user query using a default :class:`whoosh.spelling.ReaderCorrector`. The default: * Corrects any words that don't appear in the index. * Takes suggestions from the words in the index. To make certain fields use custom correctors, use the ``correctors`` argument to pass a dictionary mapping field names to :class:`whoosh.spelling.Corrector` objects. * ONLY CORRECTS FIELDS THAT HAVE THE ``spelling`` ATTRIBUTE in the schema (or for which you pass a custom corrector). To automatically check all fields, use ``allfields=True``. Spell checking fields without ``spelling`` is slower. Expert users who want more sophisticated correction behavior can create a custom :class:`whoosh.spelling.QueryCorrector` and use that instead of this method. Returns a :class:`whoosh.spelling.Correction` object with a ``query`` attribute containing the corrected :class:`whoosh.query.Query` object and a ``string`` attributes containing the corrected query string. >>> from whoosh import qparser, highlight >>> qtext = 'mary "litle lamb"' >>> q = qparser.QueryParser("text", myindex.schema) >>> mysearcher = myindex.searcher() >>> correction = mysearcher().correct_query(q, qtext) >>> correction.query >>> correction.string 'mary "little lamb"' >>> mysearcher.close() You can use the ``Correction`` object's ``format_string`` method to format the corrected query string using a :class:`whoosh.highlight.Formatter` object. For example, you can format the corrected string as HTML, emphasizing the changed words. >>> hf = highlight.HtmlFormatter(classname="change") >>> correction.format_string(hf) 'mary "little lamb"' :param q: the :class:`whoosh.query.Query` object to correct. :param qstring: the original user query from which the query object was created. You can pass None instead of a string, in which the second item in the returned tuple will also be None. :param correctors: an optional dictionary mapping fieldnames to :class:`whoosh.spelling.Corrector` objects. By default, this method uses the contents of the index to spell check the terms in the query. You can use this argument to "override" some fields with a different correct, for example a :class:`whoosh.spelling.GraphCorrector`. :param allfields: if True, automatically spell check all fields, not just fields with the ``spelling`` attribute. :param terms: a sequence of ``("fieldname", "text")`` tuples to correct in the query. By default, this method corrects terms that don't appear in the index. You can use this argument to override that behavior and explicitly specify the terms that should be corrected. :param prefix: suggested replacement words must share this number of initial characters with the original word. Increasing this even to just ``1`` can dramatically speed up suggestions, and may be justifiable since spellling mistakes rarely involve the first letter of a word. :param maxdist: the maximum number of "edits" (insertions, deletions, subsitutions, or transpositions of letters) allowed between the original word and any suggestion. Values higher than ``2`` may be slow. :rtype: :class:`whoosh.spelling.Correction` """ reader = self.reader() # Dictionary of custom per-field correctors if correctors is None: correctors = {} if allfields: fieldnames = self.schema.names() else: fieldnames = [name for name, field in self.schema.items() if field.spelling] # Fill in default corrector objects for fields that don't have a custom # one in the "correctors" dictionary for fieldname in fieldnames: if fieldname not in correctors: correctors[fieldname] = self.reader().corrector(fieldname) # Get any missing terms in the query in the fields we're correcting if terms is None: terms = [] for token in q.all_tokens(): fieldname = token.fieldname text = token.text if fieldname in correctors and (fieldname, text) not in reader: terms.append((token.fieldname, token.text)) # Make q query corrector from whoosh import spelling sqc = spelling.SimpleQueryCorrector(correctors, terms) return sqc.correct_query(q, qstring) class Results(object): """This object is returned by a Searcher. This object represents the results of a search query. You can mostly use it as if it was a list of dictionaries, where each dictionary is the stored fields of the document at that position in the results. Note that a Results object keeps a reference to the Searcher that created it, so keeping a reference to a Results object keeps the Searcher alive and so keeps all files used by it open. """ def __init__(self, searcher, q, top_n, docset=None, facetmaps=None, runtime=0, highlighter=None): """ :param searcher: the :class:`Searcher` object that produced these results. :param query: the original query that created these results. :param top_n: a list of (score, docnum) tuples representing the top N search results. """ self.searcher = searcher self.q = q self.top_n = top_n self.docset = docset self._facetmaps = facetmaps or {} self.runtime = runtime self.highlighter = highlighter or highlight.Highlighter() self.collector = None self._total = None self._char_cache = {} def __repr__(self): return "" % (len(self.top_n), self.q, self.runtime) def __len__(self): """Returns the total number of documents that matched the query. Note this may be more than the number of scored documents, given the value of the ``limit`` keyword argument to :meth:`Searcher.search`. If this Results object was created by searching with a ``limit`` keyword, then computing the exact length of the result set may be expensive for large indexes or large result sets. You may consider using :meth:`Results.has_exact_length`, :meth:`Results.estimated_length`, and :meth:`Results.estimated_min_length` to display an estimated size of the result set instead of an exact number. """ if self._total is None: self._total = self.collector.count() return self._total def __getitem__(self, n): if isinstance(n, slice): start, stop, step = n.indices(len(self.top_n)) return [Hit(self, self.top_n[i][1], i, self.top_n[i][0]) for i in xrange(start, stop, step)] else: if n >= len(self.top_n): raise IndexError("results[%r]: Results only has %s hits" % (n, len(self.top_n))) return Hit(self, self.top_n[n][1], n, self.top_n[n][0]) def __iter__(self): """Yields a :class:`Hit` object for each result in ranked order. """ for i in xrange(len(self.top_n)): yield Hit(self, self.top_n[i][1], i, self.top_n[i][0]) def __contains__(self, docnum): """Returns True if the given document number matched the query. """ return docnum in self.docs() def __nonzero__(self): return not self.is_empty() __bool__ = __nonzero__ def is_empty(self): """Returns True if not documents matched the query. """ return self.scored_length() == 0 def items(self): """Returns an iterator of (docnum, score) pairs for the scored documents in the results. """ return ((docnum, score) for score, docnum in self.top_n) def fields(self, n): """Returns the stored fields for the document at the ``n`` th position in the results. Use :meth:`Results.docnum` if you want the raw document number instead of the stored fields. """ return self.searcher.stored_fields(self.top_n[n][1]) def facet_names(self): """Returns the available facet names, for use with the ``groups()`` method. """ return self._facetmaps.keys() def groups(self, name=None): """If you generated facet groupings for the results using the `groupedby` keyword argument to the ``search()`` method, you can use this method to retrieve the groups. You can use the ``facet_names()`` method to get the list of available facet names. >>> results = searcher.search(my_query, groupedby=["tag", "price"]) >>> results.facet_names() ["tag", "price"] >>> results.groups("tag") {"new": [12, 1, 4], "apple": [3, 10, 5], "search": [11]} If you only used one facet, you can call the method without a facet name to get the groups for the facet. >>> results = searcher.search(my_query, groupedby="tag") >>> results.groups() {"new": [12, 1, 4], "apple": [3, 10, 5, 0], "search": [11]} By default, this returns a dictionary mapping category names to a list of document numbers, in the same relative order as they appear in the results. >>> results = mysearcher.search(myquery, groupedby="tag") >>> docnums = results.groups() >>> docnums['new'] [12, 1, 4] You can then use :meth:`Searcher.stored_fields` to get the stored fields associated with a document ID. If you specified a different ``maptype`` for the facet when you searched, the values in the dictionary depend on the :class:`whoosh.sorting.FacetMap`. >>> myfacet = sorting.FieldFacet("tag", maptype=sorting.Count) >>> results = mysearcher.search(myquery, groupedby=myfacet) >>> counts = results.groups() {"new": 3, "apple": 4, "search": 1} """ if (name is None or name == "facet") and len(self._facetmaps) == 1: # If there's only one facet, just use it; convert keys() to list # for Python 3 name = list(self._facetmaps.keys())[0] elif name not in self._facetmaps: raise KeyError("%r not in facet names %r" % (name, self.facet_names())) return self._facetmaps[name].as_dict() def has_exact_length(self): """Returns True if this results object already knows the exact number of matching documents. """ if self.collector: return self.collector.computes_count() else: return self._total is not None def estimated_length(self): """The estimated maximum number of matching documents, or the exact number of matching documents if it's known. """ if self.has_exact_length(): return len(self) else: return self.q.estimate_size(self.searcher.reader()) def estimated_min_length(self): """The estimated minimum number of matching documents, or the exact number of matching documents if it's known. """ if self.has_exact_length(): return len(self) else: return self.q.estimate_min_size(self.searcher.reader()) def scored_length(self): """Returns the number of scored documents in the results, equal to or less than the ``limit`` keyword argument to the search. >>> r = mysearcher.search(myquery, limit=20) >>> len(r) 1246 >>> r.scored_length() 20 This may be fewer than the total number of documents that match the query, which is what ``len(Results)`` returns. """ return len(self.top_n) def docs(self): """Returns a set-like object containing the document numbers that matched the query. """ if self.docset is None: self.docset = set(self.collector.all_ids()) return self.docset def copy(self): """Returns a deep copy of this results object. """ # Shallow copy self to get attributes r = copy.copy(self) # Deep copies of docset and top_n in case they're modified r.docset = copy.deepcopy(self.docset) r.top_n = copy.deepcopy(self.top_n) return r def score(self, n): """Returns the score for the document at the Nth position in the list of ranked documents. If the search was not scored, this may return None. """ return self.top_n[n][0] def docnum(self, n): """Returns the document number of the result at position n in the list of ranked documents. """ return self.top_n[n][1] def query_terms(self, expand=False, fieldname=None): return self.q.existing_terms(self.searcher.reader(), fieldname=fieldname, expand=expand) def has_matched_terms(self): """Returns True if the search recorded which terms matched in which documents. >>> r = searcher.search(myquery) >>> r.has_matched_terms() False >>> """ return hasattr(self, "docterms") and hasattr(self, "termdocs") def matched_terms(self): """Returns the set of ``("fieldname", "text")`` tuples representing terms from the query that matched one or more of the TOP N documents (this does not report terms for documents that match the query but did not score high enough to make the top N results). You can compare this set to the terms from the original query to find terms which didn't occur in any matching documents. This is only valid if you used ``terms=True`` in the search call to record matching terms. Otherwise it will raise an exception. >>> q = myparser.parse("alfa OR bravo OR charlie") >>> results = searcher.search(q, terms=True) >>> results.terms() set([("content", "alfa"), ("content", "charlie")]) >>> q.all_terms() - results.terms() set([("content", "bravo")]) """ if not self.has_matched_terms(): raise NoTermsException return set(self.termdocs.keys()) def _get_fragmenter(self): return self.highlighter.fragmenter def _set_fragmenter(self, f): self.highlighter.fragmenter = f fragmenter = property(_get_fragmenter, _set_fragmenter) def _get_formatter(self): return self.highlighter.formatter def _set_formatter(self, f): self.highlighter.formatter = f formatter = property(_get_formatter, _set_formatter) def _get_scorer(self): return self.highlighter.scorer def _set_scorer(self, s): self.highlighter.scorer = s scorer = property(_get_scorer, _set_scorer) def _get_order(self): return self.highlighter.order def _set_order(self, o): self.highlighter.order = o order = property(_get_order, _set_order) def key_terms(self, fieldname, docs=10, numterms=5, model=classify.Bo1Model, normalize=True): """Returns the 'numterms' most important terms from the top 'docs' documents in these results. "Most important" is generally defined as terms that occur frequently in the top hits but relatively infrequently in the collection as a whole. :param fieldname: Look at the terms in this field. This field must store vectors. :param docs: Look at this many of the top documents of the results. :param numterms: Return this number of important terms. :param model: The classify.ExpansionModel to use. See the classify module. :returns: list of unicode strings. """ if not len(self): return [] docs = min(docs, len(self)) reader = self.searcher.reader() expander = classify.Expander(reader, fieldname, model=model) for _, docnum in self.top_n[:docs]: expander.add_document(docnum) return expander.expanded_terms(numterms, normalize=normalize) def extend(self, results): """Appends hits from 'results' (that are not already in this results object) to the end of these results. :param results: another results object. """ docs = self.docs() for item in results.top_n: if item[1] not in docs: self.top_n.append(item) self.docset = docs | results.docs() def filter(self, results): """Removes any hits that are not also in the other results object. """ if not len(results): return otherdocs = results.docs() items = [item for item in self.top_n if item[1] in otherdocs] self.docset = self.docs() & otherdocs self.top_n = items def upgrade(self, results, reverse=False): """Re-sorts the results so any hits that are also in 'results' appear before hits not in 'results', otherwise keeping their current relative positions. This does not add the documents in the other results object to this one. :param results: another results object. :param reverse: if True, lower the position of hits in the other results object instead of raising them. """ if not len(results): return otherdocs = results.docs() arein = [item for item in self.top_n if item[1] in otherdocs] notin = [item for item in self.top_n if item[1] not in otherdocs] if reverse: items = notin + arein else: items = arein + notin self.top_n = items def upgrade_and_extend(self, results): """Combines the effects of extend() and upgrade(): hits that are also in 'results' are raised. Then any hits from the other results object that are not in this results object are appended to the end. :param results: another results object. """ if not len(results): return docs = self.docs() otherdocs = results.docs() arein = [item for item in self.top_n if item[1] in otherdocs] notin = [item for item in self.top_n if item[1] not in otherdocs] other = [item for item in results.top_n if item[1] not in docs] self.docset = docs | otherdocs self.top_n = arein + notin + other class Hit(object): """Represents a single search result ("hit") in a Results object. This object acts like a dictionary of the matching document's stored fields. If for some reason you need an actual ``dict`` object, use ``Hit.fields()`` to get one. >>> r = searcher.search(query.Term("content", "render")) >>> r[0] < Hit {title = u"Rendering the scene"} > >>> r[0].rank 0 >>> r[0].docnum == 4592 True >>> r[0].score 2.52045682 >>> r[0]["title"] "Rendering the scene" >>> r[0].keys() ["title"] """ def __init__(self, results, docnum, pos=None, score=None): """ :param results: the Results object this hit belongs to. :param pos: the position in the results list of this hit, for example pos = 0 means this is the first (highest scoring) hit. :param docnum: the document number of this hit. :param score: the score of this hit. """ self.results = results self.searcher = results.searcher self.reader = self.searcher.reader() self.pos = self.rank = pos self.docnum = docnum self.score = score self._fields = None def fields(self): """Returns a dictionary of the stored fields of the document this object represents. """ if self._fields is None: self._fields = self.searcher.stored_fields(self.docnum) return self._fields def matched_terms(self): """Returns the set of ``("fieldname", "text")`` tuples representing terms from the query that matched in this document. You can compare this set to the terms from the original query to find terms which didn't occur in this document. This is only valid if you used ``terms=True`` in the search call to record matching terms. Otherwise it will raise an exception. >>> q = myparser.parse("alfa OR bravo OR charlie") >>> results = searcher.search(q, terms=True) >>> for hit in results: ... print(hit["title"]) ... print("Contains:", hit.matched_terms()) ... print("Doesn't contain:", q.all_terms() - hit.matched_terms()) """ if not self.results.has_matched_terms(): raise NoTermsException return self.results.docterms.get(self.docnum, []) def highlights(self, fieldname, text=None, top=3, minscore=1): """Returns highlighted snippets from the given field:: r = searcher.search(myquery) for hit in r: print(hit["title"]) print(hit.highlights("content")) See :doc:`/highlight`. To change the fragmeter, formatter, order, or scorer used in highlighting, you can set attributes on the results object:: from whoosh import highlight results = searcher.search(myquery, terms=True) results.fragmenter = highlight.SentenceFragmenter() ...or use a custom :class:`whoosh.highlight.Highlighter` object:: hl = highlight.Highlighter(fragmenter=sf) results.highlighter = hl :param fieldname: the name of the field you want to highlight. :param text: by default, the method will attempt to load the contents of the field from the stored fields for the document. If the field you want to highlight isn't stored in the index, but you have access to the text another way (for example, loading from a file or a database), you can supply it using the ``text`` parameter. :param top: the maximum number of fragments to return. :param minscore: the minimum score for fragments to appear in the highlights. """ hliter = self.results.highlighter return hliter.highlight_hit(self, fieldname, text=text, top=top, minscore=minscore) def more_like_this(self, fieldname, text=None, top=10, numterms=5, model=classify.Bo1Model, normalize=True, filter=None): """Returns a new Results object containing documents similar to this hit, based on "key terms" in the given field:: r = searcher.search(myquery) for hit in r: print(hit["title"]) print("Top 3 similar documents:") for subhit in hit.more_like_this("content", top=3): print(" ", subhit["title"]) :param fieldname: the name of the field to use to test similarity. :param text: by default, the method will attempt to load the contents of the field from the stored fields for the document, or from a term vector. If the field isn't stored or vectored in the index, but you have access to the text another way (for example, loading from a file or a database), you can supply it using the ``text`` parameter. :param top: the number of results to return. :param numterms: the number of "key terms" to extract from the hit and search for. Using more terms is slower but gives potentially more and more accurate results. :param model: (expert) a :class:`whoosh.classify.ExpansionModel` to use to compute "key terms". :param normalize: whether to normalize term weights. """ return self.searcher.more_like(self.docnum, fieldname, text=text, top=top, numterms=numterms, model=model, normalize=normalize, filter=filter) def __repr__(self): return "<%s %r>" % (self.__class__.__name__, self.fields()) def __eq__(self, other): if isinstance(other, Hit): return self.fields() == other.fields() elif isinstance(other, dict): return self.fields() == other else: return False def __len__(self): return len(self.fields()) def __iter__(self): return iterkeys(self.fields()) def __getitem__(self, fieldname): if fieldname in self.fields(): return self._fields[fieldname] reader = self.reader if reader.has_column(fieldname): cr = reader.column_reader(fieldname) return cr[self.docnum] raise KeyError(fieldname) def __contains__(self, key): return (key in self.fields() or self.reader.has_column(key)) def items(self): return list(self.fields().items()) def keys(self): return list(self.fields().keys()) def values(self): return list(self.fields().values()) def iteritems(self): return iteritems(self.fields()) def iterkeys(self): return iterkeys(self.fields()) def itervalues(self): return itervalues(self.fields()) def get(self, key, default=None): return self.fields().get(key, default) def __setitem__(self, key, value): raise NotImplementedError("You cannot modify a search result") def __delitem__(self, key, value): raise NotImplementedError("You cannot modify a search result") def clear(self): raise NotImplementedError("You cannot modify a search result") def update(self, dict=None, **kwargs): raise NotImplementedError("You cannot modify a search result") class ResultsPage(object): """Represents a single page out of a longer list of results, as returned by :func:`whoosh.searching.Searcher.search_page`. Supports a subset of the interface of the :class:`~whoosh.searching.Results` object, namely getting stored fields with __getitem__ (square brackets), iterating, and the ``score()`` and ``docnum()`` methods. The ``offset`` attribute contains the results number this page starts at (numbered from 0). For example, if the page length is 10, the ``offset`` attribute on the second page will be ``10``. The ``pagecount`` attribute contains the number of pages available. The ``pagenum`` attribute contains the page number. This may be less than the page you requested if the results had too few pages. For example, if you do:: ResultsPage(results, 5) but the results object only contains 3 pages worth of hits, ``pagenum`` will be 3. The ``pagelen`` attribute contains the number of results on this page (which may be less than the page length you requested if this is the last page of the results). The ``total`` attribute contains the total number of hits in the results. >>> mysearcher = myindex.searcher() >>> pagenum = 2 >>> page = mysearcher.find_page(pagenum, myquery) >>> print("Page %s of %s, results %s to %s of %s" % ... (pagenum, page.pagecount, page.offset+1, ... page.offset+page.pagelen, page.total)) >>> for i, fields in enumerate(page): ... print("%s. %r" % (page.offset + i + 1, fields)) >>> mysearcher.close() To set highlighter attributes (for example ``formatter``), access the underlying :class:`Results` object:: page.results.formatter = highlight.UppercaseFormatter() """ def __init__(self, results, pagenum, pagelen=10): """ :param results: a :class:`~whoosh.searching.Results` object. :param pagenum: which page of the results to use, numbered from ``1``. :param pagelen: the number of hits per page. """ self.results = results self.total = len(results) if pagenum < 1: raise ValueError("pagenum must be >= 1") self.pagecount = int(ceil(self.total / pagelen)) self.pagenum = min(self.pagecount, pagenum) offset = (self.pagenum - 1) * pagelen if (offset + pagelen) > self.total: pagelen = self.total - offset self.offset = offset self.pagelen = pagelen def __getitem__(self, n): offset = self.offset if isinstance(n, slice): start, stop, step = n.indices(self.pagelen) return self.results.__getitem__(slice(start + offset, stop + offset, step)) else: return self.results.__getitem__(n + offset) def __iter__(self): return iter(self.results[self.offset:self.offset + self.pagelen]) def __len__(self): return self.total def scored_length(self): return self.results.scored_length() def score(self, n): """Returns the score of the hit at the nth position on this page. """ return self.results.score(n + self.offset) def docnum(self, n): """Returns the document number of the hit at the nth position on this page. """ return self.results.docnum(n + self.offset) def is_last_page(self): """Returns True if this object represents the last page of results. """ return self.pagecount == 0 or self.pagenum == self.pagecount Whoosh-2.5.7/src/whoosh/sorting.py0000644000076500000240000012171412254366764017246 0ustar mattstaff00000000000000# Copyright 2011 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from array import array from collections import defaultdict from whoosh.compat import string_type from whoosh.compat import iteritems, izip, xrange # Faceting objects class FacetType(object): """Base class for "facets", aspects that can be sorted/faceted. """ maptype = None def categorizer(self, global_searcher): """Returns a :class:`Categorizer` corresponding to this facet. :param global_searcher: A parent searcher. You can use this searcher if you need global document ID references. """ raise NotImplementedError def map(self, default=None): t = self.maptype if t is None: t = default if t is None: return OrderedList() elif type(t) is type: return t() else: return t def default_name(self): return "facet" class Categorizer(object): """Base class for categorizer objects which compute a key value for a document based on certain criteria, for use in sorting/faceting. Categorizers are created by FacetType objects through the :meth:`FacetType.categorizer` method. The :class:`whoosh.searching.Searcher` object passed to the ``categorizer`` method may be a composite searcher (that is, wrapping a multi-reader), but categorizers are always run **per-segment**, with segment-relative document numbers. The collector will call a categorizer's ``set_searcher`` method as it searches each segment to let the cateogorizer set up whatever segment- specific data it needs. ``Collector.allow_overlap`` should be ``True`` if the caller can use the ``keys_for`` method instead of ``key_for`` to group documents into potentially overlapping groups. The default is ``False``. If a categorizer subclass can categorize the document using only the document number, it should set ``Collector.needs_current`` to ``False`` (this is the default) and NOT USE the given matcher in the ``key_for`` or ``keys_for`` methods, since in that case ``segment_docnum`` is not guaranteed to be consistent with the given matcher. If a categorizer subclass needs to access information on the matcher, it should set ``needs_current`` to ``True``. This will prevent the caller from using optimizations that might leave the matcher in an inconsistent state. """ allow_overlap = False needs_current = False def set_searcher(self, segment_searcher, docoffset): """Called by the collector when the collector moves to a new segment. The ``segment_searcher`` will be atomic. The ``docoffset`` is the offset of the segment's document numbers relative to the entire index. You can use the offset to get absolute index docnums by adding the offset to segment-relative docnums. """ pass def key_for(self, matcher, segment_docnum): """Returns a key for the current match. :param matcher: a :class:`whoosh.matching.Matcher` object. If ``self.needs_current`` is ``False``, DO NOT use this object, since it may be inconsistent. Use the given ``segment_docnum`` instead. :param segment_docnum: the segment-relative document number of the current match. """ # Backwards compatibility if hasattr(self, "key_for_id"): return self.key_for_id(segment_docnum) elif hasattr(self, "key_for_matcher"): return self.key_for_matcher(matcher) raise NotImplementedError(self.__class__) def keys_for(self, matcher, segment_docnum): """Yields a series of keys for the current match. This method will be called instead of ``key_for`` if ``self.allow_overlap`` is ``True``. :param matcher: a :class:`whoosh.matching.Matcher` object. If ``self.needs_current`` is ``False``, DO NOT use this object, since it may be inconsistent. Use the given ``segment_docnum`` instead. :param segment_docnum: the segment-relative document number of the current match. """ # Backwards compatibility if hasattr(self, "keys_for_id"): return self.keys_for_id(segment_docnum) raise NotImplementedError(self.__class__) def key_to_name(self, key): """Returns a representation of the key to be used as a dictionary key in faceting. For example, the sorting key for date fields is a large integer; this method translates it into a ``datetime`` object to make the groupings clearer. """ return key # General field facet class FieldFacet(FacetType): """Sorts/facest by the contents of a field. For example, to sort by the contents of the "path" field in reverse order, and facet by the contents of the "tag" field:: paths = FieldFacet("path", reverse=True) tags = FieldFacet("tag") results = searcher.search(myquery, sortedby=paths, groupedby=tags) This facet returns different categorizers based on the field type. """ def __init__(self, fieldname, reverse=False, allow_overlap=False, maptype=None): """ :param fieldname: the name of the field to sort/facet on. :param reverse: if True, when sorting, reverse the sort order of this facet. :param allow_overlap: if True, when grouping, allow documents to appear in multiple groups when they have multiple terms in the field. """ self.fieldname = fieldname self.reverse = reverse self.allow_overlap = allow_overlap self.maptype = maptype def default_name(self): return self.fieldname def categorizer(self, global_searcher): # The searcher we're passed here may wrap a multireader, but the # actual key functions will always be called per-segment following a # Categorizer.set_searcher method call fieldname = self.fieldname fieldobj = global_searcher.schema[fieldname] # If we're grouping with allow_overlap=True, all we can use is # OverlappingCategorizer if self.allow_overlap: return OverlappingCategorizer(global_searcher, fieldname) if global_searcher.reader().has_column(fieldname): coltype = fieldobj.column_type if coltype.reversible or not self.reverse: c = ColumnCategorizer(global_searcher, fieldname, self.reverse) else: c = ReversedColumnCategorizer(global_searcher, fieldname) else: c = PostingCategorizer(global_searcher, fieldname, self.reverse) return c class ColumnCategorizer(Categorizer): def __init__(self, global_searcher, fieldname, reverse=False): self._fieldname = fieldname self._fieldobj = global_searcher.schema[self._fieldname] self._column_type = self._fieldobj.column_type self._reverse = reverse # The column reader is set in set_searcher() as we iterate over the # sub-searchers self._creader = None def __repr__(self): return "%s(%r, %r, reverse=%r)" % (self.__class__.__name__, self._fieldobj, self._fieldname, self._reverse) def set_searcher(self, segment_searcher, docoffset): r = segment_searcher.reader() self._creader = r.column_reader(self._fieldname, reverse=self._reverse, translate=False) def key_for(self, matcher, segment_docnum): return self._creader.sort_key(segment_docnum) def key_to_name(self, key): return self._fieldobj.from_column_value(key) class ReversedColumnCategorizer(ColumnCategorizer): """Categorizer that reverses column values for columns that aren't naturally reversible. """ def __init__(self, global_searcher, fieldname): ColumnCategorizer.__init__(self, global_searcher, fieldname) reader = global_searcher.reader() self._doccount = reader.doc_count_all() global_creader = reader.column_reader(fieldname, translate=False) self._values = sorted(set(global_creader)) def key_for(self, matcher, segment_docnum): value = self._creader[segment_docnum] order = self._values.index(value) # Subtract from 0 to reverse the order return 0 - order def key_to_name(self, key): # Re-reverse the key to get the index into _values key = self._values[0 - key] return ColumnCategorizer.key_to_name(self, key) class OverlappingCategorizer(Categorizer): allow_overlap = True def __init__(self, global_searcher, fieldname): self._fieldname = fieldname self._fieldobj = global_searcher.schema[fieldname] field = global_searcher.schema[fieldname] reader = global_searcher.reader() self._use_vectors = bool(field.vector) self._use_column = (reader.has_column(fieldname) and field.column_type.stores_lists()) # These are set in set_searcher() as we iterate over the sub-searchers self._segment_searcher = None self._creader = None self._lists = None def set_searcher(self, segment_searcher, docoffset): fieldname = self._fieldname self._segment_searcher = segment_searcher reader = segment_searcher.reader() if self._use_vectors: pass elif self._use_column: self._creader = reader.column_reader(fieldname, translate=False) else: # Otherwise, cache the values in each document in a huge list # of lists dc = segment_searcher.doc_count_all() field = segment_searcher.schema[fieldname] from_bytes = field.from_bytes self._lists = [[] for _ in xrange(dc)] for btext in field.sortable_terms(reader, fieldname): text = from_bytes(btext) postings = reader.postings(fieldname, btext) for docid in postings.all_ids(): self._lists[docid].append(text) def keys_for(self, matcher, docid): if self._use_vectors: try: v = self._segment_searcher.vector(docid, self._fieldname) return list(v.all_ids()) except KeyError: return [] elif self._use_column: return self._creader[docid] else: return self._lists[docid] or [None] def key_for(self, matcher, docid): if self._use_vectors: try: v = self._segment_searcher.vector(docid, self._fieldname) return v.id() except KeyError: return None elif self._use_column: return self._creader.sort_key(docid) else: ls = self._lists[docid] if ls: return ls[0] else: return None class PostingCategorizer(Categorizer): """ Categorizer for fields that don't store column values. This is very inefficient. Instead of relying on this categorizer you should plan for which fields you'll want to sort on and set ``sortable=True`` in their field type. This object builds an array caching the order of all documents according to the field, then uses the cached order as a numeric key. This is useful when a field cache is not available, and also for reversed fields (since field cache keys for non- numeric fields are arbitrary data, it's not possible to "negate" them to reverse the sort order). """ def __init__(self, global_searcher, fieldname, reverse): self.reverse = reverse if fieldname in global_searcher._field_caches: self.values, self.array = global_searcher._field_caches[fieldname] else: # Cache the relative positions of all docs with the given field # across the entire index reader = global_searcher.reader() dc = reader.doc_count_all() self._fieldobj = global_searcher.schema[fieldname] from_bytes = self._fieldobj.from_bytes self.values = [] self.array = array("i", [dc + 1] * dc) btexts = self._fieldobj.sortable_terms(reader, fieldname) for i, btext in enumerate(btexts): self.values.append(from_bytes(btext)) # Get global docids from global reader postings = reader.postings(fieldname, btext) for docid in postings.all_ids(): self.array[docid] = i global_searcher._field_caches[fieldname] = (self.values, self.array) def set_searcher(self, segment_searcher, docoffset): self._searcher = segment_searcher self.docoffset = docoffset def key_for(self, matcher, segment_docnum): global_docnum = self.docoffset + segment_docnum i = self.array[global_docnum] if self.reverse: i = len(self.values) - i return i def key_to_name(self, i): if i >= len(self.values): return None if self.reverse: i = len(self.values) - i return self.values[i] # Special facet types class QueryFacet(FacetType): """Sorts/facets based on the results of a series of queries. """ def __init__(self, querydict, other=None, allow_overlap=False, maptype=None): """ :param querydict: a dictionary mapping keys to :class:`whoosh.query.Query` objects. :param other: the key to use for documents that don't match any of the queries. """ self.querydict = querydict self.other = other self.maptype = maptype self.allow_overlap = allow_overlap def categorizer(self, global_searcher): return self.QueryCategorizer(self.querydict, self.other, self.allow_overlap) class QueryCategorizer(Categorizer): def __init__(self, querydict, other, allow_overlap=False): self.querydict = querydict self.other = other self.allow_overlap = allow_overlap def set_searcher(self, segment_searcher, offset): self.docsets = {} for qname, q in self.querydict.items(): docset = set(q.docs(segment_searcher)) if docset: self.docsets[qname] = docset self.offset = offset def key_for(self, matcher, docid): for qname in self.docsets: if docid in self.docsets[qname]: return qname return self.other def keys_for(self, matcher, docid): found = False for qname in self.docsets: if docid in self.docsets[qname]: yield qname found = True if not found: yield None class RangeFacet(QueryFacet): """Sorts/facets based on numeric ranges. For textual ranges, use :class:`QueryFacet`. For example, to facet the "price" field into $100 buckets, up to $1000:: prices = RangeFacet("price", 0, 1000, 100) results = searcher.search(myquery, groupedby=prices) The ranges/buckets are always **inclusive** at the start and **exclusive** at the end. """ def __init__(self, fieldname, start, end, gap, hardend=False, maptype=None): """ :param fieldname: the numeric field to sort/facet on. :param start: the start of the entire range. :param end: the end of the entire range. :param gap: the size of each "bucket" in the range. This can be a sequence of sizes. For example, ``gap=[1,5,10]`` will use 1 as the size of the first bucket, 5 as the size of the second bucket, and 10 as the size of all subsequent buckets. :param hardend: if True, the end of the last bucket is clamped to the value of ``end``. If False (the default), the last bucket is always ``gap`` sized, even if that means the end of the last bucket is after ``end``. """ self.fieldname = fieldname self.start = start self.end = end self.gap = gap self.hardend = hardend self.maptype = maptype self._queries() def default_name(self): return self.fieldname def _rangetype(self): from whoosh import query return query.NumericRange def _range_name(self, startval, endval): return (startval, endval) def _queries(self): if not self.gap: raise Exception("No gap secified (%r)" % self.gap) if isinstance(self.gap, (list, tuple)): gaps = self.gap gapindex = 0 else: gaps = [self.gap] gapindex = -1 rangetype = self._rangetype() self.querydict = {} cstart = self.start while cstart < self.end: thisgap = gaps[gapindex] if gapindex >= 0: gapindex += 1 if gapindex == len(gaps): gapindex = -1 cend = cstart + thisgap if self.hardend: cend = min(self.end, cend) rangename = self._range_name(cstart, cend) q = rangetype(self.fieldname, cstart, cend, endexcl=True) self.querydict[rangename] = q cstart = cend def categorizer(self, global_searcher): return QueryFacet(self.querydict).categorizer(global_searcher) class DateRangeFacet(RangeFacet): """Sorts/facets based on date ranges. This is the same as RangeFacet except you are expected to use ``daterange`` objects as the start and end of the range, and ``timedelta`` or ``relativedelta`` objects as the gap(s), and it generates :class:`~whoosh.query.DateRange` queries instead of :class:`~whoosh.query.TermRange` queries. For example, to facet a "birthday" range into 5 year buckets:: from datetime import datetime from whoosh.support.relativedelta import relativedelta startdate = datetime(1920, 0, 0) enddate = datetime.now() gap = relativedelta(years=5) bdays = DateRangeFacet("birthday", startdate, enddate, gap) results = searcher.search(myquery, groupedby=bdays) The ranges/buckets are always **inclusive** at the start and **exclusive** at the end. """ def _rangetype(self): from whoosh import query return query.DateRange class ScoreFacet(FacetType): """Uses a document's score as a sorting criterion. For example, to sort by the ``tag`` field, and then within that by relative score:: tag_score = MultiFacet(["tag", ScoreFacet()]) results = searcher.search(myquery, sortedby=tag_score) """ def categorizer(self, global_searcher): return self.ScoreCategorizer(global_searcher) class ScoreCategorizer(Categorizer): needs_current = True def __init__(self, global_searcher): w = global_searcher.weighting self.use_final = w.use_final if w.use_final: self.final = w.final def set_searcher(self, segment_searcher, offset): self.segment_searcher = segment_searcher def key_for(self, matcher, docid): score = matcher.score() if self.use_final: score = self.final(self.segment_searcher, docid, score) # Negate the score so higher values sort first return 0 - score class FunctionFacet(FacetType): """This facet type is low-level. In most cases you should use :class:`TranslateFacet` instead. This facet type ets you pass an arbitrary function that will compute the key. This may be easier than subclassing FacetType and Categorizer to set up the desired behavior. The function is called with the arguments ``(searcher, docid)``, where the ``searcher`` may be a composite searcher, and the ``docid`` is an absolute index document number (not segment-relative). For example, to use the number of words in the document's "content" field as the sorting/faceting key:: fn = lambda s, docid: s.doc_field_length(docid, "content") lengths = FunctionFacet(fn) """ def __init__(self, fn, maptype=None): self.fn = fn self.maptype = maptype def categorizer(self, global_searcher): return self.FunctionCategorizer(global_searcher, self.fn) class FunctionCategorizer(Categorizer): def __init__(self, global_searcher, fn): self.global_searcher = global_searcher self.fn = fn def set_searcher(self, segment_searcher, docoffset): self.offset = docoffset def key_for(self, matcher, docid): return self.fn(self.global_searcher, docid + self.offset) class TranslateFacet(FacetType): """Lets you specify a function to compute the key based on a key generated by a wrapped facet. This is useful if you want to use a custom ordering of a sortable field. For example, if you want to use an implementation of the Unicode Collation Algorithm (UCA) to sort a field using the rules from a particular language:: from pyuca import Collator # The Collator object has a sort_key() method which takes a unicode # string and returns a sort key c = Collator("allkeys.txt") # Make a facet object for the field you want to sort on facet = sorting.FieldFacet("name") # Wrap the facet in a TranslateFacet with the translation function # (the Collator object's sort_key method) facet = sorting.TranslateFacet(c.sort_key, facet) # Use the facet to sort the search results results = searcher.search(myquery, sortedby=facet) You can pass multiple facets to the """ def __init__(self, fn, *facets): """ :param fn: The function to apply. For each matching document, this function will be called with the values of the given facets as arguments. :param facets: One or more :class:`FacetType` objects. These facets are used to compute facet value(s) for a matching document, and then the value(s) is/are passed to the function. """ self.fn = fn self.facets = facets self.maptype = None def categorizer(self, global_searcher): catters = [facet.categorizer(global_searcher) for facet in self.facets] return self.TranslateCategorizer(self.fn, catters) class TranslateCategorizer(Categorizer): def __init__(self, fn, catters): self.fn = fn self.catters = catters def set_searcher(self, segment_searcher, docoffset): for catter in self.catters: catter.set_searcher(segment_searcher, docoffset) def key_for(self, matcher, segment_docnum): keys = [catter.key_for(matcher, segment_docnum) for catter in self.catters] return self.fn(*keys) class StoredFieldFacet(FacetType): """Lets you sort/group using the value in an unindexed, stored field (e.g. :class:`whoosh.fields.STORED`). This is usually slower than using an indexed field. For fields where the stored value is a space-separated list of keywords, (e.g. ``"tag1 tag2 tag3"``), you can use the ``allow_overlap`` keyword argument to allow overlapped faceting on the result of calling the ``split()`` method on the field value (or calling a custom split function if one is supplied). """ def __init__(self, fieldname, allow_overlap=False, split_fn=None, maptype=None): """ :param fieldname: the name of the stored field. :param allow_overlap: if True, when grouping, allow documents to appear in multiple groups when they have multiple terms in the field. The categorizer uses ``string.split()`` or the custom ``split_fn`` to convert the stored value into a list of facet values. :param split_fn: a custom function to split a stored field value into multiple facet values when ``allow_overlap`` is True. If not supplied, the categorizer simply calls the value's ``split()`` method. """ self.fieldname = fieldname self.allow_overlap = allow_overlap self.split_fn = None self.maptype = maptype def default_name(self): return self.fieldname def categorizer(self, global_searcher): return self.StoredFieldCategorizer(self.fieldname, self.allow_overlap, self.split_fn) class StoredFieldCategorizer(Categorizer): def __init__(self, fieldname, allow_overlap, split_fn): self.fieldname = fieldname self.allow_overlap = allow_overlap self.split_fn = split_fn def set_searcher(self, segment_searcher, docoffset): self.segment_searcher = segment_searcher def keys_for(self, matcher, docid): d = self.segment_searcher.stored_fields(docid) value = d.get(self.fieldname) if self.split_fn: return self.split_fn(value) else: return value.split() def key_for(self, matcher, docid): d = self.segment_searcher.stored_fields(docid) return d.get(self.fieldname) class MultiFacet(FacetType): """Sorts/facets by the combination of multiple "sub-facets". For example, to sort by the value of the "tag" field, and then (for documents where the tag is the same) by the value of the "path" field:: facet = MultiFacet(FieldFacet("tag"), FieldFacet("path") results = searcher.search(myquery, sortedby=facet) As a shortcut, you can use strings to refer to field names, and they will be assumed to be field names and turned into FieldFacet objects:: facet = MultiFacet("tag", "path") You can also use the ``add_*`` methods to add criteria to the multifacet:: facet = MultiFacet() facet.add_field("tag") facet.add_field("path", reverse=True) facet.add_query({"a-m": TermRange("name", "a", "m"), "n-z": TermRange("name", "n", "z")}) """ def __init__(self, items=None, maptype=None): self.facets = [] if items: for item in items: self._add(item) self.maptype = maptype def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self.facets, self.maptype) @classmethod def from_sortedby(cls, sortedby): multi = cls() if isinstance(sortedby, string_type): multi._add(sortedby) elif (isinstance(sortedby, (list, tuple)) or hasattr(sortedby, "__iter__")): for item in sortedby: multi._add(item) else: multi._add(sortedby) return multi def _add(self, item): if isinstance(item, FacetType): self.add_facet(item) elif isinstance(item, string_type): self.add_field(item) else: raise Exception("Don't know what to do with facet %r" % (item,)) def add_field(self, fieldname, reverse=False): self.facets.append(FieldFacet(fieldname, reverse=reverse)) return self def add_query(self, querydict, other=None, allow_overlap=False): self.facets.append(QueryFacet(querydict, other=other, allow_overlap=allow_overlap)) return self def add_score(self): self.facets.append(ScoreFacet()) return self def add_facet(self, facet): if not isinstance(facet, FacetType): raise TypeError("%r is not a facet object, perhaps you meant " "add_field()" % (facet,)) self.facets.append(facet) return self def categorizer(self, global_searcher): if not self.facets: raise Exception("No facets") elif len(self.facets) == 1: catter = self.facets[0].categorizer(global_searcher) else: catter = self.MultiCategorizer([facet.categorizer(global_searcher) for facet in self.facets]) return catter class MultiCategorizer(Categorizer): def __init__(self, catters): self.catters = catters @property def needs_current(self): return any(c.needs_current for c in self.catters) def set_searcher(self, segment_searcher, docoffset): for catter in self.catters: catter.set_searcher(segment_searcher, docoffset) def key_for(self, matcher, docid): return tuple(catter.key_for(matcher, docid) for catter in self.catters) def key_to_name(self, key): return tuple(catter.key_to_name(keypart) for catter, keypart in izip(self.catters, key)) class Facets(object): """Maps facet names to :class:`FacetType` objects, for creating multiple groupings of documents. For example, to group by tag, and **also** group by price range:: facets = Facets() facets.add_field("tag") facets.add_facet("price", RangeFacet("price", 0, 1000, 100)) results = searcher.search(myquery, groupedby=facets) tag_groups = results.groups("tag") price_groups = results.groups("price") (To group by the combination of multiple facets, use :class:`MultiFacet`.) """ def __init__(self, x=None): self.facets = {} if x: self.add_facets(x) @classmethod def from_groupedby(cls, groupedby): facets = cls() if isinstance(groupedby, (cls, dict)): facets.add_facets(groupedby) elif isinstance(groupedby, string_type): facets.add_field(groupedby) elif isinstance(groupedby, FacetType): facets.add_facet(groupedby.default_name(), groupedby) elif isinstance(groupedby, (list, tuple)): for item in groupedby: facets.add_facets(cls.from_groupedby(item)) else: raise Exception("Don't know what to do with groupedby=%r" % groupedby) return facets def names(self): """Returns an iterator of the facet names in this object. """ return iter(self.facets) def items(self): """Returns a list of (facetname, facetobject) tuples for the facets in this object. """ return self.facets.items() def add_field(self, fieldname, **kwargs): """Adds a :class:`FieldFacet` for the given field name (the field name is automatically used as the facet name). """ self.facets[fieldname] = FieldFacet(fieldname, **kwargs) return self def add_query(self, name, querydict, **kwargs): """Adds a :class:`QueryFacet` under the given ``name``. :param name: a name for the facet. :param querydict: a dictionary mapping keys to :class:`whoosh.query.Query` objects. """ self.facets[name] = QueryFacet(querydict, **kwargs) return self def add_facet(self, name, facet): """Adds a :class:`FacetType` object under the given ``name``. """ if not isinstance(facet, FacetType): raise Exception("%r:%r is not a facet" % (name, facet)) self.facets[name] = facet return self def add_facets(self, facets, replace=True): """Adds the contents of the given ``Facets`` or ``dict`` object to this object. """ if not isinstance(facets, (dict, Facets)): raise Exception("%r is not a Facets object or dict" % facets) for name, facet in facets.items(): if replace or name not in self.facets: self.facets[name] = facet return self # Objects for holding facet groups class FacetMap(object): """Base class for objects holding the results of grouping search results by a Facet. Use an object's ``as_dict()`` method to access the results. You can pass a subclass of this to the ``maptype`` keyword argument when creating a ``FacetType`` object to specify what information the facet should record about the group. For example:: # Record each document in each group in its sorted order myfacet = FieldFacet("size", maptype=OrderedList) # Record only the count of documents in each group myfacet = FieldFacet("size", maptype=Count) """ def add(self, groupname, docid, sortkey): """Adds a document to the facet results. :param groupname: the name of the group to add this document to. :param docid: the document number of the document to add. :param sortkey: a value representing the sort position of the document in the full results. """ raise NotImplementedError def as_dict(self): """Returns a dictionary object mapping group names to implementation-specific values. For example, the value might be a list of document numbers, or a integer representing the number of documents in the group. """ raise NotImplementedError class OrderedList(FacetMap): """Stores a list of document numbers for each group, in the same order as they appear in the search results. The ``as_dict`` method returns a dictionary mapping group names to lists of document numbers. """ def __init__(self): self.dict = defaultdict(list) def __repr__(self): return "<%s %r>" % (self.__class__.__name__, self.dict) def add(self, groupname, docid, sortkey): self.dict[groupname].append((sortkey, docid)) def as_dict(self): d = {} for key, items in iteritems(self.dict): d[key] = [docnum for _, docnum in sorted(items)] return d class UnorderedList(FacetMap): """Stores a list of document numbers for each group, in arbitrary order. This is slightly faster and uses less memory than :class:`OrderedListResult` if you don't care about the ordering of the documents within groups. The ``as_dict`` method returns a dictionary mapping group names to lists of document numbers. """ def __init__(self): self.dict = defaultdict(list) def __repr__(self): return "<%s %r>" % (self.__class__.__name__, self.dict) def add(self, groupname, docid, sortkey): self.dict[groupname].append(docid) def as_dict(self): return dict(self.dict) class Count(FacetMap): """Stores the number of documents in each group. The ``as_dict`` method returns a dictionary mapping group names to integers. """ def __init__(self): self.dict = defaultdict(int) def __repr__(self): return "<%s %r>" % (self.__class__.__name__, self.dict) def add(self, groupname, docid, sortkey): self.dict[groupname] += 1 def as_dict(self): return dict(self.dict) class Best(FacetMap): """Stores the "best" document in each group (that is, the one with the highest sort key). The ``as_dict`` method returns a dictionary mapping group names to docnument numbers. """ def __init__(self): self.bestids = {} self.bestkeys = {} def __repr__(self): return "<%s %r>" % (self.__class__.__name__, self.bestids) def add(self, groupname, docid, sortkey): if groupname not in self.bestids or sortkey < self.bestkeys[groupname]: self.bestids[groupname] = docid self.bestkeys[groupname] = sortkey def as_dict(self): return self.bestids # Helper functions def add_sortable(writer, fieldname, facet, column=None): """Adds a per-document value column to an existing field which was created without the ``sortable`` keyword argument. >>> from whoosh import index, sorting >>> ix = index.open_dir("indexdir") >>> with ix.writer() as w: ... facet = sorting.FieldFacet("price") ... sorting.add_sortable(w, "price", facet) ... :param writer: a :class:`whoosh.writing.IndexWriter` object. :param fieldname: the name of the field to add the per-document sortable values to. If this field doesn't exist in the writer's schema, the function will add a :class:`whoosh.fields.COLUMN` field to the schema, and you must specify the column object to using the ``column`` keyword argument. :param facet: a :class:`FacetType` object to use to generate the per-document values. :param column: a :class:`whosh.columns.ColumnType` object to use to store the per-document values. If you don't specify a column object, the function will use the default column type for the given field. """ storage = writer.storage schema = writer.schema field = None if fieldname in schema: field = schema[fieldname] if field.column_type: raise Exception("%r field is already sortable" % fieldname) if column: if fieldname not in schema: from whoosh.fields import COLUMN field = COLUMN(column) schema.add(fieldname, field) else: if fieldname in schema: column = field.default_column() else: raise Exception("Field %r does not exist" % fieldname) searcher = writer.searcher() catter = facet.categorizer(searcher) for subsearcher, docoffset in searcher.leaf_searchers(): catter.set_searcher(subsearcher, docoffset) reader = subsearcher.reader() if reader.has_column(fieldname): raise Exception("%r field already has a column" % fieldname) codec = reader.codec() segment = reader.segment() colname = codec.column_filename(segment, fieldname) colfile = storage.create_file(colname) try: colwriter = column.writer(colfile) for docnum in reader.all_doc_ids(): v = catter.key_to_name(catter.key_for(None, docnum)) cv = field.to_column_value(v) colwriter.add(docnum, cv) colwriter.finish(reader.doc_count_all()) finally: colfile.close() field.column_type = column Whoosh-2.5.7/src/whoosh/spelling.py0000644000076500000240000003071612254366350017366 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """This module contains helper functions for correcting typos in user queries. """ from collections import defaultdict from heapq import heappush, heapreplace from whoosh import analysis, fields, highlight, query, scoring from whoosh.automata import fst from whoosh.compat import xrange, string_type from whoosh.support.levenshtein import distance from whoosh.util.text import utf8encode # Corrector objects class Corrector(object): """Base class for spelling correction objects. Concrete sub-classes should implement the ``_suggestions`` method. """ def suggest(self, text, limit=5, maxdist=2, prefix=0): """ :param text: the text to check. This word will **not** be added to the suggestions, even if it appears in the word graph. :param limit: only return up to this many suggestions. If there are not enough terms in the field within ``maxdist`` of the given word, the returned list will be shorter than this number. :param maxdist: the largest edit distance from the given word to look at. Values higher than 2 are not very effective or efficient. :param prefix: require suggestions to share a prefix of this length with the given word. This is often justifiable since most misspellings do not involve the first letter of the word. Using a prefix dramatically decreases the time it takes to generate the list of words. """ _suggestions = self._suggestions heap = [] seen = set([text]) for k in xrange(1, maxdist + 1): for item in _suggestions(text, k, prefix): if item[1] in seen: continue seen.add(item[1]) # Note that the *higher* scores (item[0]) are better! if len(heap) < limit: heappush(heap, item) elif item > heap[0]: heapreplace(heap, item) # If the heap is already at the required length, don't bother going # to a higher edit distance if len(heap) >= limit: break sugs = sorted(heap, key=lambda item: (0 - item[0], item[1])) return [sug for _, sug in sugs] def _suggestions(self, text, maxdist, prefix): """Low-level method that yields a series of (score, "suggestion") tuples. :param text: the text to check. :param maxdist: the maximum edit distance. :param prefix: require suggestions to share a prefix of this length with the given word. """ raise NotImplementedError class ReaderCorrector(Corrector): """Suggests corrections based on the content of a field in a reader. Ranks suggestions by the edit distance, then by highest to lowest frequency. """ def __init__(self, reader, fieldname): self.reader = reader self.fieldname = fieldname def _suggestions(self, text, maxdist, prefix): fieldname = self.fieldname freq = self.reader.frequency for sug in self.reader.terms_within(fieldname, text, maxdist, prefix=prefix): # Higher scores are better, so negate the distance and frequency # TODO: store spelling frequencies in the graph f = freq(fieldname, sug) or 1 score = 0 - (maxdist + (1.0 / f * 0.5)) yield (score, sug) class GraphCorrector(Corrector): """Suggests corrections based on the content of a raw :class:`whoosh.automata.fst.GraphReader` object. By default ranks suggestions based on the edit distance. """ def __init__(self, graph): self.graph = graph def _suggestions(self, text, maxdist, prefix): for sug in fst.within(self.graph, text, k=maxdist, prefix=prefix): # Higher scores are better, so negate the edit distance yield (0 - maxdist, sug) class MultiCorrector(Corrector): """Merges suggestions from a list of sub-correctors. """ def __init__(self, correctors): self.correctors = correctors def _suggestions(self, text, maxdist, prefix): for corr in self.correctors: for item in corr._suggestions(text, maxdist, prefix): yield item def wordlist_to_graph_file(wordlist, dbfile, fieldname="_", strip=True): """Writes a word graph file from a list of words. >>> # Open a word list file with one word on each line, and write the >>> # word graph to a graph file >>> wordlist_to_graph_file("mywords.txt", "mywords.dawg") :param wordlist: an iterable containing the words for the graph. The words must be in sorted order. :param dbfile: a filename string or file-like object to write the word graph to. This function will close the file. """ from whoosh.filedb.structfile import StructFile if isinstance(dbfile, string_type): dbfile = open(dbfile, "wb") if not isinstance(dbfile, StructFile): dbfile = StructFile(dbfile) gw = fst.GraphWriter(dbfile) gw.start_field(fieldname) for word in wordlist: if strip: word = word.strip() gw.insert(word) gw.finish_field() gw.close() # Query correction class Correction(object): """Represents the corrected version of a user query string. Has the following attributes: ``query`` The corrected :class:`whoosh.query.Query` object. ``string`` The corrected user query string. ``original_query`` The original :class:`whoosh.query.Query` object that was corrected. ``original_string`` The original user query string. ``tokens`` A list of token objects representing the corrected words. You can also use the :meth:`Correction.format_string` method to reformat the corrected query string using a :class:`whoosh.highlight.Formatter` class. For example, to display the corrected query string as HTML with the changed words emphasized:: from whoosh import highlight correction = mysearcher.correct_query(q, qstring) hf = highlight.HtmlFormatter(classname="change") html = correction.format_string(hf) """ def __init__(self, q, qstring, corr_q, tokens): self.original_query = q self.query = corr_q self.original_string = qstring self.tokens = tokens if self.original_string: self.string = self.format_string(highlight.NullFormatter()) else: self.string = '' def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self.query, self.string) def format_string(self, formatter): """ Highlights the corrected words in the original query string using the given :class:`~whoosh.highlight.Formatter`. :param formatter: A :class:`whoosh.highlight.Formatter` instance. :return: the output of the formatter (usually a string). """ if not self.original_string: return '' if isinstance(formatter, type): formatter = formatter() fragment = highlight.Fragment(self.original_string, self.tokens) return formatter.format_fragment(fragment, replace=True) # QueryCorrector objects class QueryCorrector(object): """Base class for objects that correct words in a user query. """ def correct_query(self, q, qstring): """Returns a :class:`Correction` object representing the corrected form of the given query. :param q: the original :class:`whoosh.query.Query` tree to be corrected. :param qstring: the original user query. This may be None if the original query string is not available, in which case the ``Correction.string`` attribute will also be None. :rtype: :class:`Correction` """ raise NotImplementedError class SimpleQueryCorrector(QueryCorrector): """A simple query corrector based on a mapping of field names to :class:`Corrector` objects, and a list of ``("fieldname", "text")`` tuples to correct. And terms in the query that appear in list of term tuples are corrected using the appropriate corrector. """ def __init__(self, correctors, terms, prefix=0, maxdist=2): """ :param correctors: a dictionary mapping field names to :class:`Corrector` objects. :param terms: a sequence of ``("fieldname", "text")`` tuples representing terms to be corrected. :param prefix: suggested replacement words must share this number of initial characters with the original word. Increasing this even to just ``1`` can dramatically speed up suggestions, and may be justifiable since spellling mistakes rarely involve the first letter of a word. :param maxdist: the maximum number of "edits" (insertions, deletions, subsitutions, or transpositions of letters) allowed between the original word and any suggestion. Values higher than ``2`` may be slow. """ self.correctors = correctors self.termset = frozenset(terms) self.prefix = prefix self.maxdist = maxdist def correct_query(self, q, qstring): correctors = self.correctors termset = self.termset prefix = self.prefix maxdist = self.maxdist # A list of tokens that were changed by a corrector corrected_tokens = [] # The corrected query tree. We don't need to deepcopy the original # because we use Query.replace() to find-and-replace the corrected # words and it returns a copy of the query tree. corrected_q = q # For every word in the original query... # Note we can't put these in a set, because we must preserve WHERE # in the query each token occured so we can format them later for token in q.all_tokens(): fname = token.fieldname # If this is one of the words we're supposed to correct... if (fname, token.text) in termset: sugs = correctors[fname].suggest(token.text, prefix=prefix, maxdist=maxdist) if sugs: # This is a "simple" corrector, so we just pick the first # suggestion :/ sug = sugs[0] # Return a new copy of the original query with this word # replaced by the correction corrected_q = corrected_q.replace(token.fieldname, token.text, sug) # Add the token to the list of corrected tokens (for the # formatter to use later) token.original = token.text token.text = sug corrected_tokens.append(token) return Correction(q, qstring, corrected_q, corrected_tokens) Whoosh-2.5.7/src/whoosh/support/0000755000076500000240000000000012277504634016710 5ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/support/__init__.py0000644000076500000240000000000012254366350021003 0ustar mattstaff00000000000000Whoosh-2.5.7/src/whoosh/support/base85.py0000644000076500000240000000465112254366350020353 0ustar mattstaff00000000000000""" This module contains generic base85 encoding and decoding functions. The whoosh.util.numeric module contains faster variants for encoding and decoding integers. Modified from: http://paste.lisp.org/display/72815 """ import struct from whoosh.compat import xrange # Instead of using the character set from the ascii85 algorithm, I put the # characters in order so that the encoded text sorts properly (my life would be # a lot easier if they had just done that from the start) b85chars = ("!$%&*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" "^_abcdefghijklmnopqrstuvwxyz{|}~") b85dec = {} for i in range(len(b85chars)): b85dec[b85chars[i]] = i # Integer encoding and decoding functions def to_base85(x, islong=False): "Encodes the given integer using base 85." size = 10 if islong else 5 rems = "" for i in xrange(size): rems = b85chars[x % 85] + rems x //= 85 return rems def from_base85(text): "Decodes the given base 85 text into an integer." acc = 0 for c in text: acc = acc * 85 + b85dec[c] return acc # Bytes encoding and decoding functions def b85encode(text, pad=False): l = len(text) r = l % 4 if r: text += '\0' * (4 - r) longs = len(text) >> 2 out = [] words = struct.unpack('>' + 'L' * longs, text[0:longs * 4]) for word in words: rems = [0, 0, 0, 0, 0] for i in range(4, -1, -1): rems[i] = b85chars[word % 85] word /= 85 out.extend(rems) out = ''.join(out) if pad: return out # Trim padding olen = l % 4 if olen: olen += 1 olen += l / 4 * 5 return out[0:olen] def b85decode(text): l = len(text) out = [] for i in range(0, len(text), 5): chunk = text[i:i + 5] acc = 0 for j in range(len(chunk)): try: acc = acc * 85 + b85dec[chunk[j]] except KeyError: raise TypeError('Bad base85 character at byte %d' % (i + j)) if acc > 4294967295: raise OverflowError('Base85 overflow in hunk starting at byte %d' % i) out.append(acc) # Pad final chunk if necessary cl = l % 5 if cl: acc *= 85 ** (5 - cl) if cl > 1: acc += 0xffffff >> (cl - 2) * 8 out[-1] = acc out = struct.pack('>' + 'L' * ((l + 4) / 5), *out) if cl: out = out[:-(5 - cl)] return out Whoosh-2.5.7/src/whoosh/support/bench.py0000644000076500000240000005137012254366350020343 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import division import os.path from optparse import OptionParser from shutil import rmtree from whoosh import index, qparser, query, scoring from whoosh.util import now, find_object try: import xappy except ImportError: pass try: import xapian except ImportError: pass try: import pysolr except ImportError: pass try: from persistent import Persistent class ZDoc(Persistent): def __init__(self, d): self.__dict__.update(d) except ImportError: pass class Module(object): def __init__(self, bench, options, args): self.bench = bench self.options = options self.args = args def __repr__(self): return self.__class__.__name__ def indexer(self, **kwargs): pass def index_document(self, d): raise NotImplementedError def finish(self, **kwargs): pass def _process_result(self, d): attrname = "process_result_%s" % self.options.lib if hasattr(self.bench.spec, attrname): method = getattr(self.bench.spec, attrname) self._process_result = method return method(d) else: self._process_result = lambda x: x return d def searcher(self): pass def query(self): raise NotImplementedError def find(self, q): raise NotImplementedError def findterms(self, terms): raise NotImplementedError def results(self, r): for hit in r: yield self._process_result(hit) class Spec(object): headline_field = "title" main_field = "body" def __init__(self, options, args): self.options = options self.args = args def documents(self): raise NotImplementedError def setup(self): pass def print_results(self, ls): showbody = self.options.showbody snippets = self.options.snippets limit = self.options.limit for i, hit in enumerate(ls): if i >= limit: break print("%d. %s" % (i + 1, hit.get(self.headline_field))) if snippets: print(self.show_snippet(hit)) if showbody: print(hit.get(self.main_field)) class WhooshModule(Module): def indexer(self, create=True): schema = self.bench.spec.whoosh_schema() path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) if not os.path.exists(path): os.mkdir(path) if create: ix = index.create_in(path, schema) else: ix = index.open_dir(path) poolclass = None if self.options.pool: poolclass = find_object(self.options.pool) self.writer = ix.writer(limitmb=int(self.options.limitmb), poolclass=poolclass, dir=self.options.tempdir, procs=int(self.options.procs), batchsize=int(self.options.batch), multisegment=self.options.xms) self._procdoc = None if hasattr(self.bench.spec, "process_document_whoosh"): self._procdoc = self.bench.spec.process_document_whoosh def index_document(self, d): _procdoc = self._procdoc if _procdoc: _procdoc(d) self.writer.add_document(**d) def finish(self, merge=True, optimize=False): self.writer.commit(merge=merge, optimize=optimize) def searcher(self): path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) ix = index.open_dir(path) self.srch = ix.searcher(weighting=scoring.PL2()) self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema) def query(self): qstring = " ".join(self.args).decode("utf-8") return self.parser.parse(qstring) def find(self, q): return self.srch.search(q, limit=int(self.options.limit), optimize=self.options.optimize) def findterms(self, terms): limit = int(self.options.limit) s = self.srch q = query.Term(self.bench.spec.main_field, None) for term in terms: q.text = term yield s.search(q, limit=limit) class XappyModule(Module): def indexer(self, **kwargs): path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) conn = self.bench.spec.xappy_connection(path) return conn def index_document(self, conn, d): if hasattr(self.bench, "process_document_xappy"): self.bench.process_document_xappy(d) doc = xappy.UnprocessedDocument() for key, values in d: if not isinstance(values, list): values = [values] for value in values: doc.fields.append(xappy.Field(key, value)) conn.add(doc) def finish(self, conn): conn.flush() def searcher(self): path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) return xappy.SearchConnection(path) def query(self, conn): return conn.query_parse(" ".join(self.args)) def find(self, conn, q): return conn.search(q, 0, int(self.options.limit)) def findterms(self, conn, terms): limit = int(self.options.limit) for term in terms: q = conn.query_field(self.bench.spec.main_field, term) yield conn.search(q, 0, limit) def results(self, r): hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for hit in r: yield self._process_result({hf: hit.data[hf], mf: hit.data[mf]}) class XapianModule(Module): def indexer(self, **kwargs): path = os.path.join(self.options.dir, "%s_xapian" % self.options.indexname) self.database = xapian.WritableDatabase(path, xapian.DB_CREATE_OR_OPEN) self.ixer = xapian.TermGenerator() def index_document(self, d): if hasattr(self.bench, "process_document_xapian"): self.bench.process_document_xapian(d) doc = xapian.Document() doc.add_value(0, d.get(self.bench.spec.headline_field, "-")) doc.set_data(d[self.bench.spec.main_field]) self.ixer.set_document(doc) self.ixer.index_text(d[self.bench.spec.main_field]) self.database.add_document(doc) def finish(self, **kwargs): self.database.flush() def searcher(self): path = os.path.join(self.options.dir, "%s_xappy" % self.options.indexname) self.db = xapian.Database(path) self.enq = xapian.Enquire(self.db) self.qp = xapian.QueryParser() self.qp.set_database(self.db) def query(self): return self.qp.parse_query(" ".join(self.args)) def find(self, q): self.enq.set_query(q) return self.enq.get_mset(0, int(self.options.limit)) def findterms(self, terms): limit = int(self.options.limit) for term in terms: q = self.qp.parse_query(term) self.enq.set_query(q) yield self.enq.get_mset(0, limit) def results(self, matches): hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for m in matches: yield self._process_result({hf: m.document.get_value(0), mf: m.document.get_data()}) class SolrModule(Module): def indexer(self, **kwargs): self.solr_doclist = [] self.conn = pysolr.Solr(self.options.url) self.conn.delete("*:*") self.conn.commit() def index_document(self, d): self.solr_doclist.append(d) if len(self.solr_doclist) >= int(self.options.batch): self.conn.add(self.solr_doclist, commit=False) self.solr_doclist = [] def finish(self, **kwargs): if self.solr_doclist: self.conn.add(self.solr_doclist) del self.solr_doclist self.conn.optimize(block=True) def searcher(self): self.solr = pysolr.Solr(self.options.url) def query(self): return " ".join(self.args) def find(self, q): return self.solr.search(q, limit=int(self.options.limit)) def findterms(self, terms): limit = int(self.options.limit) for term in terms: yield self.solr.search("body:" + term, limit=limit) class ZcatalogModule(Module): def indexer(self, **kwargs): from ZODB.FileStorage import FileStorage # @UnresolvedImport from ZODB.DB import DB # @UnresolvedImport from zcatalog import catalog # @UnresolvedImport from zcatalog import indexes # @UnresolvedImport import transaction # @UnresolvedImport dir = os.path.join(self.options.dir, "%s_zcatalog" % self.options.indexname) if os.path.exists(dir): rmtree(dir) os.mkdir(dir) storage = FileStorage(os.path.join(dir, "index")) db = DB(storage) conn = db.open() self.cat = catalog.Catalog() self.bench.spec.zcatalog_setup(self.cat) conn.root()["cat"] = self.cat transaction.commit() self.zcatalog_count = 0 def index_document(self, d): if hasattr(self.bench, "process_document_zcatalog"): self.bench.process_document_zcatalog(d) doc = ZDoc(d) self.cat.index_doc(doc) self.zcatalog_count += 1 if self.zcatalog_count >= 100: import transaction # @UnresolvedImport transaction.commit() self.zcatalog_count = 0 def finish(self, **kwargs): import transaction # @UnresolvedImport transaction.commit() del self.zcatalog_count def searcher(self): from ZODB.FileStorage import FileStorage # @UnresolvedImport from ZODB.DB import DB # @UnresolvedImport from zcatalog import catalog # @UnresolvedImport from zcatalog import indexes # @UnresolvedImport import transaction # @UnresolvedImport path = os.path.join(self.options.dir, "%s_zcatalog" % self.options.indexname, "index") storage = FileStorage(path) db = DB(storage) conn = db.open() self.cat = conn.root()["cat"] def query(self): return " ".join(self.args) def find(self, q): return self.cat.searchResults(body=q) def findterms(self, terms): for term in terms: yield self.cat.searchResults(body=term) def results(self, r): hf = self.bench.spec.headline_field mf = self.bench.spec.main_field for hit in r: # Have to access the attributes for them to be retrieved yield self._process_result({hf: getattr(hit, hf), mf: getattr(hit, mf)}) class NucularModule(Module): def indexer(self, create=True): import shutil from nucular import Nucular dir = os.path.join(self.options.dir, "%s_nucular" % self.options.indexname) if create: if os.path.exists(dir): shutil.rmtree(dir) os.mkdir(dir) self.archive = Nucular.Nucular(dir) if create: self.archive.create() self.count = 0 def index_document(self, d): try: self.archive.indexDictionary(str(self.count), d) except ValueError: print("d=", d) raise self.count += 1 if not self.count % int(self.options.batch): t = now() self.archive.store(lazy=True) self.indexer(create=False) def finish(self, **kwargs): self.archive.store(lazy=False) self.archive.aggregateRecent(fast=False, verbose=True) self.archive.moveTransientToBase(verbose=True) self.archive.cleanUp() def searcher(self): from nucular import Nucular dir = os.path.join(self.options.dir, "%s_nucular" % self.options.indexname) self.archive = Nucular.Nucular(dir) def query(self): return " ".join(self.args) def find(self, q): return self.archive.dictionaries(q) def findterms(self, terms): for term in terms: q = self.archive.Query() q.anyWord(term) yield q.resultDictionaries() class Bench(object): libs = {"whoosh": WhooshModule, "xappy": XappyModule, "xapian": XapianModule, "solr": SolrModule, "zcatalog": ZcatalogModule, "nucular": NucularModule} def index(self, lib): print("Indexing with %s..." % lib) options = self.options every = None if options.every is None else int(options.every) merge = options.merge chunk = int(options.chunk) skip = int(options.skip) upto = int(options.upto) count = 0 skipc = skip starttime = chunkstarttime = now() lib.indexer() for d in self.spec.documents(): skipc -= 1 if not skipc: lib.index_document(d) count += 1 skipc = skip if chunk and not count % chunk: t = now() sofar = t - starttime print("Done %d docs, %0.3f secs for %d, %0.3f total, %0.3f docs/s" % (count, t - chunkstarttime, chunk, sofar, count / sofar)) chunkstarttime = t if count > upto: break if every and not count % every: print("----Commit") lib.finish(merge=merge) lib.indexer(create=False) spooltime = now() print("Spool time:", spooltime - starttime) lib.finish(merge=merge) committime = now() print("Commit time:", committime - spooltime) totaltime = committime - starttime print("Total time to index %d documents: %0.3f secs (%0.3f minutes)" % (count, totaltime, totaltime / 60.0)) print("Indexed %0.3f docs/s" % (count / totaltime)) def search(self, lib): lib.searcher() t = now() q = lib.query() print("Query:", q) r = lib.find(q) print("Search time:", now() - t) t = now() self.spec.print_results(lib.results(r)) print("Print time:", now() - t) def search_file(self, lib): f = open(self.options.termfile, "rb") terms = [line.strip() for line in f] f.close() print("Searching %d terms with %s" % (len(terms), lib)) lib.searcher() starttime = now() for r in lib.findterms(terms): pass searchtime = now() - starttime print("Search time:", searchtime, "searches/s:", float(len(terms)) / searchtime) def _parser(self, name): p = OptionParser() p.add_option("-x", "--lib", dest="lib", help="Name of the library to use to index/search.", default="whoosh") p.add_option("-d", "--dir", dest="dir", metavar="DIRNAME", help="Directory in which to store index.", default=".") p.add_option("-s", "--setup", dest="setup", action="store_true", help="Set up any support files or caches.", default=False) p.add_option("-i", "--index", dest="index", action="store_true", help="Index the documents.", default=False) p.add_option("-n", "--name", dest="indexname", metavar="PREFIX", help="Index name prefix.", default="%s_index" % name) p.add_option("-U", "--url", dest="url", metavar="URL", help="Solr URL", default="http://localhost:8983/solr") p.add_option("-m", "--mb", dest="limitmb", help="Max. memory usage, in MB", default="128") p.add_option("-c", "--chunk", dest="chunk", help="Number of documents to index between progress messages.", default=1000) p.add_option("-B", "--batch", dest="batch", help="Batch size for batch adding documents.", default=1000) p.add_option("-k", "--skip", dest="skip", metavar="N", help="Index every Nth document.", default=1) p.add_option("-e", "--commit-every", dest="every", metavar="NUM", help="Commit every NUM documents", default=None) p.add_option("-M", "--no-merge", dest="merge", action="store_false", help="Don't merge segments when doing multiple commits", default=True) p.add_option("-u", "--upto", dest="upto", metavar="N", help="Index up to this document number.", default=600000) p.add_option("-p", "--procs", dest="procs", metavar="NUMBER", help="Number of processors to use.", default=0) p.add_option("-l", "--limit", dest="limit", metavar="N", help="Maximum number of search results to retrieve.", default=10) p.add_option("-b", "--body", dest="showbody", action="store_true", help="Show the body text in search results.", default=False) p.add_option("-g", "--gen", dest="generate", metavar="N", help="Generate a list at most N terms present in all libraries.", default=None) p.add_option("-f", "--file", dest="termfile", metavar="FILENAME", help="Search using the list of terms in this file.", default=None) p.add_option("-t", "--tempdir", dest="tempdir", metavar="DIRNAME", help="Whoosh temp dir", default=None) p.add_option("-P", "--pool", dest="pool", metavar="CLASSNAME", help="Whoosh pool class", default=None) p.add_option("-X", "--xms", dest="xms", action="store_true", help="Experimental Whoosh feature", default=False) p.add_option("-Z", "--storebody", dest="storebody", action="store_true", help="Store the body text in index", default=False) p.add_option("-q", "--snippets", dest="snippets", action="store_true", help="Show highlighted snippets", default=False) p.add_option("-O", "--no-optimize", dest="optimize", action="store_false", help="Turn off searcher optimization", default=True) return p def run(self, specclass): parser = self._parser(specclass.name) options, args = parser.parse_args() self.options = options self.args = args if options.lib not in self.libs: raise Exception("Unknown library: %r" % options.lib) lib = self.libs[options.lib](self, options, args) self.spec = specclass(options, args) if options.setup: self.spec.setup() action = self.search if options.index: action = self.index if options.termfile: action = self.search_file if options.generate: action = self.generate_search_file action(lib) Whoosh-2.5.7/src/whoosh/support/charset.py0000644000076500000240000023530012254366350020712 0ustar mattstaff00000000000000# coding=utf-8 """This module contains tools for working with Sphinx charset table files. These files are useful for doing case and accent folding. See :class:`whoosh.analysis.CharsetTokenizer` and :class:`whoosh.analysis.CharsetFilter`. """ from collections import defaultdict import re from whoosh.compat import izip, u, iteritems, unichr, xrange # This is a straightforward accent-folding charset taken from Carlos Bueno's # article "Accent Folding for Auto-Complete", for use with CharsetFilter. # # http://www.alistapart.com/articles/accent-folding-for-auto-complete/ # # See the article for information and caveats. The code is lifted directly # from here: # # http://github.com/aristus/accent-folding/blob/master/accent_fold.py accent_map = { u('H'): u('h'), # H -> h u('I'): u('i'), # I -> i u('J'): u('j'), # J -> j u('N'): u('n'), # N -> n u('P'): u('p'), # P -> p u('S'): u('s'), # S -> s u('T'): u('t'), # T -> t u('W'): u('w'), # W -> w u('Y'): u('y'), # Y -> y u('i'): u('i'), # i -> i u('n'): u('n'), # n -> n u('p'): u('p'), # p -> p u('s'): u('s'), # s -> s u('\xc0'): u('a'), # À -> a u('\xc1'): u('a'), # Á -> a u('\xc2'): u('a'), # Â -> a u('\xc3'): u('a'), # Ã -> a u('\xc4'): u('a'), # Ä -> a u('\xc5'): u('a'), # Å -> a u('\xc7'): u('c'), # Ç -> c u('\xc8'): u('e'), # È -> e u('\xc9'): u('e'), # É -> e u('\xca'): u('e'), # Ê -> e u('\xcb'): u('e'), # Ë -> e u('\xcc'): u('i'), # Ì -> i u('\xcd'): u('i'), # Í -> i u('\xce'): u('i'), # Î -> i u('\xcf'): u('i'), # Ï -> i u('\xd1'): u('n'), # Ñ -> n u('\xd2'): u('o'), # Ò -> o u('\xd3'): u('o'), # Ó -> o u('\xd4'): u('o'), # Ô -> o u('\xd5'): u('o'), # Õ -> o u('\xd6'): u('o'), # Ö -> o u('\xd8'): u('o'), # Ø -> o u('\xd9'): u('u'), # Ù -> u u('\xda'): u('u'), # Ú -> u u('\xdb'): u('u'), # Û -> u u('\xdc'): u('u'), # Ü -> u u('\xdd'): u('y'), # Ý -> y u('\xde'): u('t'), # Þ -> t u('\xdf'): u('s'), # ß -> s u('\xe0'): u('a'), # à -> a u('\xe1'): u('a'), # á -> a u('\xe2'): u('a'), # â -> a u('\xe3'): u('a'), # ã -> a u('\xe4'): u('a'), # ä -> a u('\xe5'): u('a'), # å -> a u('\xe7'): u('c'), # ç -> c u('\xe8'): u('e'), # è -> e u('\xe9'): u('e'), # é -> e u('\xea'): u('e'), # ê -> e u('\xeb'): u('e'), # ë -> e u('\xec'): u('i'), # ì -> i u('\xed'): u('i'), # í -> i u('\xee'): u('i'), # î -> i u('\xef'): u('i'), # ï -> i u('\xf0'): u('d'), # ð -> d u('\xf1'): u('n'), # ñ -> n u('\xf2'): u('o'), # ò -> o u('\xf3'): u('o'), # ó -> o u('\xf4'): u('o'), # ô -> o u('\xf5'): u('o'), # õ -> o u('\xf6'): u('o'), # ö -> o u('\xf8'): u('o'), # ø -> o u('\xf9'): u('u'), # ù -> u u('\xfa'): u('u'), # ú -> u u('\xfb'): u('u'), # û -> u u('\xfc'): u('u'), # ü -> u u('\xfd'): u('y'), # ý -> y u('\xfe'): u('t'), # þ -> t u('\xff'): u('y'), # ÿ -> y u('\u0100'): u('a'), # Ā -> a u('\u0101'): u('a'), # ā -> a u('\u0102'): u('a'), # Ă -> a u('\u0103'): u('a'), # ă -> a u('\u0104'): u('a'), # Ą -> a u('\u0105'): u('a'), # ą -> a u('\u0106'): u('c'), # Ć -> c u('\u0107'): u('c'), # ć -> c u('\u0108'): u('c'), # Ĉ -> c u('\u0109'): u('c'), # ĉ -> c u('\u010a'): u('c'), # Ċ -> c u('\u010b'): u('c'), # ċ -> c u('\u010c'): u('c'), # Č -> c u('\u010d'): u('c'), # č -> c u('\u010e'): u('d'), # Ď -> d u('\u010f'): u('d'), # ď -> d u('\u0110'): u('d'), # Đ -> d u('\u0111'): u('d'), # đ -> d u('\u0112'): u('e'), # Ē -> e u('\u0113'): u('e'), # ē -> e u('\u0114'): u('e'), # Ĕ -> e u('\u0115'): u('e'), # ĕ -> e u('\u0116'): u('e'), # Ė -> e u('\u0117'): u('e'), # ė -> e u('\u0118'): u('e'), # Ę -> e u('\u0119'): u('e'), # ę -> e u('\u011a'): u('e'), # Ě -> e u('\u011b'): u('e'), # ě -> e u('\u011c'): u('g'), # Ĝ -> g u('\u011d'): u('g'), # ĝ -> g u('\u011e'): u('g'), # Ğ -> g u('\u011f'): u('g'), # ğ -> g u('\u0120'): u('g'), # Ġ -> g u('\u0121'): u('g'), # ġ -> g u('\u0122'): u('g'), # Ģ -> g u('\u0123'): u('g'), # ģ -> g u('\u0124'): u('h'), # Ĥ -> h u('\u0125'): u('h'), # ĥ -> h u('\u0126'): u('h'), # Ħ -> h u('\u0127'): u('h'), # ħ -> h u('\u0128'): u('i'), # Ĩ -> i u('\u0129'): u('i'), # ĩ -> i u('\u012a'): u('i'), # Ī -> i u('\u012b'): u('i'), # ī -> i u('\u012c'): u('i'), # Ĭ -> i u('\u012d'): u('i'), # ĭ -> i u('\u012e'): u('i'), # Į -> i u('\u012f'): u('i'), # į -> i u('\u0130'): u('i'), # İ -> i u('\u0131'): u('i'), # ı -> i u('\u0134'): u('j'), # Ĵ -> j u('\u0135'): u('j'), # ĵ -> j u('\u0136'): u('k'), # Ķ -> k u('\u0137'): u('k'), # ķ -> k u('\u0139'): u('a'), # Ĺ -> a u('\u013a'): u('l'), # ĺ -> l u('\u013b'): u('l'), # Ļ -> l u('\u013c'): u('l'), # ļ -> l u('\u013d'): u('l'), # Ľ -> l u('\u013e'): u('l'), # ľ -> l u('\u013f'): u('l'), # Ŀ -> l u('\u0140'): u('l'), # ŀ -> l u('\u0141'): u('l'), # Ł -> l u('\u0142'): u('l'), # ł -> l u('\u0143'): u('n'), # Ń -> n u('\u0144'): u('n'), # ń -> n u('\u0145'): u('n'), # Ņ -> n u('\u0146'): u('n'), # ņ -> n u('\u0147'): u('n'), # Ň -> n u('\u0148'): u('n'), # ň -> n u('\u014c'): u('o'), # Ō -> o u('\u014d'): u('o'), # ō -> o u('\u014e'): u('o'), # Ŏ -> o u('\u014f'): u('o'), # ŏ -> o u('\u0150'): u('o'), # Ő -> o u('\u0151'): u('o'), # ő -> o u('\u0154'): u('r'), # Ŕ -> r u('\u0155'): u('r'), # ŕ -> r u('\u0156'): u('r'), # Ŗ -> r u('\u0157'): u('r'), # ŗ -> r u('\u0158'): u('r'), # Ř -> r u('\u0159'): u('r'), # ř -> r u('\u015a'): u('s'), # Ś -> s u('\u015b'): u('s'), # ś -> s u('\u015c'): u('s'), # Ŝ -> s u('\u015d'): u('s'), # ŝ -> s u('\u015e'): u('s'), # Ş -> s u('\u015f'): u('s'), # ş -> s u('\u0160'): u('s'), # Š -> s u('\u0161'): u('s'), # š -> s u('\u0162'): u('t'), # Ţ -> t u('\u0163'): u('t'), # ţ -> t u('\u0164'): u('t'), # Ť -> t u('\u0165'): u('t'), # ť -> t u('\u0166'): u('t'), # Ŧ -> t u('\u0167'): u('t'), # ŧ -> t u('\u0168'): u('u'), # Ũ -> u u('\u0169'): u('u'), # ũ -> u u('\u016a'): u('u'), # Ū -> u u('\u016b'): u('u'), # ū -> u u('\u016c'): u('u'), # Ŭ -> u u('\u016d'): u('u'), # ŭ -> u u('\u016e'): u('u'), # Ů -> u u('\u016f'): u('u'), # ů -> u u('\u0170'): u('u'), # Ű -> u u('\u0171'): u('u'), # ű -> u u('\u0172'): u('u'), # Ų -> u u('\u0173'): u('u'), # ų -> u u('\u0174'): u('w'), # Ŵ -> w u('\u0175'): u('w'), # ŵ -> w u('\u0176'): u('y'), # Ŷ -> y u('\u0177'): u('y'), # ŷ -> y u('\u0178'): u('y'), # Ÿ -> y u('\u0179'): u('z'), # Ź -> z u('\u017a'): u('z'), # ź -> z u('\u017b'): u('z'), # Ż -> z u('\u017c'): u('z'), # ż -> z u('\u017d'): u('z'), # Ž -> z u('\u017e'): u('z'), # ž -> z u('\u0180'): u('b'), # ƀ -> b u('\u0181'): u('b'), # Ɓ -> b u('\u0182'): u('b'), # Ƃ -> b u('\u0183'): u('b'), # ƃ -> b u('\u0187'): u('c'), # Ƈ -> c u('\u0188'): u('c'), # ƈ -> c u('\u0189'): u('d'), # Ɖ -> d u('\u018a'): u('d'), # Ɗ -> d u('\u018b'): u('d'), # Ƌ -> d u('\u018c'): u('d'), # ƌ -> d u('\u018e'): u('e'), # Ǝ -> e u('\u018f'): u('e'), # Ə -> e u('\u0191'): u('f'), # Ƒ -> f u('\u0192'): u('f'), # ƒ -> f u('\u0193'): u('g'), # Ɠ -> g u('\u0197'): u('i'), # Ɨ -> i u('\u0198'): u('k'), # Ƙ -> k u('\u0199'): u('k'), # ƙ -> k u('\u019a'): u('l'), # ƚ -> l u('\u019d'): u('n'), # Ɲ -> n u('\u019e'): u('n'), # ƞ -> n u('\u019f'): u('o'), # Ɵ -> o u('\u01a0'): u('o'), # Ơ -> o u('\u01a1'): u('o'), # ơ -> o u('\u01a4'): u('p'), # Ƥ -> p u('\u01a5'): u('p'), # ƥ -> p u('\u01ab'): u('t'), # ƫ -> t u('\u01ac'): u('t'), # Ƭ -> t u('\u01ad'): u('t'), # ƭ -> t u('\u01ae'): u('t'), # Ʈ -> t u('\u01af'): u('u'), # Ư -> u u('\u01b0'): u('u'), # ư -> u u('\u01b2'): u('v'), # Ʋ -> v u('\u01b3'): u('y'), # Ƴ -> y u('\u01b4'): u('y'), # ƴ -> y u('\u01b5'): u('z'), # Ƶ -> z u('\u01b6'): u('z'), # ƶ -> z u('\u01ba'): u('z'), # ƺ -> z u('\u01cd'): u('a'), # Ǎ -> a u('\u01ce'): u('a'), # ǎ -> a u('\u01cf'): u('i'), # Ǐ -> i u('\u01d0'): u('i'), # ǐ -> i u('\u01d1'): u('o'), # Ǒ -> o u('\u01d2'): u('o'), # ǒ -> o u('\u01d3'): u('u'), # Ǔ -> u u('\u01d4'): u('u'), # ǔ -> u u('\u01d5'): u('u'), # Ǖ -> u u('\u01d6'): u('u'), # ǖ -> u u('\u01d7'): u('u'), # Ǘ -> u u('\u01d8'): u('u'), # ǘ -> u u('\u01d9'): u('u'), # Ǚ -> u u('\u01da'): u('u'), # ǚ -> u u('\u01db'): u('u'), # Ǜ -> u u('\u01dc'): u('u'), # ǜ -> u u('\u01dd'): u('e'), # ǝ -> e u('\u01de'): u('a'), # Ǟ -> a u('\u01df'): u('a'), # ǟ -> a u('\u01e0'): u('a'), # Ǡ -> a u('\u01e1'): u('a'), # ǡ -> a u('\u01e2'): u('a'), # Ǣ -> a u('\u01e3'): u('a'), # ǣ -> a u('\u01e4'): u('g'), # Ǥ -> g u('\u01e5'): u('g'), # ǥ -> g u('\u01e6'): u('g'), # Ǧ -> g u('\u01e7'): u('g'), # ǧ -> g u('\u01e8'): u('k'), # Ǩ -> k u('\u01e9'): u('k'), # ǩ -> k u('\u01ea'): u('o'), # Ǫ -> o u('\u01eb'): u('o'), # ǫ -> o u('\u01ec'): u('o'), # Ǭ -> o u('\u01ed'): u('o'), # ǭ -> o u('\u01ee'): u('z'), # Ǯ -> z u('\u01ef'): u('z'), # ǯ -> z u('\u01f0'): u('j'), # ǰ -> j u('\u01f4'): u('g'), # Ǵ -> g u('\u01f5'): u('g'), # ǵ -> g u('\u01f8'): u('n'), # Ǹ -> n u('\u01f9'): u('n'), # ǹ -> n u('\u01fa'): u('a'), # Ǻ -> a u('\u01fb'): u('a'), # ǻ -> a u('\u01fc'): u('a'), # Ǽ -> a u('\u01fd'): u('a'), # ǽ -> a u('\u01fe'): u('o'), # Ǿ -> o u('\u01ff'): u('o'), # ǿ -> o u('\u0200'): u('a'), # Ȁ -> a u('\u0201'): u('a'), # ȁ -> a u('\u0202'): u('a'), # Ȃ -> a u('\u0203'): u('a'), # ȃ -> a u('\u0204'): u('e'), # Ȅ -> e u('\u0205'): u('e'), # ȅ -> e u('\u0206'): u('e'), # Ȇ -> e u('\u0207'): u('e'), # ȇ -> e u('\u0208'): u('i'), # Ȉ -> i u('\u0209'): u('i'), # ȉ -> i u('\u020a'): u('i'), # Ȋ -> i u('\u020b'): u('i'), # ȋ -> i u('\u020c'): u('o'), # Ȍ -> o u('\u020d'): u('o'), # ȍ -> o u('\u020e'): u('o'), # Ȏ -> o u('\u020f'): u('o'), # ȏ -> o u('\u0210'): u('r'), # Ȑ -> r u('\u0211'): u('r'), # ȑ -> r u('\u0212'): u('r'), # Ȓ -> r u('\u0213'): u('r'), # ȓ -> r u('\u0214'): u('u'), # Ȕ -> u u('\u0215'): u('u'), # ȕ -> u u('\u0216'): u('u'), # Ȗ -> u u('\u0217'): u('u'), # ȗ -> u u('\u0218'): u('s'), # Ș -> s u('\u0219'): u('s'), # ș -> s u('\u021a'): u('t'), # Ț -> t u('\u021b'): u('t'), # ț -> t u('\u021e'): u('h'), # Ȟ -> h u('\u021f'): u('h'), # ȟ -> h u('\u0220'): u('n'), # Ƞ -> n u('\u0221'): u('d'), # ȡ -> d u('\u0224'): u('z'), # Ȥ -> z u('\u0225'): u('z'), # ȥ -> z u('\u0226'): u('a'), # Ȧ -> a u('\u0227'): u('a'), # ȧ -> a u('\u0228'): u('e'), # Ȩ -> e u('\u0229'): u('e'), # ȩ -> e u('\u022a'): u('o'), # Ȫ -> o u('\u022b'): u('o'), # ȫ -> o u('\u022c'): u('o'), # Ȭ -> o u('\u022d'): u('o'), # ȭ -> o u('\u022e'): u('o'), # Ȯ -> o u('\u022f'): u('o'), # ȯ -> o u('\u0230'): u('o'), # Ȱ -> o u('\u0231'): u('o'), # ȱ -> o u('\u0232'): u('y'), # Ȳ -> y u('\u0233'): u('y'), # ȳ -> y u('\u0234'): u('l'), # ȴ -> l u('\u0235'): u('n'), # ȵ -> n u('\u0236'): u('t'), # ȶ -> t u('\u0237'): u('j'), # ȷ -> j u('\u023a'): u('a'), # Ⱥ -> a u('\u023b'): u('c'), # Ȼ -> c u('\u023c'): u('c'), # ȼ -> c u('\u023d'): u('l'), # Ƚ -> l u('\u023e'): u('t'), # Ⱦ -> t u('\u0243'): u('b'), # Ƀ -> b u('\u0244'): u('u'), # Ʉ -> u u('\u0246'): u('e'), # Ɇ -> e u('\u0247'): u('e'), # ɇ -> e u('\u0248'): u('j'), # Ɉ -> j u('\u0249'): u('j'), # ɉ -> j u('\u024a'): u('q'), # Ɋ -> q u('\u024b'): u('q'), # ɋ -> q u('\u024c'): u('r'), # Ɍ -> r u('\u024d'): u('r'), # ɍ -> r u('\u024e'): u('y'), # Ɏ -> y u('\u024f'): u('y'), # ɏ -> y u('\u0253'): u('b'), # ɓ -> b u('\u0255'): u('c'), # ɕ -> c u('\u0256'): u('d'), # ɖ -> d u('\u0257'): u('d'), # ɗ -> d u('\u025a'): u('e'), # ɚ -> e u('\u025d'): u('e'), # ɝ -> e u('\u025f'): u('j'), # ɟ -> j u('\u0260'): u('g'), # ɠ -> g u('\u0268'): u('i'), # ɨ -> i u('\u026b'): u('l'), # ɫ -> l u('\u026c'): u('l'), # ɬ -> l u('\u026d'): u('l'), # ɭ -> l u('\u0271'): u('m'), # ɱ -> m u('\u0272'): u('n'), # ɲ -> n u('\u0273'): u('n'), # ɳ -> n u('\u0275'): u('o'), # ɵ -> o u('\u027c'): u('r'), # ɼ -> r u('\u027d'): u('r'), # ɽ -> r u('\u027e'): u('r'), # ɾ -> r u('\u0282'): u('s'), # ʂ -> s u('\u0284'): u('j'), # ʄ -> j u('\u0288'): u('t'), # ʈ -> t u('\u0289'): u('u'), # ʉ -> u u('\u028b'): u('v'), # ʋ -> v u('\u028f'): u('y'), # ʏ -> y u('\u0290'): u('z'), # ʐ -> z u('\u0291'): u('z'), # ʑ -> z u('\u029d'): u('j'), # ʝ -> j u('\u02a0'): u('q'), # ʠ -> q u('\u0303'): u('p'), # ̃ -> p u('\u0308'): u('t'), # ̈ -> t u('\u030a'): u('y'), # ̊ -> y u('\u030c'): u('j'), # ̌ -> j u('\u0323'): u('l'), # ̣ -> l u('\u0329'): u('s'), # ̩ -> s u('\u0331'): u('h'), # ̱ -> h u('\u1d6c'): u('b'), # ᵬ -> b u('\u1d6d'): u('d'), # ᵭ -> d u('\u1d6e'): u('f'), # ᵮ -> f u('\u1d72'): u('r'), # ᵲ -> r u('\u1d73'): u('r'), # ᵳ -> r u('\u1d75'): u('t'), # ᵵ -> t u('\u1e00'): u('a'), # Ḁ -> a u('\u1e01'): u('a'), # ḁ -> a u('\u1e02'): u('b'), # Ḃ -> b u('\u1e03'): u('b'), # ḃ -> b u('\u1e04'): u('b'), # Ḅ -> b u('\u1e05'): u('b'), # ḅ -> b u('\u1e06'): u('b'), # Ḇ -> b u('\u1e07'): u('b'), # ḇ -> b u('\u1e08'): u('c'), # Ḉ -> c u('\u1e09'): u('c'), # ḉ -> c u('\u1e0a'): u('d'), # Ḋ -> d u('\u1e0b'): u('d'), # ḋ -> d u('\u1e0c'): u('d'), # Ḍ -> d u('\u1e0d'): u('d'), # ḍ -> d u('\u1e0e'): u('d'), # Ḏ -> d u('\u1e0f'): u('d'), # ḏ -> d u('\u1e10'): u('d'), # Ḑ -> d u('\u1e11'): u('d'), # ḑ -> d u('\u1e12'): u('d'), # Ḓ -> d u('\u1e13'): u('d'), # ḓ -> d u('\u1e14'): u('e'), # Ḕ -> e u('\u1e15'): u('e'), # ḕ -> e u('\u1e16'): u('e'), # Ḗ -> e u('\u1e17'): u('e'), # ḗ -> e u('\u1e18'): u('e'), # Ḙ -> e u('\u1e19'): u('e'), # ḙ -> e u('\u1e1a'): u('e'), # Ḛ -> e u('\u1e1b'): u('e'), # ḛ -> e u('\u1e1c'): u('e'), # Ḝ -> e u('\u1e1d'): u('e'), # ḝ -> e u('\u1e1e'): u('f'), # Ḟ -> f u('\u1e1f'): u('f'), # ḟ -> f u('\u1e20'): u('g'), # Ḡ -> g u('\u1e21'): u('g'), # ḡ -> g u('\u1e22'): u('h'), # Ḣ -> h u('\u1e23'): u('h'), # ḣ -> h u('\u1e24'): u('h'), # Ḥ -> h u('\u1e25'): u('h'), # ḥ -> h u('\u1e26'): u('h'), # Ḧ -> h u('\u1e27'): u('h'), # ḧ -> h u('\u1e28'): u('h'), # Ḩ -> h u('\u1e29'): u('h'), # ḩ -> h u('\u1e2a'): u('h'), # Ḫ -> h u('\u1e2b'): u('h'), # ḫ -> h u('\u1e2c'): u('i'), # Ḭ -> i u('\u1e2d'): u('i'), # ḭ -> i u('\u1e2e'): u('i'), # Ḯ -> i u('\u1e2f'): u('i'), # ḯ -> i u('\u1e30'): u('k'), # Ḱ -> k u('\u1e31'): u('k'), # ḱ -> k u('\u1e32'): u('k'), # Ḳ -> k u('\u1e33'): u('k'), # ḳ -> k u('\u1e34'): u('k'), # Ḵ -> k u('\u1e35'): u('k'), # ḵ -> k u('\u1e36'): u('l'), # Ḷ -> l u('\u1e37'): u('l'), # ḷ -> l u('\u1e38'): u('l'), # Ḹ -> l u('\u1e39'): u('l'), # ḹ -> l u('\u1e3a'): u('l'), # Ḻ -> l u('\u1e3b'): u('l'), # ḻ -> l u('\u1e3c'): u('l'), # Ḽ -> l u('\u1e3d'): u('l'), # ḽ -> l u('\u1e3e'): u('m'), # Ḿ -> m u('\u1e3f'): u('m'), # ḿ -> m u('\u1e40'): u('m'), # Ṁ -> m u('\u1e41'): u('m'), # ṁ -> m u('\u1e42'): u('m'), # Ṃ -> m u('\u1e43'): u('m'), # ṃ -> m u('\u1e44'): u('n'), # Ṅ -> n u('\u1e45'): u('n'), # ṅ -> n u('\u1e46'): u('n'), # Ṇ -> n u('\u1e47'): u('n'), # ṇ -> n u('\u1e48'): u('n'), # Ṉ -> n u('\u1e49'): u('n'), # ṉ -> n u('\u1e4a'): u('n'), # Ṋ -> n u('\u1e4b'): u('n'), # ṋ -> n u('\u1e4c'): u('o'), # Ṍ -> o u('\u1e4d'): u('o'), # ṍ -> o u('\u1e4e'): u('o'), # Ṏ -> o u('\u1e4f'): u('o'), # ṏ -> o u('\u1e50'): u('o'), # Ṑ -> o u('\u1e51'): u('o'), # ṑ -> o u('\u1e52'): u('o'), # Ṓ -> o u('\u1e53'): u('o'), # ṓ -> o u('\u1e54'): u('p'), # Ṕ -> p u('\u1e55'): u('p'), # ṕ -> p u('\u1e56'): u('p'), # Ṗ -> p u('\u1e57'): u('p'), # ṗ -> p u('\u1e58'): u('r'), # Ṙ -> r u('\u1e59'): u('r'), # ṙ -> r u('\u1e5a'): u('r'), # Ṛ -> r u('\u1e5b'): u('r'), # ṛ -> r u('\u1e5c'): u('r'), # Ṝ -> r u('\u1e5d'): u('r'), # ṝ -> r u('\u1e5e'): u('r'), # Ṟ -> r u('\u1e5f'): u('r'), # ṟ -> r u('\u1e60'): u('s'), # Ṡ -> s u('\u1e61'): u('s'), # ṡ -> s u('\u1e62'): u('s'), # Ṣ -> s u('\u1e63'): u('s'), # ṣ -> s u('\u1e64'): u('s'), # Ṥ -> s u('\u1e65'): u('s'), # ṥ -> s u('\u1e66'): u('s'), # Ṧ -> s u('\u1e67'): u('s'), # ṧ -> s u('\u1e68'): u('s'), # Ṩ -> s u('\u1e69'): u('s'), # ṩ -> s u('\u1e6a'): u('t'), # Ṫ -> t u('\u1e6b'): u('t'), # ṫ -> t u('\u1e6c'): u('t'), # Ṭ -> t u('\u1e6d'): u('t'), # ṭ -> t u('\u1e6e'): u('t'), # Ṯ -> t u('\u1e6f'): u('t'), # ṯ -> t u('\u1e70'): u('t'), # Ṱ -> t u('\u1e71'): u('t'), # ṱ -> t u('\u1e72'): u('u'), # Ṳ -> u u('\u1e73'): u('u'), # ṳ -> u u('\u1e74'): u('u'), # Ṵ -> u u('\u1e75'): u('u'), # ṵ -> u u('\u1e76'): u('u'), # Ṷ -> u u('\u1e77'): u('u'), # ṷ -> u u('\u1e78'): u('u'), # Ṹ -> u u('\u1e79'): u('u'), # ṹ -> u u('\u1e7a'): u('u'), # Ṻ -> u u('\u1e7b'): u('u'), # ṻ -> u u('\u1e7c'): u('v'), # Ṽ -> v u('\u1e7d'): u('v'), # ṽ -> v u('\u1e7e'): u('v'), # Ṿ -> v u('\u1e7f'): u('v'), # ṿ -> v u('\u1e80'): u('w'), # Ẁ -> w u('\u1e81'): u('w'), # ẁ -> w u('\u1e82'): u('w'), # Ẃ -> w u('\u1e83'): u('w'), # ẃ -> w u('\u1e84'): u('w'), # Ẅ -> w u('\u1e85'): u('w'), # ẅ -> w u('\u1e86'): u('w'), # Ẇ -> w u('\u1e87'): u('w'), # ẇ -> w u('\u1e88'): u('w'), # Ẉ -> w u('\u1e89'): u('w'), # ẉ -> w u('\u1e8a'): u('x'), # Ẋ -> x u('\u1e8b'): u('x'), # ẋ -> x u('\u1e8c'): u('x'), # Ẍ -> x u('\u1e8d'): u('x'), # ẍ -> x u('\u1e8e'): u('y'), # Ẏ -> y u('\u1e8f'): u('y'), # ẏ -> y u('\u1e90'): u('z'), # Ẑ -> z u('\u1e91'): u('z'), # ẑ -> z u('\u1e92'): u('z'), # Ẓ -> z u('\u1e93'): u('z'), # ẓ -> z u('\u1e94'): u('z'), # Ẕ -> z u('\u1e95'): u('z'), # ẕ -> z u('\u1e96'): u('h'), # ẖ -> h u('\u1e97'): u('t'), # ẗ -> t u('\u1e98'): u('w'), # ẘ -> w u('\u1e99'): u('y'), # ẙ -> y u('\u1e9a'): u('a'), # ẚ -> a u('\u1e9b'): u('s'), # ẛ -> s u('\u1ea0'): u('a'), # Ạ -> a u('\u1ea1'): u('a'), # ạ -> a u('\u1ea2'): u('a'), # Ả -> a u('\u1ea3'): u('a'), # ả -> a u('\u1ea4'): u('a'), # Ấ -> a u('\u1ea5'): u('a'), # ấ -> a u('\u1ea6'): u('a'), # Ầ -> a u('\u1ea7'): u('a'), # ầ -> a u('\u1ea8'): u('a'), # Ẩ -> a u('\u1ea9'): u('a'), # ẩ -> a u('\u1eaa'): u('a'), # Ẫ -> a u('\u1eab'): u('a'), # ẫ -> a u('\u1eac'): u('a'), # Ậ -> a u('\u1ead'): u('a'), # ậ -> a u('\u1eae'): u('a'), # Ắ -> a u('\u1eaf'): u('a'), # ắ -> a u('\u1eb0'): u('a'), # Ằ -> a u('\u1eb1'): u('a'), # ằ -> a u('\u1eb2'): u('a'), # Ẳ -> a u('\u1eb3'): u('a'), # ẳ -> a u('\u1eb4'): u('a'), # Ẵ -> a u('\u1eb5'): u('a'), # ẵ -> a u('\u1eb6'): u('a'), # Ặ -> a u('\u1eb7'): u('a'), # ặ -> a u('\u1eb8'): u('e'), # Ẹ -> e u('\u1eb9'): u('e'), # ẹ -> e u('\u1eba'): u('e'), # Ẻ -> e u('\u1ebb'): u('e'), # ẻ -> e u('\u1ebc'): u('e'), # Ẽ -> e u('\u1ebd'): u('e'), # ẽ -> e u('\u1ebe'): u('e'), # Ế -> e u('\u1ebf'): u('e'), # ế -> e u('\u1ec0'): u('e'), # Ề -> e u('\u1ec1'): u('e'), # ề -> e u('\u1ec2'): u('e'), # Ể -> e u('\u1ec3'): u('e'), # ể -> e u('\u1ec4'): u('e'), # Ễ -> e u('\u1ec5'): u('e'), # ễ -> e u('\u1ec6'): u('e'), # Ệ -> e u('\u1ec7'): u('e'), # ệ -> e u('\u1ec8'): u('i'), # Ỉ -> i u('\u1ec9'): u('i'), # ỉ -> i u('\u1eca'): u('i'), # Ị -> i u('\u1ecb'): u('i'), # ị -> i u('\u1ecc'): u('o'), # Ọ -> o u('\u1ecd'): u('o'), # ọ -> o u('\u1ece'): u('o'), # Ỏ -> o u('\u1ecf'): u('o'), # ỏ -> o u('\u1ed0'): u('o'), # Ố -> o u('\u1ed1'): u('o'), # ố -> o u('\u1ed2'): u('o'), # Ồ -> o u('\u1ed3'): u('o'), # ồ -> o u('\u1ed4'): u('o'), # Ổ -> o u('\u1ed5'): u('o'), # ổ -> o u('\u1ed6'): u('o'), # Ỗ -> o u('\u1ed7'): u('o'), # ỗ -> o u('\u1ed8'): u('o'), # Ộ -> o u('\u1ed9'): u('o'), # ộ -> o u('\u1eda'): u('o'), # Ớ -> o u('\u1edb'): u('o'), # ớ -> o u('\u1edc'): u('o'), # Ờ -> o u('\u1edd'): u('o'), # ờ -> o u('\u1ede'): u('o'), # Ở -> o u('\u1edf'): u('o'), # ở -> o u('\u1ee0'): u('o'), # Ỡ -> o u('\u1ee1'): u('o'), # ỡ -> o u('\u1ee2'): u('o'), # Ợ -> o u('\u1ee3'): u('o'), # ợ -> o u('\u1ee4'): u('u'), # Ụ -> u u('\u1ee5'): u('u'), # ụ -> u u('\u1ee6'): u('u'), # Ủ -> u u('\u1ee7'): u('u'), # ủ -> u u('\u1ee8'): u('u'), # Ứ -> u u('\u1ee9'): u('u'), # ứ -> u u('\u1eea'): u('u'), # Ừ -> u u('\u1eeb'): u('u'), # ừ -> u u('\u1eec'): u('u'), # Ử -> u u('\u1eed'): u('u'), # ử -> u u('\u1eee'): u('u'), # Ữ -> u u('\u1eef'): u('u'), # ữ -> u u('\u1ef0'): u('u'), # Ự -> u u('\u1ef1'): u('u'), # ự -> u u('\u1ef2'): u('y'), # Ỳ -> y u('\u1ef3'): u('y'), # ỳ -> y u('\u1ef4'): u('y'), # Ỵ -> y u('\u1ef5'): u('y'), # ỵ -> y u('\u1ef6'): u('y'), # Ỷ -> y u('\u1ef7'): u('y'), # ỷ -> y u('\u1ef8'): u('y'), # Ỹ -> y u('\u1ef9'): u('y'), # ỹ -> y u('\u2c60'): u('l'), # Ⱡ -> l u('\u2c61'): u('l'), # ⱡ -> l u('\u2c62'): u('l'), # Ɫ -> l u('\u2c63'): u('p'), # Ᵽ -> p u('\u2c64'): u('r'), # Ɽ -> r u('\u2c65'): u('a'), # ⱥ -> a u('\u2c66'): u('t'), # ⱦ -> t u('\u2c67'): u('h'), # Ⱨ -> h u('\u2c68'): u('h'), # ⱨ -> h u('\u2c69'): u('k'), # Ⱪ -> k u('\u2c6a'): u('k'), # ⱪ -> k u('\u2c6b'): u('z'), # Ⱬ -> z u('\u2c6c'): u('z'), # ⱬ -> z u('\uff10'): u('0'), # 0 -> 0 u('\uff11'): u('1'), # 1 -> 1 u('\uff12'): u('2'), # 2 -> 2 u('\uff13'): u('3'), # 3 -> 3 u('\uff14'): u('4'), # 4 -> 4 u('\uff15'): u('5'), # 5 -> 5 u('\uff16'): u('6'), # 6 -> 6 u('\uff17'): u('7'), # 7 -> 7 u('\uff18'): u('8'), # 8 -> 8 u('\uff19'): u('9'), # 9 -> 9 u('\uff21'): u('A'), # A -> A u('\uff22'): u('B'), # B -> B u('\uff23'): u('C'), # C -> C u('\uff24'): u('D'), # D -> D u('\uff25'): u('E'), # E -> E u('\uff26'): u('F'), # F -> F u('\uff27'): u('G'), # G -> G u('\uff28'): u('H'), # H -> H u('\uff29'): u('I'), # I -> I u('\uff2a'): u('J'), # J -> J u('\uff2b'): u('K'), # K -> K u('\uff2c'): u('L'), # L -> L u('\uff2d'): u('M'), # M -> M u('\uff2e'): u('N'), # N -> N u('\uff2f'): u('O'), # O -> O u('\uff30'): u('P'), # P -> P u('\uff31'): u('Q'), # Q -> Q u('\uff32'): u('R'), # R -> R u('\uff33'): u('S'), # S -> S u('\uff34'): u('T'), # T -> T u('\uff35'): u('U'), # U -> U u('\uff36'): u('V'), # V -> V u('\uff37'): u('W'), # W -> W u('\uff38'): u('X'), # X -> X u('\uff39'): u('Y'), # Y -> Y u('\uff3a'): u('Z'), # Z -> Z u('\uff41'): u('a'), # a -> a u('\uff42'): u('b'), # b -> b u('\uff43'): u('c'), # c -> c u('\uff44'): u('d'), # d -> d u('\uff45'): u('e'), # e -> e u('\uff46'): u('f'), # f -> f u('\uff47'): u('g'), # g -> g u('\uff48'): u('h'), # h -> h u('\uff49'): u('i'), # i -> i u('\uff4a'): u('j'), # j -> j u('\uff4b'): u('k'), # k -> k u('\uff4c'): u('l'), # l -> l u('\uff4d'): u('m'), # m -> m u('\uff4e'): u('n'), # n -> n u('\uff4f'): u('o'), # o -> o u('\uff50'): u('p'), # p -> p u('\uff51'): u('q'), # q -> q u('\uff52'): u('r'), # r -> r u('\uff53'): u('s'), # s -> s u('\uff54'): u('t'), # t -> t u('\uff55'): u('u'), # u -> u u('\uff56'): u('v'), # v -> v u('\uff57'): u('w'), # w -> w u('\uff58'): u('x'), # x -> x u('\uff59'): u('y'), # y -> y u('\uff5a'): u('z'), # z -> z } # The unicode.translate() method actually requires a dictionary mapping # character *numbers* to characters, for some reason. accent_map = dict((ord(k), v) for k, v in iteritems(accent_map)) # This Sphinx charset table taken from http://speeple.com/unicode-maps.txt default_charset = """ ################################################## # Latin # A U+00C0->a, U+00C1->a, U+00C2->a, U+00C3->a, U+00C4->a, U+00C5->a, U+00E0->a, U+00E1->a, U+00E2->a, U+00E3->a, U+00E4->a, U+00E5->a, U+0100->a, U+0101->a, U+0102->a, U+0103->a, U+010300->a, U+0104->a, U+0105->a, U+01CD->a, U+01CE->a, U+01DE->a, U+01DF->a, U+01E0->a, U+01E1->a, U+01FA->a, U+01FB->a, U+0200->a, U+0201->a, U+0202->a, U+0203->a, U+0226->a, U+0227->a, U+023A->a, U+0250->a, U+04D0->a, U+04D1->a, U+1D2C->a, U+1D43->a, U+1D44->a, U+1D8F->a, U+1E00->a, U+1E01->a, U+1E9A->a, U+1EA0->a, U+1EA1->a, U+1EA2->a, U+1EA3->a, U+1EA4->a, U+1EA5->a, U+1EA6->a, U+1EA7->a, U+1EA8->a, U+1EA9->a, U+1EAA->a, U+1EAB->a, U+1EAC->a, U+1EAD->a, U+1EAE->a, U+1EAF->a, U+1EB0->a, U+1EB1->a, U+1EB2->a, U+1EB3->a, U+1EB4->a, U+1EB5->a, U+1EB6->a, U+1EB7->a, U+2090->a, U+2C65->a # B U+0180->b, U+0181->b, U+0182->b, U+0183->b, U+0243->b, U+0253->b, U+0299->b, U+16D2->b, U+1D03->b, U+1D2E->b, U+1D2F->b, U+1D47->b, U+1D6C->b, U+1D80->b, U+1E02->b, U+1E03->b, U+1E04->b, U+1E05->b, U+1E06->b, U+1E07->b # C U+00C7->c, U+00E7->c, U+0106->c, U+0107->c, U+0108->c, U+0109->c, U+010A->c, U+010B->c, U+010C->c, U+010D->c, U+0187->c, U+0188->c, U+023B->c, U+023C->c, U+0255->c, U+0297->c, U+1D9C->c, U+1D9D->c, U+1E08->c, U+1E09->c, U+212D->c, U+2184->c # D U+010E->d, U+010F->d, U+0110->d, U+0111->d, U+0189->d, U+018A->d, U+018B->d, U+018C->d, U+01C5->d, U+01F2->d, U+0221->d, U+0256->d, U+0257->d, U+1D05->d, U+1D30->d, U+1D48->d, U+1D6D->d, U+1D81->d, U+1D91->d, U+1E0A->d, U+1E0B->d, U+1E0C->d, U+1E0D->d, U+1E0E->d, U+1E0F->d, U+1E10->d, U+1E11->d, U+1E12->d, U+1E13->d # E U+00C8->e, U+00C9->e, U+00CA->e, U+00CB->e, U+00E8->e, U+00E9->e, U+00EA->e, U+00EB->e, U+0112->e, U+0113->e, U+0114->e, U+0115->e, U+0116->e, U+0117->e, U+0118->e, U+0119->e, U+011A->e, U+011B->e, U+018E->e, U+0190->e, U+01DD->e, U+0204->e, U+0205->e, U+0206->e, U+0207->e, U+0228->e, U+0229->e, U+0246->e, U+0247->e, U+0258->e, U+025B->e, U+025C->e, U+025D->e, U+025E->e, U+029A->e, U+1D07->e, U+1D08->e, U+1D31->e, U+1D32->e, U+1D49->e, U+1D4B->e, U+1D4C->e, U+1D92->e, U+1D93->e, U+1D94->e, U+1D9F->e, U+1E14->e, U+1E15->e, U+1E16->e, U+1E17->e, U+1E18->e, U+1E19->e, U+1E1A->e, U+1E1B->e, U+1E1C->e, U+1E1D->e, U+1EB8->e, U+1EB9->e, U+1EBA->e, U+1EBB->e, U+1EBC->e, U+1EBD->e, U+1EBE->e, U+1EBF->e, U+1EC0->e, U+1EC1->e, U+1EC2->e, U+1EC3->e, U+1EC4->e, U+1EC5->e, U+1EC6->e, U+1EC7->e, U+2091->e # F U+0191->f, U+0192->f, U+1D6E->f, U+1D82->f, U+1DA0->f, U+1E1E->f, U+1E1F->f # G U+011C->g, U+011D->g, U+011E->g, U+011F->g, U+0120->g, U+0121->g, U+0122->g, U+0123->g, U+0193->g, U+01E4->g, U+01E5->g, U+01E6->g, U+01E7->g, U+01F4->g, U+01F5->g, U+0260->g, U+0261->g, U+0262->g, U+029B->g, U+1D33->g, U+1D4D->g, U+1D77->g, U+1D79->g, U+1D83->g, U+1DA2->g, U+1E20->g, U+1E21->g # H U+0124->h, U+0125->h, U+0126->h, U+0127->h, U+021E->h, U+021F->h, U+0265->h, U+0266->h, U+029C->h, U+02AE->h, U+02AF->h, U+02B0->h, U+02B1->h, U+1D34->h, U+1DA3->h, U+1E22->h, U+1E23->h, U+1E24->h, U+1E25->h, U+1E26->h, U+1E27->h, U+1E28->h, U+1E29->h, U+1E2A->h, U+1E2B->h, U+1E96->h, U+210C->h, U+2C67->h, U+2C68->h, U+2C75->h, U+2C76->h # I U+00CC->i, U+00CD->i, U+00CE->i, U+00CF->i, U+00EC->i, U+00ED->i, U+00EE->i, U+00EF->i, U+010309->i, U+0128->i, U+0129->i, U+012A->i, U+012B->i, U+012C->i, U+012D->i, U+012E->i, U+012F->i, U+0130->i, U+0131->i, U+0197->i, U+01CF->i, U+01D0->i, U+0208->i, U+0209->i, U+020A->i, U+020B->i, U+0268->i, U+026A->i, U+040D->i, U+0418->i, U+0419->i, U+0438->i, U+0439->i, U+0456->i, U+1D09->i, U+1D35->i, U+1D4E->i, U+1D62->i, U+1D7B->i, U+1D96->i, U+1DA4->i, U+1DA6->i, U+1DA7->i, U+1E2C->i, U+1E2D->i, U+1E2E->i, U+1E2F->i, U+1EC8->i, U+1EC9->i, U+1ECA->i, U+1ECB->i, U+2071->i, U+2111->i # J U+0134->j, U+0135->j, U+01C8->j, U+01CB->j, U+01F0->j, U+0237->j, U+0248->j, U+0249->j, U+025F->j, U+0284->j, U+029D->j, U+02B2->j, U+1D0A->j, U+1D36->j, U+1DA1->j, U+1DA8->j # K U+0136->k, U+0137->k, U+0198->k, U+0199->k, U+01E8->k, U+01E9->k, U+029E->k, U+1D0B->k, U+1D37->k, U+1D4F->k, U+1D84->k, U+1E30->k, U+1E31->k, U+1E32->k, U+1E33->k, U+1E34->k, U+1E35->k, U+2C69->k, U+2C6A->k # L U+0139->l, U+013A->l, U+013B->l, U+013C->l, U+013D->l, U+013E->l, U+013F->l, U+0140->l, U+0141->l, U+0142->l, U+019A->l, U+01C8->l, U+0234->l, U+023D->l, U+026B->l, U+026C->l, U+026D->l, U+029F->l, U+02E1->l, U+1D0C->l, U+1D38->l, U+1D85->l, U+1DA9->l, U+1DAA->l, U+1DAB->l, U+1E36->l, U+1E37->l, U+1E38->l, U+1E39->l, U+1E3A->l, U+1E3B->l, U+1E3C->l, U+1E3D->l, U+2C60->l, U+2C61->l, U+2C62->l # M U+019C->m, U+026F->m, U+0270->m, U+0271->m, U+1D0D->m, U+1D1F->m, U+1D39->m, U+1D50->m, U+1D5A->m, U+1D6F->m, U+1D86->m, U+1DAC->m, U+1DAD->m, U+1E3E->m, U+1E3F->m, U+1E40->m, U+1E41->m, U+1E42->m, U+1E43->m # N U+00D1->n, U+00F1->n, U+0143->n, U+0144->n, U+0145->n, U+0146->n, U+0147->n, U+0148->n, U+0149->n, U+019D->n, U+019E->n, U+01CB->n, U+01F8->n, U+01F9->n, U+0220->n, U+0235->n, U+0272->n, U+0273->n, U+0274->n, U+1D0E->n, U+1D3A->n, U+1D3B->n, U+1D70->n, U+1D87->n, U+1DAE->n, U+1DAF->n, U+1DB0->n, U+1E44->n, U+1E45->n, U+1E46->n, U+1E47->n, U+1E48->n, U+1E49->n, U+1E4A->n, U+1E4B->n, U+207F->n # O U+00D2->o, U+00D3->o, U+00D4->o, U+00D5->o, U+00D6->o, U+00D8->o, U+00F2->o, U+00F3->o, U+00F4->o, U+00F5->o, U+00F6->o, U+00F8->o, U+01030F->o, U+014C->o, U+014D->o, U+014E->o, U+014F->o, U+0150->o, U+0151->o, U+0186->o, U+019F->o, U+01A0->o, U+01A1->o, U+01D1->o, U+01D2->o, U+01EA->o, U+01EB->o, U+01EC->o, U+01ED->o, U+01FE->o, U+01FF->o, U+020C->o, U+020D->o, U+020E->o, U+020F->o, U+022A->o, U+022B->o, U+022C->o, U+022D->o, U+022E->o, U+022F->o, U+0230->o, U+0231->o, U+0254->o, U+0275->o, U+043E->o, U+04E6->o, U+04E7->o, U+04E8->o, U+04E9->o, U+04EA->o, U+04EB->o, U+1D0F->o, U+1D10->o, U+1D11->o, U+1D12->o, U+1D13->o, U+1D16->o, U+1D17->o, U+1D3C->o, U+1D52->o, U+1D53->o, U+1D54->o, U+1D55->o, U+1D97->o, U+1DB1->o, U+1E4C->o, U+1E4D->o, U+1E4E->o, U+1E4F->o, U+1E50->o, U+1E51->o, U+1E52->o, U+1E53->o, U+1ECC->o, U+1ECD->o, U+1ECE->o, U+1ECF->o, U+1ED0->o, U+1ED1->o, U+1ED2->o, U+1ED3->o, U+1ED4->o, U+1ED5->o, U+1ED6->o, U+1ED7->o, U+1ED8->o, U+1ED9->o, U+1EDA->o, U+1EDB->o, U+1EDC->o, U+1EDD->o, U+1EDE->o, U+1EDF->o, U+1EE0->o, U+1EE1->o, U+1EE2->o, U+1EE3->o, U+2092->o, U+2C9E->o, U+2C9F->o # P U+01A4->p, U+01A5->p, U+1D18->p, U+1D3E->p, U+1D56->p, U+1D71->p, U+1D7D->p, U+1D88->p, U+1E54->p, U+1E55->p, U+1E56->p, U+1E57->p, U+2C63->p # Q U+024A->q, U+024B->q, U+02A0->q # R U+0154->r, U+0155->r, U+0156->r, U+0157->r, U+0158->r, U+0159->r, U+0210->r, U+0211->r, U+0212->r, U+0213->r, U+024C->r, U+024D->r, U+0279->r, U+027A->r, U+027B->r, U+027C->r, U+027D->r, U+027E->r, U+027F->r, U+0280->r, U+0281->r, U+02B3->r, U+02B4->r, U+02B5->r, U+02B6->r, U+1D19->r, U+1D1A->r, U+1D3F->r, U+1D63->r, U+1D72->r, U+1D73->r, U+1D89->r, U+1DCA->r, U+1E58->r, U+1E59->r, U+1E5A->r, U+1E5B->r, U+1E5C->r, U+1E5D->r, U+1E5E->r, U+1E5F->r, U+211C->r, U+2C64->r # S U+00DF->s, U+015A->s, U+015B->s, U+015C->s, U+015D->s, U+015E->s, U+015F->s, U+0160->s, U+0161->s, U+017F->s, U+0218->s, U+0219->s, U+023F->s, U+0282->s, U+02E2->s, U+1D74->s, U+1D8A->s, U+1DB3->s, U+1E60->s, U+1E61->s, U+1E62->s, U+1E63->s, U+1E64->s, U+1E65->s, U+1E66->s, U+1E67->s, U+1E68->s, U+1E69->s, U+1E9B->s # T U+0162->t, U+0163->t, U+0164->t, U+0165->t, U+0166->t, U+0167->t, U+01AB->t, U+01AC->t, U+01AD->t, U+01AE->t, U+021A->t, U+021B->t, U+0236->t, U+023E->t, U+0287->t, U+0288->t, U+1D1B->t, U+1D40->t, U+1D57->t, U+1D75->t, U+1DB5->t, U+1E6A->t, U+1E6B->t, U+1E6C->t, U+1E6D->t, U+1E6E->t, U+1E6F->t, U+1E70->t, U+1E71->t, U+1E97->t, U+2C66->t # U U+00D9->u, U+00DA->u, U+00DB->u, U+00DC->u, U+00F9->u, U+00FA->u, U+00FB->u, U+00FC->u, U+010316->u, U+0168->u, U+0169->u, U+016A->u, U+016B->u, U+016C->u, U+016D->u, U+016E->u, U+016F->u, U+0170->u, U+0171->u, U+0172->u, U+0173->u, U+01AF->u, U+01B0->u, U+01D3->u, U+01D4->u, U+01D5->u, U+01D6->u, U+01D7->u, U+01D8->u, U+01D9->u, U+01DA->u, U+01DB->u, U+01DC->u, U+0214->u, U+0215->u, U+0216->u, U+0217->u, U+0244->u, U+0289->u, U+1D1C->u, U+1D1D->u, U+1D1E->u, U+1D41->u, U+1D58->u, U+1D59->u, U+1D64->u, U+1D7E->u, U+1D99->u, U+1DB6->u, U+1DB8->u, U+1E72->u, U+1E73->u, U+1E74->u, U+1E75->u, U+1E76->u, U+1E77->u, U+1E78->u, U+1E79->u, U+1E7A->u, U+1E7B->u, U+1EE4->u, U+1EE5->u, U+1EE6->u, U+1EE7->u, U+1EE8->u, U+1EE9->u, U+1EEA->u, U+1EEB->u, U+1EEC->u, U+1EED->u, U+1EEE->u, U+1EEF->u, U+1EF0->u, U+1EF1->u # V U+01B2->v, U+0245->v, U+028B->v, U+028C->v, U+1D20->v, U+1D5B->v, U+1D65->v, U+1D8C->v, U+1DB9->v, U+1DBA->v, U+1E7C->v, U+1E7D->v, U+1E7E->v, U+1E7F->v, U+2C74->v # W U+0174->w, U+0175->w, U+028D->w, U+02B7->w, U+1D21->w, U+1D42->w, U+1E80->w, U+1E81->w, U+1E82->w, U+1E83->w, U+1E84->w, U+1E85->w, U+1E86->w, U+1E87->w, U+1E88->w, U+1E89->w, U+1E98->w # X U+02E3->x, U+1D8D->x, U+1E8A->x, U+1E8B->x, U+1E8C->x, U+1E8D->x, U+2093->x # Y U+00DD->y, U+00FD->y, U+00FF->y, U+0176->y, U+0177->y, U+0178->y, U+01B3->y, U+01B4->y, U+0232->y, U+0233->y, U+024E->y, U+024F->y, U+028E->y, U+028F->y, U+02B8->y, U+1E8E->y, U+1E8F->y, U+1E99->y, U+1EF2->y, U+1EF3->y, U+1EF4->y, U+1EF5->y, U+1EF6->y, U+1EF7->y, U+1EF8->y, U+1EF9->y # Z U+0179->z, U+017A->z, U+017B->z, U+017C->z, U+017D->z, U+017E->z, U+01B5->z, U+01B6->z, U+0224->z, U+0225->z, U+0240->z, U+0290->z, U+0291->z, U+1D22->z, U+1D76->z, U+1D8E->z, U+1DBB->z, U+1DBC->z, U+1DBD->z, U+1E90->z, U+1E91->z, U+1E92->z, U+1E93->z, U+1E94->z, U+1E95->z, U+2128->z, U+2C6B->z, U+2C6C->z # Latin Extras: U+00C6->U+00E6, U+01E2->U+00E6, U+01E3->U+00E6, U+01FC->U+00E6, U+01FD->U+00E6, U+1D01->U+00E6, U+1D02->U+00E6, U+1D2D->U+00E6, U+1D46->U+00E6, U+00E6 ################################################## # Arabic U+0622->U+0627, U+0623->U+0627, U+0624->U+0648, U+0625->U+0627, U+0626->U+064A, U+06C0->U+06D5, U+06C2->U+06C1, U+06D3->U+06D2, U+FB50->U+0671, U+FB51->U+0671, U+FB52->U+067B, U+FB53->U+067B, U+FB54->U+067B, U+FB56->U+067E, U+FB57->U+067E, U+FB58->U+067E, U+FB5A->U+0680, U+FB5B->U+0680, U+FB5C->U+0680, U+FB5E->U+067A, U+FB5F->U+067A, U+FB60->U+067A, U+FB62->U+067F, U+FB63->U+067F, U+FB64->U+067F, U+FB66->U+0679, U+FB67->U+0679, U+FB68->U+0679, U+FB6A->U+06A4, U+FB6B->U+06A4, U+FB6C->U+06A4, U+FB6E->U+06A6, U+FB6F->U+06A6, U+FB70->U+06A6, U+FB72->U+0684, U+FB73->U+0684, U+FB74->U+0684, U+FB76->U+0683, U+FB77->U+0683, U+FB78->U+0683, U+FB7A->U+0686, U+FB7B->U+0686, U+FB7C->U+0686, U+FB7E->U+0687, U+FB7F->U+0687, U+FB80->U+0687, U+FB82->U+068D, U+FB83->U+068D, U+FB84->U+068C, U+FB85->U+068C, U+FB86->U+068E, U+FB87->U+068E, U+FB88->U+0688, U+FB89->U+0688, U+FB8A->U+0698, U+FB8B->U+0698, U+FB8C->U+0691, U+FB8D->U+0691, U+FB8E->U+06A9, U+FB8F->U+06A9, U+FB90->U+06A9, U+FB92->U+06AF, U+FB93->U+06AF, U+FB94->U+06AF, U+FB96->U+06B3, U+FB97->U+06B3, U+FB98->U+06B3, U+FB9A->U+06B1, U+FB9B->U+06B1, U+FB9C->U+06B1, U+FB9E->U+06BA, U+FB9F->U+06BA, U+FBA0->U+06BB, U+FBA1->U+06BB, U+FBA2->U+06BB, U+FBA4->U+06C0, U+FBA5->U+06C0, U+FBA6->U+06C1, U+FBA7->U+06C1, U+FBA8->U+06C1, U+FBAA->U+06BE, U+FBAB->U+06BE, U+FBAC->U+06BE, U+FBAE->U+06D2, U+FBAF->U+06D2, U+FBB0->U+06D3, U+FBB1->U+06D3, U+FBD3->U+06AD, U+FBD4->U+06AD, U+FBD5->U+06AD, U+FBD7->U+06C7, U+FBD8->U+06C7, U+FBD9->U+06C6, U+FBDA->U+06C6, U+FBDB->U+06C8, U+FBDC->U+06C8, U+FBDD->U+0677, U+FBDE->U+06CB, U+FBDF->U+06CB, U+FBE0->U+06C5, U+FBE1->U+06C5, U+FBE2->U+06C9, U+FBE3->U+06C9, U+FBE4->U+06D0, U+FBE5->U+06D0, U+FBE6->U+06D0, U+FBE8->U+0649, U+FBFC->U+06CC, U+FBFD->U+06CC, U+FBFE->U+06CC, U+0621, U+0627..U+063A, U+0641..U+064A, U+0660..U+0669, U+066E, U+066F, U+0671..U+06BF, U+06C1, U+06C3..U+06D2, U+06D5, U+06EE..U+06FC, U+06FF, U+0750..U+076D, U+FB55, U+FB59, U+FB5D, U+FB61, U+FB65, U+FB69, U+FB6D, U+FB71, U+FB75, U+FB79, U+FB7D, U+FB81, U+FB91, U+FB95, U+FB99, U+FB9D, U+FBA3, U+FBA9, U+FBAD, U+FBD6, U+FBE7, U+FBE9, U+FBFF ################################################## # Armenian U+0531..U+0556->U+0561..U+0586, U+0561..U+0586, U+0587 ################################################# # Bengali U+09DC->U+09A1, U+09DD->U+09A2, U+09DF->U+09AF, U+09F0->U+09AC, U+09F1->U+09AC, U+0985..U+0990, U+0993..U+09B0, U+09B2, U+09B6..U+09B9, U+09CE, U+09E0, U+09E1, U+09E6..U+09EF ################################################# # CJK* U+F900->U+8C48, U+F901->U+66F4, U+F902->U+8ECA, U+F903->U+8CC8, U+F904->U+6ED1, U+F905->U+4E32, U+F906->U+53E5, U+F907->U+9F9C, U+F908->U+9F9C, U+F909->U+5951, U+F90A->U+91D1, U+F90B->U+5587, U+F90C->U+5948, U+F90D->U+61F6, U+F90E->U+7669, U+F90F->U+7F85, U+F910->U+863F, U+F911->U+87BA, U+F912->U+88F8, U+F913->U+908F, U+F914->U+6A02, U+F915->U+6D1B, U+F916->U+70D9, U+F917->U+73DE, U+F918->U+843D, U+F919->U+916A, U+F91A->U+99F1, U+F91B->U+4E82, U+F91C->U+5375, U+F91D->U+6B04, U+F91E->U+721B, U+F91F->U+862D, U+F920->U+9E1E, U+F921->U+5D50, U+F922->U+6FEB, U+F923->U+85CD, U+F924->U+8964, U+F925->U+62C9, U+F926->U+81D8, U+F927->U+881F, U+F928->U+5ECA, U+F929->U+6717, U+F92A->U+6D6A, U+F92B->U+72FC, U+F92C->U+90CE, U+F92D->U+4F86, U+F92E->U+51B7, U+F92F->U+52DE, U+F930->U+64C4, U+F931->U+6AD3, U+F932->U+7210, U+F933->U+76E7, U+F934->U+8001, U+F935->U+8606, U+F936->U+865C, U+F937->U+8DEF, U+F938->U+9732, U+F939->U+9B6F, U+F93A->U+9DFA, U+F93B->U+788C, U+F93C->U+797F, U+F93D->U+7DA0, U+F93E->U+83C9, U+F93F->U+9304, U+F940->U+9E7F, U+F941->U+8AD6, U+F942->U+58DF, U+F943->U+5F04, U+F944->U+7C60, U+F945->U+807E, U+F946->U+7262, U+F947->U+78CA, U+F948->U+8CC2, U+F949->U+96F7, U+F94A->U+58D8, U+F94B->U+5C62, U+F94C->U+6A13, U+F94D->U+6DDA, U+F94E->U+6F0F, U+F94F->U+7D2F, U+F950->U+7E37, U+F951->U+964B, U+F952->U+52D2, U+F953->U+808B, U+F954->U+51DC, U+F955->U+51CC, U+F956->U+7A1C, U+F957->U+7DBE, U+F958->U+83F1, U+F959->U+9675, U+F95A->U+8B80, U+F95B->U+62CF, U+F95C->U+6A02, U+F95D->U+8AFE, U+F95E->U+4E39, U+F95F->U+5BE7, U+F960->U+6012, U+F961->U+7387, U+F962->U+7570, U+F963->U+5317, U+F964->U+78FB, U+F965->U+4FBF, U+F966->U+5FA9, U+F967->U+4E0D, U+F968->U+6CCC, U+F969->U+6578, U+F96A->U+7D22, U+F96B->U+53C3, U+F96C->U+585E, U+F96D->U+7701, U+F96E->U+8449, U+F96F->U+8AAA, U+F970->U+6BBA, U+F971->U+8FB0, U+F972->U+6C88, U+F973->U+62FE, U+F974->U+82E5, U+F975->U+63A0, U+F976->U+7565, U+F977->U+4EAE, U+F978->U+5169, U+F979->U+51C9, U+F97A->U+6881, U+F97B->U+7CE7, U+F97C->U+826F, U+F97D->U+8AD2, U+F97E->U+91CF, U+F97F->U+52F5, U+F980->U+5442, U+F981->U+5973, U+F982->U+5EEC, U+F983->U+65C5, U+F984->U+6FFE, U+F985->U+792A, U+F986->U+95AD, U+F987->U+9A6A, U+F988->U+9E97, U+F989->U+9ECE, U+F98A->U+529B, U+F98B->U+66C6, U+F98C->U+6B77, U+F98D->U+8F62, U+F98E->U+5E74, U+F98F->U+6190, U+F990->U+6200, U+F991->U+649A, U+F992->U+6F23, U+F993->U+7149, U+F994->U+7489, U+F995->U+79CA, U+F996->U+7DF4, U+F997->U+806F, U+F998->U+8F26, U+F999->U+84EE, U+F99A->U+9023, U+F99B->U+934A, U+F99C->U+5217, U+F99D->U+52A3, U+F99E->U+54BD, U+F99F->U+70C8, U+F9A0->U+88C2, U+F9A1->U+8AAA, U+F9A2->U+5EC9, U+F9A3->U+5FF5, U+F9A4->U+637B, U+F9A5->U+6BAE, U+F9A6->U+7C3E, U+F9A7->U+7375, U+F9A8->U+4EE4, U+F9A9->U+56F9, U+F9AA->U+5BE7, U+F9AB->U+5DBA, U+F9AC->U+601C, U+F9AD->U+73B2, U+F9AE->U+7469, U+F9AF->U+7F9A, U+F9B0->U+8046, U+F9B1->U+9234, U+F9B2->U+96F6, U+F9B3->U+9748, U+F9B4->U+9818, U+F9B5->U+4F8B, U+F9B6->U+79AE, U+F9B7->U+91B4, U+F9B8->U+96B8, U+F9B9->U+60E1, U+F9BA->U+4E86, U+F9BB->U+50DA, U+F9BC->U+5BEE, U+F9BD->U+5C3F, U+F9BE->U+6599, U+F9BF->U+6A02, U+F9C0->U+71CE, U+F9C1->U+7642, U+F9C2->U+84FC, U+F9C3->U+907C, U+F9C4->U+9F8D, U+F9C5->U+6688, U+F9C6->U+962E, U+F9C7->U+5289, U+F9C8->U+677B, U+F9C9->U+67F3, U+F9CA->U+6D41, U+F9CB->U+6E9C, U+F9CC->U+7409, U+F9CD->U+7559, U+F9CE->U+786B, U+F9CF->U+7D10, U+F9D0->U+985E, U+F9D1->U+516D, U+F9D2->U+622E, U+F9D3->U+9678, U+F9D4->U+502B, U+F9D5->U+5D19, U+F9D6->U+6DEA, U+F9D7->U+8F2A, U+F9D8->U+5F8B, U+F9D9->U+6144, U+F9DA->U+6817, U+F9DB->U+7387, U+F9DC->U+9686, U+F9DD->U+5229, U+F9DE->U+540F, U+F9DF->U+5C65, U+F9E0->U+6613, U+F9E1->U+674E, U+F9E2->U+68A8, U+F9E3->U+6CE5, U+F9E4->U+7406, U+F9E5->U+75E2, U+F9E6->U+7F79, U+F9E7->U+88CF, U+F9E8->U+88E1, U+F9E9->U+91CC, U+F9EA->U+96E2, U+F9EB->U+533F, U+F9EC->U+6EBA, U+F9ED->U+541D, U+F9EE->U+71D0, U+F9EF->U+7498, U+F9F0->U+85FA, U+F9F1->U+96A3, U+F9F2->U+9C57, U+F9F3->U+9E9F, U+F9F4->U+6797, U+F9F5->U+6DCB, U+F9F6->U+81E8, U+F9F7->U+7ACB, U+F9F8->U+7B20, U+F9F9->U+7C92, U+F9FA->U+72C0, U+F9FB->U+7099, U+F9FC->U+8B58, U+F9FD->U+4EC0, U+F9FE->U+8336, U+F9FF->U+523A, U+FA00->U+5207, U+FA01->U+5EA6, U+FA02->U+62D3, U+FA03->U+7CD6, U+FA04->U+5B85, U+FA05->U+6D1E, U+FA06->U+66B4, U+FA07->U+8F3B, U+FA08->U+884C, U+FA09->U+964D, U+FA0A->U+898B, U+FA0B->U+5ED3, U+FA0C->U+5140, U+FA0D->U+55C0, U+FA10->U+585A, U+FA12->U+6674, U+FA15->U+51DE, U+FA16->U+732A, U+FA17->U+76CA, U+FA18->U+793C, U+FA19->U+795E, U+FA1A->U+7965, U+FA1B->U+798F, U+FA1C->U+9756, U+FA1D->U+7CBE, U+FA1E->U+7FBD, U+FA20->U+8612, U+FA22->U+8AF8, U+FA25->U+9038, U+FA26->U+90FD, U+FA2A->U+98EF, U+FA2B->U+98FC, U+FA2C->U+9928, U+FA2D->U+9DB4, U+FA30->U+4FAE, U+FA31->U+50E7, U+FA32->U+514D, U+FA33->U+52C9, U+FA34->U+52E4, U+FA35->U+5351, U+FA36->U+559D, U+FA37->U+5606, U+FA38->U+5668, U+FA39->U+5840, U+FA3A->U+58A8, U+FA3B->U+5C64, U+FA3C->U+5C6E, U+FA3D->U+6094, U+FA3E->U+6168, U+FA3F->U+618E, U+FA40->U+61F2, U+FA41->U+654F, U+FA42->U+65E2, U+FA43->U+6691, U+FA44->U+6885, U+FA45->U+6D77, U+FA46->U+6E1A, U+FA47->U+6F22, U+FA48->U+716E, U+FA49->U+722B, U+FA4A->U+7422, U+FA4B->U+7891, U+FA4C->U+793E, U+FA4D->U+7949, U+FA4E->U+7948, U+FA4F->U+7950, U+FA50->U+7956, U+FA51->U+795D, U+FA52->U+798D, U+FA53->U+798E, U+FA54->U+7A40, U+FA55->U+7A81, U+FA56->U+7BC0, U+FA57->U+7DF4, U+FA58->U+7E09, U+FA59->U+7E41, U+FA5A->U+7F72, U+FA5B->U+8005, U+FA5C->U+81ED, U+FA5D->U+8279, U+FA5E->U+8279, U+FA5F->U+8457, U+FA60->U+8910, U+FA61->U+8996, U+FA62->U+8B01, U+FA63->U+8B39, U+FA64->U+8CD3, U+FA65->U+8D08, U+FA66->U+8FB6, U+FA67->U+9038, U+FA68->U+96E3, U+FA69->U+97FF, U+FA6A->U+983B, U+FA70->U+4E26, U+FA71->U+51B5, U+FA72->U+5168, U+FA73->U+4F80, U+FA74->U+5145, U+FA75->U+5180, U+FA76->U+52C7, U+FA77->U+52FA, U+FA78->U+559D, U+FA79->U+5555, U+FA7A->U+5599, U+FA7B->U+55E2, U+FA7C->U+585A, U+FA7D->U+58B3, U+FA7E->U+5944, U+FA7F->U+5954, U+FA80->U+5A62, U+FA81->U+5B28, U+FA82->U+5ED2, U+FA83->U+5ED9, U+FA84->U+5F69, U+FA85->U+5FAD, U+FA86->U+60D8, U+FA87->U+614E, U+FA88->U+6108, U+FA89->U+618E, U+FA8A->U+6160, U+FA8B->U+61F2, U+FA8C->U+6234, U+FA8D->U+63C4, U+FA8E->U+641C, U+FA8F->U+6452, U+FA90->U+6556, U+FA91->U+6674, U+FA92->U+6717, U+FA93->U+671B, U+FA94->U+6756, U+FA95->U+6B79, U+FA96->U+6BBA, U+FA97->U+6D41, U+FA98->U+6EDB, U+FA99->U+6ECB, U+FA9A->U+6F22, U+FA9B->U+701E, U+FA9C->U+716E, U+FA9D->U+77A7, U+FA9E->U+7235, U+FA9F->U+72AF, U+FAA0->U+732A, U+FAA1->U+7471, U+FAA2->U+7506, U+FAA3->U+753B, U+FAA4->U+761D, U+FAA5->U+761F, U+FAA6->U+76CA, U+FAA7->U+76DB, U+FAA8->U+76F4, U+FAA9->U+774A, U+FAAA->U+7740, U+FAAB->U+78CC, U+FAAC->U+7AB1, U+FAAD->U+7BC0, U+FAAE->U+7C7B, U+FAAF->U+7D5B, U+FAB0->U+7DF4, U+FAB1->U+7F3E, U+FAB2->U+8005, U+FAB3->U+8352, U+FAB4->U+83EF, U+FAB5->U+8779, U+FAB6->U+8941, U+FAB7->U+8986, U+FAB8->U+8996, U+FAB9->U+8ABF, U+FABA->U+8AF8, U+FABB->U+8ACB, U+FABC->U+8B01, U+FABD->U+8AFE, U+FABE->U+8AED, U+FABF->U+8B39, U+FAC0->U+8B8A, U+FAC1->U+8D08, U+FAC2->U+8F38, U+FAC3->U+9072, U+FAC4->U+9199, U+FAC5->U+9276, U+FAC6->U+967C, U+FAC7->U+96E3, U+FAC8->U+9756, U+FAC9->U+97DB, U+FACA->U+97FF, U+FACB->U+980B, U+FACC->U+983B, U+FACD->U+9B12, U+FACE->U+9F9C, U+FACF->U+2284A, U+FAD0->U+22844, U+FAD1->U+233D5, U+FAD2->U+3B9D, U+FAD3->U+4018, U+FAD4->U+4039, U+FAD5->U+25249, U+FAD6->U+25CD0, U+FAD7->U+27ED3, U+FAD8->U+9F43, U+FAD9->U+9F8E, U+2F800->U+4E3D, U+2F801->U+4E38, U+2F802->U+4E41, U+2F803->U+20122, U+2F804->U+4F60, U+2F805->U+4FAE, U+2F806->U+4FBB, U+2F807->U+5002, U+2F808->U+507A, U+2F809->U+5099, U+2F80A->U+50E7, U+2F80B->U+50CF, U+2F80C->U+349E, U+2F80D->U+2063A, U+2F80E->U+514D, U+2F80F->U+5154, U+2F810->U+5164, U+2F811->U+5177, U+2F812->U+2051C, U+2F813->U+34B9, U+2F814->U+5167, U+2F815->U+518D, U+2F816->U+2054B, U+2F817->U+5197, U+2F818->U+51A4, U+2F819->U+4ECC, U+2F81A->U+51AC, U+2F81B->U+51B5, U+2F81C->U+291DF, U+2F81D->U+51F5, U+2F81E->U+5203, U+2F81F->U+34DF, U+2F820->U+523B, U+2F821->U+5246, U+2F822->U+5272, U+2F823->U+5277, U+2F824->U+3515, U+2F825->U+52C7, U+2F826->U+52C9, U+2F827->U+52E4, U+2F828->U+52FA, U+2F829->U+5305, U+2F82A->U+5306, U+2F82B->U+5317, U+2F82C->U+5349, U+2F82D->U+5351, U+2F82E->U+535A, U+2F82F->U+5373, U+2F830->U+537D, U+2F831->U+537F, U+2F832->U+537F, U+2F833->U+537F, U+2F834->U+20A2C, U+2F835->U+7070, U+2F836->U+53CA, U+2F837->U+53DF, U+2F838->U+20B63, U+2F839->U+53EB, U+2F83A->U+53F1, U+2F83B->U+5406, U+2F83C->U+549E, U+2F83D->U+5438, U+2F83E->U+5448, U+2F83F->U+5468, U+2F840->U+54A2, U+2F841->U+54F6, U+2F842->U+5510, U+2F843->U+5553, U+2F844->U+5563, U+2F845->U+5584, U+2F846->U+5584, U+2F847->U+5599, U+2F848->U+55AB, U+2F849->U+55B3, U+2F84A->U+55C2, U+2F84B->U+5716, U+2F84C->U+5606, U+2F84D->U+5717, U+2F84E->U+5651, U+2F84F->U+5674, U+2F850->U+5207, U+2F851->U+58EE, U+2F852->U+57CE, U+2F853->U+57F4, U+2F854->U+580D, U+2F855->U+578B, U+2F856->U+5832, U+2F857->U+5831, U+2F858->U+58AC, U+2F859->U+214E4, U+2F85A->U+58F2, U+2F85B->U+58F7, U+2F85C->U+5906, U+2F85D->U+591A, U+2F85E->U+5922, U+2F85F->U+5962, U+2F860->U+216A8, U+2F861->U+216EA, U+2F862->U+59EC, U+2F863->U+5A1B, U+2F864->U+5A27, U+2F865->U+59D8, U+2F866->U+5A66, U+2F867->U+36EE, U+2F868->U+36FC, U+2F869->U+5B08, U+2F86A->U+5B3E, U+2F86B->U+5B3E, U+2F86C->U+219C8, U+2F86D->U+5BC3, U+2F86E->U+5BD8, U+2F86F->U+5BE7, U+2F870->U+5BF3, U+2F871->U+21B18, U+2F872->U+5BFF, U+2F873->U+5C06, U+2F874->U+5F53, U+2F875->U+5C22, U+2F876->U+3781, U+2F877->U+5C60, U+2F878->U+5C6E, U+2F879->U+5CC0, U+2F87A->U+5C8D, U+2F87B->U+21DE4, U+2F87C->U+5D43, U+2F87D->U+21DE6, U+2F87E->U+5D6E, U+2F87F->U+5D6B, U+2F880->U+5D7C, U+2F881->U+5DE1, U+2F882->U+5DE2, U+2F883->U+382F, U+2F884->U+5DFD, U+2F885->U+5E28, U+2F886->U+5E3D, U+2F887->U+5E69, U+2F888->U+3862, U+2F889->U+22183, U+2F88A->U+387C, U+2F88B->U+5EB0, U+2F88C->U+5EB3, U+2F88D->U+5EB6, U+2F88E->U+5ECA, U+2F88F->U+2A392, U+2F890->U+5EFE, U+2F891->U+22331, U+2F892->U+22331, U+2F893->U+8201, U+2F894->U+5F22, U+2F895->U+5F22, U+2F896->U+38C7, U+2F897->U+232B8, U+2F898->U+261DA, U+2F899->U+5F62, U+2F89A->U+5F6B, U+2F89B->U+38E3, U+2F89C->U+5F9A, U+2F89D->U+5FCD, U+2F89E->U+5FD7, U+2F89F->U+5FF9, U+2F8A0->U+6081, U+2F8A1->U+393A, U+2F8A2->U+391C, U+2F8A3->U+6094, U+2F8A4->U+226D4, U+2F8A5->U+60C7, U+2F8A6->U+6148, U+2F8A7->U+614C, U+2F8A8->U+614E, U+2F8A9->U+614C, U+2F8AA->U+617A, U+2F8AB->U+618E, U+2F8AC->U+61B2, U+2F8AD->U+61A4, U+2F8AE->U+61AF, U+2F8AF->U+61DE, U+2F8B0->U+61F2, U+2F8B1->U+61F6, U+2F8B2->U+6210, U+2F8B3->U+621B, U+2F8B4->U+625D, U+2F8B5->U+62B1, U+2F8B6->U+62D4, U+2F8B7->U+6350, U+2F8B8->U+22B0C, U+2F8B9->U+633D, U+2F8BA->U+62FC, U+2F8BB->U+6368, U+2F8BC->U+6383, U+2F8BD->U+63E4, U+2F8BE->U+22BF1, U+2F8BF->U+6422, U+2F8C0->U+63C5, U+2F8C1->U+63A9, U+2F8C2->U+3A2E, U+2F8C3->U+6469, U+2F8C4->U+647E, U+2F8C5->U+649D, U+2F8C6->U+6477, U+2F8C7->U+3A6C, U+2F8C8->U+654F, U+2F8C9->U+656C, U+2F8CA->U+2300A, U+2F8CB->U+65E3, U+2F8CC->U+66F8, U+2F8CD->U+6649, U+2F8CE->U+3B19, U+2F8CF->U+6691, U+2F8D0->U+3B08, U+2F8D1->U+3AE4, U+2F8D2->U+5192, U+2F8D3->U+5195, U+2F8D4->U+6700, U+2F8D5->U+669C, U+2F8D6->U+80AD, U+2F8D7->U+43D9, U+2F8D8->U+6717, U+2F8D9->U+671B, U+2F8DA->U+6721, U+2F8DB->U+675E, U+2F8DC->U+6753, U+2F8DD->U+233C3, U+2F8DE->U+3B49, U+2F8DF->U+67FA, U+2F8E0->U+6785, U+2F8E1->U+6852, U+2F8E2->U+6885, U+2F8E3->U+2346D, U+2F8E4->U+688E, U+2F8E5->U+681F, U+2F8E6->U+6914, U+2F8E7->U+3B9D, U+2F8E8->U+6942, U+2F8E9->U+69A3, U+2F8EA->U+69EA, U+2F8EB->U+6AA8, U+2F8EC->U+236A3, U+2F8ED->U+6ADB, U+2F8EE->U+3C18, U+2F8EF->U+6B21, U+2F8F0->U+238A7, U+2F8F1->U+6B54, U+2F8F2->U+3C4E, U+2F8F3->U+6B72, U+2F8F4->U+6B9F, U+2F8F5->U+6BBA, U+2F8F6->U+6BBB, U+2F8F7->U+23A8D, U+2F8F8->U+21D0B, U+2F8F9->U+23AFA, U+2F8FA->U+6C4E, U+2F8FB->U+23CBC, U+2F8FC->U+6CBF, U+2F8FD->U+6CCD, U+2F8FE->U+6C67, U+2F8FF->U+6D16, U+2F900->U+6D3E, U+2F901->U+6D77, U+2F902->U+6D41, U+2F903->U+6D69, U+2F904->U+6D78, U+2F905->U+6D85, U+2F906->U+23D1E, U+2F907->U+6D34, U+2F908->U+6E2F, U+2F909->U+6E6E, U+2F90A->U+3D33, U+2F90B->U+6ECB, U+2F90C->U+6EC7, U+2F90D->U+23ED1, U+2F90E->U+6DF9, U+2F90F->U+6F6E, U+2F910->U+23F5E, U+2F911->U+23F8E, U+2F912->U+6FC6, U+2F913->U+7039, U+2F914->U+701E, U+2F915->U+701B, U+2F916->U+3D96, U+2F917->U+704A, U+2F918->U+707D, U+2F919->U+7077, U+2F91A->U+70AD, U+2F91B->U+20525, U+2F91C->U+7145, U+2F91D->U+24263, U+2F91E->U+719C, U+2F91F->U+243AB, U+2F920->U+7228, U+2F921->U+7235, U+2F922->U+7250, U+2F923->U+24608, U+2F924->U+7280, U+2F925->U+7295, U+2F926->U+24735, U+2F927->U+24814, U+2F928->U+737A, U+2F929->U+738B, U+2F92A->U+3EAC, U+2F92B->U+73A5, U+2F92C->U+3EB8, U+2F92D->U+3EB8, U+2F92E->U+7447, U+2F92F->U+745C, U+2F930->U+7471, U+2F931->U+7485, U+2F932->U+74CA, U+2F933->U+3F1B, U+2F934->U+7524, U+2F935->U+24C36, U+2F936->U+753E, U+2F937->U+24C92, U+2F938->U+7570, U+2F939->U+2219F, U+2F93A->U+7610, U+2F93B->U+24FA1, U+2F93C->U+24FB8, U+2F93D->U+25044, U+2F93E->U+3FFC, U+2F93F->U+4008, U+2F940->U+76F4, U+2F941->U+250F3, U+2F942->U+250F2, U+2F943->U+25119, U+2F944->U+25133, U+2F945->U+771E, U+2F946->U+771F, U+2F947->U+771F, U+2F948->U+774A, U+2F949->U+4039, U+2F94A->U+778B, U+2F94B->U+4046, U+2F94C->U+4096, U+2F94D->U+2541D, U+2F94E->U+784E, U+2F94F->U+788C, U+2F950->U+78CC, U+2F951->U+40E3, U+2F952->U+25626, U+2F953->U+7956, U+2F954->U+2569A, U+2F955->U+256C5, U+2F956->U+798F, U+2F957->U+79EB, U+2F958->U+412F, U+2F959->U+7A40, U+2F95A->U+7A4A, U+2F95B->U+7A4F, U+2F95C->U+2597C, U+2F95D->U+25AA7, U+2F95E->U+25AA7, U+2F95F->U+7AEE, U+2F960->U+4202, U+2F961->U+25BAB, U+2F962->U+7BC6, U+2F963->U+7BC9, U+2F964->U+4227, U+2F965->U+25C80, U+2F966->U+7CD2, U+2F967->U+42A0, U+2F968->U+7CE8, U+2F969->U+7CE3, U+2F96A->U+7D00, U+2F96B->U+25F86, U+2F96C->U+7D63, U+2F96D->U+4301, U+2F96E->U+7DC7, U+2F96F->U+7E02, U+2F970->U+7E45, U+2F971->U+4334, U+2F972->U+26228, U+2F973->U+26247, U+2F974->U+4359, U+2F975->U+262D9, U+2F976->U+7F7A, U+2F977->U+2633E, U+2F978->U+7F95, U+2F979->U+7FFA, U+2F97A->U+8005, U+2F97B->U+264DA, U+2F97C->U+26523, U+2F97D->U+8060, U+2F97E->U+265A8, U+2F97F->U+8070, U+2F980->U+2335F, U+2F981->U+43D5, U+2F982->U+80B2, U+2F983->U+8103, U+2F984->U+440B, U+2F985->U+813E, U+2F986->U+5AB5, U+2F987->U+267A7, U+2F988->U+267B5, U+2F989->U+23393, U+2F98A->U+2339C, U+2F98B->U+8201, U+2F98C->U+8204, U+2F98D->U+8F9E, U+2F98E->U+446B, U+2F98F->U+8291, U+2F990->U+828B, U+2F991->U+829D, U+2F992->U+52B3, U+2F993->U+82B1, U+2F994->U+82B3, U+2F995->U+82BD, U+2F996->U+82E6, U+2F997->U+26B3C, U+2F998->U+82E5, U+2F999->U+831D, U+2F99A->U+8363, U+2F99B->U+83AD, U+2F99C->U+8323, U+2F99D->U+83BD, U+2F99E->U+83E7, U+2F99F->U+8457, U+2F9A0->U+8353, U+2F9A1->U+83CA, U+2F9A2->U+83CC, U+2F9A3->U+83DC, U+2F9A4->U+26C36, U+2F9A5->U+26D6B, U+2F9A6->U+26CD5, U+2F9A7->U+452B, U+2F9A8->U+84F1, U+2F9A9->U+84F3, U+2F9AA->U+8516, U+2F9AB->U+273CA, U+2F9AC->U+8564, U+2F9AD->U+26F2C, U+2F9AE->U+455D, U+2F9AF->U+4561, U+2F9B0->U+26FB1, U+2F9B1->U+270D2, U+2F9B2->U+456B, U+2F9B3->U+8650, U+2F9B4->U+865C, U+2F9B5->U+8667, U+2F9B6->U+8669, U+2F9B7->U+86A9, U+2F9B8->U+8688, U+2F9B9->U+870E, U+2F9BA->U+86E2, U+2F9BB->U+8779, U+2F9BC->U+8728, U+2F9BD->U+876B, U+2F9BE->U+8786, U+2F9BF->U+45D7, U+2F9C0->U+87E1, U+2F9C1->U+8801, U+2F9C2->U+45F9, U+2F9C3->U+8860, U+2F9C4->U+8863, U+2F9C5->U+27667, U+2F9C6->U+88D7, U+2F9C7->U+88DE, U+2F9C8->U+4635, U+2F9C9->U+88FA, U+2F9CA->U+34BB, U+2F9CB->U+278AE, U+2F9CC->U+27966, U+2F9CD->U+46BE, U+2F9CE->U+46C7, U+2F9CF->U+8AA0, U+2F9D0->U+8AED, U+2F9D1->U+8B8A, U+2F9D2->U+8C55, U+2F9D3->U+27CA8, U+2F9D4->U+8CAB, U+2F9D5->U+8CC1, U+2F9D6->U+8D1B, U+2F9D7->U+8D77, U+2F9D8->U+27F2F, U+2F9D9->U+20804, U+2F9DA->U+8DCB, U+2F9DB->U+8DBC, U+2F9DC->U+8DF0, U+2F9DD->U+208DE, U+2F9DE->U+8ED4, U+2F9DF->U+8F38, U+2F9E0->U+285D2, U+2F9E1->U+285ED, U+2F9E2->U+9094, U+2F9E3->U+90F1, U+2F9E4->U+9111, U+2F9E5->U+2872E, U+2F9E6->U+911B, U+2F9E7->U+9238, U+2F9E8->U+92D7, U+2F9E9->U+92D8, U+2F9EA->U+927C, U+2F9EB->U+93F9, U+2F9EC->U+9415, U+2F9ED->U+28BFA, U+2F9EE->U+958B, U+2F9EF->U+4995, U+2F9F0->U+95B7, U+2F9F1->U+28D77, U+2F9F2->U+49E6, U+2F9F3->U+96C3, U+2F9F4->U+5DB2, U+2F9F5->U+9723, U+2F9F6->U+29145, U+2F9F7->U+2921A, U+2F9F8->U+4A6E, U+2F9F9->U+4A76, U+2F9FA->U+97E0, U+2F9FB->U+2940A, U+2F9FC->U+4AB2, U+2F9FD->U+29496, U+2F9FE->U+980B, U+2F9FF->U+980B, U+2FA00->U+9829, U+2FA01->U+295B6, U+2FA02->U+98E2, U+2FA03->U+4B33, U+2FA04->U+9929, U+2FA05->U+99A7, U+2FA06->U+99C2, U+2FA07->U+99FE, U+2FA08->U+4BCE, U+2FA09->U+29B30, U+2FA0A->U+9B12, U+2FA0B->U+9C40, U+2FA0C->U+9CFD, U+2FA0D->U+4CCE, U+2FA0E->U+4CED, U+2FA0F->U+9D67, U+2FA10->U+2A0CE, U+2FA11->U+4CF8, U+2FA12->U+2A105, U+2FA13->U+2A20E, U+2FA14->U+2A291, U+2FA15->U+9EBB, U+2FA16->U+4D56, U+2FA17->U+9EF9, U+2FA18->U+9EFE, U+2FA19->U+9F05, U+2FA1A->U+9F0F, U+2FA1B->U+9F16, U+2FA1C->U+9F3B, U+2FA1D->U+2A600, U+2F00->U+4E00, U+2F01->U+4E28, U+2F02->U+4E36, U+2F03->U+4E3F, U+2F04->U+4E59, U+2F05->U+4E85, U+2F06->U+4E8C, U+2F07->U+4EA0, U+2F08->U+4EBA, U+2F09->U+513F, U+2F0A->U+5165, U+2F0B->U+516B, U+2F0C->U+5182, U+2F0D->U+5196, U+2F0E->U+51AB, U+2F0F->U+51E0, U+2F10->U+51F5, U+2F11->U+5200, U+2F12->U+529B, U+2F13->U+52F9, U+2F14->U+5315, U+2F15->U+531A, U+2F16->U+5338, U+2F17->U+5341, U+2F18->U+535C, U+2F19->U+5369, U+2F1A->U+5382, U+2F1B->U+53B6, U+2F1C->U+53C8, U+2F1D->U+53E3, U+2F1E->U+56D7, U+2F1F->U+571F, U+2F20->U+58EB, U+2F21->U+5902, U+2F22->U+590A, U+2F23->U+5915, U+2F24->U+5927, U+2F25->U+5973, U+2F26->U+5B50, U+2F27->U+5B80, U+2F28->U+5BF8, U+2F29->U+5C0F, U+2F2A->U+5C22, U+2F2B->U+5C38, U+2F2C->U+5C6E, U+2F2D->U+5C71, U+2F2E->U+5DDB, U+2F2F->U+5DE5, U+2F30->U+5DF1, U+2F31->U+5DFE, U+2F32->U+5E72, U+2F33->U+5E7A, U+2F34->U+5E7F, U+2F35->U+5EF4, U+2F36->U+5EFE, U+2F37->U+5F0B, U+2F38->U+5F13, U+2F39->U+5F50, U+2F3A->U+5F61, U+2F3B->U+5F73, U+2F3C->U+5FC3, U+2F3D->U+6208, U+2F3E->U+6236, U+2F3F->U+624B, U+2F40->U+652F, U+2F41->U+6534, U+2F42->U+6587, U+2F43->U+6597, U+2F44->U+65A4, U+2F45->U+65B9, U+2F46->U+65E0, U+2F47->U+65E5, U+2F48->U+66F0, U+2F49->U+6708, U+2F4A->U+6728, U+2F4B->U+6B20, U+2F4C->U+6B62, U+2F4D->U+6B79, U+2F4E->U+6BB3, U+2F4F->U+6BCB, U+2F50->U+6BD4, U+2F51->U+6BDB, U+2F52->U+6C0F, U+2F53->U+6C14, U+2F54->U+6C34, U+2F55->U+706B, U+2F56->U+722A, U+2F57->U+7236, U+2F58->U+723B, U+2F59->U+723F, U+2F5A->U+7247, U+2F5B->U+7259, U+2F5C->U+725B, U+2F5D->U+72AC, U+2F5E->U+7384, U+2F5F->U+7389, U+2F60->U+74DC, U+2F61->U+74E6, U+2F62->U+7518, U+2F63->U+751F, U+2F64->U+7528, U+2F65->U+7530, U+2F66->U+758B, U+2F67->U+7592, U+2F68->U+7676, U+2F69->U+767D, U+2F6A->U+76AE, U+2F6B->U+76BF, U+2F6C->U+76EE, U+2F6D->U+77DB, U+2F6E->U+77E2, U+2F6F->U+77F3, U+2F70->U+793A, U+2F71->U+79B8, U+2F72->U+79BE, U+2F73->U+7A74, U+2F74->U+7ACB, U+2F75->U+7AF9, U+2F76->U+7C73, U+2F77->U+7CF8, U+2F78->U+7F36, U+2F79->U+7F51, U+2F7A->U+7F8A, U+2F7B->U+7FBD, U+2F7C->U+8001, U+2F7D->U+800C, U+2F7E->U+8012, U+2F7F->U+8033, U+2F80->U+807F, U+2F81->U+8089, U+2F82->U+81E3, U+2F83->U+81EA, U+2F84->U+81F3, U+2F85->U+81FC, U+2F86->U+820C, U+2F87->U+821B, U+2F88->U+821F, U+2F89->U+826E, U+2F8A->U+8272, U+2F8B->U+8278, U+2F8C->U+864D, U+2F8D->U+866B, U+2F8E->U+8840, U+2F8F->U+884C, U+2F90->U+8863, U+2F91->U+897E, U+2F92->U+898B, U+2F93->U+89D2, U+2F94->U+8A00, U+2F95->U+8C37, U+2F96->U+8C46, U+2F97->U+8C55, U+2F98->U+8C78, U+2F99->U+8C9D, U+2F9A->U+8D64, U+2F9B->U+8D70, U+2F9C->U+8DB3, U+2F9D->U+8EAB, U+2F9E->U+8ECA, U+2F9F->U+8F9B, U+2FA0->U+8FB0, U+2FA1->U+8FB5, U+2FA2->U+9091, U+2FA3->U+9149, U+2FA4->U+91C6, U+2FA5->U+91CC, U+2FA6->U+91D1, U+2FA7->U+9577, U+2FA8->U+9580, U+2FA9->U+961C, U+2FAA->U+96B6, U+2FAB->U+96B9, U+2FAC->U+96E8, U+2FAD->U+9751, U+2FAE->U+975E, U+2FAF->U+9762, U+2FB0->U+9769, U+2FB1->U+97CB, U+2FB2->U+97ED, U+2FB3->U+97F3, U+2FB4->U+9801, U+2FB5->U+98A8, U+2FB6->U+98DB, U+2FB7->U+98DF, U+2FB8->U+9996, U+2FB9->U+9999, U+2FBA->U+99AC, U+2FBB->U+9AA8, U+2FBC->U+9AD8, U+2FBD->U+9ADF, U+2FBE->U+9B25, U+2FBF->U+9B2F, U+2FC0->U+9B32, U+2FC1->U+9B3C, U+2FC2->U+9B5A, U+2FC3->U+9CE5, U+2FC4->U+9E75, U+2FC5->U+9E7F, U+2FC6->U+9EA5, U+2FC7->U+9EBB, U+2FC8->U+9EC3, U+2FC9->U+9ECD, U+2FCA->U+9ED1, U+2FCB->U+9EF9, U+2FCC->U+9EFD, U+2FCD->U+9F0E, U+2FCE->U+9F13, U+2FCF->U+9F20, U+2FD0->U+9F3B, U+2FD1->U+9F4A, U+2FD2->U+9F52, U+2FD3->U+9F8D, U+2FD4->U+9F9C, U+2FD5->U+9FA0, U+3042->U+3041, U+3044->U+3043, U+3046->U+3045, U+3048->U+3047, U+304A->U+3049, U+304C->U+304B, U+304E->U+304D, U+3050->U+304F, U+3052->U+3051, U+3054->U+3053, U+3056->U+3055, U+3058->U+3057, U+305A->U+3059, U+305C->U+305B, U+305E->U+305D, U+3060->U+305F, U+3062->U+3061, U+3064->U+3063, U+3065->U+3063, U+3067->U+3066, U+3069->U+3068, U+3070->U+306F, U+3071->U+306F, U+3073->U+3072, U+3074->U+3072, U+3076->U+3075, U+3077->U+3075, U+3079->U+3078, U+307A->U+3078, U+307C->U+307B, U+307D->U+307B, U+3084->U+3083, U+3086->U+3085, U+3088->U+3087, U+308F->U+308E, U+3094->U+3046, U+3095->U+304B, U+3096->U+3051, U+30A2->U+30A1, U+30A4->U+30A3, U+30A6->U+30A5, U+30A8->U+30A7, U+30AA->U+30A9, U+30AC->U+30AB, U+30AE->U+30AD, U+30B0->U+30AF, U+30B2->U+30B1, U+30B4->U+30B3, U+30B6->U+30B5, U+30B8->U+30B7, U+30BA->U+30B9, U+30BC->U+30BB, U+30BE->U+30BD, U+30C0->U+30BF, U+30C2->U+30C1, U+30C5->U+30C4, U+30C7->U+30C6, U+30C9->U+30C8, U+30D0->U+30CF, U+30D1->U+30CF, U+30D3->U+30D2, U+30D4->U+30D2, U+30D6->U+30D5, U+30D7->U+30D5, U+30D9->U+30D8, U+30DA->U+30D8, U+30DC->U+30DB, U+30DD->U+30DB, U+30E4->U+30E3, U+30E6->U+30E5, U+30E8->U+30E7, U+30EF->U+30EE, U+30F4->U+30A6, U+30AB->U+30F5, U+30B1->U+30F6, U+30F7->U+30EF, U+30F8->U+30F0, U+30F9->U+30F1, U+30FA->U+30F2, U+30AF->U+31F0, U+30B7->U+31F1, U+30B9->U+31F2, U+30C8->U+31F3, U+30CC->U+31F4, U+30CF->U+31F5, U+30D2->U+31F6, U+30D5->U+31F7, U+30D8->U+31F8, U+30DB->U+31F9, U+30E0->U+31FA, U+30E9->U+31FB, U+30EA->U+31FC, U+30EB->U+31FD, U+30EC->U+31FE, U+30ED->U+31FF, U+FF66->U+30F2, U+FF67->U+30A1, U+FF68->U+30A3, U+FF69->U+30A5, U+FF6A->U+30A7, U+FF6B->U+30A9, U+FF6C->U+30E3, U+FF6D->U+30E5, U+FF6E->U+30E7, U+FF6F->U+30C3, U+FF71->U+30A1, U+FF72->U+30A3, U+FF73->U+30A5, U+FF74->U+30A7, U+FF75->U+30A9, U+FF76->U+30AB, U+FF77->U+30AD, U+FF78->U+30AF, U+FF79->U+30B1, U+FF7A->U+30B3, U+FF7B->U+30B5, U+FF7C->U+30B7, U+FF7D->U+30B9, U+FF7E->U+30BB, U+FF7F->U+30BD, U+FF80->U+30BF, U+FF81->U+30C1, U+FF82->U+30C3, U+FF83->U+30C6, U+FF84->U+30C8, U+FF85->U+30CA, U+FF86->U+30CB, U+FF87->U+30CC, U+FF88->U+30CD, U+FF89->U+30CE, U+FF8A->U+30CF, U+FF8B->U+30D2, U+FF8C->U+30D5, U+FF8D->U+30D8, U+FF8E->U+30DB, U+FF8F->U+30DE, U+FF90->U+30DF, U+FF91->U+30E0, U+FF92->U+30E1, U+FF93->U+30E2, U+FF94->U+30E3, U+FF95->U+30E5, U+FF96->U+30E7, U+FF97->U+30E9, U+FF98->U+30EA, U+FF99->U+30EB, U+FF9A->U+30EC, U+FF9B->U+30ED, U+FF9C->U+30EF, U+FF9D->U+30F3, U+FFA0->U+3164, U+FFA1->U+3131, U+FFA2->U+3132, U+FFA3->U+3133, U+FFA4->U+3134, U+FFA5->U+3135, U+FFA6->U+3136, U+FFA7->U+3137, U+FFA8->U+3138, U+FFA9->U+3139, U+FFAA->U+313A, U+FFAB->U+313B, U+FFAC->U+313C, U+FFAD->U+313D, U+FFAE->U+313E, U+FFAF->U+313F, U+FFB0->U+3140, U+FFB1->U+3141, U+FFB2->U+3142, U+FFB3->U+3143, U+FFB4->U+3144, U+FFB5->U+3145, U+FFB6->U+3146, U+FFB7->U+3147, U+FFB8->U+3148, U+FFB9->U+3149, U+FFBA->U+314A, U+FFBB->U+314B, U+FFBC->U+314C, U+FFBD->U+314D, U+FFBE->U+314E, U+FFC2->U+314F, U+FFC3->U+3150, U+FFC4->U+3151, U+FFC5->U+3152, U+FFC6->U+3153, U+FFC7->U+3154, U+FFCA->U+3155, U+FFCB->U+3156, U+FFCC->U+3157, U+FFCD->U+3158, U+FFCE->U+3159, U+FFCF->U+315A, U+FFD2->U+315B, U+FFD3->U+315C, U+FFD4->U+315D, U+FFD5->U+315E, U+FFD6->U+315F, U+FFD7->U+3160, U+FFDA->U+3161, U+FFDB->U+3162, U+FFDC->U+3163, U+3131->U+1100, U+3132->U+1101, U+3133->U+11AA, U+3134->U+1102, U+3135->U+11AC, U+3136->U+11AD, U+3137->U+1103, U+3138->U+1104, U+3139->U+1105, U+313A->U+11B0, U+313B->U+11B1, U+313C->U+11B2, U+313D->U+11B3, U+313E->U+11B4, U+313F->U+11B5, U+3140->U+111A, U+3141->U+1106, U+3142->U+1107, U+3143->U+1108, U+3144->U+1121, U+3145->U+1109, U+3146->U+110A, U+3147->U+110B, U+3148->U+110C, U+3149->U+110D, U+314A->U+110E, U+314B->U+110F, U+314C->U+1110, U+314D->U+1111, U+314E->U+1112, U+314F->U+1161, U+3150->U+1162, U+3151->U+1163, U+3152->U+1164, U+3153->U+1165, U+3154->U+1166, U+3155->U+1167, U+3156->U+1168, U+3157->U+1169, U+3158->U+116A, U+3159->U+116B, U+315A->U+116C, U+315B->U+116D, U+315C->U+116E, U+315D->U+116F, U+315E->U+1170, U+315F->U+1171, U+3160->U+1172, U+3161->U+1173, U+3162->U+1174, U+3163->U+1175, U+3165->U+1114, U+3166->U+1115, U+3167->U+11C7, U+3168->U+11C8, U+3169->U+11CC, U+316A->U+11CE, U+316B->U+11D3, U+316C->U+11D7, U+316D->U+11D9, U+316E->U+111C, U+316F->U+11DD, U+3170->U+11DF, U+3171->U+111D, U+3172->U+111E, U+3173->U+1120, U+3174->U+1122, U+3175->U+1123, U+3176->U+1127, U+3177->U+1129, U+3178->U+112B, U+3179->U+112C, U+317A->U+112D, U+317B->U+112E, U+317C->U+112F, U+317D->U+1132, U+317E->U+1136, U+317F->U+1140, U+3180->U+1147, U+3181->U+114C, U+3182->U+11F1, U+3183->U+11F2, U+3184->U+1157, U+3185->U+1158, U+3186->U+1159, U+3187->U+1184, U+3188->U+1185, U+3189->U+1188, U+318A->U+1191, U+318B->U+1192, U+318C->U+1194, U+318D->U+119E, U+318E->U+11A1, U+A490->U+A408, U+A491->U+A1B9, U+4E00..U+9FBB, U+3400..U+4DB5, U+20000..U+2A6D6, U+FA0E, U+FA0F, U+FA11, U+FA13, U+FA14, U+FA1F, U+FA21, U+FA23, U+FA24, U+FA27, U+FA28, U+FA29, U+3105..U+312C, U+31A0..U+31B7, U+3041, U+3043, U+3045, U+3047, U+3049, U+304B, U+304D, U+304F, U+3051, U+3053, U+3055, U+3057, U+3059, U+305B, U+305D, U+305F, U+3061, U+3063, U+3066, U+3068, U+306A..U+306F, U+3072, U+3075, U+3078, U+307B, U+307E..U+3083, U+3085, U+3087, U+3089..U+308E, U+3090..U+3093, U+30A1, U+30A3, U+30A5, U+30A7, U+30A9, U+30AD, U+30AF, U+30B3, U+30B5, U+30BB, U+30BD, U+30BF, U+30C1, U+30C3, U+30C4, U+30C6, U+30CA, U+30CB, U+30CD, U+30CE, U+30DE, U+30DF, U+30E1, U+30E2, U+30E3, U+30E5, U+30E7, U+30EE, U+30F0..U+30F3, U+30F5, U+30F6, U+31F0, U+31F1, U+31F2, U+31F3, U+31F4, U+31F5, U+31F6, U+31F7, U+31F8, U+31F9, U+31FA, U+31FB, U+31FC, U+31FD, U+31FE, U+31FF, U+AC00..U+D7A3, U+1100..U+1159, U+1161..U+11A2, U+11A8..U+11F9, U+A000..U+A48C, U+A492..U+A4C6 ################################################## # Coptic # Notes: Some shared Greek characters, may require amendments. U+2C80->U+2C81, U+2C81, U+2C82->U+2C83, U+2C83, U+2C84->U+2C85, U+2C85, U+2C86->U+2C87, U+2C87, U+2C88->U+2C89, U+2C89, U+2C8A->U+2C8B, U+2C8B, U+2C8C->U+2C8D, U+2C8D, U+2C8E->U+2C8F, U+2C8F, U+2C90->U+2C91, U+2C91, U+2C92->U+2C93, U+2C93, U+2C94->U+2C95, U+2C95, U+2C96->U+2C97, U+2C97, U+2C98->U+2C99, U+2C99, U+2C9A->U+2C9B, U+2C9B, U+2C9C->U+2C9D, U+2C9D, U+2C9E->U+2C9F, U+2C9F, U+2CA0->U+2CA1, U+2CA1, U+2CA2->U+2CA3, U+2CA3, U+2CA4->U+2CA5, U+2CA5, U+2CA6->U+2CA7, U+2CA7, U+2CA8->U+2CA9, U+2CA9, U+2CAA->U+2CAB, U+2CAB, U+2CAC->U+2CAD, U+2CAD, U+2CAE->U+2CAF, U+2CAF, U+2CB0->U+2CB1, U+2CB1, U+2CB2->U+2CB3, U+2CB3, U+2CB4->U+2CB5, U+2CB5, U+2CB6->U+2CB7, U+2CB7, U+2CB8->U+2CB9, U+2CB9, U+2CBA->U+2CBB, U+2CBB, U+2CBC->U+2CBD, U+2CBD, U+2CBE->U+2CBF, U+2CBF, U+2CC0->U+2CC1, U+2CC1, U+2CC2->U+2CC3, U+2CC3, U+2CC4->U+2CC5, U+2CC5, U+2CC6->U+2CC7, U+2CC7, U+2CC8->U+2CC9, U+2CC9, U+2CCA->U+2CCB, U+2CCB, U+2CCC->U+2CCD, U+2CCD, U+2CCE->U+2CCF, U+2CCF, U+2CD0->U+2CD1, U+2CD1, U+2CD2->U+2CD3, U+2CD3, U+2CD4->U+2CD5, U+2CD5, U+2CD6->U+2CD7, U+2CD7, U+2CD8->U+2CD9, U+2CD9, U+2CDA->U+2CDB, U+2CDB, U+2CDC->U+2CDD, U+2CDD, U+2CDE->U+2CDF, U+2CDF, U+2CE0->U+2CE1, U+2CE1, U+2CE2->U+2CE3, U+2CE3 ################################################## # Cryllic* U+0400->U+0435, U+0401->U+0435, U+0402->U+0452, U+0452, U+0403->U+0433, U+0404->U+0454, U+0454, U+0405->U+0455, U+0455, U+0406->U+0456, U+0407->U+0456, U+0457->U+0456, U+0456, U+0408..U+040B->U+0458..U+045B, U+0458..U+045B, U+040C->U+043A, U+040D->U+0438, U+040E->U+0443, U+040F->U+045F, U+045F, U+0450->U+0435, U+0451->U+0435, U+0453->U+0433, U+045C->U+043A, U+045D->U+0438, U+045E->U+0443, U+0460->U+0461, U+0461, U+0462->U+0463, U+0463, U+0464->U+0465, U+0465, U+0466->U+0467, U+0467, U+0468->U+0469, U+0469, U+046A->U+046B, U+046B, U+046C->U+046D, U+046D, U+046E->U+046F, U+046F, U+0470->U+0471, U+0471, U+0472->U+0473, U+0473, U+0474->U+0475, U+0476->U+0475, U+0477->U+0475, U+0475, U+0478->U+0479, U+0479, U+047A->U+047B, U+047B, U+047C->U+047D, U+047D, U+047E->U+047F, U+047F, U+0480->U+0481, U+0481, U+048A->U+0438, U+048B->U+0438, U+048C->U+044C, U+048D->U+044C, U+048E->U+0440, U+048F->U+0440, U+0490->U+0433, U+0491->U+0433, U+0490->U+0433, U+0491->U+0433, U+0492->U+0433, U+0493->U+0433, U+0494->U+0433, U+0495->U+0433, U+0496->U+0436, U+0497->U+0436, U+0498->U+0437, U+0499->U+0437, U+049A->U+043A, U+049B->U+043A, U+049C->U+043A, U+049D->U+043A, U+049E->U+043A, U+049F->U+043A, U+04A0->U+043A, U+04A1->U+043A, U+04A2->U+043D, U+04A3->U+043D, U+04A4->U+043D, U+04A5->U+043D, U+04A6->U+043F, U+04A7->U+043F, U+04A8->U+04A9, U+04A9, U+04AA->U+0441, U+04AB->U+0441, U+04AC->U+0442, U+04AD->U+0442, U+04AE->U+0443, U+04AF->U+0443, U+04B0->U+0443, U+04B1->U+0443, U+04B2->U+0445, U+04B3->U+0445, U+04B4->U+04B5, U+04B5, U+04B6->U+0447, U+04B7->U+0447, U+04B8->U+0447, U+04B9->U+0447, U+04BA->U+04BB, U+04BB, U+04BC->U+04BD, U+04BE->U+04BD, U+04BF->U+04BD, U+04BD, U+04C0->U+04CF, U+04CF, U+04C1->U+0436, U+04C2->U+0436, U+04C3->U+043A, U+04C4->U+043A, U+04C5->U+043B, U+04C6->U+043B, U+04C7->U+043D, U+04C8->U+043D, U+04C9->U+043D, U+04CA->U+043D, U+04CB->U+0447, U+04CC->U+0447, U+04CD->U+043C, U+04CE->U+043C, U+04D0->U+0430, U+04D1->U+0430, U+04D2->U+0430, U+04D3->U+0430, U+04D4->U+00E6, U+04D5->U+00E6, U+04D6->U+0435, U+04D7->U+0435, U+04D8->U+04D9, U+04DA->U+04D9, U+04DB->U+04D9, U+04D9, U+04DC->U+0436, U+04DD->U+0436, U+04DE->U+0437, U+04DF->U+0437, U+04E0->U+04E1, U+04E1, U+04E2->U+0438, U+04E3->U+0438, U+04E4->U+0438, U+04E5->U+0438, U+04E6->U+043E, U+04E7->U+043E, U+04E8->U+043E, U+04E9->U+043E, U+04EA->U+043E, U+04EB->U+043E, U+04EC->U+044D, U+04ED->U+044D, U+04EE->U+0443, U+04EF->U+0443, U+04F0->U+0443, U+04F1->U+0443, U+04F2->U+0443, U+04F3->U+0443, U+04F4->U+0447, U+04F5->U+0447, U+04F6->U+0433, U+04F7->U+0433, U+04F8->U+044B, U+04F9->U+044B, U+04FA->U+0433, U+04FB->U+0433, U+04FC->U+0445, U+04FD->U+0445, U+04FE->U+0445, U+04FF->U+0445, U+0410..U+0418->U+0430..U+0438, U+0419->U+0438, U+0430..U+0438, U+041A..U+042F->U+043A..U+044F, U+043A..U+044F ################################################## # Devanagari U+0929->U+0928, U+0931->U+0930, U+0934->U+0933, U+0958->U+0915, U+0959->U+0916, U+095A->U+0917, U+095B->U+091C, U+095C->U+0921, U+095D->U+0922, U+095E->U+092B, U+095F->U+092F, U+0904..U+0928, U+092A..U+0930, U+0932, U+0933, U+0935..U+0939, U+0960, U+0961, U+0966..U+096F, U+097B..U+097F ################################################## # Georgian U+10FC->U+10DC, U+10D0..U+10FA, U+10A0..U+10C5->U+2D00..U+2D25, U+2D00..U+2D25 ################################################## # Greek U+0386->U+03B1, U+0388->U+03B5, U+0389->U+03B7, U+038A->U+03B9, U+038C->U+03BF, U+038E->U+03C5, U+038F->U+03C9, U+0390->U+03B9, U+03AA->U+03B9, U+03AB->U+03C5, U+03AC->U+03B1, U+03AD->U+03B5, U+03AE->U+03B7, U+03AF->U+03B9, U+03B0->U+03C5, U+03CA->U+03B9, U+03CB->U+03C5, U+03CC->U+03BF, U+03CD->U+03C5, U+03CE->U+03C9, U+03D0->U+03B2, U+03D1->U+03B8, U+03D2->U+03C5, U+03D3->U+03C5, U+03D4->U+03C5, U+03D5->U+03C6, U+03D6->U+03C0, U+03D8->U+03D9, U+03DA->U+03DB, U+03DC->U+03DD, U+03DE->U+03DF, U+03E0->U+03E1, U+03E2->U+03E3, U+03E4->U+03E5, U+03E6->U+03E7, U+03E8->U+03E9, U+03EA->U+03EB, U+03EC->U+03ED, U+03EE->U+03EF, U+03F0->U+03BA, U+03F1->U+03C1, U+03F2->U+03C3, U+03F4->U+03B8, U+03F5->U+03B5, U+03F6->U+03B5, U+03F7->U+03F8, U+03F9->U+03C3, U+03FA->U+03FB, U+1F00->U+03B1, U+1F01->U+03B1, U+1F02->U+03B1, U+1F03->U+03B1, U+1F04->U+03B1, U+1F05->U+03B1, U+1F06->U+03B1, U+1F07->U+03B1, U+1F08->U+03B1, U+1F09->U+03B1, U+1F0A->U+03B1, U+1F0B->U+03B1, U+1F0C->U+03B1, U+1F0D->U+03B1, U+1F0E->U+03B1, U+1F0F->U+03B1, U+1F10->U+03B5, U+1F11->U+03B5, U+1F12->U+03B5, U+1F13->U+03B5, U+1F14->U+03B5, U+1F15->U+03B5, U+1F18->U+03B5, U+1F19->U+03B5, U+1F1A->U+03B5, U+1F1B->U+03B5, U+1F1C->U+03B5, U+1F1D->U+03B5, U+1F20->U+03B7, U+1F21->U+03B7, U+1F22->U+03B7, U+1F23->U+03B7, U+1F24->U+03B7, U+1F25->U+03B7, U+1F26->U+03B7, U+1F27->U+03B7, U+1F28->U+03B7, U+1F29->U+03B7, U+1F2A->U+03B7, U+1F2B->U+03B7, U+1F2C->U+03B7, U+1F2D->U+03B7, U+1F2E->U+03B7, U+1F2F->U+03B7, U+1F30->U+03B9, U+1F31->U+03B9, U+1F32->U+03B9, U+1F33->U+03B9, U+1F34->U+03B9, U+1F35->U+03B9, U+1F36->U+03B9, U+1F37->U+03B9, U+1F38->U+03B9, U+1F39->U+03B9, U+1F3A->U+03B9, U+1F3B->U+03B9, U+1F3C->U+03B9, U+1F3D->U+03B9, U+1F3E->U+03B9, U+1F3F->U+03B9, U+1F40->U+03BF, U+1F41->U+03BF, U+1F42->U+03BF, U+1F43->U+03BF, U+1F44->U+03BF, U+1F45->U+03BF, U+1F48->U+03BF, U+1F49->U+03BF, U+1F4A->U+03BF, U+1F4B->U+03BF, U+1F4C->U+03BF, U+1F4D->U+03BF, U+1F50->U+03C5, U+1F51->U+03C5, U+1F52->U+03C5, U+1F53->U+03C5, U+1F54->U+03C5, U+1F55->U+03C5, U+1F56->U+03C5, U+1F57->U+03C5, U+1F59->U+03C5, U+1F5B->U+03C5, U+1F5D->U+03C5, U+1F5F->U+03C5, U+1F60->U+03C9, U+1F61->U+03C9, U+1F62->U+03C9, U+1F63->U+03C9, U+1F64->U+03C9, U+1F65->U+03C9, U+1F66->U+03C9, U+1F67->U+03C9, U+1F68->U+03C9, U+1F69->U+03C9, U+1F6A->U+03C9, U+1F6B->U+03C9, U+1F6C->U+03C9, U+1F6D->U+03C9, U+1F6E->U+03C9, U+1F6F->U+03C9, U+1F70->U+03B1, U+1F71->U+03B1, U+1F72->U+03B5, U+1F73->U+03B5, U+1F74->U+03B7, U+1F75->U+03B7, U+1F76->U+03B9, U+1F77->U+03B9, U+1F78->U+03BF, U+1F79->U+03BF, U+1F7A->U+03C5, U+1F7B->U+03C5, U+1F7C->U+03C9, U+1F7D->U+03C9, U+1F80->U+03B1, U+1F81->U+03B1, U+1F82->U+03B1, U+1F83->U+03B1, U+1F84->U+03B1, U+1F85->U+03B1, U+1F86->U+03B1, U+1F87->U+03B1, U+1F88->U+03B1, U+1F89->U+03B1, U+1F8A->U+03B1, U+1F8B->U+03B1, U+1F8C->U+03B1, U+1F8D->U+03B1, U+1F8E->U+03B1, U+1F8F->U+03B1, U+1F90->U+03B7, U+1F91->U+03B7, U+1F92->U+03B7, U+1F93->U+03B7, U+1F94->U+03B7, U+1F95->U+03B7, U+1F96->U+03B7, U+1F97->U+03B7, U+1F98->U+03B7, U+1F99->U+03B7, U+1F9A->U+03B7, U+1F9B->U+03B7, U+1F9C->U+03B7, U+1F9D->U+03B7, U+1F9E->U+03B7, U+1F9F->U+03B7, U+1FA0->U+03C9, U+1FA1->U+03C9, U+1FA2->U+03C9, U+1FA3->U+03C9, U+1FA4->U+03C9, U+1FA5->U+03C9, U+1FA6->U+03C9, U+1FA7->U+03C9, U+1FA8->U+03C9, U+1FA9->U+03C9, U+1FAA->U+03C9, U+1FAB->U+03C9, U+1FAC->U+03C9, U+1FAD->U+03C9, U+1FAE->U+03C9, U+1FAF->U+03C9, U+1FB0->U+03B1, U+1FB1->U+03B1, U+1FB2->U+03B1, U+1FB3->U+03B1, U+1FB4->U+03B1, U+1FB6->U+03B1, U+1FB7->U+03B1, U+1FB8->U+03B1, U+1FB9->U+03B1, U+1FBA->U+03B1, U+1FBB->U+03B1, U+1FBC->U+03B1, U+1FC2->U+03B7, U+1FC3->U+03B7, U+1FC4->U+03B7, U+1FC6->U+03B7, U+1FC7->U+03B7, U+1FC8->U+03B5, U+1FC9->U+03B5, U+1FCA->U+03B7, U+1FCB->U+03B7, U+1FCC->U+03B7, U+1FD0->U+03B9, U+1FD1->U+03B9, U+1FD2->U+03B9, U+1FD3->U+03B9, U+1FD6->U+03B9, U+1FD7->U+03B9, U+1FD8->U+03B9, U+1FD9->U+03B9, U+1FDA->U+03B9, U+1FDB->U+03B9, U+1FE0->U+03C5, U+1FE1->U+03C5, U+1FE2->U+03C5, U+1FE3->U+03C5, U+1FE4->U+03C1, U+1FE5->U+03C1, U+1FE6->U+03C5, U+1FE7->U+03C5, U+1FE8->U+03C5, U+1FE9->U+03C5, U+1FEA->U+03C5, U+1FEB->U+03C5, U+1FEC->U+03C1, U+1FF2->U+03C9, U+1FF3->U+03C9, U+1FF4->U+03C9, U+1FF6->U+03C9, U+1FF7->U+03C9, U+1FF8->U+03BF, U+1FF9->U+03BF, U+1FFA->U+03C9, U+1FFB->U+03C9, U+1FFC->U+03C9, U+0391..U+03A1->U+03B1..U+03C1, U+03B1..U+03C1, U+03A3..U+03A9->U+03C3..U+03C9, U+03C3..U+03C9, U+03C2, U+03D9, U+03DB, U+03DD, U+03DF, U+03E1, U+03E3, U+03E5, U+03E7, U+03E9, U+03EB, U+03ED, U+03EF, U+03F3, U+03F8, U+03FB ################################################## # Gujarati U+0A85..U+0A8C, U+0A8F, U+0A90, U+0A93..U+0AB0, U+0AB2, U+0AB3, U+0AB5..U+0AB9, U+0AE0, U+0AE1, U+0AE6..U+0AEF ################################################## # Gurmukhi U+0A33->U+0A32, U+0A36->U+0A38, U+0A59->U+0A16, U+0A5A->U+0A17, U+0A5B->U+0A1C, U+0A5E->U+0A2B, U+0A05..U+0A0A, U+0A0F, U+0A10, U+0A13..U+0A28, U+0A2A..U+0A30, U+0A32, U+0A35, U+0A38, U+0A39, U+0A5C, U+0A66..U+0A6F ################################################# # Hebrew* U+FB1D->U+05D9, U+FB1F->U+05F2, U+FB20->U+05E2, U+FB21->U+05D0, U+FB22->U+05D3, U+FB23->U+05D4, U+FB24->U+05DB, U+FB25->U+05DC, U+FB26->U+05DD, U+FB27->U+05E8, U+FB28->U+05EA, U+FB2A->U+05E9, U+FB2B->U+05E9, U+FB2C->U+05E9, U+FB2D->U+05E9, U+FB2E->U+05D0, U+FB2F->U+05D0, U+FB30->U+05D0, U+FB31->U+05D1, U+FB32->U+05D2, U+FB33->U+05D3, U+FB34->U+05D4, U+FB35->U+05D5, U+FB36->U+05D6, U+FB38->U+05D8, U+FB39->U+05D9, U+FB3A->U+05DA, U+FB3B->U+05DB, U+FB3C->U+05DC, U+FB3E->U+05DE, U+FB40->U+05E0, U+FB41->U+05E1, U+FB43->U+05E3, U+FB44->U+05E4, U+FB46->U+05E6, U+FB47->U+05E7, U+FB48->U+05E8, U+FB49->U+05E9, U+FB4A->U+05EA, U+FB4B->U+05D5, U+FB4C->U+05D1, U+FB4D->U+05DB, U+FB4E->U+05E4, U+FB4F->U+05D0, U+05D0..U+05F2 ################################################# # Kannada U+0C85..U+0C8C, U+0C8E..U+0C90, U+0C92..U+0CA8, U+0CAA..U+0CB3, U+0CB5..U+0CB9, U+0CE0, U+0CE1, U+0CE6..U+0CEF ################################################# # Limbu U+1900..U+191C, U+1930..U+1938, U+1946..U+194F ################################################# # Malayalam U+0D05..U+0D0C, U+0D0E..U+0D10, U+0D12..U+0D28, U+0D2A..U+0D39, U+0D60, U+0D61, U+0D66..U+0D6F ################################################# # Tamil U+0B94->U+0B92, U+0B85..U+0B8A, U+0B8E..U+0B90, U+0B92, U+0B93, U+0B95, U+0B99, U+0B9A, U+0B9C, U+0B9E, U+0B9F, U+0BA3, U+0BA4, U+0BA8..U+0BAA, U+0BAE..U+0BB9, U+0BE6..U+0BEF ################################################# # Thai U+0E01..U+0E30, U+0E32, U+0E33, U+0E40..U+0E46, U+0E50..U+0E5B ################################################## # Common U+FF10..U+FF19->0..9, U+FF21..U+FF3A->a..z, U+FF41..U+FF5A->a..z, 0..9, A..Z->a..z, a..z """ # The expected value format is a commas-separated list of mappings. # Two simplest mappings simply declare a character as valid, and map a single character # to another single character, respectively. But specifying the whole table in such # form would result in bloated and barely manageable specifications. So there are # several syntax shortcuts that let you map ranges of characters at once. The complete # list is as follows: # # A->a # Single char mapping, declares source char 'A' as allowed to occur within keywords # and maps it to destination char 'a' (but does not declare 'a' as allowed). # A..Z->a..z # Range mapping, declares all chars in source range as allowed and maps them to # the destination range. Does not declare destination range as allowed. Also checks # ranges' lengths (the lengths must be equal). # a # Stray char mapping, declares a character as allowed and maps it to itself. # Equivalent to a->a single char mapping. # a..z # Stray range mapping, declares all characters in range as allowed and maps them to # themselves. Equivalent to a..z->a..z range mapping. # A..Z/2 # Checkerboard range map. Maps every pair of chars to the second char. # More formally, declares odd characters in range as allowed and maps them to the # even ones; also declares even characters as allowed and maps them to themselves. # For instance, A..Z/2 is equivalent to A->B, B->B, C->D, D->D, ..., Y->Z, Z->Z. # This mapping shortcut is helpful for a number of Unicode blocks where uppercase # and lowercase letters go in such interleaved order instead of contiguous chunks. _dewhite = re.compile(r"\s") _char = r"((?:U\+[0-9A-Fa-f]{4,6})|.)" _char_map = re.compile("^" + _char + "->" + _char + "$") _range_map = re.compile("^" + _char + r"\.\." + _char + "->" + _char + ".." + _char + "$") _stray_char = re.compile("^" + _char + "$") _stray_range = re.compile("^" + _char + r"\.\." + _char + "$") _checker_range = re.compile("^" + _char + r"\.\." + _char + "/2$") def charspec_to_int(string): # Converts a character specification of the form 'A' or 'U+23BC' # to an integer if string.startswith("U+"): return int(string[2:], 16) elif len(string) == 1: return ord(string) else: raise Exception("Can't convert charspec: %r" % string) def charset_table_to_dict(tablestring): """Takes a string with the contents of a Sphinx charset table file and returns a mapping object (a defaultdict, actually) of the kind expected by the unicode.translate() method: that is, it maps a character number to a unicode character or None if the character is not a valid word character. The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table. """ #map = {} map = defaultdict(lambda: None) for line in tablestring.split("\n"): if not line or line.startswith("#"): continue line = _dewhite.sub("", line) for item in line.split(","): if not item: continue match = _range_map.match(item) if match: start1 = charspec_to_int(match.group(1)) end1 = charspec_to_int(match.group(2)) start2 = charspec_to_int(match.group(3)) end2 = charspec_to_int(match.group(4)) assert (end1 - start1) == (end2 - start2) try: for fromord, tooord in izip(xrange(start1, end1 + 1), xrange(start2, end2 + 1)): map[fromord] = unichr(tooord) except ValueError: pass continue match = _char_map.match(item) if match: fromord = charspec_to_int(match.group(1)) toord = charspec_to_int(match.group(2)) try: map[fromord] = unichr(toord) except ValueError: pass continue match = _stray_char.match(item) if match: ord = charspec_to_int(match.group(0)) try: map[ord] = unichr(ord) except ValueError: pass continue match = _stray_range.match(item) if match: start = charspec_to_int(match.group(1)) end = charspec_to_int(match.group(2)) try: for ord in xrange(start, end + 1): map[ord] = unichr(ord) except ValueError: pass continue match = _checker_range.match(item) if match: fromord = charspec_to_int(match.group(1)) toord = charspec_to_int(match.group(2)) assert toord - fromord % 2 == 0 for ord in xrange(fromord, toord + 1, 2): try: map[ord] = unichr(ord + 1) map[ord + 1] = unichr(ord + 1) except ValueError: pass continue raise Exception("Don't know what to do with %r" % item) return dict(map) Whoosh-2.5.7/src/whoosh/support/levenshtein.py0000644000076500000240000000451312254366350021605 0ustar mattstaff00000000000000""" Contains functions implementing edit distance algorithms. """ from whoosh.compat import xrange def levenshtein(seq1, seq2, limit=None): """Returns the Levenshtein edit distance between two strings. """ oneago = None thisrow = range(1, len(seq2) + 1) + [0] for x in xrange(len(seq1)): # Python lists wrap around for negative indices, so put the # leftmost column at the *end* of the list. This matches with # the zero-indexed strings and saves extra calculation. oneago, thisrow = thisrow, [0] * len(seq2) + [x + 1] for y in xrange(len(seq2)): delcost = oneago[y] + 1 addcost = thisrow[y - 1] + 1 subcost = oneago[y - 1] + (seq1[x] != seq2[y]) thisrow[y] = min(delcost, addcost, subcost) if limit and x > limit and min(thisrow) > limit: return limit + 1 return thisrow[len(seq2) - 1] def damerau_levenshtein(seq1, seq2, limit=None): """Returns the Damerau-Levenshtein edit distance between two strings. """ oneago = None thisrow = list(range(1, len(seq2) + 1)) + [0] for x in xrange(len(seq1)): # Python lists wrap around for negative indices, so put the # leftmost column at the *end* of the list. This matches with # the zero-indexed strings and saves extra calculation. twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] for y in xrange(len(seq2)): delcost = oneago[y] + 1 addcost = thisrow[y - 1] + 1 subcost = oneago[y - 1] + (seq1[x] != seq2[y]) thisrow[y] = min(delcost, addcost, subcost) # This block deals with transpositions if (x > 0 and y > 0 and seq1[x] == seq2[y - 1] and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]): thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) if limit and x > limit and min(thisrow) > limit: return limit + 1 return thisrow[len(seq2) - 1] def relative(a, b): """Returns the relative distance between two strings, in the range [0-1] where 1 means total equality. """ d = distance(a, b) longer = float(max((len(a), len(b)))) shorter = float(min((len(a), len(b)))) r = ((longer - d) / longer) * (shorter / longer) return r distance = damerau_levenshtein Whoosh-2.5.7/src/whoosh/support/relativedelta.py0000644000076500000240000004170312254366350022110 0ustar mattstaff00000000000000""" Copyright (c) 2003-2010 Gustavo Niemeyer This module offers extensions to the standard python 2.3+ datetime module. """ __author__ = "Gustavo Niemeyer " __license__ = "PSF License" import datetime import calendar __all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"] class weekday(object): __slots__ = ["weekday", "n"] def __init__(self, weekday, n=None): self.weekday = weekday self.n = n def __call__(self, n): if n == self.n: return self else: return self.__class__(self.weekday, n) def __eq__(self, other): try: if self.weekday != other.weekday or self.n != other.n: return False except AttributeError: return False return True def __repr__(self): s = ("MO", "TU", "WE", "TH", "FR", "SA", "SU")[self.weekday] if not self.n: return s else: return "%s(%+d)" % (s, self.n) MO, TU, WE, TH, FR, SA, SU = weekdays = tuple([weekday(x) for x in range(7)]) class relativedelta: """ The relativedelta type is based on the specification of the excellent work done by M.-A. Lemburg in his mx.DateTime extension. However, notice that this type does *NOT* implement the same algorithm as his work. Do *NOT* expect it to behave like mx.DateTime's counterpart. There's two different ways to build a relativedelta instance. The first one is passing it two date/datetime classes: relativedelta(datetime1, datetime2) And the other way is to use the following keyword arguments: year, month, day, hour, minute, second, microsecond: Absolute information. years, months, weeks, days, hours, minutes, seconds, microseconds: Relative information, may be negative. weekday: One of the weekday instances (MO, TU, etc). These instances may receive a parameter N, specifying the Nth weekday, which could be positive or negative (like MO(+1) or MO(-2). Not specifying it is the same as specifying +1. You can also use an integer, where 0=MO. leapdays: Will add given days to the date found, if year is a leap year, and the date found is post 28 of february. yearday, nlyearday: Set the yearday or the non-leap year day (jump leap days). These are converted to day/month/leapdays information. Here is the behavior of operations with relativedelta: 1) Calculate the absolute year, using the 'year' argument, or the original datetime year, if the argument is not present. 2) Add the relative 'years' argument to the absolute year. 3) Do steps 1 and 2 for month/months. 4) Calculate the absolute day, using the 'day' argument, or the original datetime day, if the argument is not present. Then, subtract from the day until it fits in the year and month found after their operations. 5) Add the relative 'days' argument to the absolute day. Notice that the 'weeks' argument is multiplied by 7 and added to 'days'. 6) Do steps 1 and 2 for hour/hours, minute/minutes, second/seconds, microsecond/microseconds. 7) If the 'weekday' argument is present, calculate the weekday, with the given (wday, nth) tuple. wday is the index of the weekday (0-6, 0=Mon), and nth is the number of weeks to add forward or backward, depending on its signal. Notice that if the calculated date is already Monday, for example, using (0, 1) or (0, -1) won't change the day. """ def __init__(self, dt1=None, dt2=None, years=0, months=0, days=0, leapdays=0, weeks=0, hours=0, minutes=0, seconds=0, microseconds=0, year=None, month=None, day=None, weekday=None, yearday=None, nlyearday=None, hour=None, minute=None, second=None, microsecond=None): if dt1 and dt2: if not isinstance(dt1, datetime.date) or \ not isinstance(dt2, datetime.date): raise TypeError("relativedelta only diffs datetime/date") if type(dt1) is not type(dt2): if not isinstance(dt1, datetime.datetime): dt1 = datetime.datetime.fromordinal(dt1.toordinal()) elif not isinstance(dt2, datetime.datetime): dt2 = datetime.datetime.fromordinal(dt2.toordinal()) self.years = 0 self.months = 0 self.days = 0 self.leapdays = 0 self.hours = 0 self.minutes = 0 self.seconds = 0 self.microseconds = 0 self.year = None self.month = None self.day = None self.weekday = None self.hour = None self.minute = None self.second = None self.microsecond = None self._has_time = 0 months = (dt1.year * 12 + dt1.month) - (dt2.year * 12 + dt2.month) self._set_months(months) dtm = self.__radd__(dt2) if dt1 < dt2: while dt1 > dtm: months += 1 self._set_months(months) dtm = self.__radd__(dt2) else: while dt1 < dtm: months -= 1 self._set_months(months) dtm = self.__radd__(dt2) delta = dt1 - dtm self.seconds = delta.seconds + delta.days * 86400 self.microseconds = delta.microseconds else: self.years = years self.months = months self.days = days + weeks * 7 self.leapdays = leapdays self.hours = hours self.minutes = minutes self.seconds = seconds self.microseconds = microseconds self.year = year self.month = month self.day = day self.hour = hour self.minute = minute self.second = second self.microsecond = microsecond if type(weekday) is int: self.weekday = weekdays[weekday] else: self.weekday = weekday yday = 0 if nlyearday: yday = nlyearday elif yearday: yday = yearday if yearday > 59: self.leapdays = -1 if yday: ydayidx = [31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 366] for idx, ydays in enumerate(ydayidx): if yday <= ydays: self.month = idx + 1 if idx == 0: self.day = yday else: self.day = yday - ydayidx[idx - 1] break else: raise ValueError("invalid year day (%d)" % yday) self._fix() def _fix(self): if abs(self.microseconds) > 999999: s = self.microseconds // abs(self.microseconds) div, mod = divmod(self.microseconds * s, 1000000) self.microseconds = mod * s self.seconds += div * s if abs(self.seconds) > 59: s = self.seconds // abs(self.seconds) div, mod = divmod(self.seconds * s, 60) self.seconds = mod * s self.minutes += div * s if abs(self.minutes) > 59: s = self.minutes // abs(self.minutes) div, mod = divmod(self.minutes * s, 60) self.minutes = mod * s self.hours += div * s if abs(self.hours) > 23: s = self.hours // abs(self.hours) div, mod = divmod(self.hours * s, 24) self.hours = mod * s self.days += div * s if abs(self.months) > 11: s = self.months // abs(self.months) div, mod = divmod(self.months * s, 12) self.months = mod * s self.years += div * s if (self.hours or self.minutes or self.seconds or self.microseconds or self.hour is not None or self.minute is not None or self.second is not None or self.microsecond is not None): self._has_time = 1 else: self._has_time = 0 def _set_months(self, months): self.months = months if abs(self.months) > 11: s = self.months // abs(self.months) div, mod = divmod(self.months * s, 12) self.months = mod * s self.years = div * s else: self.years = 0 def __radd__(self, other): if not isinstance(other, datetime.date): raise TypeError("unsupported type for add operation") elif self._has_time and not isinstance(other, datetime.datetime): other = datetime.datetime.fromordinal(other.toordinal()) year = (self.year or other.year) + self.years month = self.month or other.month if self.months: assert 1 <= abs(self.months) <= 12 month += self.months if month > 12: year += 1 month -= 12 elif month < 1: year -= 1 month += 12 day = min(calendar.monthrange(year, month)[1], self.day or other.day) repl = {"year": year, "month": month, "day": day} for attr in ["hour", "minute", "second", "microsecond"]: value = getattr(self, attr) if value is not None: repl[attr] = value days = self.days if self.leapdays and month > 2 and calendar.isleap(year): days += self.leapdays ret = (other.replace(**repl) + datetime.timedelta(days=days, hours=self.hours, minutes=self.minutes, seconds=self.seconds, microseconds=self.microseconds)) if self.weekday: weekday, nth = self.weekday.weekday, self.weekday.n or 1 jumpdays = (abs(nth) - 1) * 7 if nth > 0: jumpdays += (7 - ret.weekday() + weekday) % 7 else: jumpdays += (ret.weekday() - weekday) % 7 jumpdays *= -1 ret += datetime.timedelta(days=jumpdays) return ret def __rsub__(self, other): return self.__neg__().__radd__(other) def __add__(self, other): if not isinstance(other, relativedelta): raise TypeError("unsupported type for add operation") return relativedelta(years=other.years + self.years, months=other.months + self.months, days=other.days + self.days, hours=other.hours + self.hours, minutes=other.minutes + self.minutes, seconds=other.seconds + self.seconds, microseconds=other.microseconds + self.microseconds, leapdays=other.leapdays or self.leapdays, year=other.year or self.year, month=other.month or self.month, day=other.day or self.day, weekday=other.weekday or self.weekday, hour=other.hour or self.hour, minute=other.minute or self.minute, second=other.second or self.second, microsecond=other.second or self.microsecond) def __sub__(self, other): if not isinstance(other, relativedelta): raise TypeError("unsupported type for sub operation") return relativedelta(years=other.years - self.years, months=other.months - self.months, days=other.days - self.days, hours=other.hours - self.hours, minutes=other.minutes - self.minutes, seconds=other.seconds - self.seconds, microseconds=other.microseconds - self.microseconds, leapdays=other.leapdays or self.leapdays, year=other.year or self.year, month=other.month or self.month, day=other.day or self.day, weekday=other.weekday or self.weekday, hour=other.hour or self.hour, minute=other.minute or self.minute, second=other.second or self.second, microsecond=other.second or self.microsecond) def __neg__(self): return relativedelta(years= -self.years, months= -self.months, days= -self.days, hours= -self.hours, minutes= -self.minutes, seconds= -self.seconds, microseconds= -self.microseconds, leapdays=self.leapdays, year=self.year, month=self.month, day=self.day, weekday=self.weekday, hour=self.hour, minute=self.minute, second=self.second, microsecond=self.microsecond) def __nonzero__(self): return not (not self.years and not self.months and not self.days and not self.hours and not self.minutes and not self.seconds and not self.microseconds and not self.leapdays and self.year is None and self.month is None and self.day is None and self.weekday is None and self.hour is None and self.minute is None and self.second is None and self.microsecond is None) __bool__ = __nonzero__ def __mul__(self, other): f = float(other) return relativedelta(years=self.years * f, months=self.months * f, days=self.days * f, hours=self.hours * f, minutes=self.minutes * f, seconds=self.seconds * f, microseconds=self.microseconds * f, leapdays=self.leapdays, year=self.year, month=self.month, day=self.day, weekday=self.weekday, hour=self.hour, minute=self.minute, second=self.second, microsecond=self.microsecond) def __eq__(self, other): if not isinstance(other, relativedelta): return False if self.weekday or other.weekday: if not self.weekday or not other.weekday: return False if self.weekday.weekday != other.weekday.weekday: return False n1, n2 = self.weekday.n, other.weekday.n if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)): return False return (self.years == other.years and self.months == other.months and self.days == other.days and self.hours == other.hours and self.minutes == other.minutes and self.seconds == other.seconds and self.leapdays == other.leapdays and self.year == other.year and self.month == other.month and self.day == other.day and self.hour == other.hour and self.minute == other.minute and self.second == other.second and self.microsecond == other.microsecond) def __ne__(self, other): return not self.__eq__(other) def __div__(self, other): return self.__mul__(1 / float(other)) def __repr__(self): l = [] for attr in ["years", "months", "days", "leapdays", "hours", "minutes", "seconds", "microseconds"]: value = getattr(self, attr) if value: l.append("%s=%+d" % (attr, value)) for attr in ["year", "month", "day", "weekday", "hour", "minute", "second", "microsecond"]: value = getattr(self, attr) if value is not None: l.append("%s=%s" % (attr, repr(value))) return "%s(%s)" % (self.__class__.__name__, ", ".join(l)) # vim:ts=4:sw=4:et Whoosh-2.5.7/src/whoosh/support/unicode.py0000644000076500000240000006375412254366350020723 0ustar mattstaff00000000000000import re from bisect import bisect_right from whoosh.compat import text_type, u # http://unicode.org/Public/UNIDATA/Blocks.txt _blockdata = ''' # Blocks-5.1.0.txt # Date: 2008-03-20, 17:41:00 PDT [KW] # # Unicode Character Database # Copyright (c) 1991-2008 Unicode, Inc. # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see UCD.html # # Note: The casing of block names is not normative. # For example, "Basic Latin" and "BASIC LATIN" are equivalent. # # Format: # Start Code..End Code; Block Name # ================================================ # Note: When comparing block names, casing, whitespace, hyphens, # and underbars are ignored. # For example, "Latin Extended-A" and "latin extended a" are equivalent # For more information on the comparison of property values, # see UCD.html. # # All code points not explicitly listed for Block # have the value No_Block. # Property: Block # # @missing: 0000..10FFFF; No_Block 0000..007F; Basic Latin 0080..00FF; Latin-1 Supplement 0100..017F; Latin Extended-A 0180..024F; Latin Extended-B 0250..02AF; IPA Extensions 02B0..02FF; Spacing Modifier Letters 0300..036F; Combining Diacritical Marks 0370..03FF; Greek and Coptic 0400..04FF; Cyrillic 0500..052F; Cyrillic Supplement 0530..058F; Armenian 0590..05FF; Hebrew 0600..06FF; Arabic 0700..074F; Syriac 0750..077F; Arabic Supplement 0780..07BF; Thaana 07C0..07FF; NKo 0900..097F; Devanagari 0980..09FF; Bengali 0A00..0A7F; Gurmukhi 0A80..0AFF; Gujarati 0B00..0B7F; Oriya 0B80..0BFF; Tamil 0C00..0C7F; Telugu 0C80..0CFF; Kannada 0D00..0D7F; Malayalam 0D80..0DFF; Sinhala 0E00..0E7F; Thai 0E80..0EFF; Lao 0F00..0FFF; Tibetan 1000..109F; Myanmar 10A0..10FF; Georgian 1100..11FF; Hangul Jamo 1200..137F; Ethiopic 1380..139F; Ethiopic Supplement 13A0..13FF; Cherokee 1400..167F; Unified Canadian Aboriginal Syllabics 1680..169F; Ogham 16A0..16FF; Runic 1700..171F; Tagalog 1720..173F; Hanunoo 1740..175F; Buhid 1760..177F; Tagbanwa 1780..17FF; Khmer 1800..18AF; Mongolian 1900..194F; Limbu 1950..197F; Tai Le 1980..19DF; New Tai Lue 19E0..19FF; Khmer Symbols 1A00..1A1F; Buginese 1B00..1B7F; Balinese 1B80..1BBF; Sundanese 1C00..1C4F; Lepcha 1C50..1C7F; Ol Chiki 1D00..1D7F; Phonetic Extensions 1D80..1DBF; Phonetic Extensions Supplement 1DC0..1DFF; Combining Diacritical Marks Supplement 1E00..1EFF; Latin Extended Additional 1F00..1FFF; Greek Extended 2000..206F; General Punctuation 2070..209F; Superscripts and Subscripts 20A0..20CF; Currency Symbols 20D0..20FF; Combining Diacritical Marks for Symbols 2100..214F; Letterlike Symbols 2150..218F; Number Forms 2190..21FF; Arrows 2200..22FF; Mathematical Operators 2300..23FF; Miscellaneous Technical 2400..243F; Control Pictures 2440..245F; Optical Character Recognition 2460..24FF; Enclosed Alphanumerics 2500..257F; Box Drawing 2580..259F; Block Elements 25A0..25FF; Geometric Shapes 2600..26FF; Miscellaneous Symbols 2700..27BF; Dingbats 27C0..27EF; Miscellaneous Mathematical Symbols-A 27F0..27FF; Supplemental Arrows-A 2800..28FF; Braille Patterns 2900..297F; Supplemental Arrows-B 2980..29FF; Miscellaneous Mathematical Symbols-B 2A00..2AFF; Supplemental Mathematical Operators 2B00..2BFF; Miscellaneous Symbols and Arrows 2C00..2C5F; Glagolitic 2C60..2C7F; Latin Extended-C 2C80..2CFF; Coptic 2D00..2D2F; Georgian Supplement 2D30..2D7F; Tifinagh 2D80..2DDF; Ethiopic Extended 2DE0..2DFF; Cyrillic Extended-A 2E00..2E7F; Supplemental Punctuation 2E80..2EFF; CJK Radicals Supplement 2F00..2FDF; Kangxi Radicals 2FF0..2FFF; Ideographic Description Characters 3000..303F; CJK Symbols and Punctuation 3040..309F; Hiragana 30A0..30FF; Katakana 3100..312F; Bopomofo 3130..318F; Hangul Compatibility Jamo 3190..319F; Kanbun 31A0..31BF; Bopomofo Extended 31C0..31EF; CJK Strokes 31F0..31FF; Katakana Phonetic Extensions 3200..32FF; Enclosed CJK Letters and Months 3300..33FF; CJK Compatibility 3400..4DBF; CJK Unified Ideographs Extension A 4DC0..4DFF; Yijing Hexagram Symbols 4E00..9FFF; CJK Unified Ideographs A000..A48F; Yi Syllables A490..A4CF; Yi Radicals A500..A63F; Vai A640..A69F; Cyrillic Extended-B A700..A71F; Modifier Tone Letters A720..A7FF; Latin Extended-D A800..A82F; Syloti Nagri A840..A87F; Phags-pa A880..A8DF; Saurashtra A900..A92F; Kayah Li A930..A95F; Rejang AA00..AA5F; Cham AC00..D7AF; Hangul Syllables D800..DB7F; High Surrogates DB80..DBFF; High Private Use Surrogates DC00..DFFF; Low Surrogates E000..F8FF; Private Use Area F900..FAFF; CJK Compatibility Ideographs FB00..FB4F; Alphabetic Presentation Forms FB50..FDFF; Arabic Presentation Forms-A FE00..FE0F; Variation Selectors FE10..FE1F; Vertical Forms FE20..FE2F; Combining Half Marks FE30..FE4F; CJK Compatibility Forms FE50..FE6F; Small Form Variants FE70..FEFF; Arabic Presentation Forms-B FF00..FFEF; Halfwidth and Fullwidth Forms FFF0..FFFF; Specials 10000..1007F; Linear B Syllabary 10080..100FF; Linear B Ideograms 10100..1013F; Aegean Numbers 10140..1018F; Ancient Greek Numbers 10190..101CF; Ancient Symbols 101D0..101FF; Phaistos Disc 10280..1029F; Lycian 102A0..102DF; Carian 10300..1032F; Old Italic 10330..1034F; Gothic 10380..1039F; Ugaritic 103A0..103DF; Old Persian 10400..1044F; Deseret 10450..1047F; Shavian 10480..104AF; Osmanya 10800..1083F; Cypriot Syllabary 10900..1091F; Phoenician 10920..1093F; Lydian 10A00..10A5F; Kharoshthi 12000..123FF; Cuneiform 12400..1247F; Cuneiform Numbers and Punctuation 1D000..1D0FF; Byzantine Musical Symbols 1D100..1D1FF; Musical Symbols 1D200..1D24F; Ancient Greek Musical Notation 1D300..1D35F; Tai Xuan Jing Symbols 1D360..1D37F; Counting Rod Numerals 1D400..1D7FF; Mathematical Alphanumeric Symbols 1F000..1F02F; Mahjong Tiles 1F030..1F09F; Domino Tiles 20000..2A6DF; CJK Unified Ideographs Extension B 2F800..2FA1F; CJK Compatibility Ideographs Supplement E0000..E007F; Tags E0100..E01EF; Variation Selectors Supplement F0000..FFFFF; Supplementary Private Use Area-A 100000..10FFFF; Supplementary Private Use Area-B # EOF ''' pattern = re.compile(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)') _starts = [] _ends = [] _names = [] class blocks(object): pass def _init(): count = 0 for line in _blockdata.splitlines(): m = pattern.match(line) if m: start, end, name = m.groups() _starts.append(int(start, 16)) _ends.append(int(end, 16)) _names.append(name) setattr(blocks, name.replace(" ", "_"), count) count += 1 _init() def blockname(ch): """Return the Unicode block name for ch, or None if ch has no block. >>> blockname(u'a') 'Basic Latin' >>> blockname(unichr(0x0b80)) 'Tamil' >>> block(unichr(2048)) None """ assert isinstance(ch, text_type) and len(ch) == 1, repr(ch) cp = ord(ch) i = bisect_right(_starts, cp) - 1 end = _ends[i] if cp > end: return None return _names[i] def blocknum(ch): """Returns the unicode block number for ch, or None if ch has no block. >>> blocknum(u'a') 0 >>> blocknum(unichr(0x0b80)) 22 >>> blocknum(unichr(2048)) None """ cp = ord(ch) i = bisect_right(_starts, cp) - 1 end = _ends[i] if cp > end: return None return i digits = u('0123456789\xb2\xb3\xb9\u0660\u0661\u0662\u0663\u0664\u0665\u0666' '\u0667\u0668\u0669\u06f0\u06f1\u06f2\u06f3\u06f4\u06f5\u06f6\u06f7' '\u06f8\u06f9\u07c0\u07c1\u07c2\u07c3\u07c4\u07c5\u07c6\u07c7\u07c8' '\u07c9\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f' '\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef\u0a66' '\u0a67\u0a68\u0a69\u0a6a\u0a6b\u0a6c\u0a6d\u0a6e\u0a6f\u0ae6\u0ae7' '\u0ae8\u0ae9\u0aea\u0aeb\u0aec\u0aed\u0aee\u0aef\u0b66\u0b67\u0b68' '\u0b69\u0b6a\u0b6b\u0b6c\u0b6d\u0b6e\u0b6f\u0be6\u0be7\u0be8\u0be9' '\u0bea\u0beb\u0bec\u0bed\u0bee\u0bef\u0c66\u0c67\u0c68\u0c69\u0c6a' '\u0c6b\u0c6c\u0c6d\u0c6e\u0c6f\u0ce6\u0ce7\u0ce8\u0ce9\u0cea\u0ceb' '\u0cec\u0ced\u0cee\u0cef\u0d66\u0d67\u0d68\u0d69\u0d6a\u0d6b\u0d6c' '\u0d6d\u0d6e\u0d6f\u0e50\u0e51\u0e52\u0e53\u0e54\u0e55\u0e56\u0e57' '\u0e58\u0e59\u0ed0\u0ed1\u0ed2\u0ed3\u0ed4\u0ed5\u0ed6\u0ed7\u0ed8' '\u0ed9\u0f20\u0f21\u0f22\u0f23\u0f24\u0f25\u0f26\u0f27\u0f28\u0f29' '\u1040\u1041\u1042\u1043\u1044\u1045\u1046\u1047\u1048\u1049\u1090' '\u1091\u1092\u1093\u1094\u1095\u1096\u1097\u1098\u1099\u1369\u136a' '\u136b\u136c\u136d\u136e\u136f\u1370\u1371\u17e0\u17e1\u17e2\u17e3' '\u17e4\u17e5\u17e6\u17e7\u17e8\u17e9\u1810\u1811\u1812\u1813\u1814' '\u1815\u1816\u1817\u1818\u1819\u1946\u1947\u1948\u1949\u194a\u194b' '\u194c\u194d\u194e\u194f\u19d0\u19d1\u19d2\u19d3\u19d4\u19d5\u19d6' '\u19d7\u19d8\u19d9\u19da\u1a80\u1a81\u1a82\u1a83\u1a84\u1a85\u1a86' '\u1a87\u1a88\u1a89\u1a90\u1a91\u1a92\u1a93\u1a94\u1a95\u1a96\u1a97' '\u1a98\u1a99\u1b50\u1b51\u1b52\u1b53\u1b54\u1b55\u1b56\u1b57\u1b58' '\u1b59\u1bb0\u1bb1\u1bb2\u1bb3\u1bb4\u1bb5\u1bb6\u1bb7\u1bb8\u1bb9' '\u1c40\u1c41\u1c42\u1c43\u1c44\u1c45\u1c46\u1c47\u1c48\u1c49\u1c50' '\u1c51\u1c52\u1c53\u1c54\u1c55\u1c56\u1c57\u1c58\u1c59\u2070\u2074' '\u2075\u2076\u2077\u2078\u2079\u2080\u2081\u2082\u2083\u2084\u2085' '\u2086\u2087\u2088\u2089\u2460\u2461\u2462\u2463\u2464\u2465\u2466' '\u2467\u2468\u2474\u2475\u2476\u2477\u2478\u2479\u247a\u247b\u247c' '\u2488\u2489\u248a\u248b\u248c\u248d\u248e\u248f\u2490\u24ea\u24f5' '\u24f6\u24f7\u24f8\u24f9\u24fa\u24fb\u24fc\u24fd\u24ff\u2776\u2777' '\u2778\u2779\u277a\u277b\u277c\u277d\u277e\u2780\u2781\u2782\u2783' '\u2784\u2785\u2786\u2787\u2788\u278a\u278b\u278c\u278d\u278e\u278f' '\u2790\u2791\u2792\ua620\ua621\ua622\ua623\ua624\ua625\ua626\ua627' '\ua628\ua629\ua8d0\ua8d1\ua8d2\ua8d3\ua8d4\ua8d5\ua8d6\ua8d7\ua8d8' '\ua8d9\ua900\ua901\ua902\ua903\ua904\ua905\ua906\ua907\ua908\ua909' '\ua9d0\ua9d1\ua9d2\ua9d3\ua9d4\ua9d5\ua9d6\ua9d7\ua9d8\ua9d9\uaa50' '\uaa51\uaa52\uaa53\uaa54\uaa55\uaa56\uaa57\uaa58\uaa59\uabf0\uabf1' '\uabf2\uabf3\uabf4\uabf5\uabf6\uabf7\uabf8\uabf9\uff10\uff11\uff12' '\uff13\uff14\uff15\uff16\uff17\uff18\uff19') lowercase = u('abcdefghijklmnopqrstuvwxyz\xaa\xb5\xba\xdf\xe0\xe1\xe2\xe3\xe4' '\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3' '\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\u0101\u0103\u0105' '\u0107\u0109\u010b\u010d\u010f\u0111\u0113\u0115\u0117\u0119' '\u011b\u011d\u011f\u0121\u0123\u0125\u0127\u0129\u012b\u012d' '\u012f\u0131\u0133\u0135\u0137\u0138\u013a\u013c\u013e\u0140' '\u0142\u0144\u0146\u0148\u0149\u014b\u014d\u014f\u0151\u0153' '\u0155\u0157\u0159\u015b\u015d\u015f\u0161\u0163\u0165\u0167' '\u0169\u016b\u016d\u016f\u0171\u0173\u0175\u0177\u017a\u017c' '\u017e\u017f\u0180\u0183\u0185\u0188\u018c\u018d\u0192\u0195' '\u0199\u019a\u019b\u019e\u01a1\u01a3\u01a5\u01a8\u01aa\u01ab' '\u01ad\u01b0\u01b4\u01b6\u01b9\u01ba\u01bd\u01be\u01bf\u01c6' '\u01c9\u01cc\u01ce\u01d0\u01d2\u01d4\u01d6\u01d8\u01da\u01dc' '\u01dd\u01df\u01e1\u01e3\u01e5\u01e7\u01e9\u01eb\u01ed\u01ef' '\u01f0\u01f3\u01f5\u01f9\u01fb\u01fd\u01ff\u0201\u0203\u0205' '\u0207\u0209\u020b\u020d\u020f\u0211\u0213\u0215\u0217\u0219' '\u021b\u021d\u021f\u0221\u0223\u0225\u0227\u0229\u022b\u022d' '\u022f\u0231\u0233\u0234\u0235\u0236\u0237\u0238\u0239\u023c' '\u023f\u0240\u0242\u0247\u0249\u024b\u024d\u024f\u0250\u0251' '\u0252\u0253\u0254\u0255\u0256\u0257\u0258\u0259\u025a\u025b' '\u025c\u025d\u025e\u025f\u0260\u0261\u0262\u0263\u0264\u0265' '\u0266\u0267\u0268\u0269\u026a\u026b\u026c\u026d\u026e\u026f' '\u0270\u0271\u0272\u0273\u0274\u0275\u0276\u0277\u0278\u0279' '\u027a\u027b\u027c\u027d\u027e\u027f\u0280\u0281\u0282\u0283' '\u0284\u0285\u0286\u0287\u0288\u0289\u028a\u028b\u028c\u028d' '\u028e\u028f\u0290\u0291\u0292\u0293\u0295\u0296\u0297\u0298' '\u0299\u029a\u029b\u029c\u029d\u029e\u029f\u02a0\u02a1\u02a2' '\u02a3\u02a4\u02a5\u02a6\u02a7\u02a8\u02a9\u02aa\u02ab\u02ac' '\u02ad\u02ae\u02af\u0371\u0373\u0377\u037b\u037c\u037d\u0390' '\u03ac\u03ad\u03ae\u03af\u03b0\u03b1\u03b2\u03b3\u03b4\u03b5' '\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf' '\u03c0\u03c1\u03c2\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9' '\u03ca\u03cb\u03cc\u03cd\u03ce\u03d0\u03d1\u03d5\u03d6\u03d7' '\u03d9\u03db\u03dd\u03df\u03e1\u03e3\u03e5\u03e7\u03e9\u03eb' '\u03ed\u03ef\u03f0\u03f1\u03f2\u03f3\u03f5\u03f8\u03fb\u03fc' '\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439' '\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443' '\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d' '\u044e\u044f\u0450\u0451\u0452\u0453\u0454\u0455\u0456\u0457' '\u0458\u0459\u045a\u045b\u045c\u045d\u045e\u045f\u0461\u0463' '\u0465\u0467\u0469\u046b\u046d\u046f\u0471\u0473\u0475\u0477' '\u0479\u047b\u047d\u047f\u0481\u048b\u048d\u048f\u0491\u0493' '\u0495\u0497\u0499\u049b\u049d\u049f\u04a1\u04a3\u04a5\u04a7' '\u04a9\u04ab\u04ad\u04af\u04b1\u04b3\u04b5\u04b7\u04b9\u04bb' '\u04bd\u04bf\u04c2\u04c4\u04c6\u04c8\u04ca\u04cc\u04ce\u04cf' '\u04d1\u04d3\u04d5\u04d7\u04d9\u04db\u04dd\u04df\u04e1\u04e3' '\u04e5\u04e7\u04e9\u04eb\u04ed\u04ef\u04f1\u04f3\u04f5\u04f7' '\u04f9\u04fb\u04fd\u04ff\u0501\u0503\u0505\u0507\u0509\u050b' '\u050d\u050f\u0511\u0513\u0515\u0517\u0519\u051b\u051d\u051f' '\u0521\u0523\u0525\u0561\u0562\u0563\u0564\u0565\u0566\u0567' '\u0568\u0569\u056a\u056b\u056c\u056d\u056e\u056f\u0570\u0571' '\u0572\u0573\u0574\u0575\u0576\u0577\u0578\u0579\u057a\u057b' '\u057c\u057d\u057e\u057f\u0580\u0581\u0582\u0583\u0584\u0585' '\u0586\u0587\u1d00\u1d01\u1d02\u1d03\u1d04\u1d05\u1d06\u1d07' '\u1d08\u1d09\u1d0a\u1d0b\u1d0c\u1d0d\u1d0e\u1d0f\u1d10\u1d11' '\u1d12\u1d13\u1d14\u1d15\u1d16\u1d17\u1d18\u1d19\u1d1a\u1d1b' '\u1d1c\u1d1d\u1d1e\u1d1f\u1d20\u1d21\u1d22\u1d23\u1d24\u1d25' '\u1d26\u1d27\u1d28\u1d29\u1d2a\u1d2b\u1d62\u1d63\u1d64\u1d65' '\u1d66\u1d67\u1d68\u1d69\u1d6a\u1d6b\u1d6c\u1d6d\u1d6e\u1d6f' '\u1d70\u1d71\u1d72\u1d73\u1d74\u1d75\u1d76\u1d77\u1d79\u1d7a' '\u1d7b\u1d7c\u1d7d\u1d7e\u1d7f\u1d80\u1d81\u1d82\u1d83\u1d84' '\u1d85\u1d86\u1d87\u1d88\u1d89\u1d8a\u1d8b\u1d8c\u1d8d\u1d8e' '\u1d8f\u1d90\u1d91\u1d92\u1d93\u1d94\u1d95\u1d96\u1d97\u1d98' '\u1d99\u1d9a\u1e01\u1e03\u1e05\u1e07\u1e09\u1e0b\u1e0d\u1e0f' '\u1e11\u1e13\u1e15\u1e17\u1e19\u1e1b\u1e1d\u1e1f\u1e21\u1e23' '\u1e25\u1e27\u1e29\u1e2b\u1e2d\u1e2f\u1e31\u1e33\u1e35\u1e37' '\u1e39\u1e3b\u1e3d\u1e3f\u1e41\u1e43\u1e45\u1e47\u1e49\u1e4b' '\u1e4d\u1e4f\u1e51\u1e53\u1e55\u1e57\u1e59\u1e5b\u1e5d\u1e5f' '\u1e61\u1e63\u1e65\u1e67\u1e69\u1e6b\u1e6d\u1e6f\u1e71\u1e73' '\u1e75\u1e77\u1e79\u1e7b\u1e7d\u1e7f\u1e81\u1e83\u1e85\u1e87' '\u1e89\u1e8b\u1e8d\u1e8f\u1e91\u1e93\u1e95\u1e96\u1e97\u1e98' '\u1e99\u1e9a\u1e9b\u1e9c\u1e9d\u1e9f\u1ea1\u1ea3\u1ea5\u1ea7' '\u1ea9\u1eab\u1ead\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u1eb9\u1ebb' '\u1ebd\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u1ec9\u1ecb\u1ecd\u1ecf' '\u1ed1\u1ed3\u1ed5\u1ed7\u1ed9\u1edb\u1edd\u1edf\u1ee1\u1ee3' '\u1ee5\u1ee7\u1ee9\u1eeb\u1eed\u1eef\u1ef1\u1ef3\u1ef5\u1ef7' '\u1ef9\u1efb\u1efd\u1eff\u1f00\u1f01\u1f02\u1f03\u1f04\u1f05' '\u1f06\u1f07\u1f10\u1f11\u1f12\u1f13\u1f14\u1f15\u1f20\u1f21' '\u1f22\u1f23\u1f24\u1f25\u1f26\u1f27\u1f30\u1f31\u1f32\u1f33' '\u1f34\u1f35\u1f36\u1f37\u1f40\u1f41\u1f42\u1f43\u1f44\u1f45' '\u1f50\u1f51\u1f52\u1f53\u1f54\u1f55\u1f56\u1f57\u1f60\u1f61' '\u1f62\u1f63\u1f64\u1f65\u1f66\u1f67\u1f70\u1f71\u1f72\u1f73' '\u1f74\u1f75\u1f76\u1f77\u1f78\u1f79\u1f7a\u1f7b\u1f7c\u1f7d' '\u1f80\u1f81\u1f82\u1f83\u1f84\u1f85\u1f86\u1f87\u1f90\u1f91' '\u1f92\u1f93\u1f94\u1f95\u1f96\u1f97\u1fa0\u1fa1\u1fa2\u1fa3' '\u1fa4\u1fa5\u1fa6\u1fa7\u1fb0\u1fb1\u1fb2\u1fb3\u1fb4\u1fb6' '\u1fb7\u1fbe\u1fc2\u1fc3\u1fc4\u1fc6\u1fc7\u1fd0\u1fd1\u1fd2' '\u1fd3\u1fd6\u1fd7\u1fe0\u1fe1\u1fe2\u1fe3\u1fe4\u1fe5\u1fe6' '\u1fe7\u1ff2\u1ff3\u1ff4\u1ff6\u1ff7\u210a\u210e\u210f\u2113' '\u212f\u2134\u2139\u213c\u213d\u2146\u2147\u2148\u2149\u214e' '\u2184\u2c30\u2c31\u2c32\u2c33\u2c34\u2c35\u2c36\u2c37\u2c38' '\u2c39\u2c3a\u2c3b\u2c3c\u2c3d\u2c3e\u2c3f\u2c40\u2c41\u2c42' '\u2c43\u2c44\u2c45\u2c46\u2c47\u2c48\u2c49\u2c4a\u2c4b\u2c4c' '\u2c4d\u2c4e\u2c4f\u2c50\u2c51\u2c52\u2c53\u2c54\u2c55\u2c56' '\u2c57\u2c58\u2c59\u2c5a\u2c5b\u2c5c\u2c5d\u2c5e\u2c61\u2c65' '\u2c66\u2c68\u2c6a\u2c6c\u2c71\u2c73\u2c74\u2c76\u2c77\u2c78' '\u2c79\u2c7a\u2c7b\u2c7c\u2c81\u2c83\u2c85\u2c87\u2c89\u2c8b' '\u2c8d\u2c8f\u2c91\u2c93\u2c95\u2c97\u2c99\u2c9b\u2c9d\u2c9f' '\u2ca1\u2ca3\u2ca5\u2ca7\u2ca9\u2cab\u2cad\u2caf\u2cb1\u2cb3' '\u2cb5\u2cb7\u2cb9\u2cbb\u2cbd\u2cbf\u2cc1\u2cc3\u2cc5\u2cc7' '\u2cc9\u2ccb\u2ccd\u2ccf\u2cd1\u2cd3\u2cd5\u2cd7\u2cd9\u2cdb' '\u2cdd\u2cdf\u2ce1\u2ce3\u2ce4\u2cec\u2cee\u2d00\u2d01\u2d02' '\u2d03\u2d04\u2d05\u2d06\u2d07\u2d08\u2d09\u2d0a\u2d0b\u2d0c' '\u2d0d\u2d0e\u2d0f\u2d10\u2d11\u2d12\u2d13\u2d14\u2d15\u2d16' '\u2d17\u2d18\u2d19\u2d1a\u2d1b\u2d1c\u2d1d\u2d1e\u2d1f\u2d20' '\u2d21\u2d22\u2d23\u2d24\u2d25\ua641\ua643\ua645\ua647\ua649' '\ua64b\ua64d\ua64f\ua651\ua653\ua655\ua657\ua659\ua65b\ua65d' '\ua65f\ua663\ua665\ua667\ua669\ua66b\ua66d\ua681\ua683\ua685' '\ua687\ua689\ua68b\ua68d\ua68f\ua691\ua693\ua695\ua697\ua723' '\ua725\ua727\ua729\ua72b\ua72d\ua72f\ua730\ua731\ua733\ua735' '\ua737\ua739\ua73b\ua73d\ua73f\ua741\ua743\ua745\ua747\ua749' '\ua74b\ua74d\ua74f\ua751\ua753\ua755\ua757\ua759\ua75b\ua75d' '\ua75f\ua761\ua763\ua765\ua767\ua769\ua76b\ua76d\ua76f\ua771' '\ua772\ua773\ua774\ua775\ua776\ua777\ua778\ua77a\ua77c\ua77f' '\ua781\ua783\ua785\ua787\ua78c\ufb00\ufb01\ufb02\ufb03\ufb04' '\ufb05\ufb06\ufb13\ufb14\ufb15\ufb16\ufb17\uff41\uff42\uff43' '\uff44\uff45\uff46\uff47\uff48\uff49\uff4a\uff4b\uff4c\uff4d' '\uff4e\uff4f\uff50\uff51\uff52\uff53\uff54\uff55\uff56\uff57' '\uff58\uff59\uff5a') uppercase = u('ABCDEFGHIJKLMNOPQRSTUVWXYZ\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8' '\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd8' '\xd9\xda\xdb\xdc\xdd\xde\u0100\u0102\u0104\u0106\u0108\u010a' '\u010c\u010e\u0110\u0112\u0114\u0116\u0118\u011a\u011c\u011e' '\u0120\u0122\u0124\u0126\u0128\u012a\u012c\u012e\u0130\u0132' '\u0134\u0136\u0139\u013b\u013d\u013f\u0141\u0143\u0145\u0147' '\u014a\u014c\u014e\u0150\u0152\u0154\u0156\u0158\u015a\u015c' '\u015e\u0160\u0162\u0164\u0166\u0168\u016a\u016c\u016e\u0170' '\u0172\u0174\u0176\u0178\u0179\u017b\u017d\u0181\u0182\u0184' '\u0186\u0187\u0189\u018a\u018b\u018e\u018f\u0190\u0191\u0193' '\u0194\u0196\u0197\u0198\u019c\u019d\u019f\u01a0\u01a2\u01a4' '\u01a6\u01a7\u01a9\u01ac\u01ae\u01af\u01b1\u01b2\u01b3\u01b5' '\u01b7\u01b8\u01bc\u01c4\u01c7\u01ca\u01cd\u01cf\u01d1\u01d3' '\u01d5\u01d7\u01d9\u01db\u01de\u01e0\u01e2\u01e4\u01e6\u01e8' '\u01ea\u01ec\u01ee\u01f1\u01f4\u01f6\u01f7\u01f8\u01fa\u01fc' '\u01fe\u0200\u0202\u0204\u0206\u0208\u020a\u020c\u020e\u0210' '\u0212\u0214\u0216\u0218\u021a\u021c\u021e\u0220\u0222\u0224' '\u0226\u0228\u022a\u022c\u022e\u0230\u0232\u023a\u023b\u023d' '\u023e\u0241\u0243\u0244\u0245\u0246\u0248\u024a\u024c\u024e' '\u0370\u0372\u0376\u0386\u0388\u0389\u038a\u038c\u038e\u038f' '\u0391\u0392\u0393\u0394\u0395\u0396\u0397\u0398\u0399\u039a' '\u039b\u039c\u039d\u039e\u039f\u03a0\u03a1\u03a3\u03a4\u03a5' '\u03a6\u03a7\u03a8\u03a9\u03aa\u03ab\u03cf\u03d2\u03d3\u03d4' '\u03d8\u03da\u03dc\u03de\u03e0\u03e2\u03e4\u03e6\u03e8\u03ea' '\u03ec\u03ee\u03f4\u03f7\u03f9\u03fa\u03fd\u03fe\u03ff\u0400' '\u0401\u0402\u0403\u0404\u0405\u0406\u0407\u0408\u0409\u040a' '\u040b\u040c\u040d\u040e\u040f\u0410\u0411\u0412\u0413\u0414' '\u0415\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e' '\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428' '\u0429\u042a\u042b\u042c\u042d\u042e\u042f\u0460\u0462\u0464' '\u0466\u0468\u046a\u046c\u046e\u0470\u0472\u0474\u0476\u0478' '\u047a\u047c\u047e\u0480\u048a\u048c\u048e\u0490\u0492\u0494' '\u0496\u0498\u049a\u049c\u049e\u04a0\u04a2\u04a4\u04a6\u04a8' '\u04aa\u04ac\u04ae\u04b0\u04b2\u04b4\u04b6\u04b8\u04ba\u04bc' '\u04be\u04c0\u04c1\u04c3\u04c5\u04c7\u04c9\u04cb\u04cd\u04d0' '\u04d2\u04d4\u04d6\u04d8\u04da\u04dc\u04de\u04e0\u04e2\u04e4' '\u04e6\u04e8\u04ea\u04ec\u04ee\u04f0\u04f2\u04f4\u04f6\u04f8' '\u04fa\u04fc\u04fe\u0500\u0502\u0504\u0506\u0508\u050a\u050c' '\u050e\u0510\u0512\u0514\u0516\u0518\u051a\u051c\u051e\u0520' '\u0522\u0524\u0531\u0532\u0533\u0534\u0535\u0536\u0537\u0538' '\u0539\u053a\u053b\u053c\u053d\u053e\u053f\u0540\u0541\u0542' '\u0543\u0544\u0545\u0546\u0547\u0548\u0549\u054a\u054b\u054c' '\u054d\u054e\u054f\u0550\u0551\u0552\u0553\u0554\u0555\u0556' '\u10a0\u10a1\u10a2\u10a3\u10a4\u10a5\u10a6\u10a7\u10a8\u10a9' '\u10aa\u10ab\u10ac\u10ad\u10ae\u10af\u10b0\u10b1\u10b2\u10b3' '\u10b4\u10b5\u10b6\u10b7\u10b8\u10b9\u10ba\u10bb\u10bc\u10bd' '\u10be\u10bf\u10c0\u10c1\u10c2\u10c3\u10c4\u10c5\u1e00\u1e02' '\u1e04\u1e06\u1e08\u1e0a\u1e0c\u1e0e\u1e10\u1e12\u1e14\u1e16' '\u1e18\u1e1a\u1e1c\u1e1e\u1e20\u1e22\u1e24\u1e26\u1e28\u1e2a' '\u1e2c\u1e2e\u1e30\u1e32\u1e34\u1e36\u1e38\u1e3a\u1e3c\u1e3e' '\u1e40\u1e42\u1e44\u1e46\u1e48\u1e4a\u1e4c\u1e4e\u1e50\u1e52' '\u1e54\u1e56\u1e58\u1e5a\u1e5c\u1e5e\u1e60\u1e62\u1e64\u1e66' '\u1e68\u1e6a\u1e6c\u1e6e\u1e70\u1e72\u1e74\u1e76\u1e78\u1e7a' '\u1e7c\u1e7e\u1e80\u1e82\u1e84\u1e86\u1e88\u1e8a\u1e8c\u1e8e' '\u1e90\u1e92\u1e94\u1e9e\u1ea0\u1ea2\u1ea4\u1ea6\u1ea8\u1eaa' '\u1eac\u1eae\u1eb0\u1eb2\u1eb4\u1eb6\u1eb8\u1eba\u1ebc\u1ebe' '\u1ec0\u1ec2\u1ec4\u1ec6\u1ec8\u1eca\u1ecc\u1ece\u1ed0\u1ed2' '\u1ed4\u1ed6\u1ed8\u1eda\u1edc\u1ede\u1ee0\u1ee2\u1ee4\u1ee6' '\u1ee8\u1eea\u1eec\u1eee\u1ef0\u1ef2\u1ef4\u1ef6\u1ef8\u1efa' '\u1efc\u1efe\u1f08\u1f09\u1f0a\u1f0b\u1f0c\u1f0d\u1f0e\u1f0f' '\u1f18\u1f19\u1f1a\u1f1b\u1f1c\u1f1d\u1f28\u1f29\u1f2a\u1f2b' '\u1f2c\u1f2d\u1f2e\u1f2f\u1f38\u1f39\u1f3a\u1f3b\u1f3c\u1f3d' '\u1f3e\u1f3f\u1f48\u1f49\u1f4a\u1f4b\u1f4c\u1f4d\u1f59\u1f5b' '\u1f5d\u1f5f\u1f68\u1f69\u1f6a\u1f6b\u1f6c\u1f6d\u1f6e\u1f6f' '\u1fb8\u1fb9\u1fba\u1fbb\u1fc8\u1fc9\u1fca\u1fcb\u1fd8\u1fd9' '\u1fda\u1fdb\u1fe8\u1fe9\u1fea\u1feb\u1fec\u1ff8\u1ff9\u1ffa' '\u1ffb\u2102\u2107\u210b\u210c\u210d\u2110\u2111\u2112\u2115' '\u2119\u211a\u211b\u211c\u211d\u2124\u2126\u2128\u212a\u212b' '\u212c\u212d\u2130\u2131\u2132\u2133\u213e\u213f\u2145\u2183' '\u2c00\u2c01\u2c02\u2c03\u2c04\u2c05\u2c06\u2c07\u2c08\u2c09' '\u2c0a\u2c0b\u2c0c\u2c0d\u2c0e\u2c0f\u2c10\u2c11\u2c12\u2c13' '\u2c14\u2c15\u2c16\u2c17\u2c18\u2c19\u2c1a\u2c1b\u2c1c\u2c1d' '\u2c1e\u2c1f\u2c20\u2c21\u2c22\u2c23\u2c24\u2c25\u2c26\u2c27' '\u2c28\u2c29\u2c2a\u2c2b\u2c2c\u2c2d\u2c2e\u2c60\u2c62\u2c63' '\u2c64\u2c67\u2c69\u2c6b\u2c6d\u2c6e\u2c6f\u2c70\u2c72\u2c75' '\u2c7e\u2c7f\u2c80\u2c82\u2c84\u2c86\u2c88\u2c8a\u2c8c\u2c8e' '\u2c90\u2c92\u2c94\u2c96\u2c98\u2c9a\u2c9c\u2c9e\u2ca0\u2ca2' '\u2ca4\u2ca6\u2ca8\u2caa\u2cac\u2cae\u2cb0\u2cb2\u2cb4\u2cb6' '\u2cb8\u2cba\u2cbc\u2cbe\u2cc0\u2cc2\u2cc4\u2cc6\u2cc8\u2cca' '\u2ccc\u2cce\u2cd0\u2cd2\u2cd4\u2cd6\u2cd8\u2cda\u2cdc\u2cde' '\u2ce0\u2ce2\u2ceb\u2ced\ua640\ua642\ua644\ua646\ua648\ua64a' '\ua64c\ua64e\ua650\ua652\ua654\ua656\ua658\ua65a\ua65c\ua65e' '\ua662\ua664\ua666\ua668\ua66a\ua66c\ua680\ua682\ua684\ua686' '\ua688\ua68a\ua68c\ua68e\ua690\ua692\ua694\ua696\ua722\ua724' '\ua726\ua728\ua72a\ua72c\ua72e\ua732\ua734\ua736\ua738\ua73a' '\ua73c\ua73e\ua740\ua742\ua744\ua746\ua748\ua74a\ua74c\ua74e' '\ua750\ua752\ua754\ua756\ua758\ua75a\ua75c\ua75e\ua760\ua762' '\ua764\ua766\ua768\ua76a\ua76c\ua76e\ua779\ua77b\ua77d\ua77e' '\ua780\ua782\ua784\ua786\ua78b\uff21\uff22\uff23\uff24\uff25' '\uff26\uff27\uff28\uff29\uff2a\uff2b\uff2c\uff2d\uff2e\uff2f' '\uff30\uff31\uff32\uff33\uff34\uff35\uff36\uff37\uff38\uff39' '\uff3a') Whoosh-2.5.7/src/whoosh/system.py0000644000076500000240000000562412254366350017075 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import sys from struct import Struct, calcsize IS_LITTLE = sys.byteorder == "little" _INT_SIZE = calcsize("!i") _SHORT_SIZE = calcsize("!H") _LONG_SIZE = calcsize("!Q") _FLOAT_SIZE = calcsize("!f") _DOUBLE_SIZE = calcsize("!d") _byte_struct = Struct("!B") _sbyte_struct = Struct("!b") _ushort_struct = Struct("!H") _int_struct = Struct("!i") _uint_struct = Struct("!I") _long_struct = Struct("!q") _ulong_struct = Struct("!Q") _float_struct = Struct("!f") _double_struct = Struct("!d") _ushort_le_struct = Struct(">> make_binary_tree(UnionMatcher, [matcher1, matcher2, matcher3]) UnionMatcher(matcher1, UnionMatcher(matcher2, matcher3)) Any keyword arguments given to this function are passed to the class initializer. """ count = len(args) if not count: raise ValueError("Called make_binary_tree with empty list") elif count == 1: return args[0] half = count // 2 return fn(make_binary_tree(fn, args[:half], **kwargs), make_binary_tree(fn, args[half:], **kwargs), **kwargs) def make_weighted_tree(fn, ls, **kwargs): """Takes a function/class that takes two positional arguments and a list of (weight, argument) tuples and returns a huffman-like weighted tree of results/instances. """ if not ls: raise ValueError("Called make_weighted_tree with empty list") ls.sort() while len(ls) > 1: a = ls.pop(0) b = ls.pop(0) insort(ls, (a[0] + b[0], fn(a[1], b[1]))) return ls[0][1] # Fibonacci function _fib_cache = {} def fib(n): """Returns the nth value in the Fibonacci sequence. """ if n <= 2: return n if n in _fib_cache: return _fib_cache[n] result = fib(n - 1) + fib(n - 2) _fib_cache[n] = result return result # Decorators def synchronized(func): """Decorator for storage-access methods, which synchronizes on a threading lock. The parent object must have 'is_closed' and '_sync_lock' attributes. """ @wraps(func) def synchronized_wrapper(self, *args, **kwargs): with self._sync_lock: return func(self, *args, **kwargs) return synchronized_wrapper Whoosh-2.5.7/src/whoosh/util/cache.py0000644000076500000240000003210612254366350017564 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement import functools, random from array import array from heapq import nsmallest from operator import itemgetter from threading import Lock from time import time from whoosh.compat import iteritems, xrange try: from collections import Counter except ImportError: class Counter(dict): def __missing__(self, key): return 0 def unbound_cache(func): """Caching decorator with an unbounded cache size. """ cache = {} @functools.wraps(func) def caching_wrapper(*args): try: return cache[args] except KeyError: result = func(*args) cache[args] = result return result return caching_wrapper def lru_cache(maxsize=100): """A simple cache that, when the cache is full, deletes the least recently used 10% of the cached values. This function duplicates (more-or-less) the protocol of the ``functools.lru_cache`` decorator in the Python 3.2 standard library. Arguments to the cached function must be hashable. View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` with f.cache_info(). Clear the cache and statistics with f.cache_clear(). Access the underlying function with f.__wrapped__. """ def decorating_function(user_function): stats = [0, 0] # Hits, misses data = {} lastused = {} @functools.wraps(user_function) def wrapper(*args): try: result = data[args] stats[0] += 1 # Hit except KeyError: stats[1] += 1 # Miss if len(data) == maxsize: for k, _ in nsmallest(maxsize // 10 or 1, iteritems(lastused), key=itemgetter(1)): del data[k] del lastused[k] data[args] = user_function(*args) result = data[args] finally: lastused[args] = time() return result def cache_info(): return stats[0], stats[1], maxsize, len(data) def cache_clear(): data.clear() lastused.clear() stats[0] = stats[1] = 0 wrapper.cache_info = cache_info wrapper.cache_clear = cache_clear return wrapper return decorating_function def lfu_cache(maxsize=100): """A simple cache that, when the cache is full, deletes the least frequently used 10% of the cached values. This function duplicates (more-or-less) the protocol of the ``functools.lru_cache`` decorator in the Python 3.2 standard library. Arguments to the cached function must be hashable. View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` with f.cache_info(). Clear the cache and statistics with f.cache_clear(). Access the underlying function with f.__wrapped__. """ def decorating_function(user_function): stats = [0, 0] # Hits, misses data = {} usecount = Counter() @functools.wraps(user_function) def wrapper(*args): try: result = data[args] stats[0] += 1 # Hit except KeyError: stats[1] += 1 # Miss if len(data) == maxsize: for k, _ in nsmallest(maxsize // 10 or 1, iteritems(usecount), key=itemgetter(1)): del data[k] del usecount[k] data[args] = user_function(*args) result = data[args] finally: usecount[args] += 1 return result def cache_info(): return stats[0], stats[1], maxsize, len(data) def cache_clear(): data.clear() usecount.clear() wrapper.cache_info = cache_info wrapper.cache_clear = cache_clear return wrapper return decorating_function def random_cache(maxsize=100): """A very simple cache that, when the cache is filled, deletes 10% of the cached values AT RANDOM. This function duplicates (more-or-less) the protocol of the ``functools.lru_cache`` decorator in the Python 3.2 standard library. Arguments to the cached function must be hashable. View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` with f.cache_info(). Clear the cache and statistics with f.cache_clear(). Access the underlying function with f.__wrapped__. """ def decorating_function(user_function): stats = [0, 0] # hits, misses data = {} @functools.wraps(user_function) def wrapper(*args): try: result = data[args] stats[0] += 1 # Hit except KeyError: stats[1] += 1 # Miss if len(data) == maxsize: keys = data.keys() for i in xrange(maxsize // 10 or 1): n = random.randint(0, len(keys) - 1) k = keys.pop(n) del data[k] data[args] = user_function(*args) result = data[args] return result def cache_info(): return stats[0], stats[1], maxsize, len(data) def cache_clear(): data.clear() wrapper.cache_info = cache_info wrapper.cache_clear = cache_clear return wrapper return decorating_function def db_lru_cache(maxsize=100): """Double-barrel least-recently-used cache decorator. This is a simple LRU algorithm that keeps a primary and secondary dict. Keys are checked in the primary dict, and then the secondary. Once the primary dict fills up, the secondary dict is cleared and the two dicts are swapped. This function duplicates (more-or-less) the protocol of the ``functools.lru_cache`` decorator in the Python 3.2 standard library. Arguments to the cached function must be hashable. View the cache statistics tuple ``(hits, misses, maxsize, currsize)`` with f.cache_info(). Clear the cache and statistics with f.cache_clear(). Access the underlying function with f.__wrapped__. """ def decorating_function(user_function): # Cache1, Cache2, Pointer, Hits, Misses stats = [{}, {}, 0, 0, 0] @functools.wraps(user_function) def wrapper(*args): ptr = stats[2] a = stats[ptr] b = stats[not ptr] key = args if key in a: stats[3] += 1 # Hit return a[key] elif key in b: stats[3] += 1 # Hit return b[key] else: stats[4] += 1 # Miss result = user_function(*args) a[key] = result if len(a) >= maxsize: stats[2] = not ptr b.clear() return result def cache_info(): return stats[3], stats[4], maxsize, len(stats[0]) + len(stats[1]) def cache_clear(): """Clear the cache and cache statistics""" stats[0].clear() stats[1].clear() stats[3] = stats[4] = 0 wrapper.cache_info = cache_info wrapper.cache_clear = cache_clear return wrapper return decorating_function def clockface_lru_cache(maxsize=100): """Least-recently-used cache decorator. This function duplicates (more-or-less) the protocol of the ``functools.lru_cache`` decorator in the Python 3.2 standard library, but uses the clock face LRU algorithm instead of an ordered dictionary. If *maxsize* is set to None, the LRU features are disabled and the cache can grow without bound. Arguments to the cached function must be hashable. View the cache statistics named tuple (hits, misses, maxsize, currsize) with f.cache_info(). Clear the cache and statistics with f.cache_clear(). Access the underlying function with f.__wrapped__. """ def decorating_function(user_function): stats = [0, 0, 0] # hits, misses, hand data = {} if maxsize: # The keys at each point on the clock face clock_keys = [None] * maxsize # The "referenced" bits at each point on the clock face clock_refs = array("B", (0 for _ in xrange(maxsize))) lock = Lock() @functools.wraps(user_function) def wrapper(*args): key = args try: with lock: pos, result = data[key] # The key is in the cache. Set the key's reference bit clock_refs[pos] = 1 # Record a cache hit stats[0] += 1 except KeyError: # Compute the value result = user_function(*args) with lock: # Current position of the clock hand hand = stats[2] # Remember to stop here after a full revolution end = hand # Sweep around the clock looking for a position with # the reference bit off while True: hand = (hand + 1) % maxsize current_ref = clock_refs[hand] if current_ref: # This position's "referenced" bit is set. Turn # the bit off and move on. clock_refs[hand] = 0 elif not current_ref or hand == end: # We've either found a position with the # "reference" bit off or reached the end of the # circular cache. So we'll replace this # position with the new key current_key = clock_keys[hand] if current_key in data: del data[current_key] clock_keys[hand] = key clock_refs[hand] = 1 break # Put the key and result in the cache data[key] = (hand, result) # Save the new hand position stats[2] = hand # Record a cache miss stats[1] += 1 return result else: @functools.wraps(user_function) def wrapper(*args): key = args try: result = data[key] stats[0] += 1 except KeyError: result = user_function(*args) data[key] = result stats[1] += 1 return result def cache_info(): return stats[0], stats[1], maxsize, len(data) def cache_clear(): """Clear the cache and cache statistics""" data.clear() stats[0] = stats[1] = stats[2] = 0 for i in xrange(maxsize): clock_keys[i] = None clock_refs[i] = 0 wrapper.cache_info = cache_info wrapper.cache_clear = cache_clear return wrapper return decorating_function Whoosh-2.5.7/src/whoosh/util/filelock.py0000644000076500000240000001225312254366350020312 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. """ This module contains classes implementing exclusive locks for platforms with fcntl (UNIX and Mac OS X) and Windows. Whoosh originally used directory creation as a locking method, but it had the problem that if the program crashed the lock directory was left behind and would keep the index locked until it was cleaned up. Using OS-level file locks fixes this. """ import errno import os import sys import time def try_for(fn, timeout=5.0, delay=0.1): """Calls ``fn`` every ``delay`` seconds until it returns True or ``timeout`` seconds elapse. Returns True if the lock was acquired, or False if the timeout was reached. :param timeout: Length of time (in seconds) to keep retrying to acquire the lock. 0 means return immediately. Only used when blocking is False. :param delay: How often (in seconds) to retry acquiring the lock during the timeout period. Only used when blocking is False and timeout > 0. """ until = time.time() + timeout v = fn() while not v and time.time() < until: time.sleep(delay) v = fn() return v class LockBase(object): """Base class for file locks. """ def __init__(self, filename): self.fd = None self.filename = filename self.locked = False def __del__(self): if hasattr(self, "fd") and self.fd: try: self.release() except: pass def acquire(self, blocking=False): """Acquire the lock. Returns True if the lock was acquired. :param blocking: if True, call blocks until the lock is acquired. This may not be available on all platforms. On Windows, this is actually just a delay of 10 seconds, rechecking every second. """ pass def release(self): pass class FcntlLock(LockBase): """File lock based on UNIX-only fcntl module. """ def acquire(self, blocking=False): import fcntl # @UnresolvedImport flags = os.O_CREAT | os.O_WRONLY self.fd = os.open(self.filename, flags) mode = fcntl.LOCK_EX if not blocking: mode |= fcntl.LOCK_NB try: fcntl.flock(self.fd, mode) self.locked = True return True except IOError: e = sys.exc_info()[1] if e.errno not in (errno.EAGAIN, errno.EACCES): raise os.close(self.fd) self.fd = None return False def release(self): if self.fd is None: raise Exception("Lock was not acquired") import fcntl # @UnresolvedImport fcntl.flock(self.fd, fcntl.LOCK_UN) os.close(self.fd) self.fd = None class MsvcrtLock(LockBase): """File lock based on Windows-only msvcrt module. """ def acquire(self, blocking=False): import msvcrt # @UnresolvedImport flags = os.O_CREAT | os.O_WRONLY mode = msvcrt.LK_NBLCK if blocking: mode = msvcrt.LK_LOCK self.fd = os.open(self.filename, flags) try: msvcrt.locking(self.fd, mode, 1) return True except IOError: e = sys.exc_info()[1] if e.errno not in (errno.EAGAIN, errno.EACCES, errno.EDEADLK): raise os.close(self.fd) self.fd = None return False def release(self): import msvcrt # @UnresolvedImport if self.fd is None: raise Exception("Lock was not acquired") msvcrt.locking(self.fd, msvcrt.LK_UNLCK, 1) os.close(self.fd) self.fd = None if os.name == "nt": FileLock = MsvcrtLock else: FileLock = FcntlLock Whoosh-2.5.7/src/whoosh/util/loading.py0000644000076500000240000000622512254366350020141 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import pickle class RenamingUnpickler(pickle.Unpickler): """Subclasses ``pickle.Unpickler`` to allow remapping of class names before loading them. """ def __init__(self, f, objmap, shortcuts=None): pickle.Unpickler.__init__(self, f) if shortcuts: objmap = dict((k % shortcuts, v % shortcuts) for k, v in objmap.items()) self._objmap = objmap def find_class(self, modulename, objname): fqname = "%s.%s" % (modulename, objname) if fqname in self._objmap: fqname = self._objmap[fqname] try: obj = find_object(fqname) except ImportError: raise ImportError("Couldn't find %r" % fqname) return obj def find_object(name, blacklist=None, whitelist=None): """Imports and returns an object given a fully qualified name. >>> find_object("whoosh.analysis.StopFilter") """ if blacklist: for pre in blacklist: if name.startswith(pre): raise TypeError("%r: can't instantiate names starting with %r" % (name, pre)) if whitelist: passes = False for pre in whitelist: if name.startswith(pre): passes = True break if not passes: raise TypeError("Can't instantiate %r" % name) lastdot = name.rfind(".") assert lastdot > -1, "Name %r must be fully qualified" % name modname = name[:lastdot] clsname = name[lastdot + 1:] mod = __import__(modname, fromlist=[clsname]) cls = getattr(mod, clsname) return cls Whoosh-2.5.7/src/whoosh/util/numeric.py0000644000076500000240000002602512254366350020166 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import math, struct from array import array from bisect import bisect_left from struct import pack, unpack from whoosh.compat import b, long_type from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong from whoosh.system import pack_float, unpack_float, pack_double, unpack_double NaN = struct.unpack("i") _qstruct = struct.Struct(">q") _dstruct = struct.Struct(">d") _ipack, _iunpack = _istruct.pack, _istruct.unpack _qpack, _qunpack = _qstruct.pack, _qstruct.unpack _dpack, _dunpack = _dstruct.pack, _dstruct.unpack def to_sortable(numtype, intsize, signed, x): if numtype is int or numtype is long_type: if signed: x += (1 << intsize - 1) return x else: return float_to_sortable_long(x, signed) def from_sortable(numtype, intsize, signed, x): if numtype is int or numtype is long_type: if signed: x -= (1 << intsize - 1) return x else: return sortable_long_to_float(x, signed) def float_to_sortable_long(x, signed): x = _qunpack(_dpack(x))[0] if x < 0: x ^= 0x7fffffffffffffff if signed: x += 1 << 63 assert x >= 0 return x def sortable_long_to_float(x, signed): if signed: x -= 1 << 63 if x < 0: x ^= 0x7fffffffffffffff x = _dunpack(_qpack(x))[0] return x # Functions for generating tiered ranges def split_ranges(intsize, step, start, end): """Splits a range of numbers (from ``start`` to ``end``, inclusive) into a sequence of trie ranges of the form ``(start, end, shift)``. The consumer of these tuples is expected to shift the ``start`` and ``end`` right by ``shift``. This is used for generating term ranges for a numeric field. The queries for the edges of the range are generated at high precision and large blocks in the middle are generated at low precision. """ shift = 0 while True: diff = 1 << (shift + step) mask = ((1 << step) - 1) << shift setbits = lambda x: x | ((1 << shift) - 1) haslower = (start & mask) != 0 hasupper = (end & mask) != mask not_mask = ~mask & ((1 << intsize + 1) - 1) nextstart = (start + diff if haslower else start) & not_mask nextend = (end - diff if hasupper else end) & not_mask if shift + step >= intsize or nextstart > nextend: yield (start, setbits(end), shift) break if haslower: yield (start, setbits(start | mask), shift) if hasupper: yield (end & not_mask, setbits(end), shift) start = nextstart end = nextend shift += step def tiered_ranges(numtype, intsize, signed, start, end, shift_step, startexcl, endexcl): assert numtype in (int, float) assert intsize in (8, 16, 32, 64) # Convert start and end values to sortable ints if start is None: start = 0 else: start = to_sortable(numtype, intsize, signed, start) if startexcl: start += 1 if end is None: end = 2 ** intsize - 1 else: end = to_sortable(numtype, intsize, signed, end) if endexcl: end -= 1 if not shift_step: return ((start, end, 0),) # Yield (rstart, rend, shift) ranges for the different resolutions return split_ranges(intsize, shift_step, start, end) # Float-to-byte encoding/decoding def float_to_byte(value, mantissabits=5, zeroexp=2): """Encodes a floating point number in a single byte. """ # Assume int size == float size fzero = (63 - zeroexp) << mantissabits bits = unpack("i", pack("f", value))[0] smallfloat = bits >> (24 - mantissabits) if smallfloat < fzero: # Map negative numbers and 0 to 0 # Map underflow to next smallest non-zero number if bits <= 0: result = chr(0) else: result = chr(1) elif smallfloat >= fzero + 0x100: # Map overflow to largest number result = chr(255) else: result = chr(smallfloat - fzero) return b(result) def byte_to_float(b, mantissabits=5, zeroexp=2): """Decodes a floating point number stored in a single byte. """ if type(b) is not int: b = ord(b) if b == 0: return 0.0 bits = (b & 0xff) << (24 - mantissabits) bits += (63 - zeroexp) << 24 return unpack("f", pack("i", bits))[0] # Length-to-byte approximation functions # Old implementation: #def length_to_byte(length): # """Returns a logarithmic approximation of the given number, in the range # 0-255. The approximation has high precision at the low end (e.g. # 1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers # equal to or greater than 108116 all approximate to 255. # # This is useful for storing field lengths, where the general case is small # documents and very large documents are more rare. # """ # # # This encoding formula works up to 108116 -> 255, so if the length is # # equal to or greater than that limit, just return 255. # if length >= 108116: # return 255 # # # The parameters of this formula where chosen heuristically so that low # # numbers would approximate closely, and the byte range 0-255 would cover # # a decent range of document lengths (i.e. 1 to ~100000). # return int(round(log((length / 27.0) + 1, 1.033))) #def _byte_to_length(n): # return int(round((pow(1.033, n) - 1) * 27)) #_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256))) #byte_to_length = _b2l_cache.__getitem__ # New implementation # Instead of computing the actual formula to get the byte for any given length, # precompute the length associated with each byte, and use bisect to find the # nearest value. This gives quite a large speed-up. # # Note that this does not give all the same answers as the old, "real" # implementation since this implementation always "rounds down" (thanks to the # bisect_left) while the old implementation would "round up" or "round down" # depending on the input. Since this is a fairly gross approximation anyway, # I don't think it matters much. # Values generated using the formula from the "old" implementation above _length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52, 54, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114, 119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211, 219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373, 386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645, 667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066, 1103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641, 1696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516, 2600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852, 3980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889, 6084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995, 9293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868, 13294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403, 19011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314, 27183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620, 38863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780, 55556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876, 79414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374]) def length_to_byte(length): if length is None: return 0 if length >= 106374: return 255 else: return bisect_left(_length_byte_cache, length) byte_to_length = _length_byte_cache.__getitem__ Whoosh-2.5.7/src/whoosh/util/numlists.py0000644000076500000240000002427112254366350020403 0ustar mattstaff00000000000000from array import array from whoosh.compat import xrange from whoosh.system import emptybytes from whoosh.system import pack_byte, unpack_byte from whoosh.system import pack_ushort_le, unpack_ushort_le from whoosh.system import pack_uint_le, unpack_uint_le def delta_encode(nums): base = 0 for n in nums: yield n - base base = n def delta_decode(nums): base = 0 for n in nums: base += n yield base class GrowableArray(object): def __init__(self, inittype="B", allow_longs=True): self.array = array(inittype) self._allow_longs = allow_longs def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.array) def __len__(self): return len(self.array) def __iter__(self): return iter(self.array) def _retype(self, maxnum): if maxnum < 2 ** 16: newtype = "H" elif maxnum < 2 ** 31: newtype = "i" elif maxnum < 2 ** 32: newtype = "I" elif self._allow_longs: newtype = "q" else: raise OverflowError("%r is too big to fit in an array" % maxnum) try: self.array = array(newtype, iter(self.array)) except ValueError: self.array = list(self.array) def append(self, n): try: self.array.append(n) except OverflowError: self._retype(n) self.array.append(n) def extend(self, ns): append = self.append for n in ns: append(n) @property def typecode(self): if isinstance(self.array, array): return self.array.typecode else: return "q" def to_file(self, dbfile): if isinstance(self.array, array): dbfile.write_array(self.array) else: write_long = dbfile.write_long for n in self.array: write_long(n) # Number list encoding base class class NumberEncoding(object): maxint = None def write_nums(self, f, numbers): raise NotImplementedError def read_nums(self, f, n): raise NotImplementedError def write_deltas(self, f, numbers): return self.write_nums(f, list(delta_encode(numbers))) def read_deltas(self, f, n): return delta_decode(self.read_nums(f, n)) def get(self, f, pos, i): f.seek(pos) n = None for n in self.read_nums(f, i + 1): pass return n # Fixed width encodings class FixedEncoding(NumberEncoding): _encode = None _decode = None size = None def write_nums(self, f, numbers): _encode = self._encode for n in numbers: f.write(_encode(n)) def read_nums(self, f, n): _decode = self._decode for _ in xrange(n): yield _decode(f.read(self.size)) def get(self, f, pos, i): f.seek(pos + i * self.size) return self._decode(f.read(self.size)) class ByteEncoding(FixedEncoding): size = 1 maxint = 255 _encode = pack_byte _decode = unpack_byte class UShortEncoding(FixedEncoding): size = 2 maxint = 2 ** 16 - 1 _encode = pack_ushort_le _decode = unpack_ushort_le class UIntEncoding(FixedEncoding): size = 4 maxint = 2 ** 32 - 1 _encode = pack_uint_le _decode = unpack_uint_le # High-bit encoded variable-length integer class Varints(NumberEncoding): maxint = None def write_nums(self, f, numbers): for n in numbers: f.write_varint(n) def read_nums(self, f, n): for _ in xrange(n): yield f.read_varint() # Simple16 algorithm for storing arrays of positive integers (usually delta # encoded lists of sorted integers) # # 1. http://www2008.org/papers/pdf/p387-zhangA.pdf # 2. http://www2009.org/proceedings/pdf/p401.pdf class Simple16(NumberEncoding): # The maximum possible integer value Simple16 can encode is < 2^28. # Therefore, in order to use Simple16, the application must have its own # code to encode numbers in the range of [2^28, 2^32). A simple way is just # write those numbers as 32-bit integers (that is, no compression for very # big numbers). _numsize = 16 _bitsize = 28 maxint = 2 ** _bitsize - 1 # Number of stored numbers per code _num = [28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1] # Number of bits for each number per code _bits = [ (1,) * 28, (2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), (1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1), (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2), (2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), (4, 3, 3, 3, 3, 3, 3, 3, 3), (3, 4, 4, 4, 4, 3, 3, 3), (4, 4, 4, 4, 4, 4, 4), (5, 5, 5, 5, 4, 4), (4, 4, 5, 5, 5, 5), (6, 6, 6, 5, 5), (5, 5, 6, 6, 6), (7, 7, 7, 7), (10, 9, 9), (14, 14), (28,), ] def write_nums(self, f, numbers): _compress = self._compress i = 0 while i < len(numbers): value, taken = _compress(numbers, i, len(numbers) - i) f.write_uint_le(value) i += taken def _compress(self, inarray, inoffset, n): _numsize = self._numsize _bitsize = self._bitsize _num = self._num _bits = self._bits for key in xrange(_numsize): value = key << _bitsize num = _num[key] if _num[key] < n else n bits = 0 j = 0 while j < num and inarray[inoffset + j] < (1 << _bits[key][j]): x = inarray[inoffset + j] value |= x << bits bits += _bits[key][j] j += 1 if j == num: return value, num raise Exception def read_nums(self, f, n): _decompress = self._decompress i = 0 while i < n: value = unpack_uint_le(f.read(4))[0] for v in _decompress(value, n - i): yield v i += 1 def _decompress(self, value, n): _numsize = self._numsize _bitsize = self._bitsize _num = self._num _bits = self._bits key = value >> _bitsize num = _num[key] if _num[key] < n else n bits = 0 for j in xrange(num): v = value >> bits yield v & (0xffffffff >> (32 - _bits[key][j])) bits += _bits[key][j] def get(self, f, pos, i): f.seek(pos) base = 0 value = unpack_uint_le(f.read(4)) key = value >> self._bitsize num = self._num[key] while i > base + num: base += num value = unpack_uint_le(f.read(4)) key = value >> self._bitsize num = self._num[key] offset = i - base if offset: value = value >> sum(self._bits[key][:offset]) return value & (2 ** self._bits[key][offset] - 1) # Google Packed Ints algorithm: a set of four numbers is preceded by a "key" # byte, which encodes how many bytes each of the next four integers use # (stored in the byte as four 2-bit numbers) class GInts(NumberEncoding): maxint = 2 ** 32 - 1 # Number of future bytes to expect after a "key" byte value of N -- used to # skip ahead from a key byte _lens = array("B", [4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16]) def key_to_sizes(self, key): """Returns a list of the sizes of the next four numbers given a key byte. """ return [(key >> (i * 2) & 3) + 1 for i in xrange(4)] def write_nums(self, f, numbers): buf = emptybytes count = 0 key = 0 for v in numbers: shift = count * 2 if v < 256: buf += pack_byte(v) elif v < 65536: key |= 1 << shift buf += pack_ushort_le(v) elif v < 16777216: key |= 2 << shift buf += pack_uint_le(v)[:3] else: key |= 3 << shift buf += pack_uint_le(v) count += 1 if count == 4: f.write_byte(key) f.write(buf) count = 0 key = 0 buf = emptybytes # Clear the buffer # Write out leftovers in the buffer if count: f.write_byte(key) f.write(buf) def read_nums(self, f, n): """Read N integers from the bytes stream dbfile. Expects that the file is positioned at a key byte. """ count = 0 key = None for _ in xrange(n): if count == 0: key = f.read_byte() code = key >> (count * 2) & 3 if code == 0: yield f.read_byte() elif code == 1: yield f.read_ushort_le() elif code == 2: yield unpack_uint_le(f.read(3) + "\x00")[0] else: yield f.read_uint_le() count = (count + 1) % 4 # def get(self, f, pos, i): # f.seek(pos) # base = 0 # key = f.read_byte() # while i > base + 4: # base += 4 # f.seek(self._lens[key], 1) # key = f.read_byte() # # for n in self.read_nums(f, (i + 1) - base): # pass # return n Whoosh-2.5.7/src/whoosh/util/testing.py0000644000076500000240000001064512254366350020202 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import os.path import random import shutil import sys import tempfile from contextlib import contextmanager from whoosh.filedb.filestore import FileStorage from whoosh.util import now, random_name class TempDir(object): def __init__(self, basename="", parentdir=None, ext=".whoosh", suppress=frozenset(), keepdir=False): self.basename = basename or random_name(8) self.parentdir = parentdir dirname = parentdir or tempfile.mkdtemp(ext, self.basename) self.dir = os.path.abspath(dirname) self.suppress = suppress self.keepdir = keepdir def __enter__(self): if not os.path.exists(self.dir): os.makedirs(self.dir) return self.dir def cleanup(self): pass def __exit__(self, exc_type, exc_val, exc_tb): self.cleanup() if not self.keepdir: try: shutil.rmtree(self.dir) except OSError: e = sys.exc_info()[1] #sys.stderr.write("Can't remove temp dir: " + str(e) + "\n") #if exc_type is None: # raise if exc_type is not None: if self.keepdir: sys.stderr.write("Temp dir=" + self.dir + "\n") if exc_type not in self.suppress: return False class TempStorage(TempDir): def __init__(self, debug=False, **kwargs): TempDir.__init__(self, **kwargs) self._debug = debug def cleanup(self): self.store.close() def __enter__(self): dirpath = TempDir.__enter__(self) self.store = FileStorage(dirpath, debug=self._debug) return self.store class TempIndex(TempStorage): def __init__(self, schema, ixname='', storage_debug=False, **kwargs): TempStorage.__init__(self, basename=ixname, debug=storage_debug, **kwargs) self.schema = schema def __enter__(self): fstore = TempStorage.__enter__(self) return fstore.create_index(self.schema, indexname=self.basename) def is_abstract_method(attr): """Returns True if the given object has __isabstractmethod__ == True. """ return (hasattr(attr, "__isabstractmethod__") and getattr(attr, "__isabstractmethod__")) def check_abstract_methods(base, subclass): """Raises AssertionError if ``subclass`` does not override a method on ``base`` that is marked as an abstract method. """ for attrname in dir(base): if attrname.startswith("_"): continue attr = getattr(base, attrname) if is_abstract_method(attr): oattr = getattr(subclass, attrname) if is_abstract_method(oattr): raise Exception("%s.%s not overridden" % (subclass.__name__, attrname)) @contextmanager def timing(name=None): t = now() yield t = now() - t print("%s: %0.06f s" % (name or '', t)) Whoosh-2.5.7/src/whoosh/util/text.py0000644000076500000240000001042412254366350017504 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import codecs, re from whoosh.compat import string_type, u, byte # Note: these functions return a tuple of (text, length), so when you call # them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0] utf8encode = codecs.getencoder("utf-8") utf8decode = codecs.getdecoder("utf-8") # Prefix encoding functions def first_diff(a, b): """ Returns the position of the first differing character in the sequences a and b. For example, first_diff('render', 'rending') == 4. This function limits the return value to 255 so the difference can be encoded in a single byte. """ i = 0 while i <= 255 and i < len(a) and i < len(b) and a[i] == b[i]: i += 1 return i def prefix_encode(a, b): """ Compresses bytestring b as a byte representing the prefix it shares with a, followed by the suffix bytes. """ i = first_diff(a, b) return byte(i) + b[i:] def prefix_encode_all(ls): """Compresses the given list of (unicode) strings by storing each string (except the first one) as an integer (encoded in a byte) representing the prefix it shares with its predecessor, followed by the suffix encoded as UTF-8. """ last = u('') for w in ls: i = first_diff(last, w) yield chr(i) + w[i:].encode("utf-8") last = w def prefix_decode_all(ls): """Decompresses a list of strings compressed by prefix_encode(). """ last = u('') for w in ls: i = ord(w[0]) decoded = last[:i] + w[1:].decode("utf-8") yield decoded last = decoded # Natural key sorting function _nkre = re.compile(r"\D+|\d+", re.UNICODE) def _nkconv(i): try: return int(i) except ValueError: return i.lower() def natural_key(s): """Converts string ``s`` into a tuple that will sort "naturally" (i.e., ``name5`` will come before ``name10`` and ``1`` will come before ``A``). This function is designed to be used as the ``key`` argument to sorting functions. :param s: the str/unicode string to convert. :rtype: tuple """ # Use _nkre to split the input string into a sequence of # digit runs and non-digit runs. Then use _nkconv() to convert # the digit runs into ints and the non-digit runs to lowercase. return tuple(_nkconv(m) for m in _nkre.findall(s)) # Regular expression functions def rcompile(pattern, flags=0, verbose=False): """A wrapper for re.compile that checks whether "pattern" is a regex object or a string to be compiled, and automatically adds the re.UNICODE flag. """ if not isinstance(pattern, string_type): # If it's not a string, assume it's already a compiled pattern return pattern if verbose: flags |= re.VERBOSE return re.compile(pattern, re.UNICODE | flags) Whoosh-2.5.7/src/whoosh/util/times.py0000644000076500000240000004105012254366350017640 0ustar mattstaff00000000000000# Copyright 2010 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. import calendar import copy from datetime import date, datetime, timedelta from whoosh.compat import iteritems class TimeError(Exception): pass def relative_days(current_wday, wday, dir): """Returns the number of days (positive or negative) to the "next" or "last" of a certain weekday. ``current_wday`` and ``wday`` are numbers, i.e. 0 = monday, 1 = tuesday, 2 = wednesday, etc. >>> # Get the number of days to the next tuesday, if today is Sunday >>> relative_days(6, 1, 1) 2 :param current_wday: the number of the current weekday. :param wday: the target weekday. :param dir: -1 for the "last" (past) weekday, 1 for the "next" (future) weekday. """ if current_wday == wday: return 7 * dir if dir == 1: return (wday + 7 - current_wday) % 7 else: return (current_wday + 7 - wday) % 7 * -1 def timedelta_to_usecs(td): total = td.days * 86400000000 # Microseconds in a day total += td.seconds * 1000000 # Microseconds in a second total += td.microseconds return total def datetime_to_long(dt): """Converts a datetime object to a long integer representing the number of microseconds since ``datetime.min``. """ return timedelta_to_usecs(dt.replace(tzinfo=None) - dt.min) def long_to_datetime(x): """Converts a long integer representing the number of microseconds since ``datetime.min`` to a datetime object. """ days = x // 86400000000 # Microseconds in a day x -= days * 86400000000 seconds = x // 1000000 # Microseconds in a second x -= seconds * 1000000 return datetime.min + timedelta(days=days, seconds=seconds, microseconds=x) # Ambiguous datetime object class adatetime(object): """An "ambiguous" datetime object. This object acts like a ``datetime.datetime`` object but can have any of its attributes set to None, meaning unspecified. """ units = frozenset(("year", "month", "day", "hour", "minute", "second", "microsecond")) def __init__(self, year=None, month=None, day=None, hour=None, minute=None, second=None, microsecond=None): if isinstance(year, datetime): dt = year self.year, self.month, self.day = dt.year, dt.month, dt.day self.hour, self.minute, self.second = dt.hour, dt.minute, dt.second self.microsecond = dt.microsecond else: if month is not None and (month < 1 or month > 12): raise TimeError("month must be in 1..12") if day is not None and day < 1: raise TimeError("day must be greater than 1") if (year is not None and month is not None and day is not None and day > calendar.monthrange(year, month)[1]): raise TimeError("day is out of range for month") if hour is not None and (hour < 0 or hour > 23): raise TimeError("hour must be in 0..23") if minute is not None and (minute < 0 or minute > 59): raise TimeError("minute must be in 0..59") if second is not None and (second < 0 or second > 59): raise TimeError("second must be in 0..59") if microsecond is not None and (microsecond < 0 or microsecond > 999999): raise TimeError("microsecond must be in 0..999999") self.year, self.month, self.day = year, month, day self.hour, self.minute, self.second = hour, minute, second self.microsecond = microsecond def __eq__(self, other): if not other.__class__ is self.__class__: if not is_ambiguous(self) and isinstance(other, datetime): return fix(self) == other else: return False return all(getattr(self, unit) == getattr(other, unit) for unit in self.units) def __repr__(self): return "%s%r" % (self.__class__.__name__, self.tuple()) def tuple(self): """Returns the attributes of the ``adatetime`` object as a tuple of ``(year, month, day, hour, minute, second, microsecond)``. """ return (self.year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond) def date(self): return date(self.year, self.month, self.day) def copy(self): return adatetime(year=self.year, month=self.month, day=self.day, hour=self.hour, minute=self.minute, second=self.second, microsecond=self.microsecond) def replace(self, **kwargs): """Returns a copy of this object with the attributes given as keyword arguments replaced. >>> adt = adatetime(year=2009, month=10, day=31) >>> adt.replace(year=2010) (2010, 10, 31, None, None, None, None) """ newadatetime = self.copy() for key, value in iteritems(kwargs): if key in self.units: setattr(newadatetime, key, value) else: raise KeyError("Unknown argument %r" % key) return newadatetime def floor(self): """Returns a ``datetime`` version of this object with all unspecified (None) attributes replaced by their lowest values. This method raises an error if the ``adatetime`` object has no year. >>> adt = adatetime(year=2009, month=5) >>> adt.floor() datetime.datetime(2009, 5, 1, 0, 0, 0, 0) """ y, m, d, h, mn, s, ms = (self.year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond) if y is None: raise ValueError("Date has no year") if m is None: m = 1 if d is None: d = 1 if h is None: h = 0 if mn is None: mn = 0 if s is None: s = 0 if ms is None: ms = 0 return datetime(y, m, d, h, mn, s, ms) def ceil(self): """Returns a ``datetime`` version of this object with all unspecified (None) attributes replaced by their highest values. This method raises an error if the ``adatetime`` object has no year. >>> adt = adatetime(year=2009, month=5) >>> adt.floor() datetime.datetime(2009, 5, 30, 23, 59, 59, 999999) """ y, m, d, h, mn, s, ms = (self.year, self.month, self.day, self.hour, self.minute, self.second, self.microsecond) if y is None: raise ValueError("Date has no year") if m is None: m = 12 if d is None: d = calendar.monthrange(y, m)[1] if h is None: h = 23 if mn is None: mn = 59 if s is None: s = 59 if ms is None: ms = 999999 return datetime(y, m, d, h, mn, s, ms) def disambiguated(self, basedate): """Returns either a ``datetime`` or unambiguous ``timespan`` version of this object. Unless this ``adatetime`` object is full specified down to the microsecond, this method will return a timespan built from the "floor" and "ceil" of this object. This method raises an error if the ``adatetime`` object has no year. >>> adt = adatetime(year=2009, month=10, day=31) >>> adt.disambiguated() timespan(datetime(2009, 10, 31, 0, 0, 0, 0), datetime(2009, 10, 31, 23, 59 ,59, 999999) """ dt = self if not is_ambiguous(dt): return fix(dt) return timespan(dt, dt).disambiguated(basedate) # Time span class class timespan(object): """A span of time between two ``datetime`` or ``adatetime`` objects. """ def __init__(self, start, end): """ :param start: a ``datetime`` or ``adatetime`` object representing the start of the time span. :param end: a ``datetime`` or ``adatetime`` object representing the end of the time span. """ if not isinstance(start, (datetime, adatetime)): raise TimeError("%r is not a datetime object" % start) if not isinstance(end, (datetime, adatetime)): raise TimeError("%r is not a datetime object" % end) self.start = copy.copy(start) self.end = copy.copy(end) def __eq__(self, other): if not other.__class__ is self.__class__: return False return self.start == other.start and self.end == other.end def __repr__(self): return "%s(%r, %r)" % (self.__class__.__name__, self.start, self.end) def disambiguated(self, basedate, debug=0): """Returns an unambiguous version of this object. >>> start = adatetime(year=2009, month=2) >>> end = adatetime(year=2009, month=10) >>> ts = timespan(start, end) >>> ts timespan(adatetime(2009, 2, None, None, None, None, None), adatetime(2009, 10, None, None, None, None, None)) >>> td.disambiguated(datetime.now()) timespan(datetime(2009, 2, 28, 0, 0, 0, 0), datetime(2009, 10, 31, 23, 59 ,59, 999999) """ #- If year is in start but not end, use basedate.year for end #-- If year is in start but not end, but startdate is > basedate, # use "next " to get end month/year #- If year is in end but not start, copy year from end to start #- Support "next february", "last april", etc. start, end = copy.copy(self.start), copy.copy(self.end) start_year_was_amb = start.year is None end_year_was_amb = end.year is None if has_no_date(start) and has_no_date(end): # The start and end points are just times, so use the basedate # for the date information. by, bm, bd = basedate.year, basedate.month, basedate.day start = start.replace(year=by, month=bm, day=bd) end = end.replace(year=by, month=bm, day=bd) else: # If one side has a year and the other doesn't, the decision # of what year to assign to the ambiguous side is kind of # arbitrary. I've used a heuristic here based on how the range # "reads", but it may only be reasonable in English. And maybe # even just to me. if start.year is None and end.year is None: # No year on either side, use the basedate start.year = end.year = basedate.year elif start.year is None: # No year in the start, use the year from the end start.year = end.year elif end.year is None: end.year = max(start.year, basedate.year) if start.year == end.year: # Once again, if one side has a month and day but the other side # doesn't, the disambiguation is arbitrary. Does "3 am to 5 am # tomorrow" mean 3 AM today to 5 AM tomorrow, or 3am tomorrow to # 5 am tomorrow? What I picked is similar to the year: if the # end has a month+day and the start doesn't, copy the month+day # from the end to the start UNLESS that would make the end come # before the start on that day, in which case use the basedate # instead. If the start has a month+day and the end doesn't, use # the basedate. start_dm = not (start.month is None and start.day is None) end_dm = not (end.month is None and end.day is None) if end_dm and not start_dm: if start.floor().time() > end.ceil().time(): start.month = basedate.month start.day = basedate.day else: start.month = end.month start.day = end.day elif start_dm and not end_dm: end.month = basedate.month end.day = basedate.day if floor(start).date() > ceil(end).date(): # If the disambiguated dates are out of order: # - If no start year was given, reduce the start year to put the # start before the end # - If no end year was given, increase the end year to put the end # after the start # - If a year was specified for both, just swap the start and end if start_year_was_amb: start.year = end.year - 1 elif end_year_was_amb: end.year = start.year + 1 else: start, end = end, start start = floor(start) end = ceil(end) if start.date() == end.date() and start.time() > end.time(): # If the start and end are on the same day, but the start time # is after the end time, move the end time to the next day end += timedelta(days=1) return timespan(start, end) # Functions for working with datetime/adatetime objects def floor(at): if isinstance(at, datetime): return at return at.floor() def ceil(at): if isinstance(at, datetime): return at return at.ceil() def fill_in(at, basedate, units=adatetime.units): """Returns a copy of ``at`` with any unspecified (None) units filled in with values from ``basedate``. """ if isinstance(at, datetime): return at args = {} for unit in units: v = getattr(at, unit) if v is None: v = getattr(basedate, unit) args[unit] = v return fix(adatetime(**args)) def has_no_date(at): """Returns True if the given object is an ``adatetime`` where ``year``, ``month``, and ``day`` are all None. """ if isinstance(at, datetime): return False return at.year is None and at.month is None and at.day is None def has_no_time(at): """Returns True if the given object is an ``adatetime`` where ``hour``, ``minute``, ``second`` and ``microsecond`` are all None. """ if isinstance(at, datetime): return False return (at.hour is None and at.minute is None and at.second is None and at.microsecond is None) def is_ambiguous(at): """Returns True if the given object is an ``adatetime`` with any of its attributes equal to None. """ if isinstance(at, datetime): return False return any((getattr(at, attr) is None) for attr in adatetime.units) def is_void(at): """Returns True if the given object is an ``adatetime`` with all of its attributes equal to None. """ if isinstance(at, datetime): return False return all((getattr(at, attr) is None) for attr in adatetime.units) def fix(at): """If the given object is an ``adatetime`` that is unambiguous (because all its attributes are specified, that is, not equal to None), returns a ``datetime`` version of it. Otherwise returns the ``adatetime`` object unchanged. """ if is_ambiguous(at) or isinstance(at, datetime): return at return datetime(year=at.year, month=at.month, day=at.day, hour=at.hour, minute=at.minute, second=at.second, microsecond=at.microsecond) Whoosh-2.5.7/src/whoosh/util/varints.py0000644000076500000240000000617612254366350020217 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from array import array from whoosh.compat import array_tobytes, xrange # Varint cache # Build a cache of the varint byte sequences for the first N integers, so we # don't have to constantly recalculate them on the fly. This makes a small but # noticeable difference. def _varint(i): a = array("B") while (i & ~0x7F) != 0: a.append((i & 0x7F) | 0x80) i = i >> 7 a.append(i) return array_tobytes(a) _varint_cache_size = 512 _varint_cache = [] for i in xrange(0, _varint_cache_size): _varint_cache.append(_varint(i)) _varint_cache = tuple(_varint_cache) def varint(i): """Encodes the given integer into a string of the minimum number of bytes. """ if i < len(_varint_cache): return _varint_cache[i] return _varint(i) def varint_to_int(vi): b = ord(vi[0]) p = 1 i = b & 0x7f shift = 7 while b & 0x80 != 0: b = ord(vi[p]) p += 1 i |= (b & 0x7F) << shift shift += 7 return i def signed_varint(i): """Zig-zag encodes a signed integer into a varint. """ if i >= 0: return varint(i << 1) return varint((i << 1) ^ (~0)) def decode_signed_varint(i): """Zig-zag decodes an integer value. """ if not i & 1: return i >> 1 return (i >> 1) ^ (~0) def read_varint(readfn): """ Reads a variable-length encoded integer. :param readfn: a callable that reads a given number of bytes, like file.read(). """ b = ord(readfn(1)) i = b & 0x7F shift = 7 while b & 0x80 != 0: b = ord(readfn(1)) i |= (b & 0x7F) << shift shift += 7 return i Whoosh-2.5.7/src/whoosh/util/versions.py0000644000076500000240000001223312254366350020370 0ustar mattstaff00000000000000# Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.util.text import rcompile class BaseVersion(object): @classmethod def parse(cls, text): obj = cls() match = cls._version_exp.match(text) if match: groupdict = match.groupdict() for groupname, typ in cls._parts: v = groupdict.get(groupname) if v is not None: setattr(obj, groupname, typ(v)) return obj def __repr__(self): vs = ", ".join(repr(getattr(self, slot)) for slot in self.__slots__) return "%s(%s)" % (self.__class__.__name__, vs) def tuple(self): return tuple(getattr(self, slot) for slot in self.__slots__) def __eq__(self, other): if not hasattr(other, "tuple"): raise ValueError("Can't compare %r with %r" % (self, other)) return self.tuple() == other.tuple() def __lt__(self, other): if not hasattr(other, "tuple"): raise ValueError("Can't compare %r with %r" % (self, other)) return self.tuple() < other.tuple() # It's dumb that you have to define these def __gt__(self, other): if not hasattr(other, "tuple"): raise ValueError("Can't compare %r with %r" % (self, other)) return self.tuple() > other.tuple() def __ge__(self, other): if not hasattr(other, "tuple"): raise ValueError("Can't compare %r with %r" % (self, other)) return self.tuple() >= other.tuple() def __le__(self, other): if not hasattr(other, "tuple"): raise ValueError("Can't compare %r with %r" % (self, other)) return self.tuple() <= other.tuple() def __ne__(self, other): if not hasattr(other, "tuple"): raise ValueError("Can't compare %r with %r" % (self, other)) return self.tuple() != other.tuple() class SimpleVersion(BaseVersion): """An object that parses version numbers such as:: 12.2.5b The filter supports a limited subset of PEP 386 versions including:: 1 1.2 1.2c 1.2c3 1.2.3 1.2.3a 1.2.3b4 10.7.5rc1 999.999.999c999 """ _version_exp = rcompile(r""" ^ (?P\d{1,4}) ( [.](?P\d{1,4}) ( [.](?P\d{1,4}) )? ( (?P[abc]|rc) (?P\d{1,4})? )? )? $ """, verbose=True) # (groupid, method, skippable, default) _parts = [("major", int), ("minor", int), ("release", int), ("ex", str), ("exnum", int), ] _ex_bits = {"a": 0, "b": 1, "c": 2, "rc": 10, "z": 15} _bits_ex = dict((v, k) for k, v in _ex_bits.items()) __slots__ = ("major", "minor", "release", "ex", "exnum") def __init__(self, major=1, minor=0, release=0, ex="z", exnum=0): self.major = major self.minor = minor self.release = release self.ex = ex self.exnum = exnum def to_int(self): assert self.major < 1024 n = self.major << 34 assert self.minor < 1024 n |= self.minor << 24 assert self.release < 1024 n |= self.release << 14 exbits = self._ex_bits.get(self.ex, 15) n |= exbits << 10 assert self.exnum < 1024 n |= self.exnum return n @classmethod def from_int(cls, n): major = (n & (1023 << 34)) >> 34 minor = (n & (1023 << 24)) >> 24 release = (n & (1023 << 14)) >> 14 exbits = (n & (7 << 10)) >> 10 ex = cls._bits_ex.get(exbits, "z") exnum = n & 1023 return cls(major, minor, release, ex, exnum) Whoosh-2.5.7/src/whoosh/writing.py0000644000076500000240000013212712254366350017233 0ustar mattstaff00000000000000# Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from __future__ import with_statement import threading, time from bisect import bisect_right from contextlib import contextmanager from whoosh import columns from whoosh.compat import abstractmethod, bytes_type from whoosh.externalsort import SortingPool from whoosh.fields import UnknownFieldError from whoosh.index import LockError from whoosh.system import emptybytes from whoosh.util import fib, random_name from whoosh.util.filelock import try_for from whoosh.util.text import utf8encode # Exceptions class IndexingError(Exception): pass # Document grouping context manager @contextmanager def groupmanager(writer): writer.start_group() yield writer.end_group() # Merge policies # A merge policy is a callable that takes the Index object, the SegmentWriter # object, and the current segment list (not including the segment being # written), and returns an updated segment list (not including the segment # being written). def NO_MERGE(writer, segments): """This policy does not merge any existing segments. """ return segments def MERGE_SMALL(writer, segments): """This policy merges small segments, where "small" is defined using a heuristic based on the fibonacci sequence. """ from whoosh.reading import SegmentReader newsegments = [] sorted_segment_list = sorted(segments, key=lambda s: s.doc_count_all()) total_docs = 0 for i, seg in enumerate(sorted_segment_list): count = seg.doc_count_all() if count > 0: total_docs += count if total_docs < fib(i + 5): reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() else: newsegments.append(seg) return newsegments def OPTIMIZE(writer, segments): """This policy merges all existing segments. """ from whoosh.reading import SegmentReader for seg in segments: reader = SegmentReader(writer.storage, writer.schema, seg) writer.add_reader(reader) reader.close() return [] def CLEAR(writer, segments): """This policy DELETES all existing segments and only writes the new segment. """ return [] # Customized sorting pool for postings class PostingPool(SortingPool): # Subclass whoosh.externalsort.SortingPool to use knowledge of # postings to set run size in bytes instead of items namechars = "abcdefghijklmnopqrstuvwxyz0123456789" def __init__(self, tempstore, segment, limitmb=128, **kwargs): SortingPool.__init__(self, **kwargs) self.tempstore = tempstore self.segment = segment self.limit = limitmb * 1024 * 1024 self.currentsize = 0 self.fieldnames = set() def _new_run(self): path = "%s.run" % random_name() f = self.tempstore.create_file(path).raw_file() return path, f def _open_run(self, path): return self.tempstore.open_file(path).raw_file() def _remove_run(self, path): return self.tempstore.delete_file(path) def add(self, item): # item = (fieldname, tbytes, docnum, weight, vbytes) assert isinstance(item[1], bytes_type), "tbytes=%r" % item[1] if item[4] is not None: assert isinstance(item[4], bytes_type), "vbytes=%r" % item[4] self.fieldnames.add(item[0]) size = (28 + 4 * 5 # tuple = 28 + 4 * length + 21 + len(item[0]) # fieldname = str = 21 + length + 26 + len(item[1]) * 2 # text = unicode = 26 + 2 * length + 18 # docnum = long = 18 + 16 # weight = float = 16 + 21 + len(item[4] or '')) # valuestring self.currentsize += size if self.currentsize > self.limit: self.save() self.current.append(item) def iter_postings(self): # This is just an alias for items() to be consistent with the # iter_postings()/add_postings() interface of a lot of other classes return self.items() def save(self): SortingPool.save(self) self.currentsize = 0 # Writer base class class IndexWriter(object): """High-level object for writing to an index. To get a writer for a particular index, call :meth:`~whoosh.index.Index.writer` on the Index object. >>> writer = myindex.writer() You can use this object as a context manager. If an exception is thrown from within the context it calls :meth:`~IndexWriter.cancel` to clean up temporary files, otherwise it calls :meth:`~IndexWriter.commit` when the context exits. >>> with myindex.writer() as w: ... w.add_document(title="First document", content="Hello there.") ... w.add_document(title="Second document", content="This is easy!") """ def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if exc_type: self.cancel() else: self.commit() def group(self): """Returns a context manager that calls :meth:`~IndexWriter.start_group` and :meth:`~IndexWriter.end_group` for you, allowing you to use a ``with`` statement to group hierarchical documents:: with myindex.writer() as w: with w.group(): w.add_document(kind="class", name="Accumulator") w.add_document(kind="method", name="add") w.add_document(kind="method", name="get_result") w.add_document(kind="method", name="close") with w.group(): w.add_document(kind="class", name="Calculator") w.add_document(kind="method", name="add") w.add_document(kind="method", name="multiply") w.add_document(kind="method", name="get_result") w.add_document(kind="method", name="close") """ return groupmanager(self) def start_group(self): """Start indexing a group of hierarchical documents. The backend should ensure that these documents are all added to the same segment:: with myindex.writer() as w: w.start_group() w.add_document(kind="class", name="Accumulator") w.add_document(kind="method", name="add") w.add_document(kind="method", name="get_result") w.add_document(kind="method", name="close") w.end_group() w.start_group() w.add_document(kind="class", name="Calculator") w.add_document(kind="method", name="add") w.add_document(kind="method", name="multiply") w.add_document(kind="method", name="get_result") w.add_document(kind="method", name="close") w.end_group() A more convenient way to group documents is to use the :meth:`~IndexWriter.group` method and the ``with`` statement. """ pass def end_group(self): """Finish indexing a group of hierarchical documents. See :meth:`~IndexWriter.start_group`. """ pass def add_field(self, fieldname, fieldtype, **kwargs): """Adds a field to the index's schema. :param fieldname: the name of the field to add. :param fieldtype: an instantiated :class:`whoosh.fields.FieldType` object. """ self.schema.add(fieldname, fieldtype, **kwargs) def remove_field(self, fieldname, **kwargs): """Removes the named field from the index's schema. Depending on the backend implementation, this may or may not actually remove existing data for the field from the index. Optimizing the index should always clear out existing data for a removed field. """ self.schema.remove(fieldname, **kwargs) @abstractmethod def reader(self, **kwargs): """Returns a reader for the existing index. """ raise NotImplementedError def searcher(self, **kwargs): from whoosh.searching import Searcher return Searcher(self.reader(), **kwargs) def delete_by_term(self, fieldname, text, searcher=None): """Deletes any documents containing "term" in the "fieldname" field. This is useful when you have an indexed field containing a unique ID (such as "pathname") for each document. :returns: the number of documents deleted. """ from whoosh.query import Term q = Term(fieldname, text) return self.delete_by_query(q, searcher=searcher) def delete_by_query(self, q, searcher=None): """Deletes any documents matching a query object. :returns: the number of documents deleted. """ if searcher: s = searcher else: s = self.searcher() try: count = 0 for docnum in s.docs_for_query(q, for_deletion=True): self.delete_document(docnum) count += 1 finally: if not searcher: s.close() return count @abstractmethod def delete_document(self, docnum, delete=True): """Deletes a document by number. """ raise NotImplementedError @abstractmethod def add_document(self, **fields): """The keyword arguments map field names to the values to index/store:: w = myindex.writer() w.add_document(path=u"/a", title=u"First doc", text=u"Hello") w.commit() Depending on the field type, some fields may take objects other than unicode strings. For example, NUMERIC fields take numbers, and DATETIME fields take ``datetime.datetime`` objects:: from datetime import datetime, timedelta from whoosh import index from whoosh.fields import * schema = Schema(date=DATETIME, size=NUMERIC(float), content=TEXT) myindex = index.create_in("indexdir", schema) w = myindex.writer() w.add_document(date=datetime.now(), size=5.5, content=u"Hello") w.commit() Instead of a single object (i.e., unicode string, number, or datetime), you can supply a list or tuple of objects. For unicode strings, this bypasses the field's analyzer. For numbers and dates, this lets you add multiple values for the given field:: date1 = datetime.now() date2 = datetime(2005, 12, 25) date3 = datetime(1999, 1, 1) w.add_document(date=[date1, date2, date3], size=[9.5, 10], content=[u"alfa", u"bravo", u"charlie"]) For fields that are both indexed and stored, you can specify an alternate value to store using a keyword argument in the form "_stored_". For example, if you have a field named "title" and you want to index the text "a b c" but store the text "e f g", use keyword arguments like this:: writer.add_document(title=u"a b c", _stored_title=u"e f g") You can boost the weight of all terms in a certain field by specifying a ``__boost`` keyword argument. For example, if you have a field named "content", you can double the weight of this document for searches in the "content" field like this:: writer.add_document(content="a b c", _title_boost=2.0) You can boost every field at once using the ``_boost`` keyword. For example, to boost fields "a" and "b" by 2.0, and field "c" by 3.0:: writer.add_document(a="alfa", b="bravo", c="charlie", _boost=2.0, _c_boost=3.0) Note that some scoring algroithms, including Whoosh's default BM25F, do not work with term weights less than 1, so you should generally not use a boost factor less than 1. See also :meth:`Writer.update_document`. """ raise NotImplementedError @abstractmethod def add_reader(self, reader): raise NotImplementedError def _doc_boost(self, fields, default=1.0): if "_boost" in fields: return float(fields["_boost"]) else: return default def _field_boost(self, fields, fieldname, default=1.0): boostkw = "_%s_boost" % fieldname if boostkw in fields: return float(fields[boostkw]) else: return default def _unique_fields(self, fields): # Check which of the supplied fields are unique unique_fields = [name for name, field in self.schema.items() if name in fields and field.unique] return unique_fields def update_document(self, **fields): """The keyword arguments map field names to the values to index/store. This method adds a new document to the index, and automatically deletes any documents with the same values in any fields marked "unique" in the schema:: schema = fields.Schema(path=fields.ID(unique=True, stored=True), content=fields.TEXT) myindex = index.create_in("index", schema) w = myindex.writer() w.add_document(path=u"/", content=u"Mary had a lamb") w.commit() w = myindex.writer() w.update_document(path=u"/", content=u"Mary had a little lamb") w.commit() assert myindex.doc_count() == 1 It is safe to use ``update_document`` in place of ``add_document``; if there is no existing document to replace, it simply does an add. You cannot currently pass a list or tuple of values to a "unique" field. Because this method has to search for documents with the same unique fields and delete them before adding the new document, it is slower than using ``add_document``. * Marking more fields "unique" in the schema will make each ``update_document`` call slightly slower. * When you are updating multiple documents, it is faster to batch delete all changed documents and then use ``add_document`` to add the replacements instead of using ``update_document``. Note that this method will only replace a *committed* document; currently it cannot replace documents you've added to the IndexWriter but haven't yet committed. For example, if you do this: >>> writer.update_document(unique_id=u"1", content=u"Replace me") >>> writer.update_document(unique_id=u"1", content=u"Replacement") ...this will add two documents with the same value of ``unique_id``, instead of the second document replacing the first. See :meth:`Writer.add_document` for information on ``_stored_``, ``__boost``, and ``_boost`` keyword arguments. """ # Delete the set of documents matching the unique terms unique_fields = self._unique_fields(fields) if unique_fields: with self.searcher() as s: uniqueterms = [(name, fields[name]) for name in unique_fields] docs = s._find_unique(uniqueterms) for docnum in docs: self.delete_document(docnum) # Add the given fields self.add_document(**fields) def commit(self): """Finishes writing and unlocks the index. """ pass def cancel(self): """Cancels any documents/deletions added by this object and unlocks the index. """ pass # Codec-based writer class SegmentWriter(IndexWriter): def __init__(self, ix, poolclass=None, timeout=0.0, delay=0.1, _lk=True, limitmb=128, docbase=0, codec=None, compound=True, **kwargs): # Lock the index self.writelock = None if _lk: self.writelock = ix.lock("WRITELOCK") if not try_for(self.writelock.acquire, timeout=timeout, delay=delay): raise LockError if codec is None: from whoosh.codec import default_codec codec = default_codec() self.codec = codec # Get info from the index self.storage = ix.storage self.indexname = ix.indexname info = ix._read_toc() self.generation = info.generation + 1 self.schema = info.schema self.segments = info.segments self.docnum = self.docbase = docbase self._setup_doc_offsets() # Internals self._tempstorage = self.storage.temp_storage("%s.tmp" % self.indexname) newsegment = codec.new_segment(self.storage, self.indexname) self.newsegment = newsegment self.compound = compound and newsegment.should_assemble() self.is_closed = False self._added = False self.pool = PostingPool(self._tempstorage, self.newsegment, limitmb=limitmb) # Set up writers self.perdocwriter = codec.per_document_writer(self.storage, newsegment) self.fieldwriter = codec.field_writer(self.storage, newsegment) self.merge = True self.optimize = False self.mergetype = None def __repr__(self): return "<%s %r>" % (self.__class__.__name__, self.newsegment) def _check_state(self): if self.is_closed: raise IndexingError("This writer is closed") def _setup_doc_offsets(self): self._doc_offsets = [] base = 0 for s in self.segments: self._doc_offsets.append(base) base += s.doc_count_all() def _document_segment(self, docnum): #Returns the index.Segment object containing the given document #number. offsets = self._doc_offsets if len(offsets) == 1: return 0 return bisect_right(offsets, docnum) - 1 def _segment_and_docnum(self, docnum): #Returns an (index.Segment, segment_docnum) pair for the segment #containing the given document number. segmentnum = self._document_segment(docnum) offset = self._doc_offsets[segmentnum] segment = self.segments[segmentnum] return segment, docnum - offset def _process_posts(self, items, startdoc, docmap): schema = self.schema for fieldname, text, docnum, weight, vbytes in items: if fieldname not in schema: continue if docmap is not None: newdoc = docmap[docnum] else: newdoc = startdoc + docnum yield (fieldname, text, newdoc, weight, vbytes) def temp_storage(self): return self._tempstorage def add_field(self, fieldname, fieldspec, **kwargs): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).add_field(fieldname, fieldspec, **kwargs) def remove_field(self, fieldname): self._check_state() if self._added: raise Exception("Can't modify schema after adding data to writer") super(SegmentWriter, self).remove_field(fieldname) def has_deletions(self): """ Returns True if the current index has documents that are marked deleted but haven't been optimized out of the index yet. """ return any(s.has_deletions() for s in self.segments) def delete_document(self, docnum, delete=True): self._check_state() if docnum >= sum(seg.doc_count_all() for seg in self.segments): raise IndexingError("No document ID %r in this index" % docnum) segment, segdocnum = self._segment_and_docnum(docnum) segment.delete_document(segdocnum, delete=delete) def deleted_count(self): """ :returns: the total number of deleted documents in the index. """ return sum(s.deleted_count() for s in self.segments) def is_deleted(self, docnum): segment, segdocnum = self._segment_and_docnum(docnum) return segment.is_deleted(segdocnum) def reader(self, reuse=None): from whoosh.index import FileIndex self._check_state() return FileIndex._reader(self.storage, self.schema, self.segments, self.generation, reuse=reuse) def iter_postings(self): return self.pool.iter_postings() def add_postings_to_pool(self, reader, startdoc, docmap): items = self._process_posts(reader.iter_postings(), startdoc, docmap) add_post = self.pool.add for item in items: add_post(item) # For fields with separate spelling, copy the words from the graph into # the posting pool for fieldname, fieldobj in self.schema.items(): if (fieldobj.separate_spelling() and reader.has_word_graph(fieldname)): gr = reader._get_graph() cursor = gr.cursor(fieldname) for word in cursor.flatten(): # Adding a post where docnum=None marks it as a separate # spelling word add_post((fieldname, word, -1, -1, emptybytes)) def write_postings(self, lengths, items, startdoc, docmap): items = self._process_posts(items, startdoc, docmap) self.fieldwriter.add_postings(self.schema, lengths, items) def write_per_doc(self, fieldnames, reader): # Very bad hack: reader should be an IndexReader, but may be a # PerDocumentReader if this is called from multiproc, where the code # tries to be efficient by merging per-doc and terms separately. # TODO: fix this! schema = self.schema if reader.has_deletions(): docmap = {} else: docmap = None pdw = self.perdocwriter # Open all column readers cols = {} for fieldname in fieldnames: fieldobj = schema[fieldname] coltype = fieldobj.column_type if coltype and reader.has_column(fieldname): creader = reader.column_reader(fieldname, coltype) if isinstance(creader, columns.TranslatingColumnReader): creader = creader.raw_column() cols[fieldname] = creader for docnum, stored in reader.iter_docs(): if docmap is not None: docmap[docnum] = self.docnum pdw.start_doc(self.docnum) for fieldname in fieldnames: fieldobj = schema[fieldname] length = reader.doc_field_length(docnum, fieldname) pdw.add_field(fieldname, fieldobj, stored.get(fieldname), length) if fieldobj.vector and reader.has_vector(docnum, fieldname): v = reader.vector(docnum, fieldname, fieldobj.vector) pdw.add_vector_matcher(fieldname, fieldobj, v) if fieldname in cols: cv = cols[fieldname][docnum] pdw.add_column_value(fieldname, fieldobj.column_type, cv) pdw.finish_doc() self.docnum += 1 return docmap def add_reader(self, reader): self._check_state() basedoc = self.docnum ndxnames = set(fname for fname in reader.indexed_field_names() if fname in self.schema) fieldnames = set(self.schema.names()) | ndxnames docmap = self.write_per_doc(fieldnames, reader) self.add_postings_to_pool(reader, basedoc, docmap) self._added = True def _check_fields(self, schema, fieldnames): # Check if the caller gave us a bogus field for name in fieldnames: if name not in schema: raise UnknownFieldError("No field named %r in %s" % (name, schema)) def add_document(self, **fields): self._check_state() perdocwriter = self.perdocwriter schema = self.schema docnum = self.docnum add_post = self.pool.add docboost = self._doc_boost(fields) fieldnames = sorted([name for name in fields.keys() if not name.startswith("_")]) self._check_fields(schema, fieldnames) perdocwriter.start_doc(docnum) for fieldname in fieldnames: value = fields.get(fieldname) if value is None: continue field = schema[fieldname] length = 0 if field.indexed: # TODO: Method for adding progressive field values, ie # setting start_pos/start_char? fieldboost = self._field_boost(fields, fieldname, docboost) # Ask the field to return a list of (text, weight, vbytes) # tuples items = field.index(value) # Only store the length if the field is marked scorable scorable = field.scorable # Add the terms to the pool for tbytes, freq, weight, vbytes in items: weight *= fieldboost if scorable: length += freq add_post((fieldname, tbytes, docnum, weight, vbytes)) if field.separate_spelling(): # For fields which use different morphemes for spelling, # insert fake postings for the spellable words, where # docnum=-1 means "this is a spelling word" # TODO: think of something less hacktacular for word in field.spellable_words(value): word = utf8encode(word)[0] add_post((fieldname, word, -1, -1, emptybytes)) vformat = field.vector if vformat: analyzer = field.analyzer # Call the format's word_values method to get posting values vitems = vformat.word_values(value, analyzer, mode="index") # Remove unused frequency field from the tuple vitems = sorted((text, weight, vbytes) for text, _, weight, vbytes in vitems) perdocwriter.add_vector_items(fieldname, field, vitems) # Allow a custom value for stored field/column customval = fields.get("_stored_%s" % fieldname, value) # Add the stored value and length for this field to the per- # document writer sv = customval if field.stored else None perdocwriter.add_field(fieldname, field, sv, length) column = field.column_type if column and customval is not None: cv = field.to_column_value(customval) perdocwriter.add_column_value(fieldname, column, cv) perdocwriter.finish_doc() self._added = True self.docnum += 1 def doc_count(self): return self.docnum - self.docbase def get_segment(self): newsegment = self.newsegment newsegment.set_doc_count(self.docnum) return newsegment def per_document_reader(self): if not self.perdocwriter.is_closed: raise Exception("Per-doc writer is still open") return self.codec.per_document_reader(self.storage, self.get_segment()) # The following methods break out the commit functionality into smaller # pieces to allow MpWriter to call them individually def _merge_segments(self, mergetype, optimize, merge): # The writer supports two ways of setting mergetype/optimize/merge: # as attributes or as keyword arguments to commit(). Originally there # were just the keyword arguments, but then I added the ability to use # the writer as a context manager using "with", so the user no longer # explicitly called commit(), hence the attributes mergetype = mergetype if mergetype is not None else self.mergetype optimize = optimize if optimize is not None else self.optimize merge = merge if merge is not None else self.merge if mergetype: pass elif optimize: mergetype = OPTIMIZE elif not merge: mergetype = NO_MERGE else: mergetype = MERGE_SMALL # Call the merge policy function. The policy may choose to merge # other segments into this writer's pool return mergetype(self, self.segments) def _flush_segment(self): self.perdocwriter.close() if self.codec.length_stats: pdr = self.per_document_reader() else: pdr = None postings = self.pool.iter_postings() self.fieldwriter.add_postings(self.schema, pdr, postings) self.fieldwriter.close() if pdr: pdr.close() def _close_segment(self): if not self.perdocwriter.is_closed: self.perdocwriter.close() if not self.fieldwriter.is_closed: self.fieldwriter.close() self.pool.cleanup() def _assemble_segment(self): if self.compound: # Assemble the segment files into a compound file newsegment = self.get_segment() newsegment.create_compound_file(self.storage) newsegment.compound = True def _partial_segment(self): # For use by a parent multiprocessing writer: Closes out the segment # but leaves the pool files intact so the parent can access them self._check_state() self.perdocwriter.close() self.fieldwriter.close() # Don't call self.pool.cleanup()! We want to grab the pool files. return self.get_segment() def _finalize_segment(self): # Finish writing segment self._flush_segment() # Close segment files self._close_segment() # Assemble compound segment if necessary self._assemble_segment() return self.get_segment() def _commit_toc(self, segments): from whoosh.index import TOC, clean_files # Write a new TOC with the new segment list (and delete old files) toc = TOC(self.schema, segments, self.generation) toc.write(self.storage, self.indexname) # Delete leftover files clean_files(self.storage, self.indexname, self.generation, segments) def _finish(self): self._tempstorage.destroy() if self.writelock: self.writelock.release() self.is_closed = True #self.storage.close() # Finalization methods def commit(self, mergetype=None, optimize=None, merge=None): """Finishes writing and saves all additions and changes to disk. There are four possible ways to use this method:: # Merge small segments but leave large segments, trying to # balance fast commits with fast searching: writer.commit() # Merge all segments into a single segment: writer.commit(optimize=True) # Don't merge any existing segments: writer.commit(merge=False) # Use a custom merge function writer.commit(mergetype=my_merge_function) :param mergetype: a custom merge function taking a Writer object and segment list as arguments, and returning a new segment list. If you supply a ``mergetype`` function, the values of the ``optimize`` and ``merge`` arguments are ignored. :param optimize: if True, all existing segments are merged with the documents you've added to this writer (and the value of the ``merge`` argument is ignored). :param merge: if False, do not merge small segments. """ self._check_state() # Merge old segments if necessary finalsegments = self._merge_segments(mergetype, optimize, merge) if self._added: # Flush the current segment being written and add it to the # list of remaining segments returned by the merge policy # function finalsegments.append(self._finalize_segment()) else: # Close segment files self._close_segment() # Write TOC self._commit_toc(finalsegments) # Final cleanup self._finish() def cancel(self): self._check_state() self._close_segment() self._finish() # Writer wrappers class AsyncWriter(threading.Thread, IndexWriter): """Convenience wrapper for a writer object that might fail due to locking (i.e. the ``filedb`` writer). This object will attempt once to obtain the underlying writer, and if it's successful, will simply pass method calls on to it. If this object *can't* obtain a writer immediately, it will *buffer* delete, add, and update method calls in memory until you call ``commit()``. At that point, this object will start running in a separate thread, trying to obtain the writer over and over, and once it obtains it, "replay" all the buffered method calls on it. In a typical scenario where you're adding a single or a few documents to the index as the result of a Web transaction, this lets you just create the writer, add, and commit, without having to worry about index locks, retries, etc. For example, to get an aynchronous writer, instead of this: >>> writer = myindex.writer() Do this: >>> from whoosh.writing import AsyncWriter >>> writer = AsyncWriter(myindex) """ def __init__(self, index, delay=0.25, writerargs=None): """ :param index: the :class:`whoosh.index.Index` to write to. :param delay: the delay (in seconds) between attempts to instantiate the actual writer. :param writerargs: an optional dictionary specifying keyword arguments to to be passed to the index's ``writer()`` method. """ threading.Thread.__init__(self) self.running = False self.index = index self.writerargs = writerargs or {} self.delay = delay self.events = [] try: self.writer = self.index.writer(**self.writerargs) except LockError: self.writer = None def reader(self): return self.index.reader() def searcher(self, **kwargs): from whoosh.searching import Searcher return Searcher(self.reader(), fromindex=self.index, **kwargs) def _record(self, method, args, kwargs): if self.writer: getattr(self.writer, method)(*args, **kwargs) else: self.events.append((method, args, kwargs)) def run(self): self.running = True writer = self.writer while writer is None: try: writer = self.index.writer(**self.writerargs) except LockError: time.sleep(self.delay) for method, args, kwargs in self.events: getattr(writer, method)(*args, **kwargs) writer.commit(*self.commitargs, **self.commitkwargs) def delete_document(self, *args, **kwargs): self._record("delete_document", args, kwargs) def add_document(self, *args, **kwargs): self._record("add_document", args, kwargs) def update_document(self, *args, **kwargs): self._record("update_document", args, kwargs) def add_field(self, *args, **kwargs): self._record("add_field", args, kwargs) def remove_field(self, *args, **kwargs): self._record("remove_field", args, kwargs) def delete_by_term(self, *args, **kwargs): self._record("delete_by_term", args, kwargs) def commit(self, *args, **kwargs): if self.writer: self.writer.commit(*args, **kwargs) else: self.commitargs, self.commitkwargs = args, kwargs self.start() def cancel(self, *args, **kwargs): if self.writer: self.writer.cancel(*args, **kwargs) # Ex post factor functions def add_spelling(ix, fieldnames, commit=True): """Adds spelling files to an existing index that was created without them, and modifies the schema so the given fields have the ``spelling`` attribute. Only works on filedb indexes. >>> ix = index.open_dir("testindex") >>> add_spelling(ix, ["content", "tags"]) :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object. :param fieldnames: a list of field names to create word graphs for. :param force: if True, overwrites existing word graph files. This is only useful for debugging. """ from whoosh.automata import fst from whoosh.reading import SegmentReader writer = ix.writer() storage = writer.storage schema = writer.schema segments = writer.segments for segment in segments: ext = segment.codec().FST_EXT r = SegmentReader(storage, schema, segment) f = segment.create_file(storage, ext) gw = fst.GraphWriter(f) for fieldname in fieldnames: gw.start_field(fieldname) for word in r.lexicon(fieldname): gw.insert(word) gw.finish_field() gw.close() for fieldname in fieldnames: schema[fieldname].spelling = True if commit: writer.commit(merge=False) # Buffered writer class class BufferedWriter(IndexWriter): """Convenience class that acts like a writer but buffers added documents to a buffer before dumping the buffered documents as a batch into the actual index. In scenarios where you are continuously adding single documents very rapidly (for example a web application where lots of users are adding content simultaneously), using a BufferedWriter is *much* faster than opening and committing a writer for each document you add. If you're adding batches of documents at a time, you can just use a regular writer. (This class may also be useful for batches of ``update_document`` calls. In a normal writer, ``update_document`` calls cannot update documents you've added *in that writer*. With ``BufferedWriter``, this will work.) To use this class, create it from your index and *keep it open*, sharing it between threads. >>> from whoosh.writing import BufferedWriter >>> writer = BufferedWriter(myindex, period=120, limit=20) >>> # Then you can use the writer to add and update documents >>> writer.add_document(...) >>> writer.add_document(...) >>> writer.add_document(...) >>> # Before the writer goes out of scope, call close() on it >>> writer.close() .. note:: This object stores documents in memory and may keep an underlying writer open, so you must explicitly call the :meth:`~BufferedWriter.close` method on this object before it goes out of scope to release the write lock and make sure any uncommitted changes are saved. You can read/search the combination of the on-disk index and the buffered documents in memory by calling ``BufferedWriter.reader()`` or ``BufferedWriter.searcher()``. This allows quasi-real-time search, where documents are available for searching as soon as they are buffered in memory, before they are committed to disk. .. tip:: By using a searcher from the shared writer, multiple *threads* can search the buffered documents. Of course, other *processes* will only see the documents that have been written to disk. If you want indexed documents to become available to other processes as soon as possible, you have to use a traditional writer instead of a ``BufferedWriter``. You can control how often the ``BufferedWriter`` flushes the in-memory index to disk using the ``period`` and ``limit`` arguments. ``period`` is the maximum number of seconds between commits. ``limit`` is the maximum number of additions to buffer between commits. You don't need to call ``commit()`` on the ``BufferedWriter`` manually. Doing so will just flush the buffered documents to disk early. You can continue to make changes after calling ``commit()``, and you can call ``commit()`` multiple times. """ def __init__(self, index, period=60, limit=10, writerargs=None, commitargs=None): """ :param index: the :class:`whoosh.index.Index` to write to. :param period: the maximum amount of time (in seconds) between commits. Set this to ``0`` or ``None`` to not use a timer. Do not set this any lower than a few seconds. :param limit: the maximum number of documents to buffer before committing. :param writerargs: dictionary specifying keyword arguments to be passed to the index's ``writer()`` method when creating a writer. """ self.index = index self.period = period self.limit = limit self.writerargs = writerargs or {} self.commitargs = commitargs or {} self.lock = threading.RLock() self.writer = self.index.writer(**self.writerargs) self._make_ram_index() self.bufferedcount = 0 # Start timer if self.period: self.timer = threading.Timer(self.period, self.commit) self.timer.start() def _make_ram_index(self): from whoosh.codec.memory import MemoryCodec self.codec = MemoryCodec() def _get_ram_reader(self): return self.codec.reader(self.schema) @property def schema(self): return self.writer.schema def reader(self, **kwargs): from whoosh.reading import MultiReader reader = self.writer.reader() with self.lock: ramreader = self._get_ram_reader() # If there are in-memory docs, combine the readers if ramreader.doc_count(): if reader.is_atomic(): reader = MultiReader([reader, ramreader]) else: reader.add_reader(ramreader) return reader def searcher(self, **kwargs): from whoosh.searching import Searcher return Searcher(self.reader(), fromindex=self.index, **kwargs) def close(self): self.commit(restart=False) def commit(self, restart=True): if self.period: self.timer.cancel() with self.lock: ramreader = self._get_ram_reader() self._make_ram_index() if self.bufferedcount: self.writer.add_reader(ramreader) self.writer.commit(**self.commitargs) self.bufferedcount = 0 if restart: self.writer = self.index.writer(**self.writerargs) if self.period: self.timer = threading.Timer(self.period, self.commit) self.timer.start() def add_reader(self, reader): # Pass through to the underlying on-disk index self.writer.add_reader(reader) self.commit() def add_document(self, **fields): with self.lock: # Hijack a writer to make the calls into the codec with self.codec.writer(self.writer.schema) as w: w.add_document(**fields) self.bufferedcount += 1 if self.bufferedcount >= self.limit: self.commit() def update_document(self, **fields): with self.lock: IndexWriter.update_document(self, **fields) def delete_document(self, docnum, delete=True): with self.lock: base = self.index.doc_count_all() if docnum < base: self.writer.delete_document(docnum, delete=delete) else: ramsegment = self.codec.segment ramsegment.delete_document(docnum - base, delete=delete) def is_deleted(self, docnum): base = self.index.doc_count_all() if docnum < base: return self.writer.is_deleted(docnum) else: return self._get_ram_writer().is_deleted(docnum - base) # Backwards compatibility with old name BatchWriter = BufferedWriter Whoosh-2.5.7/src/Whoosh.egg-info/0000755000076500000240000000000012277504634016626 5ustar mattstaff00000000000000Whoosh-2.5.7/src/Whoosh.egg-info/dependency_links.txt0000644000076500000240000000000112277504634022674 0ustar mattstaff00000000000000 Whoosh-2.5.7/src/Whoosh.egg-info/PKG-INFO0000644000076500000240000000703512277504634017730 0ustar mattstaff00000000000000Metadata-Version: 1.1 Name: Whoosh Version: 2.5.7 Summary: Fast, pure-Python full text indexing, search, and spell checking library. Home-page: http://bitbucket.org/mchaput/whoosh Author: Matt Chaput Author-email: matt@whoosh.ca License: Two-clause BSD license Description: About Whoosh ============ Whoosh is a fast, featureful full-text indexing and searching library implemented in pure Python. Programmers can use it to easily add search functionality to their applications and websites. Every part of how Whoosh works can be extended or replaced to meet your needs exactly. Some of Whoosh's features include: * Pythonic API. * Pure-Python. No compilation or binary packages needed, no mysterious crashes. * Fielded indexing and search. * Fast indexing and retrieval -- faster than any other pure-Python, scoring, full-text search solution I know of. * Pluggable scoring algorithm (including BM25F), text analysis, storage, posting format, etc. * Powerful query language. * Pure Python spell-checker (as far as I know, the only one). Whoosh might be useful in the following circumstances: * Anywhere a pure-Python solution is desirable to avoid having to build/compile native libraries (or force users to build/compile them). * As a research platform (at least for programmers that find Python easier to read and work with than Java ;) * When an easy-to-use Pythonic interface is more important to you than raw speed. Whoosh was created and is maintained by Matt Chaput. It was originally created for use in the online help system of Side Effects Software's 3D animation software Houdini. Side Effects Software Inc. graciously agreed to open-source the code. This software is licensed under the terms of the simplified BSD (A.K.A. "two clause" or "FreeBSD") license. See LICENSE.txt for information. Installing Whoosh ================= If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install`` or ``pip`` to download and install Whoosh automatically:: $ easy_install Whoosh or $ pip install Whoosh Learning more ============= * Read the online documentation at http://packages.python.org/Whoosh/ * Join the Whoosh mailing list at http://groups.google.com/group/whoosh * File bug reports and view the Whoosh wiki at http://bitbucket.org/mchaput/whoosh/ Getting the source ================== Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/ You can check out the latest version of the source code using Mercurial:: hg clone http://bitbucket.org/mchaput/whoosh Keywords: index search text spell Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: BSD License Classifier: Natural Language :: English Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python :: 2.5 Classifier: Programming Language :: Python :: 3 Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing :: Indexing Whoosh-2.5.7/src/Whoosh.egg-info/SOURCES.txt0000644000076500000240000002024612277504634020516 0ustar mattstaff00000000000000LICENSE.txt MANIFEST.in README.txt setup.cfg setup.py benchmark/dcvgr10.txt.gz benchmark/dictionary.py benchmark/enron.py benchmark/marc21.py benchmark/reuters.py benchmark/reuters21578.txt.gz docs/build/html/_sources/analysis.txt docs/build/html/_sources/batch.txt docs/build/html/_sources/dates.txt docs/build/html/_sources/facets.txt docs/build/html/_sources/fieldcaches.txt docs/build/html/_sources/glossary.txt docs/build/html/_sources/highlight.txt docs/build/html/_sources/index.txt docs/build/html/_sources/indexing.txt docs/build/html/_sources/intro.txt docs/build/html/_sources/keywords.txt docs/build/html/_sources/nested.txt docs/build/html/_sources/ngrams.txt docs/build/html/_sources/parsing.txt docs/build/html/_sources/query.txt docs/build/html/_sources/querylang.txt docs/build/html/_sources/quickstart.txt docs/build/html/_sources/recipes.txt docs/build/html/_sources/schema.txt docs/build/html/_sources/searching.txt docs/build/html/_sources/spelling.txt docs/build/html/_sources/stemming.txt docs/build/html/_sources/threads.txt docs/build/html/_sources/api/analysis.txt docs/build/html/_sources/api/api.txt docs/build/html/_sources/api/collectors.txt docs/build/html/_sources/api/columns.txt docs/build/html/_sources/api/fields.txt docs/build/html/_sources/api/formats.txt docs/build/html/_sources/api/highlight.txt docs/build/html/_sources/api/idsets.txt docs/build/html/_sources/api/index.txt docs/build/html/_sources/api/matching.txt docs/build/html/_sources/api/qparser.txt docs/build/html/_sources/api/query.txt docs/build/html/_sources/api/reading.txt docs/build/html/_sources/api/scoring.txt docs/build/html/_sources/api/searching.txt docs/build/html/_sources/api/sorting.txt docs/build/html/_sources/api/spelling.txt docs/build/html/_sources/api/util.txt docs/build/html/_sources/api/writing.txt docs/build/html/_sources/api/codec/base.txt docs/build/html/_sources/api/filedb/filestore.txt docs/build/html/_sources/api/filedb/filetables.txt docs/build/html/_sources/api/filedb/structfile.txt docs/build/html/_sources/api/lang/morph_en.txt docs/build/html/_sources/api/lang/porter.txt docs/build/html/_sources/api/lang/wordnet.txt docs/build/html/_sources/api/support/charset.txt docs/build/html/_sources/api/support/levenshtein.txt docs/build/html/_sources/releases/0_3.txt docs/build/html/_sources/releases/1_0.txt docs/build/html/_sources/releases/2_0.txt docs/build/html/_sources/releases/index.txt docs/build/html/_sources/tech/backend.txt docs/build/html/_sources/tech/filedb.txt docs/build/html/_sources/tech/index.txt docs/source/analysis.rst docs/source/batch.rst docs/source/conf.py docs/source/dates.rst docs/source/facets.rst docs/source/fieldcaches.rst docs/source/glossary.rst docs/source/highlight.rst docs/source/index.rst docs/source/indexing.rst docs/source/intro.rst docs/source/keywords.rst docs/source/nested.rst docs/source/ngrams.rst docs/source/parsing.rst docs/source/query.rst docs/source/querylang.rst docs/source/quickstart.rst docs/source/recipes.rst docs/source/schema.rst docs/source/searching.rst docs/source/spelling.rst docs/source/stemming.rst docs/source/threads.rst docs/source/api/analysis.rst docs/source/api/api.rst docs/source/api/collectors.rst docs/source/api/columns.rst docs/source/api/fields.rst docs/source/api/formats.rst docs/source/api/highlight.rst docs/source/api/idsets.rst docs/source/api/index.rst docs/source/api/matching.rst docs/source/api/qparser.rst docs/source/api/query.rst docs/source/api/reading.rst docs/source/api/scoring.rst docs/source/api/searching.rst docs/source/api/sorting.rst docs/source/api/spelling.rst docs/source/api/util.rst docs/source/api/writing.rst docs/source/api/codec/base.rst docs/source/api/filedb/filestore.rst docs/source/api/filedb/filetables.rst docs/source/api/filedb/structfile.rst docs/source/api/lang/morph_en.rst docs/source/api/lang/porter.rst docs/source/api/lang/wordnet.rst docs/source/api/support/charset.rst docs/source/api/support/levenshtein.rst docs/source/releases/0_3.rst docs/source/releases/1_0.rst docs/source/releases/2_0.rst docs/source/releases/index.rst docs/source/tech/backend.rst docs/source/tech/filedb.rst docs/source/tech/index.rst files/whoosh.svg files/whoosh_16.png files/whoosh_35.png files/whoosh_64.png files/whoosh_small.svg src/Whoosh.egg-info/PKG-INFO src/Whoosh.egg-info/SOURCES.txt src/Whoosh.egg-info/dependency_links.txt src/Whoosh.egg-info/top_level.txt src/Whoosh.egg-info/zip-safe src/whoosh/__init__.py src/whoosh/classify.py src/whoosh/collectors.py src/whoosh/columns.py src/whoosh/compat.py src/whoosh/externalsort.py src/whoosh/fields.py src/whoosh/formats.py src/whoosh/highlight.py src/whoosh/idsets.py src/whoosh/index.py src/whoosh/legacy.py src/whoosh/multiproc.py src/whoosh/reading.py src/whoosh/scoring.py src/whoosh/searching.py src/whoosh/sorting.py src/whoosh/spelling.py src/whoosh/system.py src/whoosh/writing.py src/whoosh/analysis/__init__.py src/whoosh/analysis/acore.py src/whoosh/analysis/analyzers.py src/whoosh/analysis/filters.py src/whoosh/analysis/intraword.py src/whoosh/analysis/morph.py src/whoosh/analysis/ngrams.py src/whoosh/analysis/tokenizers.py src/whoosh/automata/__init__.py src/whoosh/automata/fst.py src/whoosh/automata/glob.py src/whoosh/automata/nfa.py src/whoosh/codec/__init__.py src/whoosh/codec/base.py src/whoosh/codec/memory.py src/whoosh/codec/plaintext.py src/whoosh/codec/whoosh2.py src/whoosh/codec/whoosh3.py src/whoosh/filedb/__init__.py src/whoosh/filedb/compound.py src/whoosh/filedb/filestore.py src/whoosh/filedb/filetables.py src/whoosh/filedb/gae.py src/whoosh/filedb/structfile.py src/whoosh/lang/__init__.py src/whoosh/lang/dmetaphone.py src/whoosh/lang/isri.py src/whoosh/lang/lovins.py src/whoosh/lang/morph_en.py src/whoosh/lang/paicehusk.py src/whoosh/lang/phonetic.py src/whoosh/lang/porter.py src/whoosh/lang/porter2.py src/whoosh/lang/stopwords.py src/whoosh/lang/wordnet.py src/whoosh/lang/snowball/__init__.py src/whoosh/lang/snowball/bases.py src/whoosh/lang/snowball/danish.py src/whoosh/lang/snowball/dutch.py src/whoosh/lang/snowball/english.py src/whoosh/lang/snowball/finnish.py src/whoosh/lang/snowball/french.py src/whoosh/lang/snowball/german.py src/whoosh/lang/snowball/hungarian.py src/whoosh/lang/snowball/italian.py src/whoosh/lang/snowball/norwegian.py src/whoosh/lang/snowball/portugese.py src/whoosh/lang/snowball/romanian.py src/whoosh/lang/snowball/russian.py src/whoosh/lang/snowball/spanish.py src/whoosh/lang/snowball/swedish.py src/whoosh/matching/__init__.py src/whoosh/matching/binary.py src/whoosh/matching/combo.py src/whoosh/matching/mcore.py src/whoosh/matching/wrappers.py src/whoosh/qparser/__init__.py src/whoosh/qparser/common.py src/whoosh/qparser/dateparse.py src/whoosh/qparser/default.py src/whoosh/qparser/plugins.py src/whoosh/qparser/syntax.py src/whoosh/qparser/taggers.py src/whoosh/query/__init__.py src/whoosh/query/compound.py src/whoosh/query/nested.py src/whoosh/query/positional.py src/whoosh/query/qcolumns.py src/whoosh/query/qcore.py src/whoosh/query/ranges.py src/whoosh/query/spans.py src/whoosh/query/terms.py src/whoosh/query/wrappers.py src/whoosh/support/__init__.py src/whoosh/support/base85.py src/whoosh/support/bench.py src/whoosh/support/charset.py src/whoosh/support/levenshtein.py src/whoosh/support/relativedelta.py src/whoosh/support/unicode.py src/whoosh/util/__init__.py src/whoosh/util/cache.py src/whoosh/util/filelock.py src/whoosh/util/loading.py src/whoosh/util/numeric.py src/whoosh/util/numlists.py src/whoosh/util/testing.py src/whoosh/util/text.py src/whoosh/util/times.py src/whoosh/util/varints.py src/whoosh/util/versions.py tests/test_analysis.py tests/test_bits.py tests/test_classify.py tests/test_codecs.py tests/test_collector.py tests/test_columns.py tests/test_compound.py tests/test_dateparse.py tests/test_dawg.py tests/test_fields.py tests/test_flexible.py tests/test_highlighting.py tests/test_indexing.py tests/test_matching.py tests/test_misc.py tests/test_mpwriter.py tests/test_nested.py tests/test_parse_plugins.py tests/test_parsing.py tests/test_postings.py tests/test_quality.py tests/test_queries.py tests/test_reading.py tests/test_results.py tests/test_searching.py tests/test_sorting.py tests/test_spans.py tests/test_spelling.py tests/test_tables.py tests/test_vectors.py tests/test_weightings.py tests/test_writing.pyWhoosh-2.5.7/src/Whoosh.egg-info/top_level.txt0000644000076500000240000000000712277504634021355 0ustar mattstaff00000000000000whoosh Whoosh-2.5.7/src/Whoosh.egg-info/zip-safe0000644000076500000240000000000112254367261020254 0ustar mattstaff00000000000000 Whoosh-2.5.7/tests/0000755000076500000240000000000012277504634014240 5ustar mattstaff00000000000000Whoosh-2.5.7/tests/test_analysis.py0000644000076500000240000004522512277504454017504 0ustar mattstaff00000000000000# coding=utf-8 from __future__ import with_statement import pytest from whoosh import analysis, fields, qparser from whoosh.compat import b, u, unichr from whoosh.compat import dumps from whoosh.filedb.filestore import RamStorage def test_regextokenizer(): value = u("AAAaaaBBBbbbCCCcccDDDddd") rex = analysis.RegexTokenizer("[A-Z]+") assert [t.text for t in rex(value)] == ["AAA", "BBB", "CCC", "DDD"] rex = analysis.RegexTokenizer("[A-Z]+", gaps=True) assert [t.text for t in rex(value)] == ["aaa", "bbb", "ccc", "ddd"] def test_path_tokenizer(): value = u("/alfa/bravo/charlie/delta/") pt = analysis.PathTokenizer() assert [t.text for t in pt(value)] == ["/alfa", "/alfa/bravo", "/alfa/bravo/charlie", "/alfa/bravo/charlie/delta"] def test_path_tokenizer2(): path_field = fields.TEXT(analyzer=analysis.PathTokenizer()) st = RamStorage() schema = fields.Schema(path=path_field) index = st.create_index(schema) with index.writer() as writer: writer.add_document(path=u('/alfa/brvo/charlie/delta/')) writer.add_document(path=u('/home/user/file.txt')) assert not index.is_empty() with index.reader() as reader: items = list(reader.all_terms()) assert 'path' in [field for field, value in items] assert b('/alfa') in [value for field, value in items] def test_composition1(): ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() assert ca.__class__.__name__ == "CompositeAnalyzer" assert ca[0].__class__.__name__ == "RegexTokenizer" assert ca[1].__class__.__name__ == "LowercaseFilter" assert [t.text for t in ca(u("ABC 123"))] == ["abc", "123"] def test_composition2(): ca = analysis.RegexTokenizer() | analysis.LowercaseFilter() sa = ca | analysis.StopFilter() assert len(sa), 3 assert sa.__class__.__name__ == "CompositeAnalyzer" assert sa[0].__class__.__name__ == "RegexTokenizer" assert sa[1].__class__.__name__ == "LowercaseFilter" assert sa[2].__class__.__name__ == "StopFilter" assert [t.text for t in sa(u("The ABC 123"))], ["abc", "123"] def test_composition3(): sa = analysis.RegexTokenizer() | analysis.StopFilter() assert sa.__class__.__name__ == "CompositeAnalyzer" def test_composing_functions(): tokenizer = analysis.RegexTokenizer() def filter(tokens): for t in tokens: t.text = t.text.upper() yield t with pytest.raises(TypeError): tokenizer | filter def test_shared_composition(): shared = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() ana1 = shared | analysis.NgramFilter(3) ana2 = shared | analysis.DoubleMetaphoneFilter() assert [t.text for t in ana1(u("hello"))] == ["hel", "ell", "llo"] assert [t.text for t in ana2(u("hello"))] == ["HL"] def test_multifilter(): f1 = analysis.LowercaseFilter() f2 = analysis.PassFilter() mf = analysis.MultiFilter(a=f1, b=f2) ana = analysis.RegexTokenizer(r"\S+") | mf text = u("ALFA BRAVO CHARLIE") assert [t.text for t in ana(text, mode="a")] == ["alfa", "bravo", "charlie"] assert [t.text for t in ana(text, mode="b")] == ["ALFA", "BRAVO", "CHARLIE"] def test_tee_filter(): target = u("Alfa Bravo Charlie") f1 = analysis.LowercaseFilter() f2 = analysis.ReverseTextFilter() ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) result = " ".join([t.text for t in ana(target)]) assert result == "alfa aflA bravo ovarB charlie eilrahC" class ucfilter(analysis.Filter): def __call__(self, tokens): for t in tokens: t.text = t.text.upper() yield t f2 = analysis.ReverseTextFilter() | ucfilter() ana = analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) result = " ".join([t.text for t in ana(target)]) assert result == "alfa AFLA bravo OVARB charlie EILRAHC" f1 = analysis.PassFilter() f2 = analysis.BiWordFilter() ana = (analysis.RegexTokenizer(r"\S+") | analysis.TeeFilter(f1, f2) | analysis.LowercaseFilter()) result = " ".join([t.text for t in ana(target)]) assert result == "alfa alfa-bravo bravo bravo-charlie charlie" def test_intraword(): iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) ana = analysis.RegexTokenizer(r"\S+") | iwf def check(text, ls): assert [(t.pos, t.text) for t in ana(text)] == ls check(u("PowerShot)"), [(0, "Power"), (1, "Shot"), (1, "PowerShot")]) check(u("A's+B's&C's"), [(0, "A"), (1, "B"), (2, "C"), (2, "ABC")]) check(u("Super-Duper-XL500-42-AutoCoder!"), [(0, "Super"), (1, "Duper"), (2, "XL"), (2, "SuperDuperXL"), (3, "500"), (4, "42"), (4, "50042"), (5, "Auto"), (6, "Coder"), (6, "AutoCoder")]) def test_intraword_chars(): iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter() target = u("WiKiWo-rd") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert tokens == [("wi", 0, 2), ("ki", 2, 4), ("wo", 4, 6), ("rd", 7, 9), ("wikiword", 0, 9)] target = u("Zo WiKiWo-rd") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert tokens == [("zo", 0, 2), ("wi", 3, 5), ("ki", 5, 7), ("wo", 7, 9), ("rd", 10, 12), ("wikiword", 3, 12)] def test_intraword_possessive(): iwf = analysis.IntraWordFilter(mergewords=True, mergenums=True) ana = analysis.RegexTokenizer(r"\S+") | iwf | analysis.LowercaseFilter() target = u("O'Malley's-Bar") tokens = [(t.text, t.startchar, t.endchar) for t in ana(target, chars=True)] assert tokens == [("o", 0, 1), ("malley", 2, 8), ("bar", 11, 14), ("omalleybar", 0, 14)] def test_word_segments(): wordset = set(u("alfa bravo charlie delta").split()) cwf = analysis.CompoundWordFilter(wordset, keep_compound=True) ana = analysis.RegexTokenizer(r"\S+") | cwf target = u("alfacharlie bravodelta delto bravo subalfa") tokens = [t.text for t in ana(target)] assert tokens == ["alfacharlie", "alfa", "charlie", "bravodelta", "bravo", "delta", "delto", "bravo", "subalfa"] cwf = analysis.CompoundWordFilter(wordset, keep_compound=False) ana = analysis.RegexTokenizer(r"\S+") | cwf target = u("alfacharlie bravodelta delto bravo subalfa") tokens = [t.text for t in ana(target)] assert tokens == ["alfa", "charlie", "bravo", "delta", "delto", "bravo", "subalfa"] def test_biword(): ana = analysis.RegexTokenizer(r"\w+") | analysis.BiWordFilter() result = [t.copy() for t in ana(u("the sign of four"), chars=True, positions=True)] assert ["the-sign", "sign-of", "of-four"] == [t.text for t in result] assert [(0, 8), (4, 11), (9, 16)] == [(t.startchar, t.endchar) for t in result] assert [0, 1, 2] == [t.pos for t in result] result = [t.copy() for t in ana(u("single"))] assert len(result) == 1 assert result[0].text == "single" def test_shingles(): ana = analysis.RegexTokenizer(r"\w+") | analysis.ShingleFilter(3, " ") source = u("better a witty fool than a foolish wit") results = [t.copy() for t in ana(source, positions=True, chars=True)] assert [t.text for t in results] == [u('better a witty'), u('a witty fool'), u('witty fool than'), u('fool than a'), u('than a foolish'), u('a foolish wit')] assert [t.pos for t in results] == list(range(len(results))) for t in results: assert t.text == source[t.startchar:t.endchar] def test_unicode_blocks(): from whoosh.support.unicode import blocks, blockname, blocknum assert blockname(u('a')) == 'Basic Latin' assert blockname(unichr(0x0b80)) == 'Tamil' assert blockname(unichr(2048)) is None assert blocknum(u('a')) == 0 assert blocknum(unichr(0x0b80)) == 22 assert blocknum(unichr(2048)) is None assert blocknum(u('a')) == blocks.Basic_Latin # @UndefinedVariable assert blocknum(unichr(0x0b80)) == blocks.Tamil # @UndefinedVariable def test_double_metaphone(): from whoosh.lang.dmetaphone import double_metaphone names = {'maurice': ('MRS', None), 'aubrey': ('APR', None), 'cambrillo': ('KMPRL', 'KMPR'), 'heidi': ('HT', None), 'katherine': ('K0RN', 'KTRN'), 'Thumbail': ('0MPL', 'TMPL'), 'catherine': ('K0RN', 'KTRN'), 'richard': ('RXRT', 'RKRT'), 'bob': ('PP', None), 'eric': ('ARK', None), 'geoff': ('JF', 'KF'), 'Through': ('0R', 'TR'), 'Schwein': ('XN', 'XFN'), 'dave': ('TF', None), 'ray': ('R', None), 'steven': ('STFN', None), 'bryce': ('PRS', None), 'randy': ('RNT', None), 'bryan': ('PRN', None), 'Rapelje': ('RPL', None), 'brian': ('PRN', None), 'otto': ('AT', None), 'auto': ('AT', None), 'Dallas': ('TLS', None), 'maisey': ('MS', None), 'zhang': ('JNK', None), 'Chile': ('XL', None), 'Jose': ('HS', None), 'Arnow': ('ARN', 'ARNF'), 'solilijs': ('SLLS', None), 'Parachute': ('PRKT', None), 'Nowhere': ('NR', None), 'Tux': ('TKS', None)} dmn = name = None for name in names.keys(): dmn = double_metaphone(name) assert dmn == names[name] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter()) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('SPRS', 1.0), ('F', 1.0), ('FF', 0.5)] mf = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)) results = [(t.text, t.boost) for t in mf(u("Spruce View"))] assert results == [('spruce', 1.0), ('SPRS', 1.0), ('view', 1.0), ('F', 1.0), ('FF', 0.5)] namefield = fields.TEXT(analyzer=mf) texts = list(namefield.process_text(u("Spruce View"), mode="query")) assert texts == [u('spruce'), 'SPRS', u('view'), 'F', 'FF'] def test_substitution(): mf = analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("-", "") assert ([t.text for t in mf(u("one-two th-re-ee four"))] == ["onetwo", "threee", "four"]) mf = (analysis.RegexTokenizer(r"\S+") | analysis.SubstitutionFilter("([^=]*)=(.*)", r"\2=\1")) assert [t.text for t in mf(u("a=b c=d ef"))] == ["b=a", "d=c", "ef"] def test_delimited_attribute(): ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() results = [(t.text, t.boost) for t in ana(u("image render^2 file^0.5"))] assert results == [("image", 1.0), ("render", 2.0), ("file", 0.5)] def test_porter2(): from whoosh.lang.porter2 import stem plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer', 'plotted'] singles = [stem(w) for w in plurals] assert singles == ['caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl', 'size', 'meet', 'state', 'siez', 'item', 'sensat', 'tradit', 'refer', 'colon', 'plot'] assert stem("bill's") == "bill" assert stem("y's") == "y" #def test_pystemmer(): # Stemmer = pytest.importorskip("Stemmer") # # ana = (analysis.RegexTokenizer() # | analysis.LowercaseFilter() # | analysis.PyStemmerFilter()) # schema = fields.Schema(text=fields.TEXT(analyzer=ana)) # st = RamStorage() # # ix = st.create_index(schema) # with ix.writer() as w: # w.add_document(text=u("rains falling strangely")) # # ix = st.open_index() # with ix.writer() as w: # w.add_document(text=u("pains stalling strongly")) # # ix = st.open_index() # with ix.reader() as r: # assert (list(r.field_terms("text")) # == ["fall", "pain", "rain", "stall", "strang", "strong"]) def test_url(): sample = u("Visit http://bitbucket.org/mchaput/whoosh or " + "urn:isbn:5930502 or http://www.apple.com/.") anas = [analysis.SimpleAnalyzer(analysis.url_pattern), analysis.StandardAnalyzer(analysis.url_pattern, stoplist=None)] for ana in anas: ts = [t.text for t in ana(sample)] assert ts == [u('visit'), u('http://bitbucket.org/mchaput/whoosh'), u('or'), u('urn:isbn:5930502'), u('or'), u('http://www.apple.com/')] def test_name_field(): ana = (analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() | analysis.DoubleMetaphoneFilter(combine=True)) namefield = fields.TEXT(analyzer=ana, multitoken_query="or") schema = fields.Schema(id=fields.STORED, name=namefield) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("one"), name=u("Leif Ericson")) w.commit() s = ix.searcher() qp = qparser.QueryParser("name", schema) q = qp.parse(u("leaf eriksen"), normalize=False) r = s.search(q) assert len(r) == 1 def test_start_pos(): from whoosh import formats ana = analysis.RegexTokenizer(r"\S+") | analysis.LowercaseFilter() kw = {"positions": True} tks = formats.tokens(u("alfa bravo charlie delta"), ana, kw) assert [t.pos for t in tks] == [0, 1, 2, 3] kw["start_pos"] = 3 ts = [t.copy() for t in formats.tokens(u("A B C D").split(), ana, kw)] assert " ".join([t.text for t in ts]) == "A B C D" assert [t.pos for t in ts] == [3, 4, 5, 6] def test_frowny_face(): # See https://bitbucket.org/mchaput/whoosh/issue/166/ ana = analysis.RegexTokenizer(r"\S+") | analysis.IntraWordFilter() # text is all delimiters tokens = [t.text for t in ana(u(":-("))] assert tokens == [] # text has consecutive delimiters tokens = [t.text for t in ana(u("LOL:)"))] assert tokens == ["LOL"] def test_ngrams(): s = u("abcdefg h ij klm") tk = analysis.RegexTokenizer(r"\S+") def dotest(f): ana = tk | f tokens = ana(s, positions=True, chars=True) return "/".join(t.text for t in tokens) f = analysis.NgramFilter(3, 4) assert dotest(f) == "abc/abcd/bcd/bcde/cde/cdef/def/defg/efg/klm" f = analysis.NgramFilter(3, 4, at="start") assert dotest(f) == "abc/abcd/klm" f = analysis.NgramFilter(3, 4, at="end") assert dotest(f) == "defg/efg/klm" ana = tk | analysis.NgramFilter(2, 5, at="end") tokens = [(t.text, t.startchar, t.endchar) for t in ana(s, chars=True)] assert tokens == [("cdefg", 2, 7), ("defg", 3, 7), ("efg", 4, 7), ("fg", 5, 7), ("ij", 10, 12), ("klm", 13, 16), ("lm", 14, 16)] @pytest.mark.skipif("sys.version_info < (2,6)") def test_language_analyzer(): domain = [("da", u("Jeg gik mig over s\xf8 og land"), [u('gik'), u('s\xf8'), u('land')]), ("nl", u("Daar komt een muisje aangelopen"), [u('komt'), u('muisj'), u('aangelop')]), ("de", u("Berlin war ihm zu gro\xdf, da baut' er sich ein Schlo\xdf."), [u('berlin'), u('gross'), u('baut'), u('schloss')]), ("es", u("Por el mar corren las liebres"), ['mar', 'corr', 'liebr']), ] for lang, source, target in domain: ana = analysis.LanguageAnalyzer(lang) words = [t.text for t in ana(source)] assert words == target @pytest.mark.skipif("sys.version_info < (2,6)") def test_la_pickleability(): ana = analysis.LanguageAnalyzer("en") _ = dumps(ana, -1) def test_charset_pickeability(): from whoosh.support import charset charmap = charset.charset_table_to_dict(charset.default_charset) ana = analysis.StandardAnalyzer() | analysis.CharsetFilter(charmap) _ = dumps(ana, -1) ana = analysis.CharsetTokenizer(charmap) _ = dumps(ana, -1) def test_shingle_stopwords(): # Note that the stop list is None here ana = (analysis.RegexTokenizer() | analysis.StopFilter(stoplist=None, minsize=3) | analysis.ShingleFilter(size=3)) texts = [t.text for t in ana(u("some other stuff and then some things To Check "))] assert texts == ["some-other-stuff", "other-stuff-and", "stuff-and-then", "and-then-some", "then-some-things", "some-things-Check"] # Use a stop list here ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter() | analysis.ShingleFilter(size=3)) texts = [t.text for t in ana(u("some other stuff and then some things To Check "))] assert texts == ["some-other-stuff", "other-stuff-then", "stuff-then-some", "then-some-things", "some-things-check"] def test_biword_stopwords(): # Note that the stop list is None here ana = (analysis.RegexTokenizer() | analysis.StopFilter(stoplist=None, minsize=3) | analysis.BiWordFilter()) texts = [t.text for t in ana(u("stuff and then some"))] assert texts == ["stuff-and", "and-then", "then-some"] # Use a stop list here ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.StopFilter() | analysis.BiWordFilter()) texts = [t.text for t in ana(u("stuff and then some"))] assert texts == ["stuff-then", "then-some"] @pytest.mark.skipif("sys.version_info < (2,6)") def test_stop_lang(): stopper = analysis.RegexTokenizer() | analysis.StopFilter() ls = [token.text for token in stopper(u("this is a test"))] assert ls == [u("test")] es_stopper = analysis.RegexTokenizer() | analysis.StopFilter(lang="es") ls = [token.text for token in es_stopper(u("el lapiz es en la mesa"))] assert ls == ["lapiz", "mesa"] def test_issue358(): t = analysis.RegexTokenizer("\w+") with pytest.raises(analysis.CompositionError): _ = t | analysis.StandardAnalyzer() def test_ngramwords_tokenizer(): tk = analysis.CommaSeparatedTokenizer() tags = fields.NGRAMWORDS(minsize=3, maxsize=50, tokenizer=tk, stored=True, queryor=True) schema = fields.Schema(tags=tags) Whoosh-2.5.7/tests/test_bits.py0000644000076500000240000001043312254366350016607 0ustar mattstaff00000000000000from whoosh.filedb.filestore import RamStorage from whoosh.idsets import BitSet, OnDiskBitSet, SortedIntSet def test_bit_basics(c=BitSet): b = c() assert not b assert 12 not in b b.update([0, 2, 4, 6, 7]) assert b assert ([(n in b) for n in range(10)] == [True, False, True, False, True, False, True, True, False, False]) b.add(9) assert 9 in b assert len(b) == 6 assert list(b.invert(10)) == [1, 3, 5, 8] b.discard(6) assert list(b) == [0, 2, 4, 7, 9] assert len(b) == 5 def test_len(c=BitSet): b = c() b.add(3) b.add(5) b.add(1024) assert len(b) == 3 b.add(5) assert len(b) == 3 b.discard(1000) assert len(b) == 3 b.discard(5) assert len(b) == 2 def test_union(c=BitSet): assert c([2, 4, 5]) | c([3, 9]) == c([2, 3, 4, 5, 9]) b = c([2, 4, 5]) b.update([3, 9]) assert list(b) == [2, 3, 4, 5, 9] b = c([2, 4, 5]) b.update(c([3, 9])) assert list(b) == [2, 3, 4, 5, 9] b = c([1, 2]) b.update([1, 5, 9]) assert list(b) == [1, 2, 5, 9] def test_intersection(c=BitSet): assert c([2, 4, 5]) & c([3, 9]) == c() assert c([2, 4, 5]) & c([4, 5, 9]) == c([4, 5]) b = c([2, 4, 5]) assert b.intersection([4, 5, 9]) == c([4, 5]) b.intersection_update([4, 5, 9]) assert list(b) == [4, 5] b = c([2, 4, 5]) b.intersection_update(c([4, 5, 9])) assert list(b) == [4, 5] def test_difference(c=BitSet): assert c([1, 3, 50, 72]) - c([3, 72]) == c([1, 50]) assert list(c([1, 3, 50, 72]).difference([3, 72])) == [1, 50] b = c([1, 3, 50, 72]) b.difference_update(c([3, 72])) assert list(b) == [1, 50] b = c([1, 3, 50, 72]) b.difference_update([3, 72]) assert list(b) == [1, 50] def test_copy(c=BitSet): b = c([1, 5, 100, 60]) assert b == b.copy() def test_clear(c=BitSet): b = c([1, 5, 100, 60]) b.clear() assert list(b) == [] def test_isdisjoint(c=BitSet): b = c([1, 7, 20, 100]) assert b.isdisjoint(c([2, 8, 25])) assert b.isdisjoint([2, 8, 25]) assert not b.isdisjoint(c([2, 7, 25])) assert not b.isdisjoint([1, 8, 25]) def test_before_after(c=BitSet): b = c([10, 11, 30, 50, 80]) assert b.after(0) == 10 assert b.after(7) == 10 assert b.after(8) == 10 assert b.after(10) == 11 assert b.after(11) == 30 assert b.after(30) == 50 assert b.after(33) == 50 assert b.after(38) == 50 assert b.after(41) == 50 assert b.after(42) == 50 assert b.after(45) == 50 assert b.after(47) == 50 assert b.after(50) == 80 assert b.after(80) is None assert b.before(0) is None assert b.before(99) == 80 assert b.before(81) == 80 assert b.before(80) == 50 assert b.before(50) == 30 assert b.before(48) == 30 assert b.before(46) == 30 assert b.before(45) == 30 assert b.before(44) == 30 assert b.before(42) == 30 assert b.before(38) == 30 assert b.before(36) == 30 assert b.before(34) == 30 assert b.before(33) == 30 assert b.before(32) == 30 assert b.before(30) == 11 assert b.before(11) == 10 assert b.before(10) is None b = c([7]) assert b.after(0) == 7 b = c([8]) assert b.after(0) == 8 b = c([9]) assert b.after(0) == 9 b = c([7]) assert b.before(16) == 7 b = c([8]) assert b.before(16) == 8 b = c([9]) assert b.before(16) == 9 b = c([49]) assert b.after(0) == 49 def test_sortedintset(): test_bit_basics(SortedIntSet) test_len(SortedIntSet) test_union(SortedIntSet) test_intersection(SortedIntSet) test_difference(SortedIntSet) test_copy(SortedIntSet) test_clear(SortedIntSet) test_isdisjoint(SortedIntSet) test_before_after(SortedIntSet) def test_ondisk(): bs = BitSet([10, 11, 30, 50, 80]) st = RamStorage() f = st.create_file("test") size = bs.to_disk(f) f.close() f = st.open_file("test") b = OnDiskBitSet(f, 0, size) assert list(b) == list(bs) assert b.after(0) == 10 assert b.after(10) == 11 assert b.after(80) is None assert b.after(99) is None assert b.before(0) is None assert b.before(99) == 80 assert b.before(80) == 50 assert b.before(10) is None f.seek(0) b = BitSet.from_disk(f, size) assert list(b) == list(bs) Whoosh-2.5.7/tests/test_classify.py0000644000076500000240000001246412254366350017471 0ustar mattstaff00000000000000from __future__ import with_statement from whoosh import analysis, classify, fields, formats, query from whoosh.compat import u, text_type from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex domain = [u("A volume that is a signed distance field used for collision calculations. The turbulence is damped near the collision object to prevent particles from passing through."), u("When particles cross the SDF boundary they have their velocities reversed according to the SDF normal and are pushed outside of the SDF."), u("The distance at which the particles start to slow down due to a collision object."), u("There are several different ways to update a particle system in response to an external velocity field. They are broadly categorized as Force, Velocity, and Position updates."), u("Instead of applying a force in the direction of the velocity field, the force is applied relative to the difference between the particle's velocity and the velocity field. This effectively adds an implicit drag that causes the particles to match the velocity field."), u("In Velocity Blend mode, the amount to mix in the field velocity every timestep."), u("In Velocity Blend mode, the amount to add the curlnoise velocity to the particle's velocity. This can be useful in addition to advectbyvolume to layer turbulence on a velocity field."), ] text = u("How do I use a velocity field for particles") def create_index(): analyzer = analysis.StandardAnalyzer() vector_format = formats.Frequency() schema = fields.Schema(path=fields.ID(stored=True), content=fields.TEXT(analyzer=analyzer, vector=vector_format)) ix = RamStorage().create_index(schema) w = ix.writer() from string import ascii_lowercase for letter, content in zip(ascii_lowercase, domain): w.add_document(path=u("/%s") % letter, content=content) w.commit() return ix def test_add_text(): ix = create_index() with ix.reader() as r: exp = classify.Expander(r, "content") exp.add_text(text) assert ([t[0] for t in exp.expanded_terms(3)] == ["particles", "velocity", "field"]) def test_keyterms(): ix = create_index() with ix.searcher() as s: docnum = s.document_number(path="/a") keys = list(s.key_terms([docnum], "content", numterms=3)) assert ([t[0] for t in keys] == [u("collision"), u("calculations"), u("damped")]) def test_keyterms_from_text(): ix = create_index() with ix.searcher() as s: keys = list(s.key_terms_from_text("content", text)) assert [t[0] for t in keys] == ["particles", "velocity", "field"] def test_more_like_this(): docs = [u("alfa bravo charlie delta echo foxtrot golf"), u("delta echo foxtrot golf hotel india juliet"), u("echo foxtrot golf hotel india juliet kilo"), u("foxtrot golf hotel india juliet kilo lima"), u("golf hotel india juliet kilo lima mike"), u("foxtrot golf hotel india alfa bravo charlie")] def _check(schema, **kwargs): ix = RamStorage().create_index(schema) with ix.writer() as w: for i, text in enumerate(docs): w.add_document(id=text_type(i + 1), text=text) with ix.searcher() as s: docnum = s.document_number(id=u("1")) r = s.more_like(docnum, "text", **kwargs) assert [hit["id"] for hit in r] == ["6", "2", "3"] schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(stored=True)) _check(schema) ana = analysis.StandardAnalyzer() schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(analyzer=ana, vector=formats.Frequency())) _check(schema) schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) _check(schema, text=docs[0]) def test_more_like(): schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT(stored=True)) with TempIndex(schema, "morelike") as ix: with ix.writer() as w: w.add_document(id=u("1"), text=u("alfa bravo charlie")) w.add_document(id=u("2"), text=u("bravo charlie delta")) w.add_document(id=u("3"), text=u("echo")) w.add_document(id=u("4"), text=u("delta echo foxtrot")) w.add_document(id=u("5"), text=u("echo echo echo")) w.add_document(id=u("6"), text=u("foxtrot golf hotel")) w.add_document(id=u("7"), text=u("golf hotel india")) with ix.searcher() as s: docnum = s.document_number(id="3") r = s.more_like(docnum, "text") assert [hit["id"] for hit in r] == ["5", "4"] def test_empty_more_like(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "emptymore") as ix: with ix.searcher() as s: assert s.doc_count() == 0 q = query.Term("a", u("b")) r = s.search(q) assert r.scored_length() == 0 assert r.key_terms("text") == [] ex = classify.Expander(s.reader(), "text") assert ex.expanded_terms(1) == [] Whoosh-2.5.7/tests/test_codecs.py0000644000076500000240000005246212254366350017116 0ustar mattstaff00000000000000from __future__ import with_statement import random from array import array import pytest from whoosh import analysis, fields, formats, query from whoosh.compat import u, b, text_type from whoosh.compat import array_tobytes, xrange from whoosh.codec import default_codec from whoosh.filedb.filestore import RamStorage from whoosh.util.numeric import byte_to_length, length_to_byte from whoosh.util.testing import TempStorage def _make_codec(**kwargs): st = RamStorage() codec = default_codec(**kwargs) seg = codec.new_segment(st, "test") return st, codec, seg class FakeLengths(object): def __init__(self, **lens): self.lens = lens def doc_field_length(self, docnum, fieldname): if fieldname in self.lens: if docnum < len(self.lens[fieldname]): return self.lens[fieldname][docnum] return 1 def test_termkey(): st, codec, seg = _make_codec() tw = codec.field_writer(st, seg) fieldobj = fields.TEXT() tw.start_field("alfa", fieldobj) tw.start_term(b("bravo")) tw.add(0, 1.0, b(""), 3) tw.finish_term() tw.start_term(b('\xc3\xa6\xc3\xaf\xc5\xc3\xba')) tw.add(0, 4.0, b(""), 3) tw.finish_term() tw.finish_field() tw.start_field("text", fieldobj) tw.start_term(b('\xe6\xa5\xe6\xac\xe8\xaa')) tw.add(0, 7.0, b(""), 9) tw.finish_term() tw.finish_field() tw.close() tr = codec.terms_reader(st, seg) assert ("alfa", b("bravo")) in tr assert ("alfa", b('\xc3\xa6\xc3\xaf\xc5\xc3\xba')) in tr assert ("text", b('\xe6\xa5\xe6\xac\xe8\xaa')) in tr tr.close() def test_random_termkeys(): def random_fieldname(): return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) def random_btext(): a = array("H", (random.randint(0, 0xd7ff) for _ in xrange(1, 20))) return array_tobytes(a).decode("utf-16") domain = sorted(set([(random_fieldname(), random_btext().encode("utf-8")) for _ in xrange(1000)])) st, codec, seg = _make_codec() fieldobj = fields.TEXT() tw = codec.field_writer(st, seg) # Stupid ultra-low-level hand-adding of postings just to check handling of # random fieldnames and term texts lastfield = None for fieldname, text in domain: if lastfield and fieldname != lastfield: tw.finish_field() lastfield = None if lastfield is None: tw.start_field(fieldname, fieldobj) lastfield = fieldname tw.start_term(text) tw.add(0, 1.0, b(""), 1) tw.finish_term() if lastfield: tw.finish_field() tw.close() tr = codec.terms_reader(st, seg) for term in domain: assert term in tr def test_stored_fields(): codec = default_codec() fieldobj = fields.TEXT(stored=True) with TempStorage("storedfields") as st: seg = codec.new_segment(st, "test") dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("a", fieldobj, "hello", 1) dw.add_field("b", fieldobj, "there", 1) dw.finish_doc() dw.start_doc(1) dw.add_field("a", fieldobj, "one", 1) dw.add_field("b", fieldobj, "two", 1) dw.add_field("c", fieldobj, "three", 1) dw.finish_doc() dw.start_doc(2) dw.finish_doc() dw.start_doc(3) dw.add_field("a", fieldobj, "alfa", 1) dw.add_field("b", fieldobj, "bravo", 1) dw.finish_doc() dw.close() seg.set_doc_count(4) pdr = codec.per_document_reader(st, seg) assert pdr.doc_count_all() == 4 assert pdr.stored_fields(0) == {"a": "hello", "b": "there"} # Note: access out of order assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"} assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"} sfs = list(pdr.all_stored_fields()) assert len(sfs) == 4 assert sfs == [{"a": "hello", "b": "there"}, {"a": "one", "b": "two", "c": "three"}, {}, {"a": "alfa", "b": "bravo"}, ] pdr.close() def test_termindex(): terms = [("a", "alfa"), ("a", "bravo"), ("a", "charlie"), ("a", "delta"), ("b", "able"), ("b", "baker"), ("b", "dog"), ("b", "easy")] st, codec, seg = _make_codec() schema = fields.Schema(a=fields.TEXT, b=fields.TEXT) tw = codec.field_writer(st, seg) postings = ((fname, b(text), 0, i, b("")) for (i, (fname, text)) in enumerate(terms)) tw.add_postings(schema, FakeLengths(), postings) tw.close() tr = codec.terms_reader(st, seg) for i, (fieldname, text) in enumerate(terms): assert (fieldname, b(text)) in tr ti = tr.term_info(fieldname, b(text)) assert ti.weight() == i assert ti.doc_frequency() == 1 def test_w2_block(): from whoosh.codec.whoosh2 import W2Codec st = RamStorage() codec = W2Codec() seg = codec.new_segment(st, "test") schema = fields.Schema(a=fields.TEXT) fw = codec.field_writer(st, seg) # This is a very convoluted, backwards way to get postings into a file but # it was the easiest low-level method available when this test was written # :( fl = FakeLengths(a=[2, 5, 3, 4, 1]) fw.add_postings(schema, fl, [("a", b("b"), 0, 2.0, b("test1")), ("a", b("b"), 1, 5.0, b("test2")), ("a", b("b"), 2, 3.0, b("test3")), ("a", b("b"), 3, 4.0, b("test4")), ("a", b("b"), 4, 1.0, b("test5"))]) fw.close() tr = codec.terms_reader(st, seg) m = tr.matcher("a", b("b"), schema["a"].format) block = m.block block.read_ids() assert block.min_length() == 1 assert block.max_length() == 5 assert block.max_weight() == 5.0 assert block.min_id() == 0 assert block.max_id() == 4 assert list(block.ids) == [0, 1, 2, 3, 4] assert list(block.read_weights()) == [2.0, 5.0, 3.0, 4.0, 1.0] assert list(block.read_values()) == [b("test1"), b("test2"), b("test3"), b("test4"), b("test5")] seg = codec.new_segment(st, "test") fw = codec.field_writer(st, seg) fl = FakeLengths(a=[1, 2, 6, 1, 1, 420]) fw.add_postings(schema, fl, [("a", b("b"), 0, 1.0, ""), ("a", b("b"), 1, 2.0, ""), ("a", b("b"), 2, 12.0, ""), ("a", b("b"), 5, 6.5, "")]) fw.close() def blen(n): return byte_to_length(length_to_byte(n)) tr = codec.terms_reader(st, seg) m = tr.matcher("a", b("b"), schema["a"].format) block = m.block block.read_ids() assert len(block) == 4 assert list(block.ids) == [0, 1, 2, 5] assert list(block.weights) == [1.0, 2.0, 12.0, 6.5] assert block.values is None assert block.min_length() == 1 assert block.max_length() == blen(420) assert block.max_weight() == 12.0 ti = tr.term_info("a", b("b")) assert ti.weight() == 21.5 assert ti.doc_frequency() == 4 assert ti.min_length() == 1 assert ti.max_length() == blen(420) assert ti.max_weight() == 12.0 def test_docwriter_one(): field = fields.TEXT(stored=True) st, codec, seg = _make_codec() dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("text", field, "Testing one two three", 4) dw.finish_doc() dw.close() seg.set_doc_count(1) pdr = codec.per_document_reader(st, seg) assert pdr.doc_field_length(0, "text") == 4 assert pdr.stored_fields(0) == {"text": "Testing one two three"} def test_docwriter_two(): field = fields.TEXT(stored=True) st, codec, seg = _make_codec() dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("title", field, ("a", "b"), 2) dw.add_field("text", field, "Testing one two three", 4) dw.finish_doc() dw.start_doc(1) dw.add_field("title", field, "The second document", 3) dw.add_field("text", field, 500, 1) dw.finish_doc() dw.close() seg.set_doc_count(2) pdr = codec.per_document_reader(st, seg) assert pdr.doc_field_length(0, "title") == 2 assert pdr.doc_field_length(0, "text") == 4 assert pdr.doc_field_length(1, "title") == 3 assert pdr.doc_field_length(1, "text") == 1 assert (pdr.stored_fields(0) == {"title": ("a", "b"), "text": "Testing one two three"}) assert (pdr.stored_fields(1) == {"title": "The second document", "text": 500}) def test_vector(): field = fields.TEXT(vector=True) st, codec, seg = _make_codec() dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("title", field, None, 1) dw.add_vector_items("title", field, [(u("alfa"), 1.0, b("t1")), (u("bravo"), 2.0, b("t2"))]) dw.finish_doc() dw.close() seg.set_doc_count(1) pdr = codec.per_document_reader(st, seg) assert pdr.stored_fields(0) == {} m = pdr.vector(0, "title", field.vector) assert m.is_active() ps = [] while m.is_active(): ps.append((m.id(), m.weight(), m.value())) m.next() assert ps == [(u("alfa"), 1.0, b("t1")), (u("bravo"), 2.0, b("t2"))] def test_vector_values(): field = fields.TEXT(vector=formats.Frequency()) st, codec, seg = _make_codec() content = u("alfa bravo charlie alfa") dw = codec.per_document_writer(st, seg) dw.start_doc(0) vals = ((t, w, v) for t, _, w, v in sorted(field.vector.word_values(content, field.analyzer))) dw.add_vector_items("f1", field, vals) dw.finish_doc() dw.close() vr = codec.per_document_reader(st, seg) m = vr.vector(0, "f1", field.vector) assert (list(m.items_as("frequency")) == [("alfa", 2), ("bravo", 1), ("charlie", 1)]) def test_no_lengths(): f1 = fields.ID() st, codec, seg = _make_codec() dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("name", f1, None, None) dw.finish_doc() dw.start_doc(1) dw.add_field("name", f1, None, None) dw.finish_doc() dw.start_doc(2) dw.add_field("name", f1, None, None) dw.finish_doc() dw.close() seg.set_doc_count(3) pdr = codec.per_document_reader(st, seg) assert pdr.doc_field_length(0, "name") == 0 assert pdr.doc_field_length(1, "name") == 0 assert pdr.doc_field_length(2, "name") == 0 def test_store_zero(): f1 = fields.ID(stored=True) st, codec, seg = _make_codec() dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("name", f1, 0, None) dw.finish_doc() dw.close() seg.set_doc_count(1) sr = codec.per_document_reader(st, seg) assert sr.stored_fields(0) == {"name": 0} def test_fieldwriter_single_term(): field = fields.TEXT() st, codec, seg = _make_codec() fw = codec.field_writer(st, seg) fw.start_field("text", field) fw.start_term(b("alfa")) fw.add(0, 1.5, b("test"), 1) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) assert ("text", b("alfa")) in tr ti = tr.term_info("text", b("alfa")) assert ti.weight() == 1.5 assert ti.doc_frequency() == 1 assert ti.min_length() == 1 assert ti.max_length() == 1 assert ti.max_weight() == 1.5 assert ti.min_id() == 0 assert ti.max_id() == 0 def test_fieldwriter_two_terms(): field = fields.TEXT() st, codec, seg = _make_codec() fw = codec.field_writer(st, seg) fw.start_field("text", field) fw.start_term(b("alfa")) fw.add(0, 2.0, b("test1"), 2) fw.add(1, 1.0, b("test2"), 1) fw.finish_term() fw.start_term(b("bravo")) fw.add(0, 3.0, b("test3"), 3) fw.add(2, 2.0, b("test4"), 2) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) assert ("text", b("alfa")) in tr ti = tr.term_info("text", b("alfa")) assert ti.weight() == 3.0 assert ti.doc_frequency() == 2 assert ti.min_length() == 1 assert ti.max_length() == 2 assert ti.max_weight() == 2.0 assert ti.min_id() == 0 assert ti.max_id() == 1 assert ("text", b("bravo")) in tr ti = tr.term_info("text", b("bravo")) assert ti.weight() == 5.0 assert ti.doc_frequency() == 2 assert ti.min_length() == 2 assert ti.max_length() == 3 assert ti.max_weight() == 3.0 assert ti.min_id() == 0 assert ti.max_id() == 2 m = tr.matcher("text", b("bravo"), field.format) assert list(m.all_ids()) == [0, 2] def test_fieldwriter_multiblock(): field = fields.TEXT() st, codec, seg = _make_codec(blocklimit=2) fw = codec.field_writer(st, seg) fw.start_field("text", field) fw.start_term(b("alfa")) fw.add(0, 2.0, b("test1"), 2) fw.add(1, 5.0, b("test2"), 5) fw.add(2, 3.0, b("test3"), 3) fw.add(3, 4.0, b("test4"), 4) fw.add(4, 1.0, b("test5"), 1) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) ti = tr.term_info("text", b("alfa")) assert ti.weight() == 15.0 assert ti.doc_frequency() == 5 assert ti.min_length() == 1 assert ti.max_length() == 5 assert ti.max_weight() == 5.0 assert ti.min_id() == 0 assert ti.max_id() == 4 ps = [] m = tr.matcher("text", b("alfa"), field.format) while m.is_active(): ps.append((m.id(), m.weight(), m.value())) m.next() assert ps == [(0, 2.0, b("test1")), (1, 5.0, b("test2")), (2, 3.0, b("test3")), (3, 4.0, b("test4")), (4, 1.0, b("test5"))] def test_term_values(): field = fields.TEXT(phrase=False) st, codec, seg = _make_codec() content = u("alfa bravo charlie alfa") fw = codec.field_writer(st, seg) fw.start_field("f1", field) for text, freq, weight, val in sorted(field.index(content)): fw.start_term(text) fw.add(0, weight, val, freq) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) ps = [(term, ti.weight(), ti.doc_frequency()) for term, ti in tr.items()] assert ps == [(("f1", b("alfa")), 2.0, 1), (("f1", b("bravo")), 1.0, 1), (("f1", b("charlie")), 1.0, 1)] def test_skip(): _docnums = [1, 3, 12, 34, 43, 67, 68, 102, 145, 212, 283, 291, 412, 900, 905, 1024, 1800, 2048, 15000] st, codec, seg = _make_codec() fieldobj = fields.TEXT() fw = codec.field_writer(st, seg) fw.start_field("f1", fieldobj) fw.start_term(b("test")) for n in _docnums: fw.add(n, 1.0, b(''), None) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) m = tr.matcher("f1", b("test"), fieldobj.format) assert m.id() == 1 m.skip_to(220) assert m.id() == 283 m.skip_to(1) assert m.id() == 283 m.skip_to(1000) assert m.id() == 1024 m.skip_to(1800) assert m.id() == 1800 def test_spelled_field(): field = fields.TEXT(spelling=True) st, codec, seg = _make_codec() fw = codec.field_writer(st, seg) fw.start_field("text", field) fw.start_term(b("special")) fw.add(0, 1.0, b("test1"), 1) fw.finish_term() fw.start_term(b("specific")) fw.add(1, 1.0, b("test2"), 1) fw.finish_term() fw.finish_field() fw.close() gr = codec.graph_reader(st, seg) assert gr.has_root("text") cur = gr.cursor("text") strings = list(cur.flatten_strings()) assert type(strings[0]) == text_type assert strings == ["special", "specific"] def test_special_spelled_field(): from whoosh.analysis import StemmingAnalyzer field = fields.TEXT(analyzer=StemmingAnalyzer(), spelling=True) st, codec, seg = _make_codec() fw = codec.field_writer(st, seg) fw.start_field("text", field) fw.start_term(b("special")) fw.add(0, 1.0, b("test1"), 1) fw.finish_term() fw.start_term(b("specific")) fw.add(1, 1.0, b("test2"), 1) fw.finish_term() fw.add_spell_word("text", u("specials")) fw.add_spell_word("text", u("specifically")) fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) assert list(tr.terms()) == [("text", b("special")), ("text", b("specific"))] cur = codec.graph_reader(st, seg).cursor("text") assert list(cur.flatten_strings()) == ["specials", "specifically"] def test_plaintext_codec(): pytest.importorskip("ast") from whoosh.codec.plaintext import PlainTextCodec from whoosh.codec.whoosh3 import W3Codec ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(vector=True, sortable=True), b=fields.STORED, c=fields.NUMERIC(stored=True, sortable=True), d=fields.TEXT(analyzer=ana, spelling=True)) st = RamStorage() ix = st.create_index(schema) with ix.writer(codec=W3Codec()) as w: w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, d=u("quelling whining echoing")) w.add_document(a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling")) w.add_document(a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling")) w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping")) w.add_document(a=u("echo foxtrot india"), b=None, c=-200, d=u("filling going hopping")) with ix.reader() as r: assert r.has_column("a") c = r.column_reader("a") assert c[2] == u("charlie delta echo") w = ix.writer(codec=PlainTextCodec()) w.commit(optimize=True) with ix.searcher() as s: reader = s.reader() r = s.search(query.Term("a", "delta")) assert len(r) == 3 assert [hit["b"] for hit in r] == [1000, 5.5, True] assert (" ".join(s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india") assert reader.doc_field_length(2, "a") == 3 cfield = schema["c"] assert type(cfield), fields.NUMERIC sortables = list(cfield.sortable_terms(reader, "c")) assert sortables assert ([cfield.from_bytes(t) for t in sortables] == [-200, -100, 100, 200, 300]) assert reader.has_column("a") c = reader.column_reader("a") assert c[2] == u("charlie delta echo") assert reader.has_column("c") c = reader.column_reader("c") assert list(c) == [100, 200, 300, -100, -200] assert s.has_vector(2, "a") v = s.vector(2, "a") assert " ".join(v.all_ids()) == "charlie delta echo" def test_memory_codec(): from whoosh.codec import memory from whoosh.searching import Searcher ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(vector=True), b=fields.STORED, c=fields.NUMERIC(stored=True, sortable=True), d=fields.TEXT(analyzer=ana, spelling=True)) codec = memory.MemoryCodec() with codec.writer(schema) as w: w.add_document(a=u("alfa bravo charlie"), b="hello", c=100, d=u("quelling whining echoing")) w.add_document(a=u("bravo charlie delta"), b=1000, c=200, d=u("rolling timing yelling")) w.add_document(a=u("charlie delta echo"), b=5.5, c=300, d=u("using opening pulling")) w.add_document(a=u("delta echo foxtrot"), b=True, c=-100, d=u("aching selling dipping")) w.add_document(a=u("echo foxtrot india"), b=None, c=-200, d=u("filling going hopping")) reader = codec.reader(schema) s = Searcher(reader) assert ("a", "delta") in reader q = query.Term("a", "delta") r = s.search(q) assert len(r) == 3 assert [hit["b"] for hit in r] == [1000, 5.5, True] assert (" ".join(s.field_terms("a")) == "alfa bravo charlie delta echo foxtrot india") cfield = schema["c"] c_sortables = cfield.sortable_terms(reader, "c") c_values = [cfield.from_bytes(t) for t in c_sortables] assert c_values, [-200, -100, 100, 200, 300] assert reader.has_column("c") c_values = list(reader.column_reader("c")) assert c_values == [100, 200, 300, -100, -200] assert s.has_vector(2, "a") v = s.vector(2, "a") assert " ".join(v.all_ids()) == "charlie delta echo" assert reader.has_word_graph("d") gr = reader.word_graph("d") assert (" ".join(gr.flatten_strings()) == "aching dipping echoing filling going hopping opening " "pulling quelling rolling selling timing using whining " "yelling") def test_memory_multiwrite(): from whoosh.codec import memory domain = ["alfa bravo charlie delta", "bravo charlie delta echo", "charlie delta echo foxtrot", "delta echo foxtrot india", "echo foxtrot india juliet"] schema = fields.Schema(line=fields.TEXT(stored=True)) codec = memory.MemoryCodec() for line in domain: with codec.writer(schema) as w: w.add_document(line=u(line)) reader = codec.reader(schema) assert [sf["line"] for sf in reader.all_stored_fields()] == domain assert (" ".join(reader.field_terms("line")) == "alfa bravo charlie delta echo foxtrot india juliet") Whoosh-2.5.7/tests/test_collector.py0000644000076500000240000000763112254366350017642 0ustar mattstaff00000000000000from __future__ import with_statement import pytest from whoosh import collectors, fields, query, searching from whoosh.compat import b, u, xrange from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex def test_add(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, text=u("alfa bravo charlie")) w.add_document(id=2, text=u("alfa bravo delta")) w.add_document(id=3, text=u("alfa charlie echo")) w.commit() with ix.searcher() as s: assert s.doc_frequency("text", u("charlie")) == 2 r = s.search(query.Term("text", u("charlie"))) assert [hit["id"] for hit in r] == [1, 3] assert len(r) == 2 def test_filter_that_matches_no_document(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, text=u("alfa bravo charlie")) w.add_document(id=2, text=u("alfa bravo delta")) w.commit() with ix.searcher() as s: r = s.search( query.Every(), filter=query.Term("text", u("echo"))) assert [hit["id"] for hit in r] == [] assert len(r) == 0 def test_timelimit(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() for _ in xrange(50): w.add_document(text=u("alfa")) w.commit() import time from whoosh import collectors, matching class SlowMatcher(matching.WrappingMatcher): def next(self): time.sleep(0.02) self.child.next() class SlowQuery(query.WrappingQuery): def matcher(self, searcher, context=None): return SlowMatcher(self.child.matcher(searcher, context)) with ix.searcher() as s: oq = query.Term("text", u("alfa")) sq = SlowQuery(oq) col = collectors.TimeLimitCollector(s.collector(limit=None), timelimit=0.1) with pytest.raises(searching.TimeLimit): s.search_with_collector(sq, col) col = collectors.TimeLimitCollector(s.collector(limit=40), timelimit=0.1) with pytest.raises(collectors.TimeLimit): s.search_with_collector(sq, col) col = collectors.TimeLimitCollector(s.collector(limit=None), timelimit=0.25) try: s.search_with_collector(sq, col) assert False # Shouldn't get here except collectors.TimeLimit: r = col.results() assert r.scored_length() > 0 col = collectors.TimeLimitCollector(s.collector(limit=None), timelimit=0.5) s.search_with_collector(oq, col) assert col.results().runtime < 0.5 @pytest.mark.skipif("not hasattr(__import__('signal'), 'SIGALRM')") def test_timelimit_alarm(): import time from whoosh import matching class SlowMatcher(matching.Matcher): def __init__(self): self._id = 0 def id(self): return self._id def is_active(self): return self._id == 0 def next(self): time.sleep(10) self._id = 1 def score(self): return 1.0 class SlowQuery(query.Query): def matcher(self, searcher, context=None): return SlowMatcher() schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("Hello")) with ix.searcher() as s: q = SlowQuery() t = time.time() c = s.collector() c = collectors.TimeLimitCollector(c, 0.2) with pytest.raises(searching.TimeLimit): _ = s.search_with_collector(q, c) assert time.time() - t < 0.5 Whoosh-2.5.7/tests/test_columns.py0000644000076500000240000002225512254366350017333 0ustar mattstaff00000000000000from __future__ import with_statement import inspect, random, sys from whoosh import columns, fields, query from whoosh.codec.whoosh3 import W3Codec from whoosh.compat import b, u, BytesIO, bytes_type, text_type from whoosh.compat import izip, xrange, dumps, loads from whoosh.filedb import compound from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex, TempStorage def test_pickleability(): # Ignore base classes ignore = (columns.Column, columns.WrappedColumn, columns.ListColumn) # Required arguments init_args = {"ClampedNumericColumn": (columns.NumericColumn("B"),), "FixedBytesColumn": (5,), "FixedBytesListColumn": (5,), "NumericColumn": ("i",), "PickleColumn": (columns.VarBytesColumn(),), "StructColumn": ("=if", (0, 0.0)), } coltypes = [c for _, c in inspect.getmembers(columns, inspect.isclass) if issubclass(c, columns.Column) and not c in ignore] for coltype in coltypes: args = init_args.get(coltype.__name__, ()) try: inst = coltype(*args) except TypeError: e = sys.exc_info()[1] raise TypeError("Error instantiating %r: %s" % (coltype, e)) _ = loads(dumps(inst, -1)) def test_multistream(): domain = [("a", "12345"), ("b", "abc"), ("c", "AaBbC"), ("a", "678"), ("c", "cDdEeF"), ("b", "defgh"), ("b", "ijk"), ("c", "fGgHh"), ("a", "9abc")] st = RamStorage() msw = compound.CompoundWriter(st) files = dict((name, msw.create_file(name)) for name in "abc") for name, data in domain: files[name].write(b(data)) f = st.create_file("test") msw.save_as_compound(f) f = st.open_file("test") msr = compound.CompoundStorage(f) assert msr.open_file("a").read() == b("123456789abc") assert msr.open_file("b").read() == b("abcdefghijk") assert msr.open_file("c").read() == b("AaBbCcDdEeFfGgHh") def test_random_multistream(): letters = "abcdefghijklmnopqrstuvwxyz" def randstring(n): s = "".join(random.choice(letters) for _ in xrange(n)) return s.encode("latin1") domain = {} for _ in xrange(100): name = randstring(random.randint(5, 10)) value = randstring(2500) domain[name] = value outfiles = dict((name, BytesIO(value)) for name, value in domain.items()) with TempStorage() as st: msw = compound.CompoundWriter(st, buffersize=1024) mfiles = {} for name in domain: mfiles[name] = msw.create_file(name) while outfiles: name = random.choice(list(outfiles.keys())) v = outfiles[name].read(1000) mfiles[name].write(v) if len(v) < 1000: del outfiles[name] f = st.create_file("test") msw.save_as_compound(f) f = st.open_file("test") msr = compound.CompoundStorage(f) for name, value in domain.items(): assert msr.open_file(name).read() == value msr.close() def _rt(c, values, default): # Continuous st = RamStorage() f = st.create_file("test1") f.write(b("hello")) w = c.writer(f) for docnum, v in enumerate(values): w.add(docnum, v) w.finish(len(values)) length = f.tell() - 5 f.close() f = st.open_file("test1") r = c.reader(f, 5, length, len(values)) assert values == list(r) for x in range(len(values)): assert values[x] == r[x] f.close() # Sparse doccount = len(values) * 7 + 15 target = [default] * doccount f = st.create_file("test2") f.write(b("hello")) w = c.writer(f) for docnum, v in izip(xrange(10, doccount, 7), values): target[docnum] = v w.add(docnum, v) w.finish(doccount) length = f.tell() - 5 f.close() f = st.open_file("test2") r = c.reader(f, 5, length, doccount) assert target == list(r) for x in range(doccount): assert target[x] == r[x] lr = r.load() assert target == list(lr) f.close() def test_roundtrip(): _rt(columns.VarBytesColumn(), [b("a"), b("ccc"), b("bbb"), b("e"), b("dd")], b("")) _rt(columns.FixedBytesColumn(5), [b("aaaaa"), b("eeeee"), b("ccccc"), b("bbbbb"), b("eeeee")], b("\x00") * 5) _rt(columns.RefBytesColumn(), [b("a"), b("ccc"), b("bb"), b("ccc"), b("a"), b("bb")], b("")) _rt(columns.RefBytesColumn(3), [b("aaa"), b("bbb"), b("ccc"), b("aaa"), b("bbb"), b("ccc")], b("\x00") * 3) _rt(columns.StructColumn("ifH", (0, 0.0, 0)), [(100, 1.5, 15000), (-100, -5.0, 0), (5820, 6.5, 462), (-57829, -1.5, 6), (0, 0, 0)], (0, 0.0, 0)) numcol = columns.NumericColumn _rt(numcol("b"), [10, -20, 30, -25, 15], 0) _rt(numcol("B"), [10, 20, 30, 25, 15], 0) _rt(numcol("h"), [1000, -2000, 3000, -15000, 32000], 0) _rt(numcol("H"), [1000, 2000, 3000, 15000, 50000], 0) _rt(numcol("i"), [2 ** 16, -(2 ** 20), 2 ** 24, -(2 ** 28), 2 ** 30], 0) _rt(numcol("I"), [2 ** 16, 2 ** 20, 2 ** 24, 2 ** 28, 2 ** 31 & 0xFFFFFFFF], 0) _rt(numcol("q"), [10, -20, 30, -25, 15], 0) _rt(numcol("Q"), [2 ** 35, 2 ** 40, 2 ** 48, 2 ** 52, 2 ** 63], 0) _rt(numcol("f"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) _rt(numcol("d"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) c = columns.BitColumn(compress_at=10) _rt(c, [bool(random.randint(0, 1)) for _ in xrange(70)], False) _rt(c, [bool(random.randint(0, 1)) for _ in xrange(90)], False) c = columns.PickleColumn(columns.VarBytesColumn()) _rt(c, [None, True, False, 100, -7, "hello"], None) def test_multivalue(): schema = fields.Schema(s=fields.TEXT(sortable=True), n=fields.NUMERIC(sortable=True)) ix = RamStorage().create_index(schema) with ix.writer(codec=W3Codec()) as w: w.add_document(s=u("alfa foxtrot charlie").split(), n=[100, 200, 300]) w.add_document(s=u("juliet bravo india").split(), n=[10, 20, 30]) with ix.reader() as r: scr = r.column_reader("s") assert list(scr) == ["alfa", "juliet"] ncr = r.column_reader("n") assert list(ncr) == [100, 10] def test_column_field(): schema = fields.Schema(a=fields.TEXT(sortable=True), b=fields.COLUMN(columns.RefBytesColumn())) with TempIndex(schema, "columnfield") as ix: with ix.writer(codec=W3Codec()) as w: w.add_document(a=u("alfa bravo"), b=b("charlie delta")) w.add_document(a=u("bravo charlie"), b=b("delta echo")) w.add_document(a=u("charlie delta"), b=b("echo foxtrot")) with ix.reader() as r: assert r.has_column("a") assert r.has_column("b") cra = r.column_reader("a") assert cra[0] == u("alfa bravo") assert type(cra[0]) == text_type crb = r.column_reader("b") assert crb[0] == b("charlie delta") assert type(crb[0]) == bytes_type def test_column_query(): schema = fields.Schema(id=fields.STORED, a=fields.ID(sortable=True), b=fields.NUMERIC(sortable=True)) with TempIndex(schema, "columnquery") as ix: with ix.writer(codec=W3Codec()) as w: w.add_document(id=1, a=u("alfa"), b=10) w.add_document(id=2, a=u("bravo"), b=20) w.add_document(id=3, a=u("charlie"), b=30) w.add_document(id=4, a=u("delta"), b=40) w.add_document(id=5, a=u("echo"), b=50) w.add_document(id=6, a=u("foxtrot"), b=60) with ix.searcher() as s: def check(q): return [s.stored_fields(docnum)["id"] for docnum in q.docs(s)] q = query.ColumnQuery("a", u("bravo")) assert check(q) == [2] q = query.ColumnQuery("b", 30) assert check(q) == [3] q = query.ColumnQuery("a", lambda v: v != u("delta")) assert check(q) == [1, 2, 3, 5, 6] q = query.ColumnQuery("b", lambda v: v > 30) assert check(q) == [4, 5, 6] def test_ref_switch(): import warnings col = columns.RefBytesColumn() def rw(size): st = RamStorage() f = st.create_file("test") cw = col.writer(f) for i in xrange(size): cw.add(i, hex(i).encode("latin1")) cw.finish(size) length = f.tell() f.close() f = st.open_file("test") cr = col.reader(f, 0, length, size) for i in xrange(size): v = cr[i] # Column ignores additional unique values after 65535 if i <= 65535 - 1: assert v == hex(i).encode("latin1") else: assert v == b('') f.close() rw(255) # warnings.catch_warnings is not available in Python 2.5 if hasattr(warnings, "catch_warnings"): # Column warns on additional unique values after 65535 with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") rw(65537) assert len(w) == 2 assert issubclass(w[-1].category, UserWarning) Whoosh-2.5.7/tests/test_compound.py0000644000076500000240000000331512254366350017473 0ustar mattstaff00000000000000from __future__ import with_statement from whoosh.compat import b from whoosh.filedb.compound import CompoundStorage from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempStorage def _test_simple_compound(st): alist = [1, 2, 3, 5, -5, -4, -3, -2] blist = [1, 12, 67, 8, 2, 1023] clist = [100, -100, 200, -200] with st.create_file("a") as af: for x in alist: af.write_int(x) with st.create_file("b") as bf: for x in blist: bf.write_varint(x) with st.create_file("c") as cf: for x in clist: cf.write_int(x) f = st.create_file("f") CompoundStorage.assemble(f, st, ["a", "b", "c"]) f = CompoundStorage(st.open_file("f")) with f.open_file("a") as af: for x in alist: assert x == af.read_int() assert af.read() == b('') with f.open_file("b") as bf: for x in blist: assert x == bf.read_varint() assert bf.read() == b('') with f.open_file("c") as cf: for x in clist: assert x == cf.read_int() assert cf.read() == b('') def test_simple_compound_mmap(): with TempStorage("compound") as st: assert st.supports_mmap _test_simple_compound(st) def test_simple_compound_nomap(): st = RamStorage() _test_simple_compound(st) #def test_unclosed_mmap(): # with TempStorage("unclosed") as st: # assert st.supports_mmap # with st.create_file("a") as af: # af.write("alfa") # with st.create_file("b") as bf: # bf.write("bravo") # f = st.create_file("f") # CompoundStorage.assemble(f, st, ["a", "b"]) # # f = CompoundStorage(st, "f") Whoosh-2.5.7/tests/test_dateparse.py0000644000076500000240000003763212254366350017630 0ustar mattstaff00000000000000from whoosh.qparser.dateparse import * basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) english = English() def assert_adatetime(at, **kwargs): assert at.__class__ is adatetime for key in adatetime.units: val = getattr(at, key) target = kwargs.get(key) assert val == target, "at.%s=%r not %r in %r" % (key, val, target, at) def assert_timespan(ts, sargs, eargs): assert_adatetime(ts.start, **sargs) def assert_unamb(ts, **kwargs): assert_unamb_span(ts, kwargs, kwargs) def assert_unamb_span(ts, sargs, eargs): startdt = adatetime(**sargs).floor() enddt = adatetime(**eargs).ceil() assert ts.start == startdt, "start %s != %s" % (ts.start, startdt) assert ts.end == enddt, "end %s != %s" % (ts.end, enddt) def assert_datespan(ts, startdate, enddate): assert ts.__class__ is timespan assert ts.start == startdate assert ts.end == enddate # def test_simple_dateparse(t=english.simple): assert_adatetime(t.date_from("2005", basedate), year=2005) assert_adatetime(t.date_from("200505", basedate), year=2005, month=5) assert_adatetime(t.date_from("20050510", basedate), year=2005, month=5, day=10) assert_adatetime(t.date_from("2005051001", basedate), year=2005, month=5, day=10, hour=1) assert_adatetime(t.date_from("200505100108", basedate), year=2005, month=5, day=10, hour=1, minute=8) assert_adatetime(t.date_from("20050510010835", basedate), year=2005, month=5, day=10, hour=1, minute=8, second=35) assert_adatetime(t.date_from("2005-05", basedate), year=2005, month=5) assert_adatetime(t.date_from("2005 05 10", basedate), year=2005, month=5, day=10) assert_adatetime(t.date_from("2005.05.10.01", basedate), year=2005, month=5, day=10, hour=1) assert_adatetime(t.date_from("2005/05/10 01:08", basedate), year=2005, month=5, day=10, hour=1, minute=8) assert_adatetime(t.date_from("2005.05.10 01:08:35", basedate), year=2005, month=5, day=10, hour=1, minute=8, second=35) assert t.date_from("2005 02 31", basedate) is None assert t.date_from("2005-13-32", basedate) is None def test_time(t=english.time): assert_adatetime(t.date_from("13:05", basedate), hour=13, minute=5) assert t.date_from("28:91", basedate) is None assert_adatetime(t.date_from("3pm", basedate), hour=15) assert_adatetime(t.date_from("3 pm", basedate), hour=15) assert_adatetime(t.date_from("10pm", basedate), hour=22) assert_adatetime(t.date_from("10 pm", basedate), hour=22) assert_adatetime(t.date_from("3am", basedate), hour=3) assert_adatetime(t.date_from("3:15 am", basedate), hour=3, minute=15) assert_adatetime(t.date_from("5:10pm", basedate), hour=17, minute=10) assert_adatetime(t.date_from("12:45am", basedate), hour=0, minute=45) assert_adatetime(t.date_from("12:45pm", basedate), hour=12, minute=45) assert_adatetime(t.date_from("5:45:05 pm", basedate), hour=17, minute=45, second=5) assert_adatetime(t.date_from("noon", basedate), hour=12, minute=0, second=0, microsecond=0) assert_adatetime(t.date_from("midnight", basedate), hour=0, minute=0, second=0, microsecond=0) assert t.date_from("15 am", basedate) is None assert t.date_from("24:00", basedate) is None assert t.date_from("12:65", basedate) is None def test_dmy(d=english.dmy): assert_adatetime(d.date_from("25 may 2011", basedate), year=2011, month=5, day=25) assert_adatetime(d.date_from("may 2 2011", basedate), year=2011, month=5, day=2) assert_adatetime(d.date_from("2011 25 may", basedate), year=2011, month=5, day=25) assert_adatetime(d.date_from("2011 may 5", basedate), year=2011, month=5, day=5) assert_adatetime(d.date_from("apr", basedate), month=4) assert_adatetime(d.date_from("september", basedate), month=9) assert_adatetime(d.date_from("2001", basedate), year=2001) assert_adatetime(d.date_from("july 2525", basedate), year=2525, month=7) assert_adatetime(d.date_from("nov 30", basedate), month=11, day=30) assert d.date_from("25 2525", basedate) is None assert_adatetime(d.date_from("25 may, 2011", basedate), year=2011, month=5, day=25) assert_adatetime(d.date_from("may 2nd, 2011", basedate), year=2011, month=5, day=2) assert_adatetime(d.date_from("2011, 25 may", basedate), year=2011, month=5, day=25) assert_adatetime(d.date_from("2011, may 5th", basedate), year=2011, month=5, day=5) assert_adatetime(d.date_from("today", basedate), year=2010, month=9, day=20) assert_adatetime(d.date_from("tomorrow", basedate), year=2010, month=9, day=21) assert_adatetime(d.date_from("yesterday", basedate), year=2010, month=9, day=19) assert_adatetime(d.date_from("this month", basedate), year=2010, month=9) assert_adatetime(d.date_from("this year", basedate), year=2010) assert d.date_from("now", basedate) == basedate def test_plustime(rt=english.plusdate): assert rt.date_from("+1hr", basedate) == basedate + timedelta(hours=1) assert rt.date_from("+5mins", basedate) == basedate + timedelta(minutes=5) assert rt.date_from("+20s", basedate) == basedate + timedelta(seconds=20) assert rt.date_from("- 2 h", basedate) == basedate + timedelta(hours=-2) assert rt.date_from("- 25 minutes", basedate) == basedate + timedelta(minutes=-25) assert rt.date_from("-400 secs", basedate) == basedate + timedelta(seconds=-400) assert rt.date_from("+1hr 5m", basedate) == basedate + timedelta(hours=1, minutes=5) assert rt.date_from("-8hr 12m", basedate) == basedate + timedelta(hours=-8, minutes=-12) assert rt.date_from("+1hr 5s", basedate) == basedate + timedelta(hours=1, seconds=5) assert rt.date_from("+1hr 12m 5s", basedate) == basedate + timedelta(hours=1, minutes=12, seconds=5) assert rt.date_from("-1hr 5s", basedate) == basedate + timedelta(hours=-1, seconds=-5) assert rt.date_from("-1hr 12m 5s", basedate) == basedate + timedelta(hours=-1, minutes=-12, seconds=-5) def test_relative_days(): # "next monday" on monday assert relative_days(0, 0, 1) == 7 # "last monday" on monday assert relative_days(0, 0, -1) == -7 # "next tuesday" on wednesday assert relative_days(2, 1, 1) == 6 # "last tuesday" on wednesay assert relative_days(2, 1, -1) == -1 # "last monday" on sunday assert relative_days(6, 0, -1) == -6 # "next monday" on sunday assert relative_days(6, 0, 1) == 1 # "next wednesday" on tuesday assert relative_days(1, 2, 1) == 1 # "last wednesday" on tuesday assert relative_days(1, 2, -1) == -6 # "last wednesday" on thursday assert relative_days(3, 2, -1) == -1 # "next wednesday" on thursday assert relative_days(3, 2, 1) == 6 # "last wednesday" on tuesday assert relative_days(1, 2, -1) == -6 # "next wednesday" on tuesday assert relative_days(1, 2, 1) == 1 def test_dayname(p=english.dayname): assert_adatetime(p.date_from("next tuesday", basedate), year=2010, month=9, day=21) assert_adatetime(p.date_from("last tuesday", basedate), year=2010, month=9, day=14) assert_adatetime(p.date_from("next sunday", basedate), year=2010, month=9, day=26) assert_adatetime(p.date_from("last sun", basedate), year=2010, month=9, day=19) assert_adatetime(p.date_from("next th", basedate), year=2010, month=9, day=23) def test_reldate(p=english.plusdate): assert p.date_from("+1y", basedate) == basedate + relativedelta(years=1) assert p.date_from("+2mo", basedate) == basedate + relativedelta(months=2) assert p.date_from("+3w", basedate) == basedate + relativedelta(weeks=3) assert p.date_from("+5d", basedate) == basedate + relativedelta(days=5) assert p.date_from("+5days", basedate) == basedate + relativedelta(days=5) assert p.date_from("-6yr", basedate) == basedate + relativedelta(years=-6) assert p.date_from("- 7 mons", basedate) == basedate + relativedelta(months=-7) assert p.date_from("-8 wks", basedate) == basedate + relativedelta(weeks=-8) assert p.date_from("- 9 dy", basedate) == basedate + relativedelta(days=-9) assert p.date_from("+1y 12mo 400d", basedate) == basedate + relativedelta(years=1, months=12, days=400) assert p.date_from("-7mo 8d", basedate) == basedate + relativedelta(months=-7, days=-8) assert p.date_from("+5wks 2d", basedate) == basedate + relativedelta(weeks=5, days=2) assert p.date_from("-1y 1w", basedate) == basedate + relativedelta(years=-1, weeks=-1) assert p.date_from("+1y 2d 5h 12s", basedate) == basedate + relativedelta(years=1, days=2, hours=5, seconds=12) def test_bundle_subs(p=english.bundle): test_time(p) test_dmy(p) test_plustime(p) test_dayname(p) test_reldate(p) def test_bundle(p=english.bundle): assert_adatetime(p.date_from("mar 29 1972 2:45am", basedate), year=1972, month=3, day=29, hour=2, minute=45) assert_adatetime(p.date_from("16:10:45 14 February 2005", basedate), year=2005, month=2, day=14, hour=16, minute=10, second=45) assert_adatetime(p.date_from("1985 sept 12 12:01", basedate), year=1985, month=9, day=12, hour=12, minute=1) assert_adatetime(p.date_from("5pm 21st oct 2005", basedate), year=2005, month=10, day=21, hour=17) assert_adatetime(p.date_from("5:59:59pm next thur", basedate), year=2010, month=9, day=23, hour=17, minute=59, second=59) def test_ranges(p=english.torange): assert_timespan(p.date_from("last tuesday to next tuesday", basedate), dict(year=2010, month=9, day=14), dict(year=2010, month=9, day=21)) assert_timespan(p.date_from("last monday to dec 25", basedate), dict(year=2010, month=9, day=13), dict(year=None, month=12, day=25)) assert_timespan(p.date_from("oct 25 to feb 14", basedate), dict(year=None, month=10, day=25), dict(year=None, month=2, day=14)) assert_timespan(p.date_from("3am oct 12 to 5pm", basedate), dict(year=None, month=10, day=12, hour=3), dict(year=None, month=None, day=None, hour=17)) assert_timespan(p.date_from("3am feb 12 to 5pm today", basedate), dict(year=None, month=2, day=12, hour=3), dict(year=2010, month=9, day=20, hour=17)) assert_timespan(p.date_from("feb to oct", basedate), dict(year=None, month=2), dict(year=None, month=10)) assert_timespan(p.date_from("oct 25 2005 11am to 5pm tomorrow", basedate), dict(year=2005, month=10, day=25, hour=11), dict(year=2010, month=9, day=21, hour=17)) assert_timespan(p.date_from("oct 5 2005 to november 20", basedate), dict(year=2005, month=10, day=5), dict(year=None, month=11, day=20)) assert_timespan(p.date_from("2007 to 2010", basedate), dict(year=2007, month=None, day=None), dict(year=2010, month=None, day=None)) assert_timespan(p.date_from("2007 to oct 12", basedate), dict(year=2007, month=None, day=None), dict(year=None, month=10, day=12)) assert_datespan(p.date_from("-2d to +1w", basedate), basedate + relativedelta(days=-2), basedate + relativedelta(weeks=1)) def test_all(): p = english.all test_bundle_subs(p) test_bundle(p) test_ranges(p) def test_final_dates(p=english): assert_unamb(p.date_from("5:10pm", basedate), year=2010, month=9, day=20, hour=17, minute=10) assert p.date_from("may 32 2005", basedate) is None assert p.date_from("2005 may 32", basedate) is None assert p.date_from("2005-13-32", basedate) is None def test_final_ranges(p=english): assert_unamb_span(p.date_from("feb to nov", basedate), dict(year=2010, month=2), dict(year=2010, month=11)) # 2005 to 10 oct 2009 -> jan 1 2005 to oct 31 2009 assert_unamb_span(p.date_from("2005 to 10 oct 2009", basedate), dict(year=2005), dict(year=2009, month=10, day=10)) # jan 12 to oct 10 2009 -> jan 12 2009 to oct 10 2009 assert_unamb_span(p.date_from("jan 12 to oct 10 2009", basedate), dict(year=2009, month=1, day=12), dict(year=2009, month=10, day=10)) # jan to oct 2009 -> jan 1 2009 to oct 31 2009 assert_unamb_span(p.date_from("jan to oct 2009", basedate), dict(year=2009, month=1), dict(year=2009, month=10, day=31)) # mar 2005 to oct -> mar 1 2005 to oct 31 basedate.year assert_unamb_span(p.date_from("mar 2005 to oct", basedate), dict(year=2005, month=3), dict(year=2010, month=10, day=31)) # jan 10 to jan 25 -> jan 10 basedate.year to jan 25 basedate.year assert_unamb_span(p.date_from("jan 10 to jan 25", basedate), dict(year=2010, month=1, day=10), dict(year=2010, month=1, day=25)) # jan 2005 to feb 2009 -> jan 1 2005 to feb 28 2009 assert_unamb_span(p.date_from("jan 2005 to feb 2009", basedate), dict(year=2005, month=1), dict(year=2009, month=2)) # jan 5000 to mar -> jan 1 5000 to mar 5000 assert_unamb_span(p.date_from("jan 5000 to mar", basedate), dict(year=5000, month=1), dict(year=5000, month=3)) # jun 5000 to jan -> jun 1 5000 to jan 31 5001 assert_unamb_span(p.date_from("jun 5000 to jan", basedate), dict(year=5000, month=6), dict(year=5001, month=1)) # oct 2010 to feb -> oct 1 2010 to feb 28 2011 assert_unamb_span(p.date_from("oct 2010 to feb", basedate), dict(year=2010, month=10), dict(year=2011, month=2)) assert_unamb_span(p.date_from("5pm to 3am", basedate), dict(year=2010, month=9, day=20, hour=17), dict(year=2010, month=9, day=21, hour=3)) assert_unamb_span(p.date_from("5am to 3 am tomorrow", basedate), dict(year=2010, month=9, day=20, hour=5), dict(year=2010, month=9, day=21, hour=3)) assert_unamb_span(p.date_from("3am to 5 pm tomorrow", basedate), dict(year=2010, month=9, day=21, hour=3), dict(year=2010, month=9, day=21, hour=17)) assert_unamb_span(p.date_from("-2hrs to +20min", basedate), dict(year=2010, month=9, day=20, hour=13, minute=16, second=6, microsecond=454000), dict(year=2010, month=9, day=20, hour=15, minute=36, second=6, microsecond=454000)) # Swap assert_unamb_span(p.date_from("oct 25 2009 to feb 14 2008", basedate), dict(year=2008, month=2, day=14), dict(year=2009, month=10, day=25)) assert_unamb_span(p.date_from("oct 25 5000 to tomorrow", basedate), dict(year=2010, month=9, day=21), dict(year=5000, month=10, day=25)) Whoosh-2.5.7/tests/test_dawg.py0000644000076500000240000003055512254366350016577 0ustar mattstaff00000000000000from __future__ import with_statement import pytest import random from array import array from whoosh.automata import fst from whoosh.compat import b, u, xrange, array_tobytes from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempStorage def gwrite(keys, st=None): st = st or RamStorage() f = st.create_file("test") gw = fst.GraphWriter(f) gw.start_field("_") for key in keys: gw.insert(key) gw.finish_field() gw.close() return st def greader(st): return fst.GraphReader(st.open_file("test")) def enlist(string): return string.split() # def test_empty_fieldname(): gw = fst.GraphWriter(RamStorage().create_file("test")) with pytest.raises(ValueError): gw.start_field("") with pytest.raises(ValueError): gw.start_field(None) with pytest.raises(ValueError): gw.start_field(0) def test_empty_key(): gw = fst.GraphWriter(RamStorage().create_file("test")) gw.start_field("_") with pytest.raises(KeyError): gw.insert(b("")) with pytest.raises(KeyError): gw.insert("") with pytest.raises(KeyError): gw.insert(u("")) with pytest.raises(KeyError): gw.insert([]) def test_keys_out_of_order(): f = RamStorage().create_file("test") gw = fst.GraphWriter(f) gw.start_field("test") gw.insert("alfa") with pytest.raises(KeyError): gw.insert("abba") def test_duplicate_keys(): st = gwrite(enlist("alfa bravo bravo bravo charlie")) cur = fst.Cursor(greader(st)) assert list(cur.flatten_strings()) == ["alfa", "bravo", "charlie"] def test_inactive_raise(): st = gwrite(enlist("alfa bravo charlie")) cur = fst.Cursor(greader(st)) while cur.is_active(): cur.next_arc() pytest.raises(fst.InactiveCursor, cur.label) pytest.raises(fst.InactiveCursor, cur.prefix) pytest.raises(fst.InactiveCursor, cur.prefix_bytes) pytest.raises(fst.InactiveCursor, list, cur.peek_key()) pytest.raises(fst.InactiveCursor, cur.peek_key_bytes) pytest.raises(fst.InactiveCursor, cur.stopped) pytest.raises(fst.InactiveCursor, cur.value) pytest.raises(fst.InactiveCursor, cur.accept) pytest.raises(fst.InactiveCursor, cur.at_last_arc) pytest.raises(fst.InactiveCursor, cur.next_arc) pytest.raises(fst.InactiveCursor, cur.follow) pytest.raises(fst.InactiveCursor, cur.switch_to, b("a")) pytest.raises(fst.InactiveCursor, cur.skip_to, b("a")) pytest.raises(fst.InactiveCursor, list, cur.flatten()) pytest.raises(fst.InactiveCursor, list, cur.flatten_v()) pytest.raises(fst.InactiveCursor, list, cur.flatten_strings()) pytest.raises(fst.InactiveCursor, cur.find_path, b("a")) def test_types(): st = RamStorage() types = ((fst.IntValues, 100, 0), (fst.BytesValues, b('abc'), b('')), (fst.ArrayValues("i"), array("i", [0, 123, 42]), array("i")), (fst.IntListValues, [0, 6, 97], [])) for t, v, z in types: assert t.common(None, v) is None assert t.common(v, None) is None assert t.common(None, None) is None assert t.subtract(v, None) == v assert t.subtract(None, v) is None assert t.subtract(None, None) is None assert t.add(v, None) == v assert t.add(None, v) == v assert t.add(None, None) is None f = st.create_file("test") t.write(f, v) t.write(f, z) f.close() f = st.open_file("test") assert t.read(f) == v assert t.read(f) == z assert fst.IntValues.common(100, 20) == 20 assert fst.IntValues.add(20, 80) == 100 assert fst.IntValues.subtract(100, 80) == 20 assert fst.BytesValues.common(b("abc"), b("abc")) == b("abc") assert fst.BytesValues.common(b("abcde"), b("abfgh")) == b("ab") assert fst.BytesValues.common(b("abcde"), b("ab")) == b("ab") assert fst.BytesValues.common(b("ab"), b("abcde")) == b("ab") assert fst.BytesValues.common(None, b("abcde")) is None assert fst.BytesValues.common(b("ab"), None) is None a1 = array("i", [0, 12, 123, 42]) a2 = array("i", [0, 12, 420]) cm = array("i", [0, 12]) assert fst.ArrayValues.common(a1, a1) == a1 assert fst.ArrayValues.common(a1, a2) == cm assert fst.ArrayValues.common(a2, a1) == cm assert fst.ArrayValues.common(None, a1) is None assert fst.ArrayValues.common(a2, None) is None def _fst_roundtrip(domain, t): with TempStorage() as st: f = st.create_file("test") gw = fst.GraphWriter(f, vtype=t) gw.start_field("_") for key, value in domain: gw.insert(key, value) gw.finish_field() gw.close() f = st.open_file("test") gr = fst.GraphReader(f, vtype=t) cur = fst.Cursor(gr) assert list(cur.flatten_v()) == domain f.close() def test_fst_int(): domain = [(b("aaab"), 0), (b("aabc"), 12), (b("abcc"), 23), (b("bcab"), 30), (b("bcbc"), 31), (b("caaa"), 70), (b("cbba"), 80), (b("ccca"), 101)] _fst_roundtrip(domain, fst.IntValues) def test_fst_bytes(): domain = [(b("aaab"), b("000")), (b("aabc"), b("001")), (b("abcc"), b("010")), (b("bcab"), b("011")), (b("bcbc"), b("100")), (b("caaa"), b("101")), (b("cbba"), b("110")), (b("ccca"), b("111"))] _fst_roundtrip(domain, fst.BytesValues) def test_fst_array(): domain = [(b("000"), array("i", [10, 231, 36, 40])), (b("001"), array("i", [1, 22, 12, 15])), (b("010"), array("i", [18, 16, 18, 20])), (b("011"), array("i", [52, 3, 4, 5])), (b("100"), array("i", [353, 4, 56, 62])), (b("101"), array("i", [3, 42, 5, 6])), (b("110"), array("i", [894, 9, 101, 11])), (b("111"), array("i", [1030, 200, 1000, 2000])), ] _fst_roundtrip(domain, fst.ArrayValues("i")) def test_fst_intlist(): domain = [(b("000"), [1, 2, 3, 4]), (b("001"), [1, 2, 12, 15]), (b("010"), [1, 16, 18, 20]), (b("011"), [2, 3, 4, 5]), (b("100"), [3, 4, 5, 6]), (b("101"), [3, 4, 5, 6]), (b("110"), [8, 9, 10, 11]), (b("111"), [100, 200, 1000, 2000]), ] _fst_roundtrip(domain, fst.IntListValues) def test_fst_nones(): domain = [(b("000"), [1, 2, 3, 4]), (b("001"), None), (b("010"), [1, 16, 18, 20]), (b("011"), None), (b("100"), [3, 4, 5, 6]), (b("101"), None), (b("110"), [8, 9, 10, 11]), (b("111"), None), ] _fst_roundtrip(domain, fst.IntListValues) def test_fst_accept(): domain = [(b("a"), [1, 2, 3, 4]), (b("aa"), [1, 2, 12, 15]), (b("aaa"), [1, 16, 18, 20]), (b("aaaa"), [2, 3, 4, 5]), (b("b"), [3, 4, 5, 6]), (b("bb"), [3, 4, 5, 6]), (b("bbb"), [8, 9, 10, 11]), (b("bbbb"), [100, 200, 1000, 2000]), ] _fst_roundtrip(domain, fst.IntListValues) def test_words(): words = enlist("alfa alpaca amtrak bellow fellow fiona zebulon") with TempStorage() as st: gwrite(words, st) gr = greader(st) cur = fst.Cursor(gr) assert list(cur.flatten_strings()) == words gr.close() def test_random(): def randstring(): length = random.randint(1, 5) a = array("B", (random.randint(0, 255) for _ in xrange(length))) return array_tobytes(a) keys = sorted(randstring() for _ in xrange(100)) with TempStorage() as st: gwrite(keys, st) gr = greader(st) cur = fst.Cursor(gr) s1 = cur.flatten() s2 = sorted(set(keys)) for i, (k1, k2) in enumerate(zip(s1, s2)): assert k1 == k2, "%s: %r != %r" % (i, k1, k2) sample = list(keys) random.shuffle(sample) for key in sample: cur.reset() cur.find_path(key) assert cur.prefix_bytes() == key gr.close() def test_shared_suffix(): st = gwrite(enlist("blowing blue glowing")) gr = greader(st) cur1 = fst.Cursor(gr) cur2 = fst.Cursor(gr) cur1.find_path(b("blo")) cur2.find_path(b("glo")) assert cur1.stack[-1].target == cur2.stack[-1].target def test_fields(): with TempStorage() as st: f = st.create_file("test") gw = fst.GraphWriter(f) gw.start_field("f1") gw.insert("a") gw.insert("aa") gw.insert("ab") gw.finish_field() gw.start_field("f2") gw.insert("ba") gw.insert("baa") gw.insert("bab") gw.close() gr = fst.GraphReader(st.open_file("test")) cur1 = fst.Cursor(gr, gr.root("f1")) cur2 = fst.Cursor(gr, gr.root("f2")) assert list(cur1.flatten_strings()) == ["a", "aa", "ab"] assert list(cur2.flatten_strings()) == ["ba", "baa", "bab"] gr.close() def test_within(): with TempStorage() as st: gwrite(enlist("0 00 000 001 01 010 011 1 10 100 101 11 110 111"), st) gr = greader(st) s = set(fst.within(gr, "01", k=1)) gr.close() assert s == set(["0", "00", "01", "011", "010", "001", "10", "101", "1", "11"]) def test_within_match(): st = gwrite(enlist("abc def ghi")) gr = greader(st) assert set(fst.within(gr, "def")) == set(["def"]) def test_within_insert(): st = gwrite(enlist("00 01 10 11")) gr = greader(st) s = set(fst.within(gr, "0")) assert s == set(["00", "01", "10"]) def test_within_delete(): st = gwrite(enlist("abc def ghi")) gr = greader(st) assert set(fst.within(gr, "df")) == set(["def"]) st = gwrite(enlist("0")) gr = greader(st) assert list(fst.within(gr, "01")) == ["0"] def test_within_replace(): st = gwrite(enlist("abc def ghi")) gr = greader(st) assert set(fst.within(gr, "dez")) == set(["def"]) st = gwrite(enlist("00 01 10 11")) gr = greader(st) s = set(fst.within(gr, "00")) assert s == set(["00", "10", "01"]) def test_within_transpose(): st = gwrite(enlist("abc def ghi")) gr = greader(st) s = set(fst.within(gr, "dfe")) assert s == set(["def"]) def test_within_k2(): st = gwrite(enlist("abc bac cba")) gr = greader(st) s = set(fst.within(gr, "cb", k=2)) assert s == set(["abc", "cba"]) def test_within_prefix(): st = gwrite(enlist("aabc aadc babc badc")) gr = greader(st) s = set(fst.within(gr, "aaxc", prefix=2)) assert s == set(["aabc", "aadc"]) def test_skip(): st = gwrite(enlist("abcd abfg cdqr1 cdqr12 cdxy wxyz")) gr = greader(st) cur = gr.cursor() while not cur.stopped(): cur.follow() assert cur.prefix_bytes() == b("abcd") assert cur.accept() cur = gr.cursor() while not cur.stopped(): cur.follow() assert cur.prefix_bytes() == b("abcd") cur.skip_to(b("cdaa")) assert cur.peek_key_bytes() == b("cdqr1") assert cur.prefix_bytes() == b("cdq") cur = gr.cursor() while not cur.stopped(): cur.follow() cur.skip_to(b("z")) assert not cur.is_active() def test_insert_bytes(): # This test is only meaningful on Python 3 domain = [b("alfa"), b("bravo"), b("charlie")] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() cur = fst.GraphReader(st.open_file("test")).cursor() assert list(cur.flatten()) == domain def test_insert_unicode(): domain = [u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() cur = fst.GraphReader(st.open_file("test")).cursor() assert list(cur.flatten_strings()) == domain def test_within_unicode(): domain = [u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() gr = fst.GraphReader(st.open_file("test")) s = list(fst.within(gr, u("\uc774.\ud76c"))) assert s == [u("\uc774\uc124\ud76c")] Whoosh-2.5.7/tests/test_fields.py0000644000076500000240000004504412254366350017122 0ustar mattstaff00000000000000from __future__ import with_statement from datetime import datetime, timedelta import pytest from whoosh import fields, qparser, query from whoosh.compat import long_type, u, b, xrange from whoosh.filedb.filestore import RamStorage from whoosh.util import times def test_schema_eq(): a = fields.Schema() b = fields.Schema() assert a == b a = fields.Schema(id=fields.ID) b = a.copy() assert a["id"] == b["id"] assert a == b c = fields.Schema(id=fields.TEXT) assert a != c def test_creation1(): s = fields.Schema() s.add("content", fields.TEXT(phrase=True)) s.add("title", fields.TEXT(stored=True)) s.add("path", fields.ID(stored=True)) s.add("tags", fields.KEYWORD(stored=True)) s.add("quick", fields.NGRAM) s.add("note", fields.STORED) assert s.names() == ["content", "note", "path", "quick", "tags", "title"] assert "content" in s assert "buzz" not in s assert isinstance(s["tags"], fields.KEYWORD) def test_creation2(): s = fields.Schema(a=fields.ID(stored=True), b=fields.ID, c=fields.KEYWORD(scorable=True)) assert s.names() == ["a", "b", "c"] assert "a" in s assert "b" in s assert "c" in s def test_declarative(): class MySchema(fields.SchemaClass): content = fields.TEXT title = fields.TEXT path = fields.ID date = fields.DATETIME ix = RamStorage().create_index(MySchema) assert ix.schema.names() == ["content", "date", "path", "title"] ix = RamStorage().create_index(MySchema()) assert ix.schema.names() == ["content", "date", "path", "title"] with pytest.raises(fields.FieldConfigurationError): RamStorage().create_index(object()) def test_declarative_inherit(): class Parent(fields.SchemaClass): path = fields.ID date = fields.DATETIME class Child(Parent): content = fields.TEXT class Grandchild(Child): title = fields.TEXT s = Grandchild() assert s.names() == ["content", "date", "path", "title"] def test_badnames(): s = fields.Schema() with pytest.raises(fields.FieldConfigurationError): s.add("_test", fields.ID) with pytest.raises(fields.FieldConfigurationError): s.add("a f", fields.ID) #def test_numeric_support(): # intf = fields.NUMERIC(int, shift_step=0) # longf = fields.NUMERIC(int, bits=64, shift_step=0) # floatf = fields.NUMERIC(float, shift_step=0) # # def roundtrip(obj, num): # assert obj.from_bytes(obj.to_bytes(num)), num) # # roundtrip(intf, 0) # roundtrip(intf, 12345) # roundtrip(intf, -12345) # roundtrip(longf, 0) # roundtrip(longf, 85020450482) # roundtrip(longf, -85020450482) # roundtrip(floatf, 0) # roundtrip(floatf, 582.592) # roundtrip(floatf, -582.592) # roundtrip(floatf, -99.42) # # from random import shuffle # # def roundtrip_sort(obj, start, end, step): # count = start # rng = [] # while count < end: # rng.append(count) # count += step # # scrabled = list(rng) # shuffle(scrabled) # round = [obj.from_text(t) for t # in sorted([obj.to_text(n) for n in scrabled])] # assert round, rng) # # roundtrip_sort(intf, -100, 100, 1) # roundtrip_sort(longf, -58902, 58249, 43) # roundtrip_sort(floatf, -99.42, 99.83, 2.38) def test_index_numeric(): schema = fields.Schema(a=fields.NUMERIC(int, 32, signed=False), b=fields.NUMERIC(int, 32, signed=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(a=1, b=1) with ix.searcher() as s: assert list(s.lexicon("a")) == \ [b('\x00\x00\x00\x00\x01'), b('\x04\x00\x00\x00\x00'), b('\x08\x00\x00\x00\x00'), b('\x0c\x00\x00\x00\x00'), b('\x10\x00\x00\x00\x00'), b('\x14\x00\x00\x00\x00'), b('\x18\x00\x00\x00\x00'), b('\x1c\x00\x00\x00\x00')] assert list(s.lexicon("b")) == \ [b('\x00\x80\x00\x00\x01'), b('\x04\x08\x00\x00\x00'), b('\x08\x00\x80\x00\x00'), b('\x0c\x00\x08\x00\x00'), b('\x10\x00\x00\x80\x00'), b('\x14\x00\x00\x08\x00'), b('\x18\x00\x00\x00\x80'), b('\x1c\x00\x00\x00\x08')] def test_numeric(): schema = fields.Schema(id=fields.ID(stored=True), integer=fields.NUMERIC(int), floating=fields.NUMERIC(float)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("a"), integer=5820, floating=1.2) w.add_document(id=u("b"), integer=22, floating=2.3) w.add_document(id=u("c"), integer=78, floating=3.4) w.add_document(id=u("d"), integer=13, floating=4.5) w.add_document(id=u("e"), integer=9, floating=5.6) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("integer", schema) q = qp.parse(u("5820")) r = s.search(q) assert len(r) == 1 assert r[0]["id"] == "a" with ix.searcher() as s: r = s.search(qp.parse("floating:4.5")) assert len(r) == 1 assert r[0]["id"] == "d" q = qp.parse("integer:*") assert q.__class__ == query.Every assert q.field() == "integer" q = qp.parse("integer:5?6") assert q == query.NullQuery def test_decimal_numeric(): from decimal import Decimal f = fields.NUMERIC(int, decimal_places=4) schema = fields.Schema(id=fields.ID(stored=True), deci=f) ix = RamStorage().create_index(schema) # assert f.from_text(f.to_text(Decimal("123.56"))), Decimal("123.56")) w = ix.writer() w.add_document(id=u("a"), deci=Decimal("123.56")) w.add_document(id=u("b"), deci=Decimal("0.536255")) w.add_document(id=u("c"), deci=Decimal("2.5255")) w.add_document(id=u("d"), deci=Decimal("58")) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("deci", schema) q = qp.parse(u("123.56")) r = s.search(q) assert len(r) == 1 assert r[0]["id"] == "a" r = s.search(qp.parse(u("0.536255"))) assert len(r) == 1 assert r[0]["id"] == "b" def test_numeric_parsing(): schema = fields.Schema(id=fields.ID(stored=True), number=fields.NUMERIC) qp = qparser.QueryParser("number", schema) q = qp.parse(u("[10 to *]")) assert q == query.NullQuery q = qp.parse(u("[to 400]")) assert q.__class__ is query.NumericRange assert q.start is None assert q.end == 400 q = qp.parse(u("[10 to]")) assert q.__class__ is query.NumericRange assert q.start == 10 assert q.end is None q = qp.parse(u("[10 to 400]")) assert q.__class__ is query.NumericRange assert q.start == 10 assert q.end == 400 def test_numeric_ranges(): schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() for i in xrange(400): w.add_document(id=i, num=i) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("num", schema) def check(qs, target): q = qp.parse(qs) result = [s.stored_fields(d)["id"] for d in q.docs(s)] assert result == target # Note that range() is always inclusive-exclusive check("[10 to 390]", list(range(10, 390 + 1))) check("[100 to]", list(range(100, 400))) check("[to 350]", list(range(0, 350 + 1))) check("[16 to 255]", list(range(16, 255 + 1))) check("{10 to 390]", list(range(11, 390 + 1))) check("[10 to 390}", list(range(10, 390))) check("{10 to 390}", list(range(11, 390))) check("{16 to 255}", list(range(17, 255))) def test_numeric_ranges_unsigned(): values = [1, 10, 100, 1000, 2, 20, 200, 2000, 9, 90, 900, 9000] schema = fields.Schema(num2=fields.NUMERIC(stored=True, signed=False)) ix = RamStorage().create_index(schema) with ix.writer() as w: for v in values: w.add_document(num2=v) with ix.searcher() as s: q = query.NumericRange("num2", 55, None, True, False) r = s.search(q, limit=None) for hit in r: assert int(hit["num2"]) >= 55 def test_decimal_ranges(): from decimal import Decimal schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(int, decimal_places=2)) ix = RamStorage().create_index(schema) w = ix.writer() count = Decimal("0.0") inc = Decimal("0.2") for _ in xrange(500): w.add_document(id=str(count), num=count) count += inc w.commit() with ix.searcher() as s: qp = qparser.QueryParser("num", schema) def check(qs, start, end): q = qp.parse(qs) result = [s.stored_fields(d)["id"] for d in q.docs(s)] target = [] count = Decimal(start) limit = Decimal(end) while count <= limit: target.append(str(count)) count += inc assert result == target check("[10.2 to 80.8]", "10.2", "80.8") check("{10.2 to 80.8]", "10.4", "80.8") check("[10.2 to 80.8}", "10.2", "80.6") check("{10.2 to 80.8}", "10.4", "80.6") def test_numeric_errors(): f = fields.NUMERIC(int, bits=16, signed=True) schema = fields.Schema(f=f) with pytest.raises(ValueError): list(f.index(-32769)) with pytest.raises(ValueError): list(f.index(32768)) def test_nontext_document(): schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC, date=fields.DATETIME, even=fields.BOOLEAN) ix = RamStorage().create_index(schema) dt = datetime.now() w = ix.writer() for i in xrange(50): w.add_document(id=i, num=i, date=dt + timedelta(days=i), even=not(i % 2)) w.commit() with ix.searcher() as s: def check(kwargs, target): result = [d['id'] for d in s.documents(**kwargs)] assert result == target check({"num": 49}, [49]) check({"date": dt + timedelta(days=30)}, [30]) check({"even": True}, list(range(0, 50, 2))) def test_nontext_update(): schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(unique=True), date=fields.DATETIME(unique=True)) ix = RamStorage().create_index(schema) dt = datetime.now() w = ix.writer() for i in xrange(10): w.add_document(id=i, num=i, date=dt + timedelta(days=i)) w.commit() w = ix.writer() w.update_document(num=8, id="a") w.update_document(num=2, id="b") w.update_document(num=4, id="c") w.update_document(date=dt + timedelta(days=5), id="d") w.update_document(date=dt + timedelta(days=1), id="e") w.update_document(date=dt + timedelta(days=7), id="f") w.commit() def test_datetime(): dtf = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=dtf) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for month in xrange(1, 12): for day in xrange(1, 28): w.add_document(id=u("%s-%s") % (month, day), date=datetime(2010, month, day, 14, 0, 0)) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("date:20100523")) assert len(r) == 1 assert r[0]["id"] == "5-23" assert r[0]["date"].__class__ is datetime assert r[0]["date"].month == 5 assert r[0]["date"].day == 23 r = s.search(qp.parse("date:'2010 02'")) assert len(r) == 27 q = qp.parse(u("date:[2010-05 to 2010-08]")) startdt = datetime(2010, 5, 1, 0, 0, 0, 0) enddt = datetime(2010, 8, 31, 23, 59, 59, 999999) assert q.__class__ is query.NumericRange assert q.start == times.datetime_to_long(startdt) assert q.end == times.datetime_to_long(enddt) def test_boolean(): schema = fields.Schema(id=fields.ID(stored=True), done=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("a"), done=True) w.add_document(id=u("b"), done=False) w.add_document(id=u("c"), done=True) w.add_document(id=u("d"), done=False) w.add_document(id=u("e"), done=True) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("done:true")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) r = s.search(qp.parse("done:yes")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) q = qp.parse("done:false") assert q.__class__ == query.Term assert q.text is False assert schema["done"].to_bytes(False) == b("f") r = s.search(q) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r) r = s.search(qp.parse("done:no")) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r) def test_boolean2(): schema = fields.Schema(t=fields.TEXT(stored=True), b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) writer = ix.writer() writer.add_document(t=u('some kind of text'), b=False) writer.add_document(t=u('some other kind of text'), b=False) writer.add_document(t=u('some more text'), b=False) writer.add_document(t=u('some again'), b=True) writer.commit() with ix.searcher() as s: qf = qparser.QueryParser('b', None).parse(u('f')) qt = qparser.QueryParser('b', None).parse(u('t')) r = s.search(qf) assert len(r) == 3 assert [d["b"] for d in s.search(qt)] == [True] assert [d["b"] for d in s.search(qf)] == [False] * 3 def test_boolean3(): schema = fields.Schema(t=fields.TEXT(stored=True, field_boost=5), b=fields.BOOLEAN(stored=True), c=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(t=u("with hardcopy"), b=True, c=u("alfa")) w.add_document(t=u("no hardcopy"), b=False, c=u("bravo")) with ix.searcher() as s: q = query.Term("b", schema["b"].to_bytes(True)) ts = [hit["t"] for hit in s.search(q)] assert ts == ["with hardcopy"] def test_boolean_strings(): schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(i=0, b="true") w.add_document(i=1, b="True") w.add_document(i=2, b="false") w.add_document(i=3, b="False") w.add_document(i=4, b=u("true")) w.add_document(i=5, b=u("True")) w.add_document(i=6, b=u("false")) w.add_document(i=7, b=u("False")) with ix.searcher() as s: qp = qparser.QueryParser("b", ix.schema) def check(qs, nums): q = qp.parse(qs) r = s.search(q, limit=None) assert [hit["i"] for hit in r] == nums trues = [0, 1, 4, 5] falses = [2, 3, 6, 7] check("true", trues) check("True", trues) check("false", falses) check("False", falses) check("t", trues) check("f", falses) def test_boolean_find_deleted(): # "Random" string of ones and zeros representing deleted and undeleted domain = "1110001010001110010101000101001011101010001011111101000101010101" schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) count = 0 # Create multiple segments just in case for _ in xrange(5): w = ix.writer() for c in domain: w.add_document(i=count, b=(c == "1")) w.commit(merge=False) # Delete documents where "b" is True with ix.writer() as w: w.delete_by_term("b", "t") with ix.searcher() as s: # Double check that documents with b=True are all deleted reader = s.reader() for docnum in xrange(s.doc_count_all()): b = s.stored_fields(docnum)["b"] assert b == reader.is_deleted(docnum) # Try doing a search for documents where b=True qp = qparser.QueryParser("b", ix.schema) q = qp.parse("b:t") r = s.search(q, limit=None) assert len(r) == 0 # Make sure Every query doesn't match deleted docs r = s.search(qp.parse("*"), limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) r = s.search(qp.parse("*:*"), limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) # Make sure Not query doesn't match deleted docs q = qp.parse("NOT b:t") r = s.search(q, limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) r = s.search(q, limit=5) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) def test_boolean_multifield(): schema = fields.Schema(name=fields.TEXT(stored=True), bit=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(name=u('audi'), bit=True) w.add_document(name=u('vw'), bit=False) w.add_document(name=u('porsche'), bit=False) w.add_document(name=u('ferrari'), bit=True) w.add_document(name=u('citroen'), bit=False) with ix.searcher() as s: qp = qparser.MultifieldParser(["name", "bit"], schema) q = qp.parse(u("boop")) r = s.search(q) assert sorted(hit["name"] for hit in r) == ["audi", "ferrari"] assert len(r) == 2 def test_missing_field(): schema = fields.Schema() ix = RamStorage().create_index(schema) with ix.searcher() as s: with pytest.raises(KeyError): s.document_numbers(id=u("test")) def test_token_boost(): from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter ana = RegexTokenizer() | DoubleMetaphoneFilter() field = fields.TEXT(analyzer=ana, phrase=False) results = sorted(field.index(u("spruce view"))) assert results == [(b('F'), 1, 1.0, b('\x00\x00\x00\x01')), (b('FF'), 1, 0.5, b('\x00\x00\x00\x01')), (b('SPRS'), 1, 1.0, b('\x00\x00\x00\x01')), ] Whoosh-2.5.7/tests/test_flexible.py0000644000076500000240000000714612254366350017447 0ustar mattstaff00000000000000from __future__ import with_statement from whoosh import fields from whoosh.compat import u, b from whoosh.util.testing import TempIndex def test_addfield(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) with TempIndex(schema, "addfield") as ix: w = ix.writer() w.add_document(id=u("a"), content=u("alfa")) w.add_document(id=u("b"), content=u("bravo")) w.add_document(id=u("c"), content=u("charlie")) w.commit() ix.add_field("added", fields.KEYWORD(stored=True)) w = ix.writer() w.add_document(id=u("d"), content=u("delta"), added=u("fourth")) w.add_document(id=u("e"), content=u("echo"), added=u("fifth")) w.commit(merge=False) with ix.searcher() as s: assert ("id", "d") in s.reader() assert s.document(id="d") == {"id": "d", "added": "fourth"} assert s.document(id="b") == {"id": "b"} def test_addfield_spelling(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) with TempIndex(schema, "addfield") as ix: w = ix.writer() w.add_document(id=u("a"), content=u("alfa")) w.add_document(id=u("b"), content=u("bravo")) w.add_document(id=u("c"), content=u("charlie")) w.commit() ix.add_field("added", fields.KEYWORD(stored=True, spelling=True)) w = ix.writer() w.add_document(id=u("d"), content=u("delta"), added=u("fourth")) w.add_document(id=u("e"), content=u("echo"), added=u("fifth")) w.commit(merge=False) with ix.searcher() as s: assert s.document(id=u("d")) == {"id": "d", "added": "fourth"} assert s.document(id=u("b")) == {"id": "b"} def test_removefield(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True)) with TempIndex(schema, "removefield") as ix: w = ix.writer() w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad")) w.add_document(id=u("c"), content=u("charlie"), city=u("cairo")) w.add_document(id=u("d"), content=u("delta"), city=u("dakar")) w.commit() with ix.searcher() as s: assert s.document(id=u("c")) == {"id": "c", "city": "cairo"} w = ix.writer() w.remove_field("content") w.remove_field("city") w.commit() ixschema = ix._current_schema() assert ixschema.names() == ["id"] assert ixschema.stored_names() == ["id"] with ix.searcher() as s: assert ("content", b("charlie")) not in s.reader() assert s.document(id=u("c")) == {"id": u("c")} def test_optimize_away(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True)) with TempIndex(schema, "optimizeaway") as ix: w = ix.writer() w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad")) w.add_document(id=u("c"), content=u("charlie"), city=u("cairo")) w.add_document(id=u("d"), content=u("delta"), city=u("dakar")) w.commit() with ix.searcher() as s: assert s.document(id=u("c")) == {"id": "c", "city": "cairo"} w = ix.writer() w.remove_field("content") w.remove_field("city") w.commit(optimize=True) with ix.searcher() as s: assert ("content", u("charlie")) not in s.reader() assert s.document(id=u("c")) == {"id": u("c")} if __name__ == "__main__": test_addfield() Whoosh-2.5.7/tests/test_highlighting.py0000644000076500000240000002441312254366350020316 0ustar mattstaff00000000000000# coding: utf-8 from __future__ import with_statement import pytest from whoosh import analysis, highlight, fields, qparser, query from whoosh.compat import u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempStorage _doc = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " + "kilo lima") def test_null_fragment(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() nf = highlight.WholeFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, nf, uc) assert htext == "alfa BRAVO charlie delta echo foxtrot golf hotel INDIA juliet kilo lima" def test_sentence_fragment(): text = u("This is the first sentence. This one doesn't have the word. " + "This sentence is the second. Third sentence here.") terms = ("sentence",) sa = analysis.StandardAnalyzer(stoplist=None) sf = highlight.SentenceFragmenter() uc = highlight.UppercaseFormatter() htext = highlight.highlight(text, terms, sa, sf, uc) assert htext == "This is the first SENTENCE...This SENTENCE is the second...Third SENTENCE here" def test_context_fragment(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) assert htext == "alfa BRAVO charlie...hotel INDIA juliet" def test_context_at_start(): terms = frozenset(["alfa"]) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=15) uc = highlight.UppercaseFormatter() htext = highlight.highlight(_doc, terms, sa, cf, uc) assert htext == "ALFA bravo charlie delta echo foxtrot" def test_html_format(): terms = frozenset(("bravo", "india")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter() htext = highlight.highlight(_doc, terms, sa, cf, hf) assert htext == 'alfa bravo charlie...hotel india juliet' def test_html_escape(): terms = frozenset(["bravo"]) sa = analysis.StandardAnalyzer() wf = highlight.WholeFragmenter() hf = highlight.HtmlFormatter() htext = highlight.highlight(u('alfa delta'), terms, sa, wf, hf) assert htext == 'alfa <bravo "charlie"> delta' def test_maxclasses(): terms = frozenset(("alfa", "bravo", "charlie", "delta", "echo")) sa = analysis.StandardAnalyzer() cf = highlight.ContextFragmenter(surround=6) hf = highlight.HtmlFormatter(tagname="b", termclass="t", maxclasses=2) htext = highlight.highlight(_doc, terms, sa, cf, hf) assert htext == 'alfa bravo charlie...delta echo foxtrot' def test_workflow_easy(): schema = fields.Schema(id=fields.ID(stored=True), title=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), title=u("The man who wasn't there")) w.add_document(id=u("2"), title=u("The dog who barked at midnight")) w.add_document(id=u("3"), title=u("The invisible man")) w.add_document(id=u("4"), title=u("The girl with the dragon tattoo")) w.add_document(id=u("5"), title=u("The woman who disappeared")) w.commit() with ix.searcher() as s: # Parse the user query parser = qparser.QueryParser("title", schema=ix.schema) q = parser.parse(u("man")) r = s.search(q, terms=True) assert len(r) == 2 r.fragmenter = highlight.WholeFragmenter() r.formatter = highlight.UppercaseFormatter() outputs = [hit.highlights("title") for hit in r] assert outputs == ["The invisible MAN", "The MAN who wasn't there"] def test_workflow_manual(): schema = fields.Schema(id=fields.ID(stored=True), title=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), title=u("The man who wasn't there")) w.add_document(id=u("2"), title=u("The dog who barked at midnight")) w.add_document(id=u("3"), title=u("The invisible man")) w.add_document(id=u("4"), title=u("The girl with the dragon tattoo")) w.add_document(id=u("5"), title=u("The woman who disappeared")) w.commit() with ix.searcher() as s: # Parse the user query parser = qparser.QueryParser("title", schema=ix.schema) q = parser.parse(u("man")) # Extract the terms the user used in the field we're interested in terms = [text for fieldname, text in q.all_terms() if fieldname == "title"] # Perform the search r = s.search(q) assert len(r) == 2 # Use the same analyzer as the field uses. To be sure, you can # do schema[fieldname].analyzer. Be careful not to do this # on non-text field types such as DATETIME. analyzer = schema["title"].analyzer # Since we want to highlight the full title, not extract fragments, # we'll use WholeFragmenter. nf = highlight.WholeFragmenter() # In this example we'll simply uppercase the matched terms fmt = highlight.UppercaseFormatter() outputs = [] for d in r: text = d["title"] outputs.append(highlight.highlight(text, terms, analyzer, nf, fmt)) assert outputs == ["The invisible MAN", "The MAN who wasn't there"] def test_unstored(): schema = fields.Schema(text=fields.TEXT, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("alfa bravo charlie"), tags=u("delta echo")) w.commit() hit = ix.searcher().search(query.Term("text", "bravo"))[0] with pytest.raises(KeyError): hit.highlights("tags") def test_multifilter(): iwf_for_index = analysis.IntraWordFilter(mergewords=True, mergenums=False) iwf_for_query = analysis.IntraWordFilter(mergewords=False, mergenums=False) mf = analysis.MultiFilter(index=iwf_for_index, query=iwf_for_query) ana = analysis.RegexTokenizer() | mf | analysis.LowercaseFilter() schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("Our BabbleTron5000 is great")) w.commit() with ix.searcher() as s: assert ("text", "5000") in s.reader() hit = s.search(query.Term("text", "5000"))[0] assert hit.highlights("text") == 'Our BabbleTron5000 is great' def test_pinpoint(): domain = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet " "kilo lima mike november oskar papa quebec romeo sierra tango") schema = fields.Schema(text=fields.TEXT(stored=True, chars=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=domain) w.commit() assert ix.schema["text"].supports("characters") with ix.searcher() as s: r = s.search(query.Term("text", "juliet"), terms=True) hit = r[0] hi = highlight.Highlighter() hi.formatter = highlight.UppercaseFormatter() assert not hi.can_load_chars(r, "text") assert hi.highlight_hit(hit, "text") == "golf hotel india JULIET kilo lima mike november" hi.fragmenter = highlight.PinpointFragmenter() assert hi.can_load_chars(r, "text") assert hi.highlight_hit(hit, "text") == "ot golf hotel india JULIET kilo lima mike nove" hi.fragmenter.autotrim = True assert hi.highlight_hit(hit, "text") == "golf hotel india JULIET kilo lima mike" def test_highlight_wildcards(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie delta cookie echo")) with ix.searcher() as s: qp = qparser.QueryParser("text", ix.schema) q = qp.parse(u("c*")) r = s.search(q) assert r.scored_length() == 1 r.formatter = highlight.UppercaseFormatter() hit = r[0] assert hit.highlights("text") == "alfa bravo CHARLIE delta COOKIE echo" def test_highlight_ngrams(): schema = fields.Schema(text=fields.NGRAMWORDS(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("Multiplication and subtraction are good")) with ix.searcher() as s: qp = qparser.QueryParser("text", ix.schema) q = qp.parse(u("multiplication")) r = s.search(q) assert r.scored_length() == 1 r.fragmenter = highlight.SentenceFragmenter() r.formatter = highlight.UppercaseFormatter() snippet = r[0].highlights("text") assert snippet == "MULTIPLICATIon and subtracTION are good" def test_issue324(): sa = analysis.StemmingAnalyzer() result = highlight.highlight(u("Indexed!\n1"), [u("index")], sa, fragmenter=highlight.ContextFragmenter(), formatter=highlight.UppercaseFormatter()) assert result == "INDEXED!\n1" def test_whole_noterms(): schema = fields.Schema(text=fields.TEXT(stored=True), tag=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie delta echo foxtrot golf"), tag=u("foo")) with ix.searcher() as s: r = s.search(query.Term("text", u("delta"))) assert len(r) == 1 r.fragmenter = highlight.WholeFragmenter() r.formatter = highlight.UppercaseFormatter() hi = r[0].highlights("text") assert hi == u("alfa bravo charlie DELTA echo foxtrot golf") r = s.search(query.Term("tag", u("foo"))) assert len(r) == 1 r.fragmenter = highlight.WholeFragmenter() r.formatter = highlight.UppercaseFormatter() hi = r[0].highlights("text") assert hi == u("") hi = r[0].highlights("text", minscore=0) assert hi == u("alfa bravo charlie delta echo foxtrot golf") Whoosh-2.5.7/tests/test_indexing.py0000644000076500000240000006104112254366764017465 0ustar mattstaff00000000000000from __future__ import with_statement import random from collections import defaultdict from datetime import datetime import pytest from whoosh import analysis, fields, index, qparser, query from whoosh.compat import b, u, xrange, text_type, PY3, permutations from whoosh.filedb.filestore import RamStorage from whoosh.writing import IndexingError from whoosh.util.numeric import length_to_byte, byte_to_length from whoosh.util.testing import TempIndex, TempStorage def test_creation(): s = fields.Schema(content=fields.TEXT(phrase=True), title=fields.TEXT(stored=True), path=fields.ID(stored=True), tags=fields.KEYWORD(stored=True), quick=fields.NGRAM, note=fields.STORED) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(title=u("First"), content=u("This is the first document"), path=u("/a"), tags=u("first second third"), quick=u("First document"), note=u("This is the first document")) w.add_document(content=u("Let's try this again"), title=u("Second"), path=u("/b"), tags=u("Uno Dos Tres"), quick=u("Second document"), note=u("This is the second document")) w.commit() def test_empty_commit(): s = fields.Schema(id=fields.ID(stored=True)) with TempIndex(s, "emptycommit") as ix: w = ix.writer() w.add_document(id=u("1")) w.add_document(id=u("2")) w.add_document(id=u("3")) w.commit() w = ix.writer() w.commit() def test_version_in(): from whoosh import __version__ from whoosh import index with TempStorage("versionin") as st: assert not index.exists(st) schema = fields.Schema(text=fields.TEXT) ix = st.create_index(schema) assert index.exists(st) assert ix.is_empty() v = index.version(st) assert v[0] == __version__ assert v[1] == index._CURRENT_TOC_VERSION with ix.writer() as w: w.add_document(text=u("alfa")) assert not ix.is_empty() def test_simple_indexing(): schema = fields.Schema(text=fields.TEXT, id=fields.STORED) domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike"), u("november")) docs = defaultdict(list) with TempIndex(schema, "simple") as ix: with ix.writer() as w: for i in xrange(100): smp = random.sample(domain, 5) for word in smp: docs[word].append(i) w.add_document(text=u(" ").join(smp), id=i) with ix.searcher() as s: for word in domain: rset = sorted([hit["id"] for hit in s.search(query.Term("text", word), limit=None)]) assert rset == docs[word] def test_integrity(): s = fields.Schema(name=fields.TEXT, value=fields.TEXT) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(name=u("Yellow brown"), value=u("Blue red green purple?")) w.add_document(name=u("Alpha beta"), value=u("Gamma delta epsilon omega.")) w.commit() w = ix.writer() w.add_document(name=u("One two"), value=u("Three four five.")) w.commit() tr = ix.reader() assert ix.doc_count_all() == 3 assert " ".join(tr.field_terms("name")) == "alpha beta brown one two yellow" def test_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "testlengths") as ix: w = ix.writer() items = u("ABCDEFG") from itertools import cycle, islice lengths = [10, 20, 2, 102, 45, 3, 420, 2] for length in lengths: w.add_document(f2=u(" ").join(islice(cycle(items), length))) w.commit() with ix.reader() as dr: ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, len(lengths))] assert ls1 == [0] * len(lengths) ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, len(lengths))] assert ls2 == [byte_to_length(length_to_byte(l)) for l in lengths] def test_many_lengths(): domain = u("alfa bravo charlie delta echo").split() schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() for i, word in enumerate(domain): length = (i + 1) ** 6 w.add_document(text=" ".join(word for _ in xrange(length))) w.commit() s = ix.searcher() for i, word in enumerate(domain): target = byte_to_length(length_to_byte((i + 1) ** 6)) ti = s.term_info("text", word) assert ti.min_length() == target assert ti.max_length() == target def test_lengths_ram(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(f1=u("A B C D E"), f2=u("X Y Z")) w.add_document(f1=u("B B B B C D D Q"), f2=u("Q R S T")) w.add_document(f1=u("D E F"), f2=u("U V A B C D E")) w.commit() dr = ix.reader() assert dr.stored_fields(0)["f1"] == "A B C D E" assert dr.doc_field_length(0, "f1") == 5 assert dr.doc_field_length(1, "f1") == 8 assert dr.doc_field_length(2, "f1") == 3 assert dr.doc_field_length(0, "f2") == 3 assert dr.doc_field_length(1, "f2") == 4 assert dr.doc_field_length(2, "f2") == 7 assert dr.field_length("f1") == 16 assert dr.field_length("f2") == 14 assert dr.max_field_length("f1") == 8 assert dr.max_field_length("f2") == 7 def test_merged_lengths(): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) with TempIndex(s, "mergedlengths") as ix: w = ix.writer() w.add_document(f1=u("A B C"), f2=u("X")) w.add_document(f1=u("B C D E"), f2=u("Y Z")) w.commit() w = ix.writer() w.add_document(f1=u("A"), f2=u("B C D E X Y")) w.add_document(f1=u("B C"), f2=u("X")) w.commit(merge=False) w = ix.writer() w.add_document(f1=u("A B X Y Z"), f2=u("B C")) w.add_document(f1=u("Y X"), f2=u("A B")) w.commit(merge=False) with ix.reader() as dr: assert dr.stored_fields(0)["f1"] == u("A B C") assert dr.doc_field_length(0, "f1") == 3 assert dr.doc_field_length(2, "f2") == 6 assert dr.doc_field_length(4, "f1") == 5 def test_frequency_keyword(): s = fields.Schema(content=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content=u("A B C D E")) w.add_document(content=u("B B B B C D D")) w.add_document(content=u("D E F")) w.commit() with ix.reader() as tr: assert tr.doc_frequency("content", u("B")) == 2 assert tr.frequency("content", u("B")) == 5 assert tr.doc_frequency("content", u("E")) == 2 assert tr.frequency("content", u("E")) == 2 assert tr.doc_frequency("content", u("A")) == 1 assert tr.frequency("content", u("A")) == 1 assert tr.doc_frequency("content", u("D")) == 3 assert tr.frequency("content", u("D")) == 4 assert tr.doc_frequency("content", u("F")) == 1 assert tr.frequency("content", u("F")) == 1 assert tr.doc_frequency("content", u("Z")) == 0 assert tr.frequency("content", u("Z")) == 0 stats = [(fname, text, ti.doc_frequency(), ti.weight()) for (fname, text), ti in tr] assert stats == [("content", b("A"), 1, 1), ("content", b("B"), 2, 5), ("content", b("C"), 2, 2), ("content", b("D"), 3, 4), ("content", b("E"), 2, 2), ("content", b("F"), 1, 1)] def test_frequency_text(): s = fields.Schema(content=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(content=u("alfa bravo charlie delta echo")) w.add_document(content=u("bravo bravo bravo bravo charlie delta delta")) w.add_document(content=u("delta echo foxtrot")) w.commit() with ix.reader() as tr: assert tr.doc_frequency("content", u("bravo")) == 2 assert tr.frequency("content", u("bravo")) == 5 assert tr.doc_frequency("content", u("echo")) == 2 assert tr.frequency("content", u("echo")) == 2 assert tr.doc_frequency("content", u("alfa")) == 1 assert tr.frequency("content", u("alfa")) == 1 assert tr.doc_frequency("content", u("delta")) == 3 assert tr.frequency("content", u("delta")) == 4 assert tr.doc_frequency("content", u("foxtrot")) == 1 assert tr.frequency("content", u("foxtrot")) == 1 assert tr.doc_frequency("content", u("zulu")) == 0 assert tr.frequency("content", u("zulu")) == 0 stats = [(fname, text, ti.doc_frequency(), ti.weight()) for (fname, text), ti in tr] assert stats == [("content", b("alfa"), 1, 1), ("content", b("bravo"), 2, 5), ("content", b("charlie"), 2, 2), ("content", b("delta"), 3, 4), ("content", b("echo"), 2, 2), ("content", b("foxtrot"), 1, 1)] def test_deletion(): s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT) with TempIndex(s, "deletion") as ix: w = ix.writer() w.add_document(key=u("A"), name=u("Yellow brown"), value=u("Blue red green purple?")) w.add_document(key=u("B"), name=u("Alpha beta"), value=u("Gamma delta epsilon omega.")) w.add_document(key=u("C"), name=u("One two"), value=u("Three four five.")) w.commit() w = ix.writer() assert w.delete_by_term("key", u("B")) == 1 w.commit(merge=False) assert ix.doc_count_all() == 3 assert ix.doc_count() == 2 w = ix.writer() w.add_document(key=u("A"), name=u("Yellow brown"), value=u("Blue red green purple?")) w.add_document(key=u("B"), name=u("Alpha beta"), value=u("Gamma delta epsilon omega.")) w.add_document(key=u("C"), name=u("One two"), value=u("Three four five.")) w.commit() # This will match both documents with key == B, one of which is already # deleted. This should not raise an error. w = ix.writer() assert w.delete_by_term("key", u("B")) == 1 w.commit() ix.optimize() assert ix.doc_count_all() == 4 assert ix.doc_count() == 4 with ix.reader() as tr: assert " ".join(tr.field_terms("name")) == "brown one two yellow" def test_writer_reuse(): s = fields.Schema(key=fields.ID) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(key=u("A")) w.add_document(key=u("B")) w.add_document(key=u("C")) w.commit() # You can't re-use a commited/canceled writer pytest.raises(IndexingError, w.add_document, key=u("D")) pytest.raises(IndexingError, w.update_document, key=u("B")) pytest.raises(IndexingError, w.delete_document, 0) pytest.raises(IndexingError, w.add_reader, None) pytest.raises(IndexingError, w.add_field, "name", fields.ID) pytest.raises(IndexingError, w.remove_field, "key") pytest.raises(IndexingError, w.searcher) def test_update(): # Test update with multiple unique keys SAMPLE_DOCS = [{"id": u("test1"), "path": u("/test/1"), "text": u("Hello")}, {"id": u("test2"), "path": u("/test/2"), "text": u("There")}, {"id": u("test3"), "path": u("/test/3"), "text": u("Reader")}, ] schema = fields.Schema(id=fields.ID(unique=True, stored=True), path=fields.ID(unique=True, stored=True), text=fields.TEXT) with TempIndex(schema, "update") as ix: with ix.writer() as w: for doc in SAMPLE_DOCS: w.add_document(**doc) with ix.writer() as w: w.update_document(id=u("test2"), path=u("test/1"), text=u("Replacement")) def test_update2(): schema = fields.Schema(key=fields.ID(unique=True, stored=True), p=fields.ID(stored=True)) with TempIndex(schema, "update2") as ix: nums = list(range(21)) random.shuffle(nums) for i, n in enumerate(nums): w = ix.writer() w.update_document(key=text_type(n % 10), p=text_type(i)) w.commit() with ix.searcher() as s: results = [d["key"] for _, d in s.iter_docs()] results = " ".join(sorted(results)) assert results == "0 1 2 3 4 5 6 7 8 9" def test_update_numeric(): schema = fields.Schema(num=fields.NUMERIC(unique=True, stored=True), text=fields.ID(stored=True)) with TempIndex(schema, "updatenum") as ix: nums = list(range(5)) * 3 random.shuffle(nums) for num in nums: with ix.writer() as w: w.update_document(num=num, text=text_type(num)) with ix.searcher() as s: results = [d["text"] for _, d in s.iter_docs()] results = " ".join(sorted(results)) assert results == "0 1 2 3 4" def test_reindex(): SAMPLE_DOCS = [ {'id': u('test1'), 'text': u('This is a document. Awesome, is it not?')}, {'id': u('test2'), 'text': u('Another document. Astounding!')}, {'id': u('test3'), 'text': u('A fascinating article on the behavior of domestic ' 'steak knives.')}, ] schema = fields.Schema(text=fields.TEXT(stored=True), id=fields.ID(unique=True, stored=True)) with TempIndex(schema, "reindex") as ix: def reindex(): writer = ix.writer() for doc in SAMPLE_DOCS: writer.update_document(**doc) writer.commit() reindex() assert ix.doc_count_all() == 3 reindex() assert ix.doc_count_all() == 3 def test_noscorables1(): values = [u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima")] from random import choice, sample, randint times = 1000 schema = fields.Schema(id=fields.ID, tags=fields.KEYWORD) with TempIndex(schema, "noscorables1") as ix: w = ix.writer() for _ in xrange(times): w.add_document(id=choice(values), tags=u(" ").join(sample(values, randint(2, 7)))) w.commit() with ix.searcher() as s: s.search(query.Term("id", "bravo")) def test_noscorables2(): schema = fields.Schema(field=fields.ID) with TempIndex(schema, "noscorables2") as ix: writer = ix.writer() writer.add_document(field=u('foo')) writer.commit() def test_multi(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.KEYWORD(stored=True)) with TempIndex(schema, "multi") as ix: writer = ix.writer() # Deleted 1 writer.add_document(id=u("1"), content=u("alfa bravo charlie")) # Deleted 1 writer.add_document(id=u("2"), content=u("bravo charlie delta echo")) # Deleted 2 writer.add_document(id=u("3"), content=u("charlie delta echo foxtrot")) writer.commit() writer = ix.writer() writer.delete_by_term("id", "1") writer.delete_by_term("id", "2") writer.add_document(id=u("4"), content=u("apple bear cherry donut")) writer.add_document(id=u("5"), content=u("bear cherry donut eggs")) # Deleted 2 writer.add_document(id=u("6"), content=u("delta echo foxtrot golf")) # no d writer.add_document(id=u("7"), content=u("echo foxtrot golf hotel")) writer.commit(merge=False) writer = ix.writer() writer.delete_by_term("id", "3") writer.delete_by_term("id", "6") writer.add_document(id=u("8"), content=u("cherry donut eggs falafel")) writer.add_document(id=u("9"), content=u("donut eggs falafel grape")) writer.add_document(id=u("A"), content=u(" foxtrot golf hotel india")) writer.commit(merge=False) assert ix.doc_count() == 6 with ix.searcher() as s: r = s.search(query.Prefix("content", u("d")), optimize=False) assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"] r = s.search(query.Prefix("content", u("d"))) assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"] r = s.search(query.Prefix("content", u("d")), limit=None) assert sorted([d["id"] for d in r]) == ["4", "5", "8", "9"] def test_deleteall(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "deleteall") as ix: w = ix.writer() domain = u("alfa bravo charlie delta echo").split() for i, ls in enumerate(permutations(domain)): w.add_document(text=u(" ").join(ls)) if not i % 10: w.commit() w = ix.writer() w.commit() # This is just a test, don't use this method to delete all docs IRL! doccount = ix.doc_count_all() w = ix.writer() for docnum in xrange(doccount): w.delete_document(docnum) w.commit() with ix.searcher() as s: r = s.search(query.Or([query.Term("text", u("alfa")), query.Term("text", u("bravo"))])) assert len(r) == 0 ix.optimize() assert ix.doc_count_all() == 0 with ix.reader() as r: assert list(r) == [] def test_simple_stored(): schema = fields.Schema(a=fields.ID(stored=True), b=fields.ID(stored=False)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(a=u("alfa"), b=u("bravo")) with ix.searcher() as s: sf = s.stored_fields(0) assert sf == {"a": "alfa"} def test_single(): schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(schema, "single") as ix: w = ix.writer() w.add_document(id=u("1"), text=u("alfa")) w.commit() with ix.searcher() as s: assert ("text", u("alfa")) in s.reader() assert list(s.documents(id="1")) == [{"id": "1"}] assert list(s.documents(text="alfa")) == [{"id": "1"}] assert list(s.all_stored_fields()) == [{"id": "1"}] def test_indentical_fields(): schema = fields.Schema(id=fields.STORED, f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT) with TempIndex(schema, "identifields") as ix: w = ix.writer() w.add_document(id=1, f1=u("alfa"), f2=u("alfa"), f3=u("alfa")) w.commit() with ix.searcher() as s: assert list(s.lexicon("f1")) == [b("alfa")] assert list(s.lexicon("f2")) == [b("alfa")] assert list(s.lexicon("f3")) == [b("alfa")] assert list(s.documents(f1="alfa")) == [{"id": 1}] assert list(s.documents(f2="alfa")) == [{"id": 1}] assert list(s.documents(f3="alfa")) == [{"id": 1}] def test_multivalue(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(id=fields.STORED, date=fields.DATETIME, num=fields.NUMERIC, txt=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=1, date=datetime(2001, 1, 1), num=5) w.add_document(id=2, date=[datetime(2002, 2, 2), datetime(2003, 3, 3)], num=[1, 2, 3, 12]) w.add_document(txt=u("a b c").split()) with ix.reader() as r: assert ("num", 3) in r assert ("date", datetime(2003, 3, 3)) in r assert " ".join(r.field_terms("txt")) == "a b c" def test_multi_language(): # Analyzer for English ana_eng = analysis.StemmingAnalyzer() # analyzer for Pig Latin def stem_piglatin(w): if w.endswith("ay"): w = w[:-2] return w ana_pig = analysis.StemmingAnalyzer(stoplist=["nday", "roay"], stemfn=stem_piglatin) # Dictionary mapping languages to analyzers analyzers = {"eng": ana_eng, "pig": ana_pig} # Fake documents corpus = [(u("eng"), u("Such stuff as dreams are made on")), (u("pig"), u("Otay ebay, roay otnay otay ebay"))] schema = fields.Schema(content=fields.TEXT(stored=True), lang=fields.ID(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for doclang, content in corpus: ana = analyzers[doclang] # "Pre-analyze" the field into token strings words = [token.text for token in ana(content)] # Note we store the original value but index the pre-analyzed words w.add_document(lang=doclang, content=words, _stored_content=content) with ix.searcher() as s: schema = s.schema # Modify the schema to fake the correct analyzer for the language # we're searching in schema["content"].analyzer = analyzers["eng"] qp = qparser.QueryParser("content", schema) q = qp.parse("dreaming") r = s.search(q) assert len(r) == 1 assert r[0]["content"] == "Such stuff as dreams are made on" schema["content"].analyzer = analyzers["pig"] qp = qparser.QueryParser("content", schema) q = qp.parse("otnay") r = s.search(q) assert len(r) == 1 assert r[0]["content"] == "Otay ebay, roay otnay otay ebay" def test_doc_boost(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, a=u("alfa alfa alfa"), b=u("bravo")) w.add_document(id=1, a=u("alfa"), b=u("bear"), _a_boost=5.0) w.add_document(id=2, a=u("alfa alfa alfa alfa"), _boost=0.5) w.commit() with ix.searcher() as s: r = s.search(query.Term("a", "alfa")) assert [hit["id"] for hit in r] == [1, 0, 2] w = ix.writer() w.add_document(id=3, a=u("alfa"), b=u("bottle")) w.add_document(id=4, b=u("bravo"), _b_boost=2.0) w.commit(merge=False) with ix.searcher() as s: r = s.search(query.Term("a", "alfa")) assert [hit["id"] for hit in r] == [1, 0, 3, 2] def test_globfield_length_merge(): # Issue 343 schema = fields.Schema(title=fields.TEXT(stored=True), path=fields.ID(stored=True)) schema.add("*_text", fields.TEXT, glob=True) with TempIndex(schema, "globlenmerge") as ix: with ix.writer() as w: w.add_document(title=u("First document"), path=u("/a"), content_text=u("This is the first document we've added!")) with ix.writer() as w: w.add_document(title=u("Second document"), path=u("/b"), content_text=u("The second document is even more interesting!")) with ix.searcher() as s: docnum = s.document_number(path="/a") assert s.doc_field_length(docnum, "content_text") is not None qp = qparser.QueryParser("content", schema) q = qp.parse("content_text:document") r = s.search(q) paths = sorted(hit["path"] for hit in r) assert paths == ["/a", "/b"] def test_index_decimals(): from decimal import Decimal schema = fields.Schema(name=fields.KEYWORD(stored=True), num=fields.NUMERIC(int)) ix = RamStorage().create_index(schema) with ix.writer() as w: with pytest.raises(TypeError): w.add_document(name=u("hello"), num=Decimal("3.2")) schema = fields.Schema(name=fields.KEYWORD(stored=True), num=fields.NUMERIC(Decimal, decimal_places=5)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(name=u("hello"), num=Decimal("3.2")) Whoosh-2.5.7/tests/test_matching.py0000644000076500000240000003622412254366350017446 0ustar mattstaff00000000000000from __future__ import with_statement from random import randint, choice, sample from whoosh import fields, matching, qparser, query from whoosh.compat import b, u, xrange, permutations from whoosh.filedb.filestore import RamStorage from whoosh.query import And, Term from whoosh.util import make_binary_tree from whoosh.scoring import WeightScorer def _keys(searcher, docnums): return sorted([searcher.stored_fields(docnum)['key'] for docnum in docnums]) def test_nullmatcher(): nm = matching.NullMatcher() assert not nm.is_active() assert list(nm.all_ids()) == [] def test_listmatcher(): ids = [1, 2, 5, 9, 10] lm = matching.ListMatcher(ids) ls = [] while lm.is_active(): ls.append((lm.id(), lm.score())) lm.next() assert ls == [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)] lm = matching.ListMatcher(ids) assert list(lm.all_ids()) == ids lm = matching.ListMatcher(ids, position=3) ls = [] while lm.is_active(): ls.append(lm.id()) lm.next() assert ls == [9, 10] lm = matching.ListMatcher(ids) for _ in xrange(3): lm.next() lm = lm.copy() ls = [] while lm.is_active(): ls.append(lm.id()) lm.next() assert ls == [9, 10] def test_listmatcher_skip_to_quality_identical_scores(): ids = [1, 2, 5, 9, 10] lm = matching.ListMatcher(ids, scorer=WeightScorer(1.0)) lm.skip_to_quality(0.3) ls = [] while lm.is_active(): ls.append((lm.id(), lm.score())) lm.next() assert ls == [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)] def test_wrapper(): wm = matching.WrappingMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), boost=2.0) ls = [] while wm.is_active(): ls.append((wm.id(), wm.score())) wm.next() assert ls == [(1, 2.0), (2, 2.0), (5, 2.0), (9, 2.0), (10, 2.0)] ids = [1, 2, 5, 9, 10] wm = matching.WrappingMatcher(matching.ListMatcher(ids), boost=2.0) assert list(wm.all_ids()) == ids def test_filter(): lm = lambda: matching.ListMatcher(list(range(2, 10))) fm = matching.FilterMatcher(lm(), frozenset([3, 9])) assert list(fm.all_ids()) == [3, 9] fm = matching.FilterMatcher(lm(), frozenset([1, 5, 9, 13])) assert list(fm.all_ids()) == [5, 9] def test_exclude(): em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True) assert list(em.all_ids()) == [1, 5, 10] em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True) assert list(em.all_ids()) == [1, 5, 10] em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True) em.next() em.next() em = em.copy() ls = [] while em.is_active(): ls.append(em.id()) em.next() assert ls == [10] def test_simple_union(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) um = matching.UnionMatcher(lm1, lm2) ls = [] while um.is_active(): ls.append((um.id(), um.score())) um.next() assert ls == [(0, 1.0), (1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) um = matching.UnionMatcher(lm1, lm2) assert list(um.all_ids()) == [0, 1, 4, 10, 20, 90] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) um = matching.UnionMatcher(lm1, lm2) um.next() um.next() um = um.copy() ls = [] while um.is_active(): ls.append(um.id()) um.next() assert ls == [4, 10, 20, 90] def test_simple_intersection(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) im = matching.IntersectionMatcher(lm1, lm2) ls = [] while im.is_active(): ls.append((im.id(), im.score())) im.next() assert ls == [(4, 2.0), (20, 2.0)] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) im = matching.IntersectionMatcher(lm1, lm2) assert list(im.all_ids()) == [4, 20] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) im = matching.IntersectionMatcher(lm1, lm2) im.next() im.next() im = im.copy() ls = [] while im.is_active(): ls.append(im.id()) im.next() assert not ls def test_andnot(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) anm = matching.AndNotMatcher(lm1, lm2) ls = [] while anm.is_active(): ls.append((anm.id(), anm.score())) anm.next() assert ls == [(1, 1.0), (10, 1.0), (90, 1.0)] echo_lm = matching.ListMatcher([0, 1, 2, 3, 4]) bravo_lm = matching.ListMatcher([0, 1]) anm = matching.AndNotMatcher(echo_lm, bravo_lm) assert list(anm.all_ids()) == [2, 3, 4] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) anm = matching.AndNotMatcher(lm1, lm2) assert list(anm.all_ids()) == [1, 10, 90] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) anm = matching.AndNotMatcher(lm1, lm2) anm.next() anm.next() anm = anm.copy() ls = [] while anm.is_active(): ls.append(anm.id()) anm.next() assert ls == [90] def test_require(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) rm = matching.RequireMatcher(lm1, lm2) ls = [] while rm.is_active(): ls.append((rm.id(), rm.score())) rm.next() assert ls == [(4, 1.0), (20, 1.0)] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) rm = matching.RequireMatcher(lm1, lm2) assert list(rm.all_ids()) == [4, 20] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) rm = matching.RequireMatcher(lm1, lm2) rm.next() rm.next() rm = rm.copy() ls = [] while rm.is_active(): ls.append(rm.id()) rm.next() assert not ls def test_andmaybe(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) amm = matching.AndMaybeMatcher(lm1, lm2) ls = [] while amm.is_active(): ls.append((amm.id(), amm.score())) amm.next() assert ls == [(1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) amm = matching.AndMaybeMatcher(lm1, lm2) assert list(amm.all_ids()) == [1, 4, 10, 20, 90] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) amm = matching.AndMaybeMatcher(lm1, lm2) amm.next() amm.next() amm = amm.copy() ls = [] while amm.is_active(): ls.append(amm.id()) amm.next() assert ls == [10, 20, 90] def test_intersection(): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u("a"), value=u("alpha bravo charlie delta")) w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo")) w.add_document(key=u("c"), value=u("charlie delta golf hotel")) w.commit() w = ix.writer() w.add_document(key=u("d"), value=u("india alpha bravo charlie")) w.add_document(key=u("e"), value=u("delta bravo india bravo")) w.commit() with ix.searcher() as s: q = And([Term("value", u("bravo")), Term("value", u("delta"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "e"] q = And([Term("value", u("bravo")), Term("value", u("alpha"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "b", "d"] def test_random_intersections(): domain = [u("alpha"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike")] segments = 5 docsperseg = 50 fieldlimits = (3, 10) documents = [] schema = fields.Schema(key=fields.STORED, value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) # Create docsperseg * segments documents containing random words from # the domain list. Add the documents to the index, but also keep them # in the "documents" list for the sanity check for i in xrange(segments): w = ix.writer() for j in xrange(docsperseg): docnum = i * docsperseg + j # Create a string of random words doc = u(" ").join(choice(domain) for _ in xrange(randint(*fieldlimits))) # Add the string to the index w.add_document(key=docnum, value=doc) # Add a (docnum, string) tuple to the documents list documents.append((docnum, doc)) w.commit() assert len(ix._segments()) != 1 testcount = 20 testlimits = (2, 5) with ix.searcher() as s: for i in xrange(s.doc_count_all()): assert s.stored_fields(i).get("key") is not None for _ in xrange(testcount): # Create a random list of words and manually do an intersection of # items in "documents" that contain the words ("target"). words = sample(domain, randint(*testlimits)) target = [] for docnum, doc in documents: if all((doc.find(w) > -1) for w in words): target.append(docnum) target.sort() # Create a query from the list of words and get two matchers from # it. q = And([Term("value", w) for w in words]) m1 = q.matcher(s) m2 = q.matcher(s) # Try getting the list of IDs from all_ids() ids1 = list(m1.all_ids()) # Try getting the list of IDs using id()/next() ids2 = [] while m2.is_active(): ids2.append(m2.id()) m2.next() # Check that the two methods return the same list assert ids1 == ids2 # Check that the IDs match the ones we manually calculated assert _keys(s, ids1) == target def test_union(): s1 = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8]) s2 = matching.ListMatcher([2, 4, 8, 10, 20, 30]) s3 = matching.ListMatcher([10, 100, 200]) target = [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30, 100, 200] um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3)) assert target == list(um.all_ids()) def test_union_scores(): s1 = matching.ListMatcher([1, 2, 3]) s2 = matching.ListMatcher([2, 4, 8]) s3 = matching.ListMatcher([2, 3, 8]) target = [(1, 1.0), (2, 3.0), (3, 2.0), (4, 1.0), (8, 2.0)] um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3)) result = [] while um.is_active(): result.append((um.id(), um.score())) um.next() assert target == result def test_random_union(): testcount = 100 rangelimits = (2, 10) clauselimits = (2, 10) vals = list(range(100)) for _ in xrange(testcount): target = set() matchers = [] for _ in xrange(randint(*clauselimits)): nums = sample(vals, randint(*rangelimits)) target = target.union(nums) matchers.append(matching.ListMatcher(sorted(nums))) target = sorted(target) um = make_binary_tree(matching.UnionMatcher, matchers) assert list(um.all_ids()) == target def test_inverse(): s = matching.ListMatcher([1, 5, 10, 11, 13]) inv = matching.InverseMatcher(s, 15) ids = [] while inv.is_active(): ids.append(inv.id()) inv.next() assert ids == [0, 2, 3, 4, 6, 7, 8, 9, 12, 14] def test_inverse_skip(): s = matching.ListMatcher([1, 5, 10, 11, 13]) inv = matching.InverseMatcher(s, 15) inv.skip_to(8) ids = [] while inv.is_active(): ids.append(inv.id()) inv.next() assert ids == [8, 9, 12, 14] def test_empty_andnot(): pos = matching.NullMatcher() neg = matching.NullMatcher() anm = matching.AndNotMatcher(pos, neg) assert not anm.is_active() assert not list(anm.all_ids()) pos = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) neg = matching.NullMatcher() ans = matching.AndNotMatcher(pos, neg) ids = list(ans.all_ids()) assert ids == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] def test_random_andnot(): testcount = 100 rangesize = 100 rng = list(range(rangesize)) for _ in xrange(testcount): negs = sorted(sample(rng, randint(0, rangesize - 1))) negset = frozenset(negs) matched = [n for n in rng if n not in negset] pos = matching.ListMatcher(rng) neg = matching.ListMatcher(negs) anm = matching.AndNotMatcher(pos, neg) ids = list(anm.all_ids()) assert ids == matched def test_current_terms(): domain = u("alfa bravo charlie delta").split() schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() for ls in permutations(domain, 3): w.add_document(text=" ".join(ls), _stored_text=ls) w.commit() with ix.searcher() as s: q = query.And([query.Term("text", "alfa"), query.Term("text", "charlie")]) m = q.matcher(s) while m.is_active(): assert sorted(m.matching_terms()) == [("text", b("alfa")), ("text", b("charlie"))] m.next() def test_exclusion(): from datetime import datetime schema = fields.Schema(id=fields.ID(stored=True), date=fields.DATETIME) ix = RamStorage().create_index(schema) dt1 = datetime(1950, 1, 1) dt2 = datetime(1960, 1, 1) with ix.writer() as w: # Make 39 documents with dates != dt1 and then make a last document # with feed == dt1. for i in xrange(40): w.add_document(id=u(str(i)), date=(dt2 if i >= 1 else dt1)) with ix.searcher() as s: qp = qparser.QueryParser("id", schema) # Find documents where date != dt1 q = qp.parse("NOT (date:(19500101000000))") r = s.search(q, limit=None) assert len(r) == 39 # Total number of matched documents assert r.scored_length() == 39 # Number of docs in the results def test_arrayunion(): l1 = matching.ListMatcher([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) l2 = matching.ListMatcher([100, 200, 300, 400, 500, 600]) aum = matching.ArrayUnionMatcher([l1, l2], 600, partsize=5) assert aum.id() == 10 aum.skip_to(45) assert aum.id() == 50 aum.skip_to(550) assert aum.id() == 600 def test_arrayunion2(): l1 = matching.ListMatcher([1, 2]) l2 = matching.ListMatcher([1, 2, 10, 20]) l3 = matching.ListMatcher([1, 5, 10, 50]) aum = matching.ArrayUnionMatcher([l1, l2, l3], 51, partsize=2) assert aum.id() == 1 assert not l1.is_active() aum.skip_to(50) assert aum.id() == 50 Whoosh-2.5.7/tests/test_misc.py0000644000076500000240000001132312254366350016600 0ustar mattstaff00000000000000from __future__ import with_statement import os, threading, time from whoosh.compat import u from whoosh.util.filelock import try_for from whoosh.util.numeric import length_to_byte, byte_to_length from whoosh.util.testing import TempStorage def test_now(): from whoosh.util import now t1 = now() t2 = now() assert t1 <= t2 def test_storage_creation(): import tempfile, uuid from whoosh import fields from whoosh.filedb.filestore import FileStorage schema = fields.Schema(text=fields.TEXT) uid = uuid.uuid4() dirpath = os.path.join(tempfile.gettempdir(), str(uid)) assert not os.path.exists(dirpath) st = FileStorage(dirpath) st.create() assert os.path.exists(dirpath) ix = st.create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo")) w.add_document(text=u("bracho charlie")) st.destroy() assert not os.path.exists(dirpath) def test_ramstorage(): from whoosh.filedb.filestore import RamStorage st = RamStorage() lock = st.lock("test") lock.acquire() lock.release() def test_filelock_simple(): with TempStorage("simplefilelock") as st: lock1 = st.lock("testlock") lock2 = st.lock("testlock") assert lock1 is not lock2 assert lock1.acquire() assert st.file_exists("testlock") assert not lock2.acquire() lock1.release() assert lock2.acquire() assert not lock1.acquire() lock2.release() def test_threaded_filelock(): with TempStorage("threadedfilelock") as st: lock1 = st.lock("testlock") result = [] # The thread function tries to acquire the lock and then quits def fn(): lock2 = st.lock("testlock") gotit = try_for(lock2.acquire, 1.0, 0.1) if gotit: result.append(True) lock2.release() t = threading.Thread(target=fn) # Acquire the lock in this thread lock1.acquire() # Start the other thread trying to acquire the lock t.start() # Wait for a bit time.sleep(0.15) # Release the lock lock1.release() # Wait for the other thread to finish t.join() # If the other thread got the lock, it should have appended True to the # "results" list. assert result == [True] def test_length_byte(): source = list(range(11)) xform = [length_to_byte(n) for n in source] result = [byte_to_length(n) for n in xform] assert source == result def test_clockface_lru(): from whoosh.util.cache import clockface_lru_cache @clockface_lru_cache(5) def test(n): return n * 2 result = [test(n) for n in (1, 2, 3, 4, 5, 4, 3, 2, 10, 1)] assert result == [2, 4, 6, 8, 10, 8, 6, 4, 20, 2] assert test.cache_info() == (3, 7, 5, 5) test.cache_clear() assert test.cache_info() == (0, 0, 5, 0) def test_double_barrel_lru(): from whoosh.util.cache import lru_cache @lru_cache(5) def test(n): return n * 2 result = [test(n) for n in (1, 2, 3, 4, 5, 4, 3, 2, 10, 1)] assert result == [2, 4, 6, 8, 10, 8, 6, 4, 20, 2] # # hits, misses, maxsize and currsize # assert test.cache_info() == (4, 6, 5, 5) test.cache_clear() # assert test.cache_info() == (0, 0, 5, 0) def test_version_object(): from whoosh.util.versions import SimpleVersion as sv assert sv.parse("1") == sv(1) assert sv.parse("1.2") == sv(1, 2) assert sv.parse("1.2b") == sv(1, 2, ex="b") assert sv.parse("1.2rc") == sv(1, 2, ex="rc") assert sv.parse("1.2b3") == sv(1, 2, ex="b", exnum=3) assert sv.parse("1.2.3") == sv(1, 2, 3) assert sv.parse("1.2.3a") == sv(1, 2, 3, "a") assert sv.parse("1.2.3rc") == sv(1, 2, 3, "rc") assert sv.parse("1.2.3a4") == sv(1, 2, 3, "a", 4) assert sv.parse("1.2.3rc2") == sv(1, 2, 3, "rc", 2) assert sv.parse("999.999.999c999") == sv(999, 999, 999, "c", 999) assert sv.parse("1.2") == sv.parse("1.2") assert sv("1.2") != sv("1.3") assert sv.parse("1.0") < sv.parse("1.1") assert sv.parse("1.0") < sv.parse("2.0") assert sv.parse("1.2.3a4") < sv.parse("1.2.3a5") assert sv.parse("1.2.3a5") > sv.parse("1.2.3a4") assert sv.parse("1.2.3c99") < sv.parse("1.2.4") assert sv.parse("1.2.3a4") != sv.parse("1.2.3a5") assert sv.parse("1.2.3a5") != sv.parse("1.2.3a4") assert sv.parse("1.2.3c99") != sv.parse("1.2.4") assert sv.parse("1.2.3a4") <= sv.parse("1.2.3a5") assert sv.parse("1.2.3a5") >= sv.parse("1.2.3a4") assert sv.parse("1.2.3c99") <= sv.parse("1.2.4") assert sv.parse("1.2") <= sv.parse("1.2") assert sv(1, 2, 3).to_int() == 17213488128 assert sv.from_int(17213488128) == sv(1, 2, 3) Whoosh-2.5.7/tests/test_mpwriter.py0000644000076500000240000002071112254366764017530 0ustar mattstaff00000000000000from __future__ import with_statement import random from collections import deque import pytest from whoosh import fields, query from whoosh.compat import u, izip, xrange, permutations from whoosh.util.numeric import length_to_byte, byte_to_length from whoosh.util.testing import TempIndex def check_multi(): try: import multiprocessing import multiprocessing.synchronize # @UnusedImport except ImportError: pytest.skip() else: try: from multiprocessing import Queue Queue() except OSError: pytest.skip() else: return False def _byten(n): return byte_to_length(length_to_byte(n)) def _do_basic(writerclass): # Create the domain data # List of individual words added to the index words = [] # List of string values added to the index docs = [] # A ring buffer for creating string values buf = deque() for ls in permutations(u("abcd")): word = "".join(ls) # Remember this word is in the index (to check lexicon) words.append(word) # Add this word on to the end, pop the first word off to create N word # documents where N <= 10 buf.append(word) if len(buf) > 10: buf.popleft() # Create a copy of the buffer and shuffle it to create a document value # and add it to the list of document values doc = list(buf) random.shuffle(doc) docs.append(" ".join(doc)) # Shuffle the list of document values random.shuffle(docs) schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True, vector=True), row=fields.NUMERIC(stored=True)) with TempIndex(schema, storage_debug=True) as ix: # Add the domain data to the index with writerclass(ix, procs=3) as w: for i, value in enumerate(docs): w.add_document(text=value, row=i) with ix.searcher() as s: r = s.reader() # Check the lexicon for word, term in izip(words, r.field_terms("text")): assert word == term # Check the doc count assert r.doc_count_all() == len(docs) # Check the word graph assert r.has_word_graph("text") flat = [w.decode("latin1") for w in r.word_graph("text").flatten()] assert flat == words # Check there are lengths total = sum(r.doc_field_length(docnum, "text", 0) for docnum in xrange(r.doc_count_all())) assert total > 0 # Check per-doc info for i, value in enumerate(docs): pieces = value.split() docnum = s.document_number(row=i) # Check stored value sv = r.stored_fields(docnum) assert sv["text"] == value # Check vectors vr = r.vector(docnum, "text") # Get the terms and positions from the vector matcher iv = list(vr.items_as("positions")) # What the vector should look like ov = sorted((text, [i]) for i, text in enumerate(pieces)) assert iv == ov # Check field length assert r.doc_field_length(docnum, "text") == len(pieces) def test_basic_serial(): check_multi() from whoosh.multiproc import SerialMpWriter _do_basic(SerialMpWriter) def test_basic_multi(): check_multi() from whoosh.multiproc import MpWriter _do_basic(MpWriter) def test_no_add(): check_multi() from whoosh.multiproc import MpWriter schema = fields.Schema(text=fields.TEXT(stored=True, spelling=True, vector=True)) with TempIndex(schema) as ix: with ix.writer(procs=3) as w: assert type(w) == MpWriter def _do_merge(writerclass): schema = fields.Schema(key=fields.ID(stored=True, unique=True), value=fields.TEXT(stored=True, spelling=True, vector=True)) domain = {"a": "aa", "b": "bb cc", "c": "cc dd ee", "d": "dd ee ff gg", "e": "ee ff gg hh ii", "f": "ff gg hh ii jj kk", "g": "gg hh ii jj kk ll mm", "h": "hh ii jj kk ll mm nn oo", "i": "ii jj kk ll mm nn oo pp qq ww ww ww ww ww ww", "j": "jj kk ll mm nn oo pp qq rr ss", "k": "kk ll mm nn oo pp qq rr ss tt uu"} with TempIndex(schema) as ix: w = ix.writer() for key in "abc": w.add_document(key=u(key), value=u(domain[key])) w.commit() w = ix.writer() for key in "def": w.add_document(key=u(key), value=u(domain[key])) w.commit(merge=False) w = writerclass(ix, procs=3) del domain["b"] w.delete_by_term("key", u("b")) domain["e"] = "xx yy zz" w.update_document(key=u("e"), value=u(domain["e"])) for key in "ghijk": w.add_document(key=u(key), value=u(domain[key])) w.commit(optimize=True) assert len(ix._segments()) == 1 with ix.searcher() as s: r = s.reader() assert s.doc_count() == len(domain) assert "".join(r.field_terms("key")) == "acdefghijk" assert " ".join(r.field_terms("value")) == "aa cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu ww xx yy zz" for key in domain: docnum = s.document_number(key=key) assert docnum is not None length = r.doc_field_length(docnum, "value") assert length assert _byten(len(domain[key].split())) == length sf = r.stored_fields(docnum) assert domain[key] == sf["value"] words = sorted(set((" ".join(domain.values())).split())) assert words == list(r.field_terms("value")) for word in words: hits = s.search(query.Term("value", word)) for hit in hits: assert word in hit["value"].split() def test_merge_serial(): check_multi() from whoosh.multiproc import SerialMpWriter _do_merge(SerialMpWriter) def test_merge_multi(): check_multi() from whoosh.multiproc import MpWriter _do_merge(MpWriter) def test_no_score_no_store(): check_multi() from whoosh.multiproc import MpWriter schema = fields.Schema(a=fields.ID, b=fields.KEYWORD) domain = {} keys = list(u("abcdefghijklmnopqrstuvwx")) random.shuffle(keys) words = u("alfa bravo charlie delta").split() for i, key in enumerate(keys): domain[key] = words[i % len(words)] with TempIndex(schema) as ix: with MpWriter(ix, procs=3) as w: for key, value in domain.items(): w.add_document(a=key, b=value) with ix.searcher() as s: for word in words: r = s.search(query.Term("b", word)) assert len(r) == 6 def test_multisegment(): check_multi() from whoosh.multiproc import MpWriter schema = fields.Schema(a=fields.TEXT(stored=True, spelling=True, vector=True)) words = u("alfa bravo charlie delta echo").split() with TempIndex(schema) as ix: with ix.writer(procs=3, multisegment=True, batchsize=10) as w: assert w.__class__ == MpWriter assert w.multisegment for ls in permutations(words, 3): w.add_document(a=u(" ").join(ls)) assert len(ix._segments()) == 3 with ix.searcher() as s: for word in words: r = s.search(query.Term("a", word)) for hit in r: assert word in hit["a"].split() def test_batchsize_eq_doccount(): check_multi() schema = fields.Schema(a=fields.KEYWORD(stored=True)) with TempIndex(schema) as ix: with ix.writer(procs=4, batchsize=10) as w: for i in xrange(10): w.add_document(a=u(str(i))) def test_finish_segment(): check_multi() from whoosh.multiproc import MpWriter schema = fields.Schema(a=fields.KEYWORD(stored=True)) with TempIndex(schema) as ix: w = MpWriter(ix, procs=2, batchsize=1, multisegment=False, limitmb=0.00001) for i in range(9): w.add_document(a=u(chr(65 + i) * 50)) w.commit() Whoosh-2.5.7/tests/test_nested.py0000644000076500000240000003260512254366350017135 0ustar mattstaff00000000000000from __future__ import with_statement from whoosh import fields, query, sorting from whoosh.compat import u from whoosh.filedb.filestore import RamStorage def test_nested_parent(): schema = fields.Schema(name=fields.ID(stored=True), type=fields.ID, part=fields.ID, price=fields.NUMERIC) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(name=u("iPad"), type=u("product")) w.add_document(part=u("screen"), price=100) w.add_document(part=u("battery"), price=50) w.add_document(part=u("case"), price=20) with w.group(): w.add_document(name=u("iPhone"), type=u("product")) w.add_document(part=u("screen"), price=60) w.add_document(part=u("battery"), price=30) w.add_document(part=u("case"), price=10) with w.group(): w.add_document(name=u("Mac mini"), type=u("product")) w.add_document(part=u("hard drive"), price=50) w.add_document(part=u("case"), price=50) with ix.searcher() as s: price = s.schema["price"] pq = query.Term("type", "product") cq = query.Term("price", 50) q = query.NestedParent(pq, cq) r = s.search(q) assert sorted([hit["name"] for hit in r]) == ["Mac mini", "iPad"] def test_scoring(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with ix.searcher() as s: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r] == ["Calculator", "Index", "Accumulator"] def test_missing(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Deleter")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("delete")) with ix.searcher() as s: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r] == ["Calculator", "Index", "Accumulator", "Deleter"] with ix.writer() as w: w.delete_by_term("name", "Accumulator") w.delete_by_term("name", "Calculator") with ix.searcher() as s: pq = query.Term("kind", "class") assert len(list(pq.docs(s))) == 2 q = query.NestedParent(pq, query.Term("name", "add")) r = s.search(q) assert [hit["name"] for hit in r] == ["Index", "Deleter"] def test_nested_delete(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Deleter")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("delete")) # Delete "Accumulator" class with ix.writer() as w: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "Accumulator")) w.delete_by_query(q) # Check that Accumulator AND ITS METHODS are deleted with ix.searcher() as s: r = s.search(query.Term("kind", "class")) assert sorted(hit["name"] for hit in r) == ["Calculator", "Deleter", "Index"] names = [fs["name"] for _, fs in s.iter_docs()] assert names == ["Index", "add document", "add reader", "close", "Calculator", "add", "add all", "add some", "multiply", "close", "Deleter", "add", "delete"] # Delete any class with a close method with ix.writer() as w: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "close")) w.delete_by_query(q) # Check the CLASSES AND METHODS are gone with ix.searcher() as s: names = [fs["name"] for _, fs in s.iter_docs()] assert names == ["Deleter", "add", "delete"] def test_all_parents_deleted(): schema = fields.Schema(kind=fields.ID, name=fields.KEYWORD(scorable=True, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u("class"), name=u("Index")) w.add_document(kind=u("method"), name=u("add document")) w.add_document(kind=u("method"), name=u("add reader")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Accumulator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("get result")) with w.group(): w.add_document(kind=u("class"), name=u("Calculator")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("add all")) w.add_document(kind=u("method"), name=u("add some")) w.add_document(kind=u("method"), name=u("multiply")) w.add_document(kind=u("method"), name=u("close")) with w.group(): w.add_document(kind=u("class"), name=u("Deleter")) w.add_document(kind=u("method"), name=u("add")) w.add_document(kind=u("method"), name=u("delete")) with ix.writer() as w: w.delete_by_term("name", "Index") w.delete_by_term("name", "Accumulator") w.delete_by_term("name", "Calculator") w.delete_by_term("name", "Deleter") with ix.searcher() as s: q = query.NestedParent(query.Term("kind", "class"), query.Term("name", "add")) r = s.search(q) assert r.is_empty() def test_everything_is_a_parent(): schema = fields.Schema(id=fields.STORED, kind=fields.ID, name=fields.ID(stored=True)) k = u("alfa") ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, kind=k, name=u("one")) w.add_document(id=1, kind=k, name=u("two")) w.add_document(id=2, kind=k, name=u("three")) w.add_document(id=3, kind=k, name=u("four")) w.add_document(id=4, kind=k, name=u("one")) w.add_document(id=5, kind=k, name=u("two")) w.add_document(id=6, kind=k, name=u("three")) w.add_document(id=7, kind=k, name=u("four")) w.add_document(id=8, kind=k, name=u("one")) w.add_document(id=9, kind=k, name=u("two")) w.add_document(id=10, kind=k, name=u("three")) w.add_document(id=11, kind=k, name=u("four")) with ix.searcher() as s: pq = query.Term("kind", k) cq = query.Or([query.Term("name", "two"), query.Term("name", "four")]) q = query.NestedParent(pq, cq) r = s.search(q) assert [hit["id"] for hit in r] == [1, 3, 5, 7, 9, 11] def test_no_parents(): schema = fields.Schema(id=fields.STORED, kind=fields.ID, name=fields.ID(stored=True)) k = u("alfa") ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, kind=k, name=u("one")) w.add_document(id=1, kind=k, name=u("two")) w.add_document(id=2, kind=k, name=u("three")) w.add_document(id=3, kind=k, name=u("four")) w.add_document(id=4, kind=k, name=u("one")) w.add_document(id=5, kind=k, name=u("two")) w.add_document(id=6, kind=k, name=u("three")) w.add_document(id=7, kind=k, name=u("four")) w.add_document(id=8, kind=k, name=u("one")) w.add_document(id=9, kind=k, name=u("two")) w.add_document(id=10, kind=k, name=u("three")) w.add_document(id=11, kind=k, name=u("four")) with ix.searcher() as s: pq = query.Term("kind", "bravo") cq = query.Or([query.Term("name", "two"), query.Term("name", "four")]) q = query.NestedParent(pq, cq) r = s.search(q) assert r.is_empty() def test_nested_children(): schema = fields.Schema(t=fields.ID(stored=True), track=fields.NUMERIC(stored=True), album_name=fields.TEXT(stored=True), song_name=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(t=u("album"), album_name=u("alfa bravo charlie")) w.add_document(t=u("track"), track=1, song_name=u("delta echo foxtrot")) w.add_document(t=u("track"), track=2, song_name=u("golf hotel india")) w.add_document(t=u("track"), track=3, song_name=u("juliet kilo lima")) with w.group(): w.add_document(t=u("album"), album_name=u("mike november oskar")) w.add_document(t=u("track"), track=1, song_name=u("papa quebec romeo")) w.add_document(t=u("track"), track=2, song_name=u("sierra tango ultra")) w.add_document(t=u("track"), track=3, song_name=u("victor whiskey xray")) with w.group(): w.add_document(t=u("album"), album_name=u("yankee zulu one")) w.add_document(t=u("track"), track=1, song_name=u("two three four")) w.add_document(t=u("track"), track=2, song_name=u("five six seven")) w.add_document(t=u("track"), track=3, song_name=u("eight nine ten")) with ix.searcher() as s: pq = query.Term("t", "album") aq = query.Term("album_name", "november") r = s.search(query.NestedChildren(pq, pq), limit=None) assert len(r) == 9 assert [str(hit["t"]) for hit in r] == ["track"] * 9 ncq = query.NestedChildren(pq, aq) assert list(ncq.docs(s)) == [5, 6, 7] r = s.search(ncq, limit=None) assert len(r) == 3 assert [str(hit["song_name"]) for hit in r] == ["papa quebec romeo", "sierra tango ultra", "victor whiskey xray"] zq = query.NestedChildren(pq, query.Term("album_name", "zulu")) f = sorting.StoredFieldFacet("song_name") r = s.search(zq, sortedby=f) assert [hit["track"] for hit in r] == [3, 2, 1] Whoosh-2.5.7/tests/test_parse_plugins.py0000644000076500000240000005265212254366350020532 0ustar mattstaff00000000000000from __future__ import with_statement import inspect from datetime import datetime from whoosh import analysis, fields, formats, qparser, query from whoosh.compat import u, text_type, xrange from whoosh.filedb.filestore import RamStorage from whoosh.qparser import dateparse, default, plugins, syntax from whoosh.util.times import adatetime def _plugin_classes(ignore): # Get all the subclasses of Plugin in whoosh.qparser.plugins return [c for _, c in inspect.getmembers(plugins, inspect.isclass) if plugins.Plugin in c.__bases__ and c not in ignore] def test_combos(): qs = ('w:a "hi there"^4.2 AND x:b^2.3 OR c AND (y:d OR e) ' + '(apple ANDNOT bear)^2.3') init_args = {plugins.MultifieldPlugin: (["content", "title"], {"content": 1.0, "title": 1.2}), plugins.FieldAliasPlugin: ({"content": ("text", "body")},), plugins.CopyFieldPlugin: ({"name": "phone"},), plugins.PseudoFieldPlugin: ({"name": lambda x: x}), } pis = _plugin_classes(()) for i, plugin in enumerate(pis): try: pis[i] = plugin(*init_args.get(plugin, ())) except TypeError: raise TypeError("Error instantiating %s" % plugin) count = 0 for i, first in enumerate(pis): for j in xrange(len(pis)): if i == j: continue plist = [p for p in pis[:j] if p is not first] + [first] qp = qparser.QueryParser("text", None, plugins=plist) qp.parse(qs) count += 1 def test_field_alias(): qp = qparser.QueryParser("content", None) qp.add_plugin(plugins.FieldAliasPlugin({"title": ("article", "caption")})) q = qp.parse("alfa title:bravo article:charlie caption:delta") assert text_type(q) == u("(content:alfa AND title:bravo AND title:charlie AND title:delta)") def test_dateparser(): schema = fields.Schema(text=fields.TEXT, date=fields.DATETIME) qp = default.QueryParser("text", schema) errs = [] def cb(arg): errs.append(arg) basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) qp.add_plugin(dateparse.DateParserPlugin(basedate, callback=cb)) q = qp.parse(u("hello date:'last tuesday'")) assert q.__class__ == query.And assert q[1].__class__ == query.DateRange assert q[1].startdate == adatetime(2010, 9, 14).floor() assert q[1].enddate == adatetime(2010, 9, 14).ceil() q = qp.parse(u("date:'3am to 5pm'")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2010, 9, 20, 3).floor() assert q.enddate == adatetime(2010, 9, 20, 17).ceil() q = qp.parse(u("date:blah")) assert q == query.NullQuery assert errs[0] == "blah" q = qp.parse(u("hello date:blarg")) assert q.__unicode__() == "(text:hello AND <_NullQuery>)" assert q[1].error == "blarg" assert errs[1] == "blarg" q = qp.parse(u("hello date:20055x10")) assert q.__unicode__() == "(text:hello AND <_NullQuery>)" assert q[1].error == "20055x10" assert errs[2] == "20055x10" q = qp.parse(u("hello date:'2005 19 32'")) assert q.__unicode__() == "(text:hello AND <_NullQuery>)" assert q[1].error == "2005 19 32" assert errs[3] == "2005 19 32" q = qp.parse(u("date:'march 24 to dec 12'")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2010, 3, 24).floor() assert q.enddate == adatetime(2010, 12, 12).ceil() q = qp.parse(u("date:('30 june' OR '10 july') quick")) assert q.__class__ == query.And assert len(q) == 2 assert q[0].__class__ == query.Or assert q[0][0].__class__ == query.DateRange assert q[0][1].__class__ == query.DateRange def test_date_range(): schema = fields.Schema(text=fields.TEXT, date=fields.DATETIME) qp = qparser.QueryParser("text", schema) basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) qp.add_plugin(dateparse.DateParserPlugin(basedate)) q = qp.parse(u("date:['30 march' to 'next wednesday']")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2010, 3, 30).floor() assert q.enddate == adatetime(2010, 9, 22).ceil() q = qp.parse(u("date:[to 'next wednesday']")) assert q.__class__ == query.DateRange assert q.startdate is None assert q.enddate == adatetime(2010, 9, 22).ceil() q = qp.parse(u("date:['30 march' to]")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2010, 3, 30).floor() assert q.enddate is None q = qp.parse(u("date:[30 march to next wednesday]")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2010, 3, 30).floor() assert q.enddate == adatetime(2010, 9, 22).ceil() q = qp.parse(u("date:[to next wednesday]")) assert q.__class__ == query.DateRange assert q.startdate is None assert q.enddate == adatetime(2010, 9, 22).ceil() q = qp.parse(u("date:[30 march to]")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2010, 3, 30).floor() assert q.enddate is None def test_daterange_multi(): schema = fields.Schema(text=fields.TEXT, start=fields.DATETIME, end=fields.DATETIME) qp = qparser.QueryParser("text", schema) basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) qp.add_plugin(dateparse.DateParserPlugin(basedate)) q = qp.parse("start:[2008 to] AND end:[2011 to 2011]") assert q.__class__ == query.And assert q[0].__class__ == query.DateRange assert q[1].__class__ == query.DateRange assert q[0].startdate == adatetime(2008).floor() assert q[0].enddate is None assert q[1].startdate == adatetime(2011).floor() assert q[1].enddate == adatetime(2011).ceil() def test_daterange_empty_field(): schema = fields.Schema(test=fields.DATETIME) ix = RamStorage().create_index(schema) writer = ix.writer() writer.add_document(test=None) writer.commit() with ix.searcher() as s: q = query.DateRange("test", datetime.fromtimestamp(0), datetime.today()) r = s.search(q) assert len(r) == 0 def test_free_dates(): a = analysis.StandardAnalyzer(stoplist=None) schema = fields.Schema(text=fields.TEXT(analyzer=a), date=fields.DATETIME) qp = qparser.QueryParser("text", schema) basedate = datetime(2010, 9, 20, 15, 16, 6, 454000) qp.add_plugin(dateparse.DateParserPlugin(basedate, free=True)) q = qp.parse(u("hello date:last tuesday")) assert q.__class__ == query.And assert len(q) == 2 assert q[0].__class__ == query.Term assert q[0].text == "hello" assert q[1].__class__ == query.DateRange assert q[1].startdate == adatetime(2010, 9, 14).floor() assert q[1].enddate == adatetime(2010, 9, 14).ceil() q = qp.parse(u("date:mar 29 1972 hello")) assert q.__class__ == query.And assert len(q) == 2 assert q[0].__class__ == query.DateRange assert q[0].startdate == adatetime(1972, 3, 29).floor() assert q[0].enddate == adatetime(1972, 3, 29).ceil() assert q[1].__class__ == query.Term assert q[1].text == "hello" q = qp.parse(u("date:2005 march 2")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2005, 3, 2).floor() assert q.enddate == adatetime(2005, 3, 2).ceil() q = qp.parse(u("date:'2005' march 2")) assert q.__class__ == query.And assert len(q) == 3 assert q[0].__class__ == query.DateRange assert q[0].startdate == adatetime(2005).floor() assert q[0].enddate == adatetime(2005).ceil() assert q[1].__class__ == query.Term assert q[1].fieldname == "text" assert q[1].text == "march" q = qp.parse(u("date:march 24 to dec 12")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2010, 3, 24).floor() assert q.enddate == adatetime(2010, 12, 12).ceil() q = qp.parse(u("date:5:10pm")) assert q.__class__ == query.DateRange assert q.startdate == adatetime(2010, 9, 20, 17, 10).floor() assert q.enddate == adatetime(2010, 9, 20, 17, 10).ceil() q = qp.parse(u("(date:30 june OR date:10 july) quick")) assert q.__class__ == query.And assert len(q) == 2 assert q[0].__class__ == query.Or assert q[0][0].__class__ == query.DateRange assert q[0][1].__class__ == query.DateRange def test_prefix_plugin(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), text=u("alfa")) w.add_document(id=u("2"), text=u("bravo")) w.add_document(id=u("3"), text=u("buono")) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("text", schema) qp.remove_plugin_class(plugins.WildcardPlugin) qp.add_plugin(plugins.PrefixPlugin) q = qp.parse(u("b*")) r = s.search(q, limit=None) assert len(r) == 2 q = qp.parse(u("br*")) r = s.search(q, limit=None) assert len(r) == 1 def test_custom_tokens(): qp = qparser.QueryParser("text", None) qp.remove_plugin_class(plugins.OperatorsPlugin) cp = plugins.OperatorsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~", Not="-") qp.add_plugin(cp) q = qp.parse("this | that") assert q.__class__ == query.Or assert q[0].__class__ == query.Term assert q[0].text == "this" assert q[1].__class__ == query.Term assert q[1].text == "that" q = qp.parse("this&!that") assert q.__class__ == query.AndNot assert q.a.__class__ == query.Term assert q.a.text == "this" assert q.b.__class__ == query.Term assert q.b.text == "that" q = qp.parse("alfa -bravo NOT charlie") assert len(q) == 4 assert q[1].__class__ == query.Not assert q[1].query.text == "bravo" assert q[2].text == "NOT" def test_copyfield(): qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, None)) assert text_type(qp.parse("hello b:matt")) == "(a:hello AND b:matt AND c:matt)" qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.AndMaybeGroup)) assert text_type(qp.parse("hello b:matt")) == "(a:hello AND (b:matt ANDMAYBE c:matt))" qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, syntax.RequireGroup)) assert text_type(qp.parse("hello (there OR b:matt)")) == "(a:hello AND (a:there OR (b:matt REQUIRE c:matt)))" qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"a": "c"}, syntax.OrGroup)) assert text_type(qp.parse("hello there")) == "((a:hello OR c:hello) AND (a:there OR c:there))" qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"b": "c"}, mirror=True)) assert text_type(qp.parse("hello c:matt")) == "(a:hello AND (c:matt OR b:matt))" qp = qparser.QueryParser("a", None) qp.add_plugin(plugins.CopyFieldPlugin({"c": "a"}, mirror=True)) assert text_type(qp.parse("hello c:matt")) == "((a:hello OR c:hello) AND (c:matt OR a:matt))" ana = analysis.RegexAnalyzer(r"\w+") | analysis.DoubleMetaphoneFilter() fmt = formats.Frequency() schema = fields.Schema(name=fields.KEYWORD, name_phone=fields.FieldType(fmt, ana, multitoken_query="or")) qp = qparser.QueryParser("name", schema) qp.add_plugin(plugins.CopyFieldPlugin({"name": "name_phone"})) assert text_type(qp.parse(u("spruce view"))) == "((name:spruce OR name_phone:SPRS) AND (name:view OR name_phone:F OR name_phone:FF))" def test_gtlt(): schema = fields.Schema(a=fields.KEYWORD, b=fields.NUMERIC, c=fields.KEYWORD, d=fields.NUMERIC(float), e=fields.DATETIME) qp = qparser.QueryParser("a", schema) qp.add_plugin(plugins.GtLtPlugin()) qp.add_plugin(dateparse.DateParserPlugin()) q = qp.parse(u("a:hello b:>100 c:<=z there")) assert q.__class__ == query.And assert len(q) == 4 assert q[0] == query.Term("a", "hello") assert q[1] == query.NumericRange("b", 100, None, startexcl=True) assert q[2] == query.TermRange("c", None, 'z') assert q[3] == query.Term("a", "there") q = qp.parse(u("hello e:>'29 mar 2001' there")) assert q.__class__ == query.And assert len(q) == 3 assert q[0] == query.Term("a", "hello") # As of this writing, date ranges don't support startexcl/endexcl assert q[1] == query.DateRange("e", datetime(2001, 3, 29, 0, 0), None) assert q[2] == query.Term("a", "there") q = qp.parse(u("a:> alfa c:<= bravo")) assert text_type(q) == "(a:a: AND a:alfa AND a:c: AND a:bravo)" qp.remove_plugin_class(plugins.FieldsPlugin) qp.remove_plugin_class(plugins.RangePlugin) q = qp.parse(u("hello a:>500 there")) assert text_type(q) == "(a:hello AND a:a: AND a:500 AND a:there)" def test_regex(): schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT) qp = qparser.QueryParser("a", schema) qp.add_plugin(plugins.RegexPlugin()) q = qp.parse(u("a:foo-bar b:foo-bar")) assert q.__unicode__() == '(a:foo-bar AND b:foo AND b:bar)' q = qp.parse(u('a:r"foo-bar" b:r"foo-bar"')) assert q.__unicode__() == '(a:r"foo-bar" AND b:r"foo-bar")' def test_pseudofield(): schema = fields.Schema(a=fields.KEYWORD, b=fields.TEXT) def regex_maker(node): if node.has_text: node = qparser.RegexPlugin.RegexNode(node.text) node.set_fieldname("content") return node qp = qparser.QueryParser("a", schema) qp.add_plugin(qparser.PseudoFieldPlugin({"regex": regex_maker})) q = qp.parse(u("alfa regex:br.vo")) assert q.__unicode__() == '(a:alfa AND content:r"br.vo")' def rev_text(node): if node.has_text: # Create a word node for the reversed text revtext = node.text[::-1] # Reverse the text rnode = qparser.WordNode(revtext) # Duplicate the original node's start and end char rnode.set_range(node.startchar, node.endchar) # Put the original node and the reversed node in an OrGroup group = qparser.OrGroup([node, rnode]) # Need to set the fieldname here because the PseudoFieldPlugin # removes the field name syntax group.set_fieldname("reverse") return group qp = qparser.QueryParser("content", schema) qp.add_plugin(qparser.PseudoFieldPlugin({"reverse": rev_text})) q = qp.parse(u("alfa reverse:bravo")) assert q.__unicode__() == '(content:alfa AND (reverse:bravo OR reverse:ovarb))' def test_fuzzy_plugin(): ana = analysis.StandardAnalyzer("\\S+") schema = fields.Schema(f=fields.TEXT(analyzer=ana)) qp = default.QueryParser("f", schema) qp.add_plugin(plugins.FuzzyTermPlugin()) q = qp.parse("bob~") assert q.__class__ == query.FuzzyTerm assert q.field() == "f" assert q.text == "bob" assert q.maxdist == 1 q = qp.parse("Alfa Bravo~ Charlie") assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[0].text == "alfa" assert q[1].__class__ == query.FuzzyTerm assert q[1].field() == "f" assert q[1].text == "bravo" assert q[1].maxdist == 1 assert q[2].__class__ == query.Term assert q[2].text == "charlie" q = qp.parse("Alfa Bravo~2 Charlie") assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[0].text == "alfa" assert q[1].__class__ == query.FuzzyTerm assert q[1].field() == "f" assert q[1].text == "bravo" assert q[1].maxdist == 2 assert q[2].__class__ == query.Term assert q[2].text == "charlie" q = qp.parse("alfa ~2 bravo") assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[0].text == "alfa" assert q[1].__class__ == query.Term assert q[1].text == "~2" assert q[2].__class__ == query.Term assert q[2].text == "bravo" qp = default.QueryParser("f", None) q = qp.parse("'bob~'") assert q.__class__ == query.Term assert q.field() == "f" assert q.text == "bob~" def test_fuzzy_prefix(): from whoosh import scoring schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: # Match -> first w.add_document(title=u("First"), content=u("This is the first document we've added!")) # No match w.add_document(title=u("Second"), content=u("The second one is even more interesting! filst")) # Match -> first w.add_document(title=u("Third"), content=u("The world first line we've added!")) # Match -> zeroth w.add_document(title=u("Fourth"), content=u("The second one is alaways comes after zeroth!")) # Match -> fire is within 2 edits (transpose + delete) of first w.add_document(title=u("Fifth"), content=u("The fire is beautiful")) from whoosh.qparser import QueryParser, FuzzyTermPlugin #, BoundedFuzzyTermPlugin parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) q = parser.parse("first~2/3 OR zeroth", debug=False) assert isinstance(q, query.Or) ft = q[0] assert isinstance(ft, query.FuzzyTerm) assert ft.maxdist == 2 assert ft.prefixlength == 3 with ix.searcher(weighting=scoring.TF_IDF()) as searcher: results = searcher.search(q) assert len(results) == 4 assert " ".join(hit["title"] for hit in results) == "Fourth First Third Fifth" def test_function_plugin(): class FakeQuery(query.Query): def __init__(self, children, *args, **kwargs): self.children = children self.args = args self.kwargs = kwargs self.fieldname = None def __hash__(self): return hash(tuple(self.children)) ^ hash(self.args) def __unicode__(self): qs = "|".join(str(q) for q in self.children) args = ",".join(self.args) kwargs = ",".join(sorted("%s:%s" % item for item in self.kwargs.items())) return u("<%s %s %s>") % (qs, args, kwargs) __str__ = __unicode__ def fuzzy(qs, prefix=0, maxdist=2): prefix = int(prefix) maxdist = int(maxdist) return query.FuzzyTerm(qs[0].fieldname, qs[0].text, prefixlength=prefix, maxdist=maxdist) fp = plugins.FunctionPlugin({"foo": FakeQuery, "fuzzy": fuzzy}) qp = default.QueryParser("f", None) qp.add_plugin(fp) def check(qstring, target): q = qp.parse(u(qstring), normalize=False) assert str(q) == target check("alfa #foo charlie delta", "(f:alfa AND < > AND f:charlie AND f:delta)") check("alfa #foo(charlie delta) echo", "(f:alfa AND AND f:echo)") check("alfa #foo(charlie AND delta) echo", "(f:alfa AND <(f:charlie AND f:delta) > AND f:echo)") check("alfa #foo[a] charlie delta", "(f:alfa AND < a > AND f:charlie AND f:delta)") check("alfa #foo[a, b](charlie delta) echo", "(f:alfa AND AND f:echo)") check("alfa #foo[a,b,c=d](charlie AND delta) echo", "(f:alfa AND <(f:charlie AND f:delta) a,b c:d> AND f:echo)") check("alfa #foo[a,b,c=d]() (charlie AND delta)", "(f:alfa AND < a,b c:d> AND ((f:charlie AND f:delta)))") check("alfa #foo[a=1,b=2](charlie AND delta)^2.0 echo", "(f:alfa AND <(f:charlie AND f:delta) a:1,b:2,boost:2.0> AND f:echo)") check("alfa #fuzzy[maxdist=2](bravo) charlie", "(f:alfa AND f:bravo~2 AND f:charlie)") def test_sequence_plugin(): qp = default.QueryParser("f", None) qp.remove_plugin_class(plugins.PhrasePlugin) qp.add_plugin(plugins.FuzzyTermPlugin()) qp.add_plugin(plugins.SequencePlugin()) q = qp.parse(u('alfa "bravo charlie~2 (delta OR echo)" foxtrot')) assert q.__unicode__() == "(f:alfa AND (f:bravo NEAR f:charlie~2 NEAR (f:delta OR f:echo)) AND f:foxtrot)" assert q[1].__class__ == query.Sequence q = qp.parse(u('alfa "bravo charlie~2 d?lt*')) assert q[0].text == "alfa" assert q[1].text == "bravo" assert q[2].__class__ == query.FuzzyTerm assert q[3].__class__ == query.Wildcard q = qp.parse(u('alfa "bravo charlie~2" d?lt* "[a TO z] [0 TO 9]" echo')) assert q.__unicode__() == "(f:alfa AND (f:bravo NEAR f:charlie~2) AND f:d?lt* AND (f:[a TO z] NEAR f:[0 TO 9]) AND f:echo)" assert q[0].text == "alfa" assert q[1].__class__ == query.Sequence assert q[2].__class__ == query.Wildcard assert q[3].__class__ == query.Sequence assert q[3][0].__class__ == query.TermRange assert q[3][1].__class__ == query.TermRange assert q[4].text == "echo" q = qp.parse(u('alfa "bravo charlie~3"~2 delta')) assert q[1].__class__ == query.Sequence assert q[1].slop == 2 assert q[1][1].__class__ == query.FuzzyTerm assert q[1][1].maxdist == 3 def test_sequence_andmaybe(): qp = default.QueryParser("f", None) qp.remove_plugin_class(plugins.PhrasePlugin) qp.add_plugins([plugins.FuzzyTermPlugin(), plugins.SequencePlugin()]) q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"')) assert isinstance(q, query.AndMaybe) assert q[0] == query.Term("f", u("Dahmen")) assert q[1] == query.Sequence([query.Term("f", u("Besov")), query.Term("f", u("Spaces"))]) Whoosh-2.5.7/tests/test_parsing.py0000644000076500000240000010070412254366764017323 0ustar mattstaff00000000000000import pytest from whoosh import analysis, fields, query from whoosh.compat import u, text_type from whoosh.qparser import default from whoosh.qparser import plugins def test_whitespace(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin()]) assert repr(p.tag("hello there amiga")) == ", < >, , < >, >" def test_singlequotes(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.SingleQuotePlugin()]) assert repr(p.process("a 'b c' d")) == ", , >" def test_prefix(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.PrefixPlugin()]) assert repr(p.process("a b* c")) == ", , >" def test_range(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.RangePlugin()]) ns = p.tag("a [b to c} d") assert repr(ns) == ", < >, , < >, >" assert repr(p.process("a {b to]")) == ", >" assert repr(p.process("[to c] d")) == ", >" assert repr(p.process("[to]")) == ">" def test_sq_range(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.SingleQuotePlugin(), plugins.RangePlugin()]) assert repr(p.process("['a b' to ']']")) == ">" def test_phrase(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.PhrasePlugin()]) assert repr(p.process('a "b c"')) == ", >" assert repr(p.process('"b c" d')) == ", >" assert repr(p.process('"b c"')) == ">" q = p.parse('alfa "bravo charlie"~2 delta') assert q[1].__class__ == query.Phrase assert q[1].words == ["bravo", "charlie"] assert q[1].slop == 2 def test_groups(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.GroupPlugin()]) ns = p.process("a ((b c) d) e") assert repr(ns) == ", , >, >, >" def test_fieldnames(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.FieldsPlugin(), plugins.GroupPlugin()]) ns = p.process("a:b c d:(e f:(g h)) i j:") assert repr(ns) == ", , , , <'f':'h'>>>, , >" assert repr(p.process("a:b:")) == ">" def test_operators(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.OperatorsPlugin()]) ns = p.process("a OR b") assert repr(ns) == ", >>" def test_boost(): p = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.GroupPlugin(), plugins.BoostPlugin()]) ns = p.tag("a^3") assert repr(ns) == ", <^ 3.0>>" ns = p.filterize(ns) assert repr(ns) == ">" assert repr(p.process("a (b c)^2.5")) == ", , ^2.5>>" assert repr(p.process("a (b c)^.5 d")) == ", , ^0.5>, >" assert repr(p.process("^2 a")) == ", >" assert repr(p.process("a^2^3")) == ">" # def test_empty_querystring(): s = fields.Schema(content=fields.TEXT, title=fields.TEXT, id=fields.ID) qp = default.QueryParser("content", s) q = qp.parse(u("")) assert q == query.NullQuery def test_fields(): s = fields.Schema(content=fields.TEXT, title=fields.TEXT, id=fields.ID) qp = default.QueryParser("content", s) q = qp.parse(u("test")) assert q.__class__ == query.Term assert q.fieldname == "content" assert q.text == "test" mq = default.MultifieldParser(("title", "content"), s) q = mq.parse(u("test")) assert q.__class__ == query.Or assert q[0].__class__ == query.Term assert q[1].__class__ == query.Term assert q[0].fieldname == "title" assert q[1].fieldname == "content" assert q[0].text == "test" assert q[1].text == "test" q = mq.parse(u("title:test")) assert q.__class__ == query.Term assert q.fieldname == "title" assert q.text == "test" def test_multifield(): schema = fields.Schema(content=fields.TEXT, title=fields.TEXT, cat=fields.KEYWORD, date=fields.DATETIME) qs = u("a (b c cat:d) OR (b c cat:e)") qp = default.MultifieldParser(['x', 'y'], schema) q = qp.parse(qs) assert text_type(q) == "((x:a OR y:a) AND (((x:b OR y:b) AND (x:c OR y:c) AND cat:d) OR ((x:b OR y:b) AND (x:c OR y:c) AND cat:e)))" def test_fieldname_chars(): s = fields.Schema(abc123=fields.TEXT, nisbah=fields.KEYWORD) qp = default.QueryParser("content", s) fieldmap = {'nisbah': [u('\u0646\u0633\u0628\u0629')], 'abc123': ['xyz']} qp.add_plugin(plugins.FieldAliasPlugin(fieldmap)) q = qp.parse(u("abc123:456")) assert q.__class__ == query.Term assert q.fieldname == u('abc123') assert q.text == u('456') q = qp.parse(u("abc123:456 def")) assert text_type(q) == u("(abc123:456 AND content:def)") q = qp.parse(u('\u0646\u0633\u0628\u0629:\u0627\u0644\u0641\u0644\u0633' '\u0637\u064a\u0646\u064a')) assert q.__class__ == query.Term assert q.fieldname == u('nisbah') assert q.text == u('\u0627\u0644\u0641\u0644\u0633\u0637\u064a\u0646\u064a') q = qp.parse(u("abc123 (xyz:123 OR qrs)")) assert text_type(q) == "(content:abc123 AND (abc123:123 OR content:qrs))" def test_colonspace(): s = fields.Schema(content=fields.TEXT, url=fields.ID) qp = default.QueryParser("content", s) q = qp.parse(u("url:test")) assert q.__class__ == query.Term assert q.fieldname == "url" assert q.text == "test" q = qp.parse(u("url: test")) assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[1].__class__ == query.Term assert q[0].fieldname == "content" assert q[1].fieldname == "content" assert q[0].text == "url" assert q[1].text == "test" q = qp.parse(u("url:")) assert q.__class__ == query.Term assert q.fieldname == "content" assert q.text == "url" s = fields.Schema(foo=fields.KEYWORD) qp = default.QueryParser("foo", s) q = qp.parse(u("blah:")) assert q.__class__ == query.Term assert q.fieldname == "foo" assert q.text == "blah:" def test_andor(): qp = default.QueryParser("a", None) q = qp.parse("a AND b OR c AND d OR e AND f") assert text_type(q) == "((a:a AND a:b) OR (a:c AND a:d) OR (a:e AND a:f))" q = qp.parse("aORb") assert q == query.Term("a", "aORb") q = qp.parse("aOR b") assert q == query.And([query.Term("a", "aOR"), query.Term("a", "b")]) q = qp.parse("a ORb") assert q == query.And([query.Term("a", "a"), query.Term("a", "ORb")]) assert qp.parse("OR") == query.Term("a", "OR") def test_andnot(): qp = default.QueryParser("content", None) q = qp.parse(u("this ANDNOT that")) assert q.__class__ == query.AndNot assert q.a.__class__ == query.Term assert q.b.__class__ == query.Term assert q.a.text == "this" assert q.b.text == "that" q = qp.parse(u("foo ANDNOT bar baz")) assert q.__class__ == query.And assert len(q) == 2 assert q[0].__class__ == query.AndNot assert q[1].__class__ == query.Term q = qp.parse(u("foo fie ANDNOT bar baz")) assert q.__class__ == query.And assert len(q) == 3 assert q[0].__class__ == query.Term assert q[1].__class__ == query.AndNot assert q[2].__class__ == query.Term q = qp.parse(u("a AND b ANDNOT c")) assert q.__class__ == query.AndNot assert text_type(q) == "((content:a AND content:b) ANDNOT content:c)" def test_boost_query(): qp = default.QueryParser("content", None) q = qp.parse(u("this^3 fn:that^0.5 5.67 hi^5x")) assert q[0].boost == 3.0 assert q[1].boost == 0.5 assert q[1].fieldname == "fn" assert q[2].text == "5.67" assert q[3].text == "hi^5x" q = qp.parse("alfa (bravo OR charlie)^2.5 ^3") assert len(q) == 3 assert q[0].boost == 1.0 assert q[1].boost == 2.5 assert q[2].text == "^3" def test_boosts(): qp = default.QueryParser("t", None) q = qp.parse("alfa ((bravo^2)^3)^4 charlie") assert q.__unicode__() == "(t:alfa AND t:bravo^24.0 AND t:charlie)" def test_wild(): qp = default.QueryParser("t", None, [plugins.WhitespacePlugin(), plugins.WildcardPlugin()]) assert repr(qp.process("a b*c? d")) == ", , >" assert repr(qp.process("a * ? d")) == ", , , >" # qp = default.QueryParser("content", None) q = qp.parse(u("hello *the?e* ?star*s? test")) assert len(q) == 4 assert q[0].__class__ == query.Term assert q[0].text == "hello" assert q[1].__class__ == query.Wildcard assert q[1].text == "*the?e*" assert q[2].__class__ == query.Wildcard assert q[2].text == "?star*s?" assert q[3].__class__ == query.Term assert q[3].text == "test" # qp = default.QueryParser("content", None) q = qp.parse(u("*the?e*")) assert q.__class__ == query.Wildcard assert q.text == "*the?e*" def test_parse_fieldname_underscores(): s = fields.Schema(my_name=fields.ID(stored=True), my_value=fields.TEXT) qp = default.QueryParser("my_value", schema=s) q = qp.parse(u("my_name:Green")) assert q.__class__ == query.Term assert q.fieldname == "my_name" assert q.text == "Green" def test_endstar(): qp = default.QueryParser("text", None) q = qp.parse(u("word*")) assert q.__class__ == query.Prefix assert q.text == "word" q = qp.parse(u("first* second")) assert q[0].__class__ == query.Prefix assert q[0].text == "first" def test_singlequotes_query(): qp = default.QueryParser("text", None) q = qp.parse("hell's hot 'i stab at thee'") assert q.__class__.__name__ == 'And' assert len(q) == 3 assert q[0].__class__ == query.Term assert q[1].__class__ == query.Term assert q[2].__class__ == query.Term assert q[0].text == "hell's" assert q[1].text == "hot" assert q[2].text == "i stab at thee" q = qp.parse("alfa zulu:'bravo charlie' delta") assert q.__class__.__name__ == 'And' assert len(q) == 3 assert q[0].__class__ == query.Term assert q[1].__class__ == query.Term assert q[2].__class__ == query.Term assert (q[0].fieldname, q[0].text) == ("text", "alfa") assert (q[1].fieldname, q[1].text) == ("zulu", "bravo charlie") assert (q[2].fieldname, q[2].text) == ("text", "delta") q = qp.parse("The rest 'is silence") assert q.__class__ == query.And assert len(q) == 4 assert [t.text for t in q.subqueries] == ["The", "rest", "'is", "silence"] q = qp.parse("I don't like W's stupid face") assert q.__class__ == query.And assert len(q) == 6 assert [t.text for t in q.subqueries] == ["I", "don't", "like", "W's", "stupid", "face"] q = qp.parse("I forgot the drinkin' in '98") assert q.__class__ == query.And assert len(q) == 6 assert [t.text for t in q.subqueries] == ["I", "forgot", "the", "drinkin'", "in", "'98"] # def test_escaping(): # qp = default.QueryParser("text", None) # # q = qp.parse(r'big\small') # assert q.__class__, query.Term, q) # assert q.text == "bigsmall" # # q = qp.parse(r'big\\small') # assert q.__class__ == query.Term # assert q.text == r'big\small' # # q = qp.parse(r'http\:example') # assert q.__class__ == query.Term # assert q.fieldname == "text" # assert q.text == "http:example" # # q = qp.parse(r'hello\ there') # assert q.__class__ == query.Term # assert q.text == "hello there" # # q = qp.parse(r'\[start\ TO\ end\]') # assert q.__class__ == query.Term # assert q.text == "[start TO end]" # # schema = fields.Schema(text=fields.TEXT) # qp = default.QueryParser("text", None) # q = qp.parse(r"http\:\/\/www\.example\.com") # assert q.__class__ == query.Term # assert q.text == "http://www.example.com" # # q = qp.parse(u("\u005c\u005c")) # assert q.__class__ == query.Term # assert q.text == "\\" # def test_escaping_wildcards(): # qp = default.QueryParser("text", None) # # q = qp.parse(u("a*b*c?d")) # assert q.__class__ == query.Wildcard # assert q.text == "a*b*c?d" # # q = qp.parse(u("a*b\u005c*c?d")) # assert q.__class__ == query.Wildcard # assert q.text == "a*b*c?d" # # q = qp.parse(u("a*b\u005c\u005c*c?d")) # assert q.__class__ == query.Wildcard # assert q.text, u('a*b\u005c*c?d')) # # q = qp.parse(u("ab*")) # assert q.__class__ == query.Prefix # assert q.text, u("ab")) # # q = qp.parse(u("ab\u005c\u005c*")) # assert q.__class__ == query.Wildcard # assert q.text, u("ab\u005c*")) def test_phrase_phrase(): qp = default.QueryParser("content", None) q = qp.parse('"alfa bravo" "charlie delta echo"^2.2 test:"foxtrot golf"') assert q[0].__class__ == query.Phrase assert q[0].words == ["alfa", "bravo"] assert q[1].__class__ == query.Phrase assert q[1].words == ["charlie", "delta", "echo"] assert q[1].boost == 2.2 assert q[2].__class__ == query.Phrase assert q[2].words == ["foxtrot", "golf"] assert q[2].fieldname == "test" def test_weird_characters(): qp = default.QueryParser("content", None) q = qp.parse(u(".abcd@gmail.com")) assert q.__class__ == query.Term assert q.text == ".abcd@gmail.com" q = qp.parse(u("r*")) assert q.__class__ == query.Prefix assert q.text == "r" q = qp.parse(u(".")) assert q.__class__ == query.Term assert q.text == "." q = qp.parse(u("?")) assert q.__class__ == query.Wildcard assert q.text == "?" def test_euro_chars(): schema = fields.Schema(text=fields.TEXT) qp = default.QueryParser("text", schema) q = qp.parse(u("stra\xdfe")) assert q.__class__ == query.Term assert q.text == u("stra\xdfe") def test_star(): schema = fields.Schema(text=fields.TEXT(stored=True)) qp = default.QueryParser("text", schema) q = qp.parse(u("*")) assert q.__class__ == query.Every assert q.fieldname == "text" q = qp.parse(u("*h?ll*")) assert q.__class__ == query.Wildcard assert q.text == "*h?ll*" q = qp.parse(u("h?pe")) assert q.__class__ == query.Wildcard assert q.text == "h?pe" q = qp.parse(u("*? blah")) assert q.__class__ == query.And assert q[0].__class__ == query.Wildcard assert q[0].text == "*?" assert q[1].__class__ == query.Term assert q[1].text == "blah" q = qp.parse(u("*ending")) assert q.__class__ == query.Wildcard assert q.text == "*ending" q = qp.parse(u("*q")) assert q.__class__ == query.Wildcard assert q.text == "*q" def test_star_field(): schema = fields.Schema(text=fields.TEXT) qp = default.QueryParser("text", schema) q = qp.parse(u("*:*")) assert q.__class__ == query.Every assert q.fieldname is None # This gets parsed to a term with text="*:test" which is then analyzed down # to just "test" q = qp.parse(u("*:test")) assert q.__class__ == query.Term assert q.fieldname == "text" assert q.text == "test" def test_range_query(): schema = fields.Schema(name=fields.ID(stored=True), text=fields.TEXT(stored=True)) qp = default.QueryParser("text", schema) q = qp.parse(u("[alfa to bravo}")) assert q.__class__ == query.TermRange assert q.start == "alfa" assert q.end == "bravo" assert q.startexcl is False assert q.endexcl is True q = qp.parse(u("['hello there' to 'what ever']")) assert q.__class__ == query.TermRange assert q.start == "hello there" assert q.end == "what ever" assert q.startexcl is False assert q.endexcl is False q = qp.parse(u("name:{'to' to 'b'}")) assert q.__class__ == query.TermRange assert q.start == "to" assert q.end == "b" assert q.startexcl is True assert q.endexcl is True q = qp.parse(u("name:{'a' to 'to']")) assert q.__class__ == query.TermRange assert q.start == "a" assert q.end == "to" assert q.startexcl is True assert q.endexcl is False q = qp.parse(u("name:[a to to]")) assert q.__class__ == query.TermRange assert q.start == "a" assert q.end == "to" q = qp.parse(u("name:[to to b]")) assert q.__class__ == query.TermRange assert q.start == "to" assert q.end == "b" q = qp.parse(u("[alfa to alfa]")) assert q.__class__ == query.Term assert q.text == "alfa" q = qp.parse(u("Ind* AND name:[d TO]")) assert q.__class__ == query.And assert q[0].__class__ == query.Prefix assert q[1].__class__ == query.TermRange assert q[0].text == "ind" assert q[1].start == "d" assert q[1].fieldname == "name" q = qp.parse(u("name:[d TO]")) assert q.__class__ == query.TermRange assert q.start == "d" assert q.fieldname == "name" def test_numeric_range(): schema = fields.Schema(id=fields.STORED, number=fields.NUMERIC) qp = default.QueryParser("number", schema) teststart = 40 testend = 100 q = qp.parse("[%s to *]" % teststart) assert q == query.NullQuery q = qp.parse("[%s to]" % teststart) assert q.__class__ == query.NumericRange assert q.start == teststart assert q.end is None q = qp.parse("[to %s]" % testend) assert q.__class__ == query.NumericRange assert q.start is None assert q.end == testend q = qp.parse("[%s to %s]" % (teststart, testend)) assert q.__class__ == query.NumericRange assert q.start == teststart assert q.end == testend def test_regressions(): qp = default.QueryParser("f", None) # From 0.3.18, these used to require escaping. Mostly good for # regression testing. assert qp.parse(u("re-inker")) == query.Term("f", "re-inker") assert qp.parse(u("0.7 wire")) == query.And([query.Term("f", "0.7"), query.Term("f", "wire")]) assert (qp.parse(u("daler-rowney pearl 'bell bronze'")) == query.And([query.Term("f", "daler-rowney"), query.Term("f", "pearl"), query.Term("f", "bell bronze")])) q = qp.parse(u('22" BX')) assert q, query.And([query.Term("f", '22"') == query.Term("f", "BX")]) def test_empty_ranges(): schema = fields.Schema(name=fields.TEXT, num=fields.NUMERIC, date=fields.DATETIME) qp = default.QueryParser("text", schema) for fname in ("name", "date"): q = qp.parse(u("%s:[to]") % fname) assert q.__class__ == query.Every def test_empty_numeric_range(): schema = fields.Schema(id=fields.ID, num=fields.NUMERIC) qp = default.QueryParser("num", schema) q = qp.parse("num:[to]") assert q.__class__ == query.NumericRange assert q.start is None assert q.end is None def test_numrange_multi(): schema = fields.Schema(text=fields.TEXT, start=fields.NUMERIC, end=fields.NUMERIC) qp = default.QueryParser("text", schema) q = qp.parse("start:[2008 to]") assert q.__class__ == query.NumericRange assert q.fieldname == "start" assert q.start == 2008 assert q.end is None q = qp.parse("start:[2011 to 2012]") assert q.__class__.__name__ == "NumericRange" assert q.fieldname == "start" assert q.start == 2011 assert q.end == 2012 q = qp.parse("start:[2008 to] AND end:[2011 to 2012]") assert q.__class__ == query.And assert q[0].__class__ == query.NumericRange assert q[1].__class__ == query.NumericRange assert q[0].start == 2008 assert q[0].end is None assert q[1].start == 2011 assert q[1].end == 2012 def test_nonexistant_fieldnames(): # Need an analyzer that won't mangle a URL a = analysis.SimpleAnalyzer("\\S+") schema = fields.Schema(id=fields.ID, text=fields.TEXT(analyzer=a)) qp = default.QueryParser("text", schema) q = qp.parse(u("id:/code http://localhost/")) assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[0].fieldname == "id" assert q[0].text == "/code" assert q[1].__class__ == query.Term assert q[1].fieldname == "text" assert q[1].text == "http://localhost/" def test_stopped(): schema = fields.Schema(text=fields.TEXT) qp = default.QueryParser("text", schema) q = qp.parse(u("a b")) assert q == query.NullQuery def test_analyzing_terms(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana)) qp = default.QueryParser("text", schema) q = qp.parse(u("Indexed!")) assert q.__class__ == query.Term assert q.text == "index" def test_simple_parsing(): parser = default.SimpleParser("x", None) q = parser.parse(u("alfa bravo charlie delta")) assert text_type(q) == "(x:alfa OR x:bravo OR x:charlie OR x:delta)" q = parser.parse(u("alfa +bravo charlie delta")) assert text_type(q) == "(x:bravo ANDMAYBE (x:alfa OR x:charlie OR x:delta))" q = parser.parse(u("alfa +bravo -charlie delta")) assert text_type(q) == "((x:bravo ANDMAYBE (x:alfa OR x:delta)) ANDNOT x:charlie)" q = parser.parse(u("- alfa +bravo + delta")) assert text_type(q) == "((x:bravo AND x:delta) ANDNOT x:alfa)" def test_dismax(): parser = default.DisMaxParser({"body": 0.8, "title": 2.5}, None) q = parser.parse(u("alfa bravo charlie")) assert text_type(q) == "(DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:bravo^0.8 title:bravo^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5))" q = parser.parse(u("alfa +bravo charlie")) assert text_type(q) == "(DisMax(body:bravo^0.8 title:bravo^2.5) ANDMAYBE (DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5)))" q = parser.parse(u("alfa -bravo charlie")) assert text_type(q) == "((DisMax(body:alfa^0.8 title:alfa^2.5) OR DisMax(body:charlie^0.8 title:charlie^2.5)) ANDNOT DisMax(body:bravo^0.8 title:bravo^2.5))" q = parser.parse(u("alfa -bravo +charlie")) assert text_type(q) == "((DisMax(body:charlie^0.8 title:charlie^2.5) ANDMAYBE DisMax(body:alfa^0.8 title:alfa^2.5)) ANDNOT DisMax(body:bravo^0.8 title:bravo^2.5))" def test_many_clauses(): qs = "1" + (" OR 1" * 1000) parser = default.QueryParser("content", None) parser.parse(qs) def test_roundtrip(): parser = default.QueryParser("a", None) q = parser.parse(u("a OR ((b AND c AND d AND e) OR f OR g) ANDNOT h")) assert text_type(q) == "((a:a OR (a:b AND a:c AND a:d AND a:e) OR a:f OR a:g) ANDNOT a:h)" def test_ngrams(): schema = fields.Schema(grams=fields.NGRAM) parser = default.QueryParser('grams', schema) parser.remove_plugin_class(plugins.WhitespacePlugin) q = parser.parse(u("Hello There")) assert q.__class__ == query.And assert len(q) == 8 assert [sq.text for sq in q] == ["hell", "ello", "llo ", "lo t", "o th", " the", "ther", "here"] def test_ngramwords(): schema = fields.Schema(grams=fields.NGRAMWORDS(queryor=True)) parser = default.QueryParser('grams', schema) q = parser.parse(u("Hello Tom")) assert q.__class__ == query.And assert q[0].__class__ == query.Or assert q[1].__class__ == query.Term assert q[0][0].text == "hell" assert q[0][1].text == "ello" assert q[1].text == "tom" def test_multitoken_default(): textfield = fields.TEXT() assert textfield.multitoken_query == "default" schema = fields.Schema(text=textfield) parser = default.QueryParser('text', schema) qstring = u("chaw-bacon") texts = list(schema["text"].process_text(qstring)) assert texts == ["chaw", "bacon"] q = parser.parse(qstring) assert q.__class__ == query.And assert len(q) == 2 assert q[0].__class__ == query.Term assert q[0].text == "chaw" assert q[1].__class__ == query.Term assert q[1].text == "bacon" def test_multitoken_or(): textfield = fields.TEXT() textfield.multitoken_query = "or" schema = fields.Schema(text=textfield) parser = default.QueryParser('text', schema) qstring = u("chaw-bacon") texts = list(schema["text"].process_text(qstring)) assert texts == ["chaw", "bacon"] q = parser.parse(qstring) assert q.__class__ == query.Or assert len(q) == 2 assert q[0].__class__ == query.Term assert q[0].text == "chaw" assert q[1].__class__ == query.Term assert q[1].text == "bacon" def test_multitoken_phrase(): textfield = fields.TEXT() textfield.multitoken_query = "phrase" schema = fields.Schema(text=textfield) parser = default.QueryParser("text", schema) qstring = u("chaw-bacon") texts = list(schema["text"].process_text(qstring)) assert texts == ["chaw", "bacon"] q = parser.parse(qstring) assert q.__class__ == query.Phrase def test_singlequote_multitoken(): schema = fields.Schema(text=fields.TEXT(multitoken_query="or")) parser = default.QueryParser("text", schema) q = parser.parse(u("foo bar")) assert q.__unicode__() == "(text:foo AND text:bar)" q = parser.parse(u("'foo bar'")) # single quotes assert q.__unicode__() == "(text:foo OR text:bar)" def test_operator_queries(): qp = default.QueryParser("f", None) q = qp.parse("a AND b OR c AND d") assert text_type(q) == "((f:a AND f:b) OR (f:c AND f:d))" q = qp.parse("a OR b OR c OR d") assert text_type(q) == "(f:a OR f:b OR f:c OR f:d)" q = qp.parse("a ANDMAYBE b ANDNOT c REQUIRE d") assert text_type(q) == "((f:a ANDMAYBE (f:b ANDNOT f:c)) REQUIRE f:d)" #def test_associativity(): # left_andmaybe = (syntax.InfixOperator("ANDMAYBE", syntax.AndMaybeGroup, True), 0) # right_andmaybe = (syntax.InfixOperator("ANDMAYBE", syntax.AndMaybeGroup, False), 0) # not_ = (syntax.PrefixOperator("NOT", syntax.NotGroup), 0) # # def make_parser(*ops): # parser = default.QueryParser("f", None) # parser.replace_plugin(plugins.CompoundsPlugin(ops, clean=True)) # return parser # # p = make_parser(left_andmaybe) # q = p.parse("a ANDMAYBE b ANDMAYBE c ANDMAYBE d") # assert text_type(q), "(((f:a ANDMAYBE f:b) ANDMAYBE f:c) ANDMAYBE f:d)") # # p = make_parser(right_andmaybe) # q = p.parse("a ANDMAYBE b ANDMAYBE c ANDMAYBE d") # assert text_type(q), "(f:a ANDMAYBE (f:b ANDMAYBE (f:c ANDMAYBE f:d)))") # # p = make_parser(not_) # q = p.parse("a NOT b NOT c NOT d", normalize=False) # assert text_type(q), "(f:a AND NOT f:b AND NOT f:c AND NOT f:d)") # # p = make_parser(left_andmaybe) # q = p.parse("(a ANDMAYBE b) ANDMAYBE (c ANDMAYBE d)") # assert text_type(q), "((f:a ANDMAYBE f:b) ANDMAYBE (f:c ANDMAYBE f:d))") # # p = make_parser(right_andmaybe) # q = p.parse("(a ANDMAYBE b) ANDMAYBE (c ANDMAYBE d)") # assert text_type(q), "((f:a ANDMAYBE f:b) ANDMAYBE (f:c ANDMAYBE f:d))") def test_not_assoc(): qp = default.QueryParser("text", None) q = qp.parse(u("a AND NOT b OR c")) assert text_type(q) == "((text:a AND NOT text:b) OR text:c)" qp = default.QueryParser("text", None) q = qp.parse(u("a NOT (b OR c)")) assert text_type(q) == "(text:a AND NOT (text:b OR text:c))" def test_fieldname_space(): qp = default.QueryParser("a", None) q = qp.parse("Man Ray: a retrospective") assert text_type(q) == "(a:Man AND a:Ray: AND a:a AND a:retrospective)" def test_fieldname_fieldname(): qp = default.QueryParser("a", None) q = qp.parse("a:b:") assert q == query.Term("a", "b:") def test_paren_fieldname(): schema = fields.Schema(kind=fields.ID, content=fields.TEXT) qp = default.QueryParser("content", schema) q = qp.parse(u("(kind:1d565 OR kind:7c584) AND (stuff)")) assert text_type(q) == "((kind:1d565 OR kind:7c584) AND content:stuff)" q = qp.parse(u("kind:(1d565 OR 7c584) AND (stuff)")) assert text_type(q) == "((kind:1d565 OR kind:7c584) AND content:stuff)" def test_star_paren(): qp = default.QueryParser("content", None) q = qp.parse(u("(*john*) AND (title:blog)")) assert q.__class__ == query.And assert q[0].__class__ == query.Wildcard assert q[1].__class__ == query.Term assert q[0].fieldname == "content" assert q[1].fieldname == "title" assert q[0].text == "*john*" assert q[1].text == "blog" def test_dash(): ana = analysis.StandardAnalyzer("[^ \t\r\n()*?]+") schema = fields.Schema(title=fields.TEXT(analyzer=ana), text=fields.TEXT(analyzer=ana), time=fields.ID) qtext = u("*Ben-Hayden*") qp = default.QueryParser("text", schema) q = qp.parse(qtext) assert q.__class__ == query.Wildcard assert q.fieldname == "text" assert q.text == "*ben-hayden*" qp = default.MultifieldParser(["title", "text", "time"], schema) q = qp.parse(qtext) assert q.__unicode__() == "(title:*ben-hayden* OR text:*ben-hayden* OR time:*Ben-Hayden*)" def test_bool_True(): schema = fields.Schema(text=fields.TEXT, bool=fields.BOOLEAN) qp = default.QueryParser("text", schema) q = qp.parse("bool:True") assert q.__class__ == query.Term assert q.fieldname == "bool" assert q.text is True def test_not_order(): schema = fields.Schema(id=fields.STORED, count=fields.KEYWORD(lowercase=True), cats=fields.KEYWORD(lowercase=True)) qp = default.QueryParser("count", schema) q1 = qp.parse(u("(NOT (count:0) AND cats:1)")) assert q1.__class__ == query.And assert q1[0].__class__ == query.Not assert q1[1].__class__ == query.Term assert q1.__unicode__() == '(NOT count:0 AND cats:1)' q2 = qp.parse(u("(cats:1 AND NOT (count:0))")) assert q2.__class__ == query.And assert q2[0].__class__ == query.Term assert q2[1].__class__ == query.Not assert q2.__unicode__() == '(cats:1 AND NOT count:0)' def test_spacespace_and(): qp = default.QueryParser("f", None) # one blank before/after AND q = qp.parse("A AND B") assert q.__class__ == query.And assert len(q) == 2 assert q[0] == query.Term("f", "A") assert q[1] == query.Term("f", "B") # two blanks before AND q = qp.parse("A AND B") assert q.__class__ == query.And assert len(q) == 2 assert q[0] == query.Term("f", "A") assert q[1] == query.Term("f", "B") def test_unicode_num(): schema = fields.Schema(num=fields.NUMERIC) parser = default.QueryParser(u("num"), schema=schema) q = parser.parse(u("num:1")) _ = text_type(q) def test_phrase_andmaybe(): qp = default.QueryParser("f", None) q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"')) assert isinstance(q, query.AndMaybe) assert q[0] == query.Term("f", u("Dahmen")) assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")]) def test_phrase_boost(): qp = default.QueryParser("f", None) q = qp.parse(u('Dahmen ANDMAYBE "Besov Spaces"^9')) assert isinstance(q, query.AndMaybe) assert q[0] == query.Term("f", u("Dahmen")) assert q[1] == query.Phrase("f", [u("Besov"), u("Spaces")], boost=9) def test_andmaybe_none(): schema = fields.Schema(f=fields.TEXT, year=fields.NUMERIC) qp = default.QueryParser("f", schema) _ = qp.parse(u("Dahmen ANDMAYBE @year:[2000 TO]")) def test_quoted_prefix(): qp = default.QueryParser("f", None) expr = r"(^|(?<=[ (]))(?P\w+|[*]):" qp.replace_plugin(plugins.FieldsPlugin(expr)) q = qp.parse(u('foo url:http://apple.com:8080/bar* baz')) assert isinstance(q, query.And) assert q[0] == query.Term("f", "foo") assert q[1] == query.Prefix("url", "http://apple.com:8080/bar") assert q[2] == query.Term("f", "baz") assert len(q) == 3 Whoosh-2.5.7/tests/test_postings.py0000644000076500000240000001065712254366350017524 0ustar mattstaff00000000000000from __future__ import with_statement from whoosh import analysis, fields from whoosh.compat import xrange, u from whoosh.codec import default_codec from whoosh.formats import Existence, Frequency from whoosh.formats import Positions, PositionBoosts from whoosh.formats import Characters, CharacterBoosts from whoosh.util.testing import TempStorage def _roundtrip(content, format_, astype, ana=None): with TempStorage("roundtrip") as st: codec = default_codec() seg = codec.new_segment(st, "") ana = ana or analysis.StandardAnalyzer() field = fields.FieldType(format=format_, analyzer=ana) fw = codec.field_writer(st, seg) fw.start_field("f1", field) for text, _, weight, valuestring in sorted(field.index(content)): fw.start_term(text) fw.add(0, weight, valuestring, None) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) ps = [] for fieldname, btext in tr.terms(): m = tr.matcher(fieldname, btext, format_) ps.append((field.from_bytes(btext), m.value_as(astype))) tr.close() return ps def test_existence_postings(): content = u("alfa bravo charlie") assert _roundtrip(content, Existence(), "frequency") == [("alfa", 1), ("bravo", 1), ("charlie", 1)] def test_frequency_postings(): content = u("alfa bravo charlie bravo alfa alfa") assert _roundtrip(content, Frequency(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] def test_position_postings(): content = u("alfa bravo charlie bravo alfa alfa") assert _roundtrip(content, Positions(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, Positions(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] def test_character_postings(): content = u("alfa bravo charlie bravo alfa alfa") assert _roundtrip(content, Characters(), "characters") == [("alfa", [(0, 0, 4), (4, 25, 29), (5, 30, 34)]), ("bravo", [(1, 5, 10), (3, 19, 24)]), ("charlie", [(2, 11, 18)])] assert _roundtrip(content, Characters(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, Characters(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] def test_posboost_postings(): pbs = PositionBoosts() ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") assert _roundtrip(content, pbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), ("bravo", [(1, 0.1), (3, 0.5)]), ("charlie", [(2, 2)])] assert _roundtrip(content, pbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, pbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)] def test_charboost_postings(): cbs = CharacterBoosts() ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") assert _roundtrip(content, cbs, "character_boosts", ana) == [("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]), ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]), ("charlie", [(2, 17, 24, 2)])] assert _roundtrip(content, cbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), ("bravo", [(1, 0.1), (3, 0.5)]), ("charlie", [(2, 2)])] assert _roundtrip(content, cbs, "characters", ana) == [("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]), ("bravo", [(1, 7, 12), (3, 27, 32)]), ("charlie", [(2, 17, 24)])] assert _roundtrip(content, cbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, cbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)] Whoosh-2.5.7/tests/test_quality.py0000644000076500000240000001262712254366350017345 0ustar mattstaff00000000000000from __future__ import with_statement import random from whoosh import fields, matching, scoring from whoosh.compat import b, u, xrange from whoosh.filedb.filestore import RamStorage from whoosh.util.numeric import length_to_byte, byte_to_length def _discreet(length): return byte_to_length(length_to_byte(length)) def test_max_field_length(): st = RamStorage() schema = fields.Schema(t=fields.TEXT) ix = st.create_index(schema) for i in xrange(1, 200, 7): w = ix.writer() w.add_document(t=u(" ").join(["word"] * i)) w.commit() with ix.reader() as r: assert r.max_field_length("t") == _discreet(i) def test_minmax_field_length(): st = RamStorage() schema = fields.Schema(t=fields.TEXT) ix = st.create_index(schema) least = 999999 most = 0 for _ in xrange(1, 200, 7): w = ix.writer() count = random.randint(1, 100) least = min(count, least) most = max(count, most) w.add_document(t=u(" ").join(["word"] * count)) w.commit() with ix.reader() as r: assert r.min_field_length("t") == _discreet(least) assert r.max_field_length("t") == _discreet(most) def test_term_stats(): schema = fields.Schema(t=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(t=u("alfa bravo charlie delta echo")) w.add_document(t=u("bravo charlie delta echo foxtrot")) w.add_document(t=u("charlie delta echo foxtrot golf")) w.add_document(t=u("delta echo foxtrot")) w.add_document(t=u("echo foxtrot golf hotel india juliet")) w.add_document(t=u("foxtrot alfa alfa alfa")) w.commit() with ix.reader() as r: ti = r.term_info("t", u("alfa")) assert ti.weight() == 4.0 assert ti.doc_frequency() == 2 assert ti.min_length() == 4 assert ti.max_length() == 5 assert ti.max_weight() == 3.0 assert r.term_info("t", u("echo")).min_length() == 3 assert r.doc_field_length(3, "t") == 3 assert r.min_field_length("t") == 3 assert r.max_field_length("t") == 6 w = ix.writer() w.add_document(t=u("alfa")) w.add_document(t=u("bravo charlie")) w.add_document(t=u("echo foxtrot tango bravo")) w.add_document(t=u("golf hotel")) w.add_document(t=u("india")) w.add_document(t=u("juliet alfa bravo charlie delta echo foxtrot")) w.commit(merge=False) with ix.reader() as r: ti = r.term_info("t", u("alfa")) assert ti.weight() == 6.0 assert ti.doc_frequency() == 4 assert ti.min_length() == 1 assert ti.max_length() == 7 assert ti.max_weight() == 3.0 assert r.term_info("t", u("echo")).min_length() == 3 assert r.min_field_length("t") == 1 assert r.max_field_length("t") == 7 def test_min_max_id(): schema = fields.Schema(id=fields.STORED, t=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, t=u("alfa bravo charlie")) w.add_document(id=1, t=u("bravo charlie delta")) w.add_document(id=2, t=u("charlie delta echo")) w.add_document(id=3, t=u("delta echo foxtrot")) w.add_document(id=4, t=u("echo foxtrot golf")) w.commit() with ix.reader() as r: ti = r.term_info("t", u("delta")) assert ti.min_id() == 1 assert ti.max_id() == 3 ti = r.term_info("t", u("alfa")) assert ti.min_id() == 0 assert ti.max_id() == 0 ti = r.term_info("t", u("foxtrot")) assert ti.min_id() == 3 assert ti.max_id() == 4 w = ix.writer() w.add_document(id=5, t=u("foxtrot golf hotel")) w.add_document(id=6, t=u("golf hotel alfa")) w.add_document(id=7, t=u("hotel alfa bravo")) w.add_document(id=8, t=u("alfa bravo charlie")) w.commit(merge=False) with ix.reader() as r: ti = r.term_info("t", u("delta")) assert ti.min_id() == 1 assert ti.max_id() == 3 ti = r.term_info("t", u("alfa")) assert ti.min_id() == 0 assert ti.max_id() == 8 ti = r.term_info("t", u("foxtrot")) assert ti.min_id() == 3 assert ti.max_id() == 5 def test_replacements(): sc = scoring.WeightScorer(0.25) a = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc) b = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc) um = matching.UnionMatcher(a, b) a2 = a.replace(0.5) assert a2.__class__ == matching.NullMatcherClass um2 = um.replace(0.5) assert um2.__class__ == matching.IntersectionMatcher um2 = um.replace(0.6) assert um2.__class__ == matching.NullMatcherClass wm = matching.WrappingMatcher(um, boost=2.0) wm = wm.replace(0.5) assert wm.__class__ == matching.WrappingMatcher assert wm.boost == 2.0 assert wm.child.__class__ == matching.IntersectionMatcher ls1 = matching.ListMatcher([1, 2, 3], [0.1, 0.1, 0.1], scorer=scoring.WeightScorer(0.1)) ls2 = matching.ListMatcher([1, 2, 3], [0.2, 0.2, 0.2], scorer=scoring.WeightScorer(0.2)) ls3 = matching.ListMatcher([1, 2, 3], [0.3, 0.3, 0.3], scorer=scoring.WeightScorer(0.3)) mm = matching.MultiMatcher([ls1, ls2, ls3], [0, 4, 8]) mm = mm.replace(0.25) assert mm.current == 2 dm = matching.DisjunctionMaxMatcher(ls1, ls2) dm = dm.replace(0.15) assert dm is ls2 Whoosh-2.5.7/tests/test_queries.py0000644000076500000240000004703312254366350017331 0ustar mattstaff00000000000000from __future__ import with_statement import copy import pytest from whoosh import fields, qparser, query from whoosh.compat import b, u from whoosh.filedb.filestore import RamStorage from whoosh.qparser import QueryParser from whoosh.query import And from whoosh.query import AndMaybe from whoosh.query import ConstantScoreQuery from whoosh.query import DateRange from whoosh.query import DisjunctionMax from whoosh.query import Every from whoosh.query import FuzzyTerm from whoosh.query import Not from whoosh.query import NullQuery from whoosh.query import NumericRange from whoosh.query import Or from whoosh.query import Phrase from whoosh.query import Prefix from whoosh.query import Require from whoosh.query import Term from whoosh.query import TermRange from whoosh.query import Variations from whoosh.query import Wildcard from whoosh.query.spans import SpanContains from whoosh.query.spans import SpanFirst from whoosh.query.spans import SpanNear from whoosh.query.spans import SpanNot from whoosh.query.spans import SpanOr from whoosh.util.testing import TempIndex def test_all_terms(): q = QueryParser("a", None).parse(u('hello b:there c:"my friend"')) ts = q.all_terms(phrases=False) assert sorted(ts) == [("a", "hello"), ("b", "there")] ts = q.all_terms(phrases=True) assert sorted(ts) == [("a", "hello"), ("b", "there"), ("c", "friend"), ("c", "my")] def test_existing_terms(): s = fields.Schema(key=fields.ID, value=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(key=u("a"), value=u("alfa bravo charlie delta echo")) w.add_document(key=u("b"), value=u("foxtrot golf hotel india juliet")) w.commit() r = ix.reader() q = QueryParser("value", None).parse(u('alfa hotel tango "sierra bravo"')) ts = q.existing_terms(r, phrases=False) assert sorted(ts) == [("value", b("alfa")), ("value", b("hotel"))] ts = q.existing_terms(r) assert sorted(ts) == [("value", b("alfa")), ("value", b("bravo")), ("value", b("hotel"))] def test_wildcard_existing_terms(): s = fields.Schema(key=fields.ID, value=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta")) w.add_document(key=u("a"), value=u("boggle echo render rendering renders")) w.commit() r = ix.reader() qp = QueryParser("value", ix.schema) def words(terms): z = [] for t in terms: assert t[0] == "value" z.append(t[1]) return b(" ").join(sorted(z)) q = qp.parse(u("b*")) ts = q.existing_terms(r) assert ts == set() ts = q.existing_terms(r, expand=True) assert words(ts) == b("bear boggle bravo") q = qp.parse(u("[a TO f]")) ts = q.existing_terms(r) assert ts == set() ts = q.existing_terms(r, expand=True) assert words(ts) == b("alfa bear boggle bravo charlie delta echo") q = query.Variations("value", "render") ts = q.existing_terms(r, expand=False) assert ts == set([("value", b("render"))]) ts = q.existing_terms(r, expand=True) assert words(ts) == b("render rendering renders") def test_replace(): q = And([Or([Term("a", "b"), Term("b", "c")], boost=1.2), Variations("a", "b", boost=2.0)]) q = q.replace("a", "b", "BB") assert q == And([Or([Term("a", "BB"), Term("b", "c")], boost=1.2), Variations("a", "BB", boost=2.0)]) def test_apply(): def visit(q): if isinstance(q, (Term, Variations, FuzzyTerm)): q.text = q.text.upper() return q return q.apply(visit) before = And([Not(Term("a", u("b"))), Variations("a", u("c")), Not(FuzzyTerm("a", u("d")))]) after = visit(before) assert after == And([Not(Term("a", u("B"))), Variations("a", u("C")), Not(FuzzyTerm("a", u("D")))]) def term2var(q): if isinstance(q, Term): return Variations(q.fieldname, q.text) else: return q.apply(term2var) q = And([Term("f", "alfa"), Or([Term("f", "bravo"), Not(Term("f", "charlie"))])]) q = term2var(q) assert q == And([Variations('f', 'alfa'), Or([Variations('f', 'bravo'), Not(Variations('f', 'charlie'))])]) def test_accept(): def boost_phrases(q): if isinstance(q, Phrase): q.boost *= 2.0 return q before = And([Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")])]), Phrase("a", [u("g"), u("h")], boost=0.25)]) after = before.accept(boost_phrases) assert after == And([Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")], boost=2.0)]), Phrase("a", [u("g"), u("h")], boost=0.5)]) before = Phrase("a", [u("b"), u("c")], boost=2.5) after = before.accept(boost_phrases) assert after == Phrase("a", [u("b"), u("c")], boost=5.0) def test_simplify(): s = fields.Schema(k=fields.ID, v=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(k=u("1"), v=u("aardvark apple allan alfa bear bee")) w.add_document(k=u("2"), v=u("brie glue geewhiz goop julia")) w.commit() r = ix.reader() q1 = And([Prefix("v", "b", boost=2.0), Term("v", "juliet")]) q2 = And([Or([Term('v', 'bear', boost=2.0), Term('v', 'bee', boost=2.0), Term('v', 'brie', boost=2.0)]), Term('v', 'juliet')]) assert q1.simplify(r) == q2 def test_merge_ranges(): q = And([TermRange("f1", u("a"), None), TermRange("f1", None, u("z"))]) assert q.normalize() == TermRange("f1", u("a"), u("z")) q = And([NumericRange("f1", None, u("aaaaa")), NumericRange("f1", u("zzzzz"), None)]) assert q.normalize() == q q = And([TermRange("f1", u("a"), u("z")), TermRange("f1", "b", "x")]) assert q.normalize() == TermRange("f1", u("a"), u("z")) q = And([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert q.normalize() == TermRange("f1", u("f"), u("m")) q = Or([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert q.normalize() == TermRange("f1", u("a"), u("q")) q = Or([TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert q.normalize() == Every("f1") q = And([Every("f1"), Term("f1", "a"), Variations("f1", "b")]) assert q.normalize() == Every("f1") q = Or([Term("f1", u("q")), TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert q.normalize() == Every("f1") q = And([Or([Term("f1", u("a")), Term("f1", u("b"))]), Every("f1")]) assert q.normalize() == Every("f1") q = And([Term("f1", u("a")), And([Or([Every("f1")])])]) assert q.normalize() == Every("f1") def test_normalize_compound(): def oq(): return Or([Term("a", u("a")), Term("a", u("b"))]) def nq(level): if level == 0: return oq() else: return Or([nq(level - 1), nq(level - 1), nq(level - 1)]) q = nq(5) q = q.normalize() assert q == Or([Term("a", u("a")), Term("a", u("b"))]) def test_duplicates(): q = And([Term("a", u("b")), Term("a", u("b"))]) assert q.normalize() == Term("a", u("b")) q = And([Prefix("a", u("b")), Prefix("a", u("b"))]) assert q.normalize() == Prefix("a", u("b")) q = And([Variations("a", u("b")), And([Variations("a", u("b")), Term("a", u("b"))])]) assert q.normalize() == And([Variations("a", u("b")), Term("a", u("b"))]) q = And([Term("a", u("b")), Prefix("a", u("b")), Term("a", u("b"), boost=1.1)]) assert q.normalize() == q # Wildcard without * or ? normalizes to Term q = And([Wildcard("a", u("b")), And([Wildcard("a", u("b")), Term("a", u("b"))])]) assert q.normalize() == Term("a", u("b")) # TODO: FIX THIS def test_query_copy_hash(): def do(q1, q2): q1a = copy.deepcopy(q1) assert q1 == q1a assert hash(q1) == hash(q1a) assert q1 != q2 do(Term("a", u("b"), boost=1.1), Term("a", u("b"), boost=1.5)) do(And([Term("a", u("b")), Term("c", u("d"))], boost=1.1), And([Term("a", u("b")), Term("c", u("d"))], boost=1.5)) do(Or([Term("a", u("b"), boost=1.1), Term("c", u("d"))]), Or([Term("a", u("b"), boost=1.8), Term("c", u("d"))], boost=1.5)) do(DisjunctionMax([Term("a", u("b"), boost=1.8), Term("c", u("d"))]), DisjunctionMax([Term("a", u("b"), boost=1.1), Term("c", u("d"))], boost=1.5)) do(Not(Term("a", u("b"), boost=1.1)), Not(Term("a", u("b"), boost=1.5))) do(Prefix("a", u("b"), boost=1.1), Prefix("a", u("b"), boost=1.5)) do(Wildcard("a", u("b*x?"), boost=1.1), Wildcard("a", u("b*x?"), boost=1.5)) do(FuzzyTerm("a", u("b"), constantscore=True), FuzzyTerm("a", u("b"), constantscore=False)) do(FuzzyTerm("a", u("b"), boost=1.1), FuzzyTerm("a", u("b"), boost=1.5)) do(TermRange("a", u("b"), u("c")), TermRange("a", u("b"), u("d"))) do(TermRange("a", None, u("c")), TermRange("a", None, None)) do(TermRange("a", u("b"), u("c"), boost=1.1), TermRange("a", u("b"), u("c"), boost=1.5)) do(TermRange("a", u("b"), u("c"), constantscore=True), TermRange("a", u("b"), u("c"), constantscore=False)) do(NumericRange("a", 1, 5), NumericRange("a", 1, 6)) do(NumericRange("a", None, 5), NumericRange("a", None, None)) do(NumericRange("a", 3, 6, boost=1.1), NumericRange("a", 3, 6, boost=1.5)) do(NumericRange("a", 3, 6, constantscore=True), NumericRange("a", 3, 6, constantscore=False)) # do(DateRange) do(Variations("a", u("render")), Variations("a", u("renders"))) do(Variations("a", u("render"), boost=1.1), Variations("a", u("renders"), boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")]), Phrase("a", [u("b"), u("c"), u("e")])) do(Phrase("a", [u("b"), u("c"), u("d")], boost=1.1), Phrase("a", [u("b"), u("c"), u("d")], boost=1.5)) do(Phrase("a", [u("b"), u("c"), u("d")], slop=1), Phrase("a", [u("b"), u("c"), u("d")], slop=2)) # do(Ordered) do(Every(), Every("a")) do(Every("a"), Every("b")) do(Every("a", boost=1.1), Every("a", boost=1.5)) do(NullQuery, Term("a", u("b"))) do(ConstantScoreQuery(Term("a", u("b"))), ConstantScoreQuery(Term("a", u("c")))) do(ConstantScoreQuery(Term("a", u("b")), score=2.0), ConstantScoreQuery(Term("a", u("c")), score=2.1)) do(Require(Term("a", u("b")), Term("c", u("d"))), Require(Term("a", u("b"), boost=1.1), Term("c", u("d")))) # do(Require) # do(AndMaybe) # do(AndNot) # do(Otherwise) do(SpanFirst(Term("a", u("b")), limit=1), SpanFirst(Term("a", u("b")), limit=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d"))), SpanNear(Term("a", u("b")), Term("c", u("e")))) do(SpanNear(Term("a", u("b")), Term("c", u("d")), slop=1), SpanNear(Term("a", u("b")), Term("c", u("d")), slop=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=1), SpanNear(Term("a", u("b")), Term("c", u("d")), mindist=2)) do(SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=True), SpanNear(Term("a", u("b")), Term("c", u("d")), ordered=False)) do(SpanNot(Term("a", u("b")), Term("a", u("c"))), SpanNot(Term("a", u("b")), Term("a", u("d")))) do(SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("d"))]), SpanOr([Term("a", u("b")), Term("a", u("c")), Term("a", u("e"))])) do(SpanContains(Term("a", u("b")), Term("a", u("c"))), SpanContains(Term("a", u("b")), Term("a", u("d")))) # do(SpanBefore) # do(SpanCondition) def test_requires(): a = Term("f", u("a")) b = Term("f", u("b")) assert And([a, b]).requires() == set([a, b]) assert Or([a, b]).requires() == set() assert AndMaybe(a, b).requires() == set([a]) assert a.requires() == set([a]) def test_highlight_daterange(): from datetime import datetime schema = fields.Schema(id=fields.ID(unique=True, stored=True), title=fields.TEXT(stored=True), content=fields.TEXT(stored=True), released=fields.DATETIME(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.update_document( id=u('1'), title=u('Life Aquatic'), content=u('A nautic film crew sets out to kill a gigantic shark.'), released=datetime(2004, 12, 25) ) w.update_document( id=u('2'), title=u('Darjeeling Limited'), content=u('Three brothers meet in India for a life changing train ' + 'journey.'), released=datetime(2007, 10, 27) ) w.commit() s = ix.searcher() r = s.search(Term('content', u('train')), terms=True) assert len(r) == 1 assert r[0]["id"] == "2" assert r[0].highlights("content") == 'for a life changing train journey' r = s.search(DateRange('released', datetime(2007, 1, 1), None)) assert len(r) == 1 assert r[0].highlights("content") == '' def test_patterns(): domain = u("aaron able acre adage aether after ago ahi aim ajax akimbo " "alembic all amiga amount ampere").split() schema = fields.Schema(word=fields.KEYWORD(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for word in domain: w.add_document(word=word) with ix.reader() as r: assert list(r.field_terms("word")) == domain assert list(r.expand_prefix("word", "al")) == [b("alembic"), b("all")] q = query.Prefix("word", "al") assert q.simplify(r).__unicode__() == "(word:alembic OR word:all)" q = query.Wildcard("word", "a*[ae]") assert q.simplify(r).__unicode__() == "(word:able OR word:acre OR word:adage OR word:amiga OR word:ampere)" assert q._find_prefix(q.text) == "a" q = query.Regex("word", "am.*[ae]") assert q.simplify(r).__unicode__() == "(word:amiga OR word:ampere)" assert q._find_prefix(q.text) == "am" q = query.Regex("word", "able|ago") assert q.simplify(r).__unicode__() == "(word:able OR word:ago)" assert q._find_prefix(q.text) == "" # special case: ? may mean "zero occurences" q = query.Regex("word", "ah?i") assert q.simplify(r).__unicode__() == "(word:ahi OR word:aim)" assert q._find_prefix(q.text) == "a" # special case: * may mean "zero occurences" q = query.Regex("word", "ah*i") assert q.simplify(r).__unicode__() == "(word:ahi OR word:aim)" assert q._find_prefix(q.text) == "a" def test_or_nots1(): # Issue #285 schema = fields.Schema(a=fields.KEYWORD(stored=True), b=fields.KEYWORD(stored=True)) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: w.add_document(a=u("alfa"), b=u("charlie")) with ix.searcher() as s: q = query.And([query.Term("a", "alfa"), query.Or([query.Not(query.Term("b", "bravo")), query.Not(query.Term("b", "charlie")) ]) ]) r = s.search(q) assert len(r) == 1 def test_or_nots2(): # Issue #286 schema = fields.Schema(a=fields.KEYWORD(stored=True), b=fields.KEYWORD(stored=True)) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: w.add_document(b=u("bravo")) with ix.searcher() as s: q = query.Or([query.Term("a", "alfa"), query.Not(query.Term("b", "alfa")) ]) r = s.search(q) assert len(r) == 1 def test_or_nots3(): schema = fields.Schema(title=fields.TEXT(stored=True), itemtype=fields.ID(stored=True)) with TempIndex(schema, "ornot") as ix: w = ix.writer() w.add_document(title=u("a1"), itemtype=u("a")) w.add_document(title=u("a2"), itemtype=u("a")) w.add_document(title=u("b1"), itemtype=u("b")) w.commit() q = Term('itemtype', 'a') | Not(Term('itemtype', 'a')) with ix.searcher() as s: r = " ".join([hit["title"] for hit in s.search(q)]) assert r == "a1 a2 b1" def test_ornot_andnot(): schema = fields.Schema(id=fields.NUMERIC(stored=True), a=fields.KEYWORD()) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: w.add_document(id=0, a=u("word1 word1")) w.add_document(id=1, a=u("word1 word2")) w.add_document(id=2, a=u("word1 foo")) w.add_document(id=3, a=u("foo word2")) w.add_document(id=4, a=u("foo bar")) with ix.searcher() as s: qp = qparser.QueryParser("a", ix.schema) q1 = qp.parse(u("NOT word1 NOT word2")) q2 = qp.parse(u("NOT (word1 OR word2)")) r1 = [hit["id"] for hit in s.search(q1, sortedby="id")] r2 = [hit["id"] for hit in s.search(q2, sortedby="id")] assert r1 == r2 == [4] def test_none_in_compounds(): with pytest.raises(query.QueryError): _ = query.And([query.Term("a", "b"), None, query.Term("c", "d")]) def test_issue_355(): schema = fields.Schema(seats=fields.NUMERIC(bits=8, stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(seats=0) w.add_document(seats=10) w.add_document(seats=20) with ix.searcher() as s: # Passing a bytestring for a numeric field q = Term("seats", b("maker")) r1 = [hit["seats"] for hit in s.search(q, limit=5)] # Passing a unicode string for a numeric field q = Term("seats", u("maker")) r2 = [hit["seats"] for hit in s.search(q, limit=5)] # Passing a value too large for the numeric field q = Term("seats", 260) r3 = [hit["seats"] for hit in s.search(q, limit=5)] assert r1 == r2 == r3 == [] def test_sequence(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, text=u("alfa bravo charlie delta echo")) w.add_document(id=1, text=u("bravo charlie delta echo alfa")) w.add_document(id=2, text=u("charlie delta echo bravo")) w.add_document(id=3, text=u("delta echo charlie")) w.add_document(id=4, text=u("echo delta")) with ix.searcher() as s: seq = query.Sequence([query.Term("text", u("echo")), query.Term("text", u("alfa"))]) q = query.And([query.Term("text", "bravo"), seq]) r = s.search(q, limit=4) assert len(r) == 1 assert r[0]["id"] == 1 def test_andmaybe(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, text=u("alfa bravo charlie delta echo")) w.add_document(id=1, text=u("bravo charlie delta echo alfa")) w.add_document(id=2, text=u("charlie delta echo bravo")) w.add_document(id=3, text=u("delta echo charlie")) w.add_document(id=4, text=u("echo delta")) qp = qparser.QueryParser("text", schema) q = qp.parse(u('bravo ANDMAYBE "echo alfa"')) with ix.searcher() as s: r = s.search(q) assert len(r) == 3 assert [hit["id"] for hit in r] == [1, 2, 0] Whoosh-2.5.7/tests/test_reading.py0000644000076500000240000002742412254366350017267 0ustar mattstaff00000000000000from __future__ import with_statement import random, threading, time from whoosh import analysis, fields, formats, reading from whoosh.compat import b, u, xrange from whoosh.reading import SegmentReader from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex def _create_index(): s = fields.Schema(f1=fields.KEYWORD(stored=True), f2=fields.KEYWORD, f3=fields.KEYWORD) st = RamStorage() ix = st.create_index(s) return ix def _one_segment_index(): ix = _create_index() w = ix.writer() w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z")) w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S")) w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S")) w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z")) w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y")) w.commit() return ix def _multi_segment_index(): ix = _create_index() w = ix.writer() w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z")) w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S")) w.commit() w = ix.writer() w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S")) w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z")) w.commit(merge=False) w = ix.writer() w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y")) w.commit(merge=False) return ix def _stats(r): return [(fname, text, ti.doc_frequency(), ti.weight()) for (fname, text), ti in r] def _fstats(r): return [(text, ti.doc_frequency(), ti.weight()) for text, ti in r] def test_readers(): target = [("f1", b('A'), 4, 6), ("f1", b('B'), 2, 2), ("f1", b('C'), 2, 2), ("f1", b('D'), 1, 1), ("f1", b('E'), 2, 2), ("f1", b('F'), 1, 1), ("f2", b('1'), 3, 3), ("f2", b('2'), 3, 3), ("f2", b('3'), 2, 2), ("f2", b('4'), 2, 2), ("f2", b('5'), 2, 2), ("f2", b('6'), 2, 2), ("f3", b('Q'), 2, 2), ("f3", b('R'), 2, 2), ("f3", b('S'), 2, 2), ("f3", b('X'), 3, 3), ("f3", b('Y'), 3, 3), ("f3", b('Z'), 2, 2)] target = sorted(target) stored = [{"f1": "A B C"}, {"f1": "D E F"}, {"f1": "A E C"}, {"f1": "A A A"}, {"f1": "A B"}] def t(ix): r = ix.reader() assert list(r.all_stored_fields()) == stored assert sorted(_stats(r)) == target ix = _one_segment_index() assert len(ix._segments()) == 1 t(ix) ix = _multi_segment_index() assert len(ix._segments()) == 3 t(ix) def test_term_inspection(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(schema) writer = ix.writer() writer.add_document(title=u("My document"), content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE")) writer.add_document(title=u("My other document"), content=u("AA AB BB CC EE EE AX AX DD")) writer.commit() reader = ix.reader() assert " ".join(reader.field_terms("content")) == "aa ab ax bb cc dd ee" assert list(reader.expand_prefix("content", "a")) == [b('aa'), b('ab'), b('ax')] assert set(reader.all_terms()) == set([('content', b('aa')), ('content', b('ab')), ('content', b('ax')), ('content', b('bb')), ('content', b('cc')), ('content', b('dd')), ('content', b('ee')), ('title', b('document')), ('title', b('my')), ('title', b('other'))]) # (text, doc_freq, index_freq) assert _fstats(reader.iter_field("content")) == [(b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2), (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)] assert _fstats(reader.iter_field("content", prefix="c")) == [(b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)] assert list(reader.most_frequent_terms("content")) == [(6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), (2, b('dd'))] assert list(reader.most_frequent_terms("content", prefix="a")) == [(6, b('aa')), (2, b('ax')), (1, b('ab'))] assert list(reader.most_distinctive_terms("content", 3)) == [(1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')), (0.0, b('ee'))] def test_vector_postings(): s = fields.Schema(id=fields.ID(stored=True, unique=True), content=fields.TEXT(vector=formats.Positions())) st = RamStorage() ix = st.create_index(s) writer = ix.writer() writer.add_document(id=u('1'), content=u('the quick brown fox jumped over the ' + 'lazy dogs')) writer.commit() r = ix.reader() terms = list(r.vector_as("weight", 0, "content")) assert terms == [(u('brown'), 1.0), (u('dogs'), 1.0), (u('fox'), 1.0), (u('jumped'), 1.0), (u('lazy'), 1.0), (u('over'), 1.0), (u('quick'), 1.0)] def test_stored_fields(): s = fields.Schema(a=fields.ID(stored=True), b=fields.STORED, c=fields.KEYWORD, d=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(s) writer = ix.writer() writer.add_document(a=u("1"), b="a", c=u("zulu"), d=u("Alfa")) writer.add_document(a=u("2"), b="b", c=u("yankee"), d=u("Bravo")) writer.add_document(a=u("3"), b="c", c=u("xray"), d=u("Charlie")) writer.commit() with ix.searcher() as sr: assert sr.stored_fields(0) == {"a": u("1"), "b": "a", "d": u("Alfa")} assert sr.stored_fields(2) == {"a": u("3"), "b": "c", "d": u("Charlie")} assert sr.document(a=u("1")) == {"a": u("1"), "b": "a", "d": u("Alfa")} assert sr.document(a=u("2")) == {"a": u("2"), "b": "b", "d": u("Bravo")} def test_stored_fields2(): schema = fields.Schema(content=fields.TEXT(stored=True), title=fields.TEXT(stored=True), summary=fields.STORED, path=fields.ID(stored=True)) storedkeys = ["content", "path", "summary", "title"] assert storedkeys == schema.stored_names() ix = RamStorage().create_index(schema) writer = ix.writer() writer.add_document(content=u("Content of this document."), title=u("This is the title"), summary=u("This is the summary"), path=u("/main")) writer.add_document(content=u("Second document."), title=u("Second title"), summary=u("Summary numero due"), path=u("/second")) writer.add_document(content=u("Third document."), title=u("Title 3"), summary=u("Summary treo"), path=u("/san")) writer.commit() with ix.searcher() as s: doc = s.document(path="/main") assert doc is not None assert ([doc[k] for k in sorted(doc.keys())] == ["Content of this document.", "/main", "This is the summary", "This is the title"]) ix.close() def test_all_stored_fields(): # all_stored_fields() should yield all stored fields, even for deleted # documents schema = fields.Schema(a=fields.ID(stored=True), b=fields.STORED) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(a=u("alfa"), b=u("bravo")) w.add_document(a=u("apple"), b=u("bear")) w.add_document(a=u("alpaca"), b=u("beagle")) w.add_document(a=u("aim"), b=u("box")) w = ix.writer() w.delete_by_term("a", "apple") w.delete_by_term("a", "aim") w.commit(merge=False) with ix.searcher() as s: assert s.doc_count_all() == 4 assert s.doc_count() == 2 sfs = list((sf["a"], sf["b"]) for sf in s.all_stored_fields()) assert sfs == [("alfa", "bravo"), ("apple", "bear"), ("alpaca", "beagle"), ("aim", "box")] def test_first_id(): schema = fields.Schema(path=fields.ID(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(path=u("/a")) w.add_document(path=u("/b")) w.add_document(path=u("/c")) w.commit() r = ix.reader() docid = r.first_id("path", u("/b")) assert r.stored_fields(docid) == {"path": "/b"} ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(path=u("/a")) w.add_document(path=u("/b")) w.add_document(path=u("/c")) w.commit(merge=False) w = ix.writer() w.add_document(path=u("/d")) w.add_document(path=u("/e")) w.add_document(path=u("/f")) w.commit(merge=False) w = ix.writer() w.add_document(path=u("/g")) w.add_document(path=u("/h")) w.add_document(path=u("/i")) w.commit(merge=False) r = ix.reader() assert r.__class__ == reading.MultiReader docid = r.first_id("path", u("/e")) assert r.stored_fields(docid) == {"path": "/e"} class RecoverReader(threading.Thread): def __init__(self, ix): threading.Thread.__init__(self) self.ix = ix def run(self): for _ in xrange(50): r = self.ix.reader() r.close() class RecoverWriter(threading.Thread): domain = u("alfa bravo charlie deleta echo foxtrot golf hotel india") domain = domain.split() def __init__(self, ix): threading.Thread.__init__(self) self.ix = ix def run(self): for _ in xrange(10): w = self.ix.writer() w.add_document(text=random.sample(self.domain, 4)) w.commit() time.sleep(0.01) def test_delete_recovery(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "delrecover") as ix: rw = RecoverWriter(ix) rr = RecoverReader(ix) rw.start() rr.start() rw.join() rr.join() def test_nonexclusive_read(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "readlock") as ix: for num in u("one two three four five").split(): w = ix.writer() w.add_document(text=u("Test document %s") % num) w.commit(merge=False) def fn(): for _ in xrange(5): r = ix.reader() assert list(r.field_terms("text")) == ["document", "five", "four", "one", "test", "three", "two"] r.close() ths = [threading.Thread(target=fn) for _ in xrange(5)] for th in ths: th.start() for th in ths: th.join() def test_doc_count(): schema = fields.Schema(id=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() for i in xrange(10): w.add_document(id=i) w.commit() r = ix.reader() assert r.doc_count() == 10 assert r.doc_count_all() == 10 w = ix.writer() w.delete_document(2) w.delete_document(4) w.delete_document(6) w.delete_document(8) w.commit() r = ix.reader() assert r.doc_count() == 6 assert r.doc_count_all() == 10 w = ix.writer() for i in xrange(10, 15): w.add_document(id=i) w.commit(merge=False) r = ix.reader() assert r.doc_count() == 11 assert r.doc_count_all() == 15 w = ix.writer() w.delete_document(10) w.delete_document(12) w.delete_document(14) w.commit(merge=False) r = ix.reader() assert r.doc_count() == 8 assert r.doc_count_all() == 15 ix.optimize() r = ix.reader() assert r.doc_count() == 8 assert r.doc_count_all() == 8 def test_reader_subclasses(): from whoosh.util.testing import check_abstract_methods check_abstract_methods(reading.IndexReader, SegmentReader) check_abstract_methods(reading.IndexReader, reading.MultiReader) check_abstract_methods(reading.IndexReader, reading.EmptyReader) Whoosh-2.5.7/tests/test_results.py0000644000076500000240000005444612254366764017374 0ustar mattstaff00000000000000from __future__ import with_statement import pytest from whoosh import analysis, fields, formats, highlight, qparser, query from whoosh.codec.whoosh3 import W3Codec from whoosh.compat import u, xrange, text_type, permutations from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempStorage, TempIndex def test_score_retrieval(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(stored=True)) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(title=u("Miss Mary"), content=u("Mary had a little white lamb its fleece" " was white as snow")) writer.add_document(title=u("Snow White"), content=u("Snow white lived in the forest with seven" " dwarfs")) writer.commit() with ix.searcher() as s: results = s.search(query.Term("content", "white")) assert len(results) == 2 assert results[0]['title'] == u("Miss Mary") assert results[1]['title'] == u("Snow White") assert results.score(0) is not None assert results.score(0) != 0 assert results.score(0) != 1 def test_resultcopy(): schema = fields.Schema(a=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(a=u("alfa bravo charlie")) w.add_document(a=u("bravo charlie delta")) w.add_document(a=u("charlie delta echo")) w.add_document(a=u("delta echo foxtrot")) w.commit() with ix.searcher() as s: r = s.search(qparser.QueryParser("a", None).parse(u("charlie"))) assert len(r) == 3 rcopy = r.copy() assert r.top_n == rcopy.top_n def test_resultslength(): schema = fields.Schema(id=fields.ID(stored=True), value=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), value=u("alfa alfa alfa alfa alfa")) w.add_document(id=u("2"), value=u("alfa alfa alfa alfa")) w.add_document(id=u("3"), value=u("alfa alfa alfa")) w.add_document(id=u("4"), value=u("alfa alfa")) w.add_document(id=u("5"), value=u("alfa")) w.add_document(id=u("6"), value=u("bravo")) w.commit() with ix.searcher() as s: r = s.search(query.Term("value", u("alfa")), limit=3) assert len(r) == 5 assert r.scored_length() == 3 assert r[10:] == [] def test_combine(): schema = fields.Schema(id=fields.ID(stored=True), value=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), value=u("alfa bravo charlie all")) w.add_document(id=u("2"), value=u("bravo charlie echo all")) w.add_document(id=u("3"), value=u("charlie echo foxtrot all")) w.add_document(id=u("4"), value=u("echo foxtrot india all")) w.add_document(id=u("5"), value=u("foxtrot india juliet all")) w.add_document(id=u("6"), value=u("india juliet alfa all")) w.add_document(id=u("7"), value=u("juliet alfa bravo all")) w.add_document(id=u("8"), value=u("charlie charlie charlie all")) w.commit() with ix.searcher() as s: def idsof(r): return "".join(hit["id"] for hit in r) def check(r1, methodname, r2, ids): getattr(r1, methodname)(r2) assert idsof(r1) == ids def rfor(t): return s.search(query.Term("value", t)) assert idsof(rfor(u("foxtrot"))) == "345" check(rfor(u("foxtrot")), "extend", rfor("charlie"), "345812") check(rfor(u("foxtrot")), "filter", rfor("juliet"), "5") check(rfor(u("charlie")), "filter", rfor("foxtrot"), "3") check(rfor(u("all")), "filter", rfor("foxtrot"), "345") check(rfor(u("all")), "upgrade", rfor("india"), "45612378") check(rfor(u("charlie")), "upgrade_and_extend", rfor("echo"), "23814") def test_results_filter(): schema = fields.Schema(id=fields.STORED, words=fields.KEYWORD(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id="1", words=u("bravo top")) w.add_document(id="2", words=u("alfa top")) w.add_document(id="3", words=u("alfa top")) w.add_document(id="4", words=u("alfa bottom")) w.add_document(id="5", words=u("bravo bottom")) w.add_document(id="6", words=u("charlie bottom")) w.add_document(id="7", words=u("charlie bottom")) w.commit() with ix.searcher() as s: def check(r, target): result = "".join(s.stored_fields(d)["id"] for d in r.docs()) assert result == target r = s.search(query.Term("words", u("alfa"))) r.filter(s.search(query.Term("words", u("bottom")))) check(r, "4") def test_extend_empty(): schema = fields.Schema(id=fields.STORED, words=fields.KEYWORD) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, words=u("alfa bravo charlie")) w.add_document(id=2, words=u("bravo charlie delta")) w.add_document(id=3, words=u("charlie delta echo")) w.add_document(id=4, words=u("delta echo foxtrot")) w.add_document(id=5, words=u("echo foxtrot golf")) w.commit() with ix.searcher() as s: # Get an empty results object r1 = s.search(query.Term("words", u("hotel"))) # Copy it r1c = r1.copy() # Get a non-empty results object r2 = s.search(query.Term("words", u("delta"))) # Copy it r2c = r2.copy() # Extend r1 with r2 r1c.extend(r2c) assert [hit["id"] for hit in r1c] == [2, 3, 4] assert r1c.scored_length() == 3 def test_extend_filtered(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, text=u("alfa bravo charlie")) w.add_document(id=2, text=u("bravo charlie delta")) w.add_document(id=3, text=u("juliet delta echo")) w.add_document(id=4, text=u("delta bravo alfa")) w.add_document(id=5, text=u("foxtrot sierra tango")) w.commit() hits = lambda result: [hit["id"] for hit in result] with ix.searcher() as s: r1 = s.search(query.Term("text", u("alfa")), filter=set([1, 4])) assert r1.allowed == set([1, 4]) assert len(r1.top_n) == 0 r2 = s.search(query.Term("text", u("bravo"))) assert len(r2.top_n) == 3 assert hits(r2) == [1, 2, 4] r3 = r1.copy() assert r3.allowed == set([1, 4]) assert len(r3.top_n) == 0 r3.extend(r2) assert len(r3.top_n) == 3 assert hits(r3) == [1, 2, 4] def test_pages(): from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True), c=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), c=u("alfa alfa alfa alfa alfa alfa")) w.add_document(id=u("2"), c=u("alfa alfa alfa alfa alfa")) w.add_document(id=u("3"), c=u("alfa alfa alfa alfa")) w.add_document(id=u("4"), c=u("alfa alfa alfa")) w.add_document(id=u("5"), c=u("alfa alfa")) w.add_document(id=u("6"), c=u("alfa")) w.commit() with ix.searcher(weighting=Frequency) as s: q = query.Term("c", u("alfa")) r = s.search(q) assert [d["id"] for d in r] == ["1", "2", "3", "4", "5", "6"] r = s.search_page(q, 2, pagelen=2) assert [d["id"] for d in r] == ["3", "4"] r = s.search_page(q, 2, pagelen=4) assert r.total == 6 assert r.pagenum == 2 assert r.pagelen == 2 def test_pages_with_filter(): from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True), type=fields.TEXT(), c=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), type=u("odd"), c=u("alfa alfa alfa alfa alfa alfa")) w.add_document(id=u("2"), type=u("even"), c=u("alfa alfa alfa alfa alfa")) w.add_document(id=u("3"), type=u("odd"), c=u("alfa alfa alfa alfa")) w.add_document(id=u("4"), type=u("even"), c=u("alfa alfa alfa")) w.add_document(id=u("5"), type=u("odd"), c=u("alfa alfa")) w.add_document(id=u("6"), type=u("even"), c=u("alfa")) w.commit() with ix.searcher(weighting=Frequency) as s: q = query.Term("c", u("alfa")) filterq = query.Term("type", u("even")) r = s.search(q, filter=filterq) assert [d["id"] for d in r] == ["2", "4", "6"] r = s.search_page(q, 2, pagelen=2, filter=filterq) assert [d["id"] for d in r] == ["6"] def test_extra_slice(): schema = fields.Schema(key=fields.ID(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() for char in u("abcdefghijklmnopqrstuvwxyz"): w.add_document(key=char) w.commit() with ix.searcher() as s: r = s.search(query.Every(), limit=5) assert r[6:7] == [] def test_page_counts(): from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for i in xrange(10): w.add_document(id=text_type(i)) w.commit() with ix.searcher(weighting=Frequency) as s: q = query.Every("id") r = s.search(q) assert len(r) == 10 with pytest.raises(ValueError): s.search_page(q, 0) r = s.search_page(q, 1, 5) assert len(r) == 10 assert r.pagecount == 2 r = s.search_page(q, 1, 5) assert len(r) == 10 assert r.pagecount == 2 r = s.search_page(q, 2, 5) assert len(r) == 10 assert r.pagecount == 2 assert r.pagenum == 2 r = s.search_page(q, 1, 10) assert len(r) == 10 assert r.pagecount == 1 assert r.pagenum == 1 def test_resultspage(): schema = fields.Schema(id=fields.STORED, content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "bravo", "charlie", "delta") w = ix.writer() for i, lst in enumerate(permutations(domain, 3)): w.add_document(id=text_type(i), content=u(" ").join(lst)) w.commit() with ix.searcher() as s: q = query.Term("content", u("bravo")) r = s.search(q, limit=10) tops = list(r) rp = s.search_page(q, 1, pagelen=5) assert rp.scored_length() == 5 assert list(rp) == tops[0:5] assert rp[10:] == [] rp = s.search_page(q, 2, pagelen=5) assert list(rp) == tops[5:10] rp = s.search_page(q, 1, pagelen=10) assert len(rp) == 54 assert rp.pagecount == 6 rp = s.search_page(q, 6, pagelen=10) assert len(list(rp)) == 4 assert rp.is_last_page() with pytest.raises(ValueError): s.search_page(q, 0) assert s.search_page(q, 10).pagenum == 6 rp = s.search_page(query.Term("content", "glonk"), 1) assert len(rp) == 0 assert rp.is_last_page() def test_highlight_setters(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("Hello")) w.commit() r = ix.searcher().search(query.Term("text", "hello")) hl = highlight.Highlighter() ucf = highlight.UppercaseFormatter() r.highlighter = hl r.formatter = ucf assert hl.formatter is ucf def test_snippets(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(stored=True, analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("Lay out the rough animation by creating the important poses where they occur on the timeline.")) w.add_document(text=u("Set key frames on everything that's key-able. This is for control and predictability: you don't want to accidentally leave something un-keyed. This is also much faster than selecting the parameters to key.")) w.add_document(text=u("Use constant (straight) or sometimes linear transitions between keyframes in the channel editor. This makes the character jump between poses.")) w.add_document(text=u("Keying everything gives quick, immediate results. But it can become difficult to tweak the animation later, especially for complex characters.")) w.add_document(text=u("Copy the current pose to create the next one: pose the character, key everything, then copy the keyframe in the playbar to another frame, and key everything at that frame.")) w.commit() target = ["Set KEY frames on everything that's KEY-able", "Copy the current pose to create the next one: pose the character, KEY everything, then copy the keyframe in the playbar to another frame, and KEY everything at that frame", "KEYING everything gives quick, immediate results"] with ix.searcher() as s: qp = qparser.QueryParser("text", ix.schema) q = qp.parse(u("key")) r = s.search(q, terms=True) r.fragmenter = highlight.SentenceFragmenter() r.formatter = highlight.UppercaseFormatter() assert sorted([hit.highlights("text", top=1) for hit in r]) == sorted(target) def test_keyterms(): ana = analysis.StandardAnalyzer() vectorformat = formats.Frequency() schema = fields.Schema(path=fields.ID, content=fields.TEXT(analyzer=ana, vector=vectorformat)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(path=u("a"), content=u("This is some generic content")) w.add_document(path=u("b"), content=u("This is some distinctive content")) w.commit() with ix.searcher() as s: docnum = s.document_number(path=u("b")) keyterms = list(s.key_terms([docnum], "content")) assert len(keyterms) > 0 assert keyterms[0][0] == "distinctive" r = s.search(query.Term("path", u("b"))) keyterms2 = list(r.key_terms("content")) assert len(keyterms2) > 0 assert keyterms2[0][0] == "distinctive" def test_lengths(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, text=u("alfa bravo charlie delta echo")) w.add_document(id=2, text=u("bravo charlie delta echo foxtrot")) w.add_document(id=3, text=u("charlie needle echo foxtrot golf")) w.add_document(id=4, text=u("delta echo foxtrot golf hotel")) w.add_document(id=5, text=u("echo needle needle hotel india")) w.add_document(id=6, text=u("foxtrot golf hotel india juliet")) w.add_document(id=7, text=u("golf needle india juliet kilo")) w.add_document(id=8, text=u("hotel india juliet needle lima")) w.commit() with ix.searcher() as s: q = query.Or([query.Term("text", u("needle")), query.Term("text", u("charlie"))]) r = s.search(q, limit=2) assert not r.has_exact_length() assert r.estimated_length() == 7 assert r.estimated_min_length() == 3 assert r.scored_length() == 2 assert len(r) == 6 def test_lengths2(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) count = 0 for _ in xrange(3): w = ix.writer() for ls in permutations(u("alfa bravo charlie").split()): if "bravo" in ls and "charlie" in ls: count += 1 w.add_document(text=u(" ").join(ls)) w.commit(merge=False) with ix.searcher() as s: q = query.Or([query.Term("text", u("bravo")), query.Term("text", u("charlie"))]) r = s.search(q, limit=None) assert len(r) == count r = s.search(q, limit=3) assert len(r) == count def test_stability(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain, 3): w.add_document(text=u(" ").join(ls)) w.commit() with ix.searcher() as s: q = query.Term("text", u("bravo")) last = [] for i in xrange(s.doc_frequency("text", u("bravo"))): # Only un-optimized results are stable r = s.search(q, limit=i + 1, optimize=False) docnums = [hit.docnum for hit in r] assert docnums[:-1] == last last = docnums def test_terms(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("alfa sierra tango")) w.add_document(text=u("bravo charlie delta")) w.add_document(text=u("charlie delta echo")) w.add_document(text=u("delta echo foxtrot")) w.commit() qp = qparser.QueryParser("text", ix.schema) q = qp.parse(u("(bravo AND charlie) OR foxtrot OR missing")) r = ix.searcher().search(q, terms=True) fieldobj = schema["text"] def txts(tset): return sorted(fieldobj.from_bytes(t[1]) for t in tset) assert txts(r.matched_terms()) == ["bravo", "charlie", "foxtrot"] for hit in r: value = hit["text"] for txt in txts(hit.matched_terms()): assert txt in value def test_hit_column(): # Not stored schema = fields.Schema(text=fields.TEXT()) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie")) with ix.searcher() as s: r = s.search(query.Term("text", "alfa")) assert len(r) == 1 hit = r[0] with pytest.raises(KeyError): _ = hit["text"] # With column schema = fields.Schema(text=fields.TEXT(sortable=True)) ix = RamStorage().create_index(schema) with ix.writer(codec=W3Codec()) as w: w.add_document(text=u("alfa bravo charlie")) with ix.searcher() as s: r = s.search(query.Term("text", "alfa")) assert len(r) == 1 hit = r[0] assert hit["text"] == u("alfa bravo charlie") def test_closed_searcher(): from whoosh.reading import ReaderClosed schema = fields.Schema(key=fields.KEYWORD(stored=True, sortable=True, spelling=True)) with TempStorage() as st: ix = st.create_index(schema) with ix.writer() as w: w.add_document(key=u("alfa")) w.add_document(key=u("bravo")) w.add_document(key=u("charlie")) w.add_document(key=u("delta")) w.add_document(key=u("echo")) s = ix.searcher() r = s.search(query.TermRange("key", "b", "d")) s.close() assert s.is_closed with pytest.raises(ReaderClosed): assert r[0]["key"] == "bravo" with pytest.raises(ReaderClosed): s.reader().column_reader("key") with pytest.raises(ReaderClosed): s.reader().has_word_graph("key") with pytest.raises(ReaderClosed): s.suggest("key", "brovo") s = ix.searcher() r = s.search(query.TermRange("key", "b", "d")) assert r[0] assert r[0]["key"] == "bravo" c = s.reader().column_reader("key") assert c[1] == "bravo" assert s.reader().has_word_graph("key") assert s.suggest("key", "brovo") == ["bravo"] def test_paged_highlights(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie delta echo foxtrot")) w.add_document(text=u("bravo charlie delta echo foxtrot golf")) w.add_document(text=u("charlie delta echo foxtrot golf hotel")) w.add_document(text=u("delta echo foxtrot golf hotel india")) w.add_document(text=u("echo foxtrot golf hotel india juliet")) w.add_document(text=u("foxtrot golf hotel india juliet kilo")) with ix.searcher() as s: q = query.Term("text", u("alfa")) page = s.search_page(q, 1, pagelen=3) page.results.fragmenter = highlight.WholeFragmenter() page.results.formatter = highlight.UppercaseFormatter() hi = page[0].highlights("text") assert hi == u("ALFA bravo charlie delta echo foxtrot") def test_phrase_keywords(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie delta")) w.add_document(text=u("bravo charlie delta echo")) w.add_document(text=u("charlie delta echo foxtrot")) w.add_document(text=u("delta echo foxtrot alfa")) w.add_document(text=u("echo foxtrot alfa bravo")) with ix.searcher() as s: q = query.Phrase("text", u("alfa bravo").split()) r = s.search(q) assert len(r) == 2 kts = " ".join(t for t, score in r.key_terms("text")) assert kts == "alfa bravo charlie foxtrot delta" def test_every_keywords(): schema = fields.Schema(title=fields.TEXT, content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(title=u("alfa"), content=u("bravo")) w.add_document(title=u("charlie"), content=u("delta")) with ix.searcher() as s: q = qparser.QueryParser("content", ix.schema).parse("*") assert isinstance(q, query.Every) r = s.search(q, terms=True) assert len(r) == 2 hit = r[0] assert hit["content"] == "bravo" assert hit.highlights("content") == "" def test_filter_by_result(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(stored=True)) with TempIndex(schema, "filter") as ix: words = u("foo bar baz qux barney").split() with ix.writer() as w: for x in xrange(100): t = u("even" if x % 2 == 0 else "odd") c = words[x % len(words)] w.add_document(title=t, content=c) with ix.searcher() as searcher: fq = query.Term("title", "even") filter_result = searcher.search(fq) assert filter_result.docset is None q = query.Term("content", "foo") # filter_result.docs() result = searcher.search(q, filter=filter_result) assert all(x["title"] == "even" and x["content"] == "foo" for x in result) Whoosh-2.5.7/tests/test_searching.py0000644000076500000240000016736612277504411017630 0ustar mattstaff00000000000000#encoding: utf-8 from __future__ import with_statement import copy from datetime import datetime, timedelta import pytest from whoosh import analysis, fields, index, qparser, query, searching, scoring from whoosh.codec.whoosh3 import W3Codec from whoosh.compat import b, u, text_type from whoosh.compat import xrange, permutations, izip_longest from whoosh.filedb.filestore import RamStorage def make_index(): s = fields.Schema(key=fields.ID(stored=True), name=fields.TEXT, value=fields.TEXT) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(key=u("A"), name=u("Yellow brown"), value=u("Blue red green render purple?")) w.add_document(key=u("B"), name=u("Alpha beta"), value=u("Gamma delta epsilon omega.")) w.add_document(key=u("C"), name=u("One two"), value=u("Three rendered four five.")) w.add_document(key=u("D"), name=u("Quick went"), value=u("Every red town.")) w.add_document(key=u("E"), name=u("Yellow uptown"), value=u("Interest rendering outer photo!")) w.commit() return ix def _get_keys(stored_fields): return sorted([d.get("key") for d in stored_fields]) def _docs(q, s): return _get_keys([s.stored_fields(docnum) for docnum in q.docs(s)]) def _run_query(q, target): ix = make_index() with ix.searcher() as s: assert target == _docs(q, s) def test_empty_index(): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT) st = RamStorage() with pytest.raises(index.EmptyIndexError): st.open_index(schema=schema) def test_docs_method(): ix = make_index() with ix.searcher() as s: assert _get_keys(s.documents(name="yellow")) == ["A", "E"] assert _get_keys(s.documents(value="red")) == ["A", "D"] assert _get_keys(s.documents()) == ["A", "B", "C", "D", "E"] def test_term(): _run_query(query.Term("name", u("yellow")), [u("A"), u("E")]) _run_query(query.Term("value", u("zeta")), []) _run_query(query.Term("value", u("red")), [u("A"), u("D")]) def test_require(): _run_query(query.Require(query.Term("value", u("red")), query.Term("name", u("yellow"))), [u("A")]) def test_and(): _run_query(query.And([query.Term("value", u("red")), query.Term("name", u("yellow"))]), [u("A")]) # Missing _run_query(query.And([query.Term("value", u("ochre")), query.Term("name", u("glonk"))]), []) def test_or(): _run_query(query.Or([query.Term("value", u("red")), query.Term("name", u("yellow"))]), [u("A"), u("D"), u("E")]) # Missing _run_query(query.Or([query.Term("value", u("ochre")), query.Term("name", u("glonk"))]), []) _run_query(query.Or([]), []) def test_ors(): domain = u("alfa bravo charlie delta").split() s = fields.Schema(num=fields.STORED, text=fields.TEXT) st = RamStorage() ix = st.create_index(s) with ix.writer() as w: for i, ls in enumerate(permutations(domain)): w.add_document(num=i, text=" ".join(ls)) with ix.searcher() as s: qs = [query.Term("text", word) for word in domain] for i in xrange(1, len(domain)): q = query.Or(qs[:i]) r1 = [(hit.docnum, hit.score) for hit in s.search(q, limit=None)] q.binary_matcher = True r2 = [(hit.docnum, hit.score) for hit in s.search(q, limit=None)] for item1, item2 in izip_longest(r1, r2): assert item1[0] == item2[0] assert item1[1] == item2[1] def test_not(): _run_query(query.And([query.Or([query.Term("value", u("red")), query.Term("name", u("yellow"))]), query.Not(query.Term("name", u("quick")))]), [u("A"), u("E")]) def test_topnot(): _run_query(query.Not(query.Term("value", "red")), [u("B"), "C", "E"]) _run_query(query.Not(query.Term("name", "yellow")), [u("B"), u("C"), u("D")]) def test_andnot(): _run_query(query.AndNot(query.Term("name", u("yellow")), query.Term("value", u("purple"))), [u("E")]) def test_andnot2(): schema = fields.Schema(a=fields.ID(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(a=u("bravo")) w.add_document(a=u("echo")) w.add_document(a=u("juliet")) w.commit() w = ix.writer() w.add_document(a=u("kilo")) w.add_document(a=u("foxtrot")) w.add_document(a=u("charlie")) w.commit(merge=False) w = ix.writer() w.delete_by_term("a", u("echo")) w.add_document(a=u("alfa")) w.add_document(a=u("india")) w.add_document(a=u("delta")) w.commit(merge=False) with ix.searcher() as s: q = query.TermRange("a", u("bravo"), u("k")) qr = [hit["a"] for hit in s.search(q)] assert " ".join(sorted(qr)) == "bravo charlie delta foxtrot india juliet" oq = query.Or([query.Term("a", "bravo"), query.Term("a", "delta")]) oqr = [hit["a"] for hit in s.search(oq)] assert " ".join(sorted(oqr)) == "bravo delta" anq = query.AndNot(q, oq) m = anq.matcher(s) r = s.search(anq) assert list(anq.docs(s)) == sorted(hit.docnum for hit in r) assert " ".join(sorted(hit["a"] for hit in r)) == "charlie foxtrot india juliet" def test_variations(): _run_query(query.Variations("value", u("render")), [u("A"), u("C"), u("E")]) def test_wildcard(): _run_query(query.Or([query.Wildcard('value', u('*red*')), query.Wildcard('name', u('*yellow*'))]), [u("A"), u("C"), u("D"), u("E")]) # Missing _run_query(query.Wildcard('value', 'glonk*'), []) def test_not2(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("a"), value=u("alfa bravo charlie delta echo")) writer.add_document(name=u("b"), value=u("bravo charlie delta echo foxtrot")) writer.add_document(name=u("c"), value=u("charlie delta echo foxtrot golf")) writer.add_document(name=u("d"), value=u("delta echo golf hotel india")) writer.add_document(name=u("e"), value=u("echo golf hotel india juliet")) writer.commit() with ix.searcher() as s: p = qparser.QueryParser("value", None) results = s.search(p.parse("echo NOT golf")) assert sorted([d["name"] for d in results]) == ["a", "b"] results = s.search(p.parse("echo NOT bravo")) assert sorted([d["name"] for d in results]) == ["c", "d", "e"] ix.delete_by_term("value", u("bravo")) with ix.searcher() as s: results = s.search(p.parse("echo NOT charlie")) assert sorted([d["name"] for d in results]) == ["d", "e"] # def test_or_minmatch(): # schema = fields.Schema(k=fields.STORED, v=fields.TEXT) # st = RamStorage() # ix = st.create_index(schema) # # w = ix.writer() # w.add_document(k=1, v=u("alfa bravo charlie delta echo")) # w.add_document(k=2, v=u("bravo charlie delta echo foxtrot")) # w.add_document(k=3, v=u("charlie delta echo foxtrot golf")) # w.add_document(k=4, v=u("delta echo foxtrot golf hotel")) # w.add_document(k=5, v=u("echo foxtrot golf hotel india")) # w.add_document(k=6, v=u("foxtrot golf hotel india juliet")) # w.commit() # # s = ix.searcher() # q = Or([Term("v", "echo"), Term("v", "foxtrot")], minmatch=2) # r = s.search(q) # assert sorted(d["k"] for d in r), [2, 3, 4, 5]) def test_range(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("A"), content=u("alfa bravo charlie delta echo")) w.add_document(id=u("B"), content=u("bravo charlie delta echo foxtrot")) w.add_document(id=u("C"), content=u("charlie delta echo foxtrot golf")) w.add_document(id=u("D"), content=u("delta echo foxtrot golf hotel")) w.add_document(id=u("E"), content=u("echo foxtrot golf hotel india")) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("content", schema) q = qp.parse(u("charlie [delta TO foxtrot]")) assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[1].__class__ == query.TermRange assert q[1].start == "delta" assert q[1].end == "foxtrot" assert not q[1].startexcl assert not q[1].endexcl ids = sorted([d['id'] for d in s.search(q)]) assert ids == [u('A'), u('B'), u('C')] q = qp.parse(u("foxtrot {echo TO hotel]")) assert q.__class__ == query.And assert q[0].__class__ == query.Term assert q[1].__class__ == query.TermRange assert q[1].start == "echo" assert q[1].end == "hotel" assert q[1].startexcl assert not q[1].endexcl ids = sorted([d['id'] for d in s.search(q)]) assert ids == [u('B'), u('C'), u('D'), u('E')] q = qp.parse(u("{bravo TO delta}")) assert q.__class__ == query.TermRange assert q.start == "bravo" assert q.end == "delta" assert q.startexcl assert q.endexcl ids = sorted([d['id'] for d in s.search(q)]) assert ids == [u('A'), u('B'), u('C')] # Shouldn't match anything q = qp.parse(u("[1 to 10]")) assert q.__class__ == query.TermRange assert len(s.search(q)) == 0 def test_range_clusiveness(): schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for letter in u("abcdefg"): w.add_document(id=letter) w.commit() with ix.searcher() as s: def check(startexcl, endexcl, string): q = query.TermRange("id", "b", "f", startexcl, endexcl) r = "".join(sorted(d['id'] for d in s.search(q))) assert r == string check(False, False, "bcdef") check(True, False, "cdef") check(True, True, "cde") check(False, True, "bcde") def test_open_ranges(): schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for letter in u("abcdefg"): w.add_document(id=letter) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) def check(qstring, result): q = qp.parse(qstring) r = "".join(sorted([d['id'] for d in s.search(q)])) assert r == result check(u("[b TO]"), "bcdefg") check(u("[TO e]"), "abcde") check(u("[b TO d]"), "bcd") check(u("{b TO]"), "cdefg") check(u("[TO e}"), "abcd") check(u("{b TO d}"), "c") def test_open_numeric_ranges(): domain = range(0, 1000, 7) schema = fields.Schema(num=fields.NUMERIC(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() for i in domain: w.add_document(num=i) w.commit() qp = qparser.QueryParser("num", schema) with ix.searcher() as s: q = qp.parse("[100 to]") r = [hit["num"] for hit in s.search(q, limit=None)] assert r == [n for n in domain if n >= 100] q = qp.parse("[to 500]") r = [hit["num"] for hit in s.search(q, limit=None)] assert r == [n for n in domain if n <= 500] def test_open_date_ranges(): basedate = datetime(2011, 1, 24, 6, 25, 0, 0) domain = [basedate + timedelta(days=n) for n in xrange(-20, 20)] schema = fields.Schema(date=fields.DATETIME(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() for d in domain: w.add_document(date=d) w.commit() with ix.searcher() as s: # Without date parser qp = qparser.QueryParser("date", schema) q = qp.parse("[2011-01-10 to]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)] assert r == target q = qp.parse("[to 2011-01-30]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)] assert r == target # With date parser from whoosh.qparser.dateparse import DateParserPlugin qp.add_plugin(DateParserPlugin(basedate)) q = qp.parse("[10 jan 2011 to]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 target = [d for d in domain if d >= datetime(2011, 1, 10, 6, 25)] assert r == target q = qp.parse("[to 30 jan 2011]") r = [hit["date"] for hit in s.search(q, limit=None)] assert len(r) > 0 target = [d for d in domain if d <= datetime(2011, 1, 30, 6, 25)] assert r == target def test_negated_unlimited_ranges(): # Whoosh should treat u("[to]") as if it was "*" schema = fields.Schema(id=fields.ID(stored=True), num=fields.NUMERIC, date=fields.DATETIME) ix = RamStorage().create_index(schema) w = ix.writer() from string import ascii_letters domain = text_type(ascii_letters) dt = datetime.now() for i, letter in enumerate(domain): w.add_document(id=letter, num=i, date=dt + timedelta(days=i)) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) nq = qp.parse(u("NOT [to]")) assert nq.__class__ == query.Not q = nq.query assert q.__class__ == query.Every assert "".join(h["id"] for h in s.search(q, limit=None)) == domain assert not list(nq.docs(s)) nq = qp.parse(u("NOT num:[to]")) assert nq.__class__ == query.Not q = nq.query assert q.__class__ == query.NumericRange assert q.start is None assert q.end is None assert "".join(h["id"] for h in s.search(q, limit=None)) == domain assert not list(nq.docs(s)) nq = qp.parse(u("NOT date:[to]")) assert nq.__class__ == query.Not q = nq.query assert q.__class__ == query.Every assert "".join(h["id"] for h in s.search(q, limit=None)) == domain assert not list(nq.docs(s)) def test_keyword_or(): schema = fields.Schema(a=fields.ID(stored=True), b=fields.KEYWORD) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(a=u("First"), b=u("ccc ddd")) w.add_document(a=u("Second"), b=u("aaa ddd")) w.add_document(a=u("Third"), b=u("ccc eee")) w.commit() qp = qparser.QueryParser("b", schema) with ix.searcher() as s: qr = qp.parse(u("b:ccc OR b:eee")) assert qr.__class__ == query.Or r = s.search(qr) assert len(r) == 2 assert r[0]["a"] == "Third" assert r[1]["a"] == "First" def test_merged(): sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(sc) w = ix.writer() w.add_document(id=u("alfa"), content=u("alfa")) w.add_document(id=u("bravo"), content=u("bravo")) w.add_document(id=u("charlie"), content=u("charlie")) w.add_document(id=u("delta"), content=u("delta")) w.commit() with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert len(r) == 1 assert r[0]["id"] == "bravo" w = ix.writer() w.add_document(id=u("echo"), content=u("echo")) w.commit() assert len(ix._segments()) == 1 with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert len(r) == 1 assert r[0]["id"] == "bravo" def test_multireader(): sc = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) st = RamStorage() ix = st.create_index(sc) w = ix.writer() w.add_document(id=u("alfa"), content=u("alfa")) w.add_document(id=u("bravo"), content=u("bravo")) w.add_document(id=u("charlie"), content=u("charlie")) w.add_document(id=u("delta"), content=u("delta")) w.add_document(id=u("echo"), content=u("echo")) w.add_document(id=u("foxtrot"), content=u("foxtrot")) w.add_document(id=u("golf"), content=u("golf")) w.add_document(id=u("hotel"), content=u("hotel")) w.add_document(id=u("india"), content=u("india")) w.commit() with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert len(r) == 1 assert r[0]["id"] == "bravo" w = ix.writer() w.add_document(id=u("juliet"), content=u("juliet")) w.add_document(id=u("kilo"), content=u("kilo")) w.add_document(id=u("lima"), content=u("lima")) w.add_document(id=u("mike"), content=u("mike")) w.add_document(id=u("november"), content=u("november")) w.add_document(id=u("oscar"), content=u("oscar")) w.add_document(id=u("papa"), content=u("papa")) w.add_document(id=u("quebec"), content=u("quebec")) w.add_document(id=u("romeo"), content=u("romeo")) w.commit() assert len(ix._segments()) == 2 #r = ix.reader() #assert r.__class__.__name__ == "MultiReader" #pr = r.postings("content", u("bravo")) with ix.searcher() as s: r = s.search(query.Term("content", u("bravo"))) assert len(r) == 1 assert r[0]["id"] == "bravo" def test_posting_phrase(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet")) writer.add_document(name=u("B"), value=u("Miss Little Muffet tuffet")) writer.add_document(name=u("C"), value=u("Miss Little Muffet tuffet sat")) writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat " + "tuffet garbonzo")) writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) writer.commit() with ix.searcher() as s: def names(results): return sorted([fields['name'] for fields in results]) q = query.Phrase("value", [u("little"), u("miss"), u("muffet"), u("sat"), u("tuffet")]) m = q.matcher(s) assert m.__class__.__name__ == "SpanNear2Matcher" r = s.search(q) assert names(r) == ["A"] assert len(r) == 1 q = query.Phrase("value", [u("miss"), u("muffet"), u("sat"), u("tuffet")]) assert names(s.search(q)) == ["A", "D"] q = query.Phrase("value", [u("falunk"), u("gibberish")]) r = s.search(q) assert not names(r) assert len(r) == 0 q = query.Phrase("value", [u("gibberish"), u("falunk")], slop=2) assert names(s.search(q)) == ["D"] q = query.Phrase("value", [u("blah")] * 4) assert not names(s.search(q)) # blah blah blah blah q = query.Phrase("value", [u("blah")] * 3) m = q.matcher(s) assert names(s.search(q)) == ["E"] def test_phrase_score(): schema = fields.Schema(name=fields.ID(stored=True), value=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u("A"), value=u("Little Miss Muffet sat on a tuffet")) writer.add_document(name=u("D"), value=u("Gibberish blonk falunk miss muffet sat " + "tuffet garbonzo")) writer.add_document(name=u("E"), value=u("Blah blah blah pancakes")) writer.add_document(name=u("F"), value=u("Little miss muffet little miss muffet")) writer.commit() with ix.searcher() as s: q = query.Phrase("value", [u("little"), u("miss"), u("muffet")]) m = q.matcher(s) assert m.id() == 0 score1 = m.weight() assert score1 > 0 m.next() assert m.id() == 3 assert m.weight() > score1 def test_stop_phrase(): schema = fields.Schema(title=fields.TEXT(stored=True)) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(title=u("Richard of York")) writer.add_document(title=u("Lily the Pink")) writer.commit() with ix.searcher() as s: qp = qparser.QueryParser("title", schema) q = qp.parse(u("richard of york")) assert q.__unicode__() == "(title:richard AND title:york)" assert len(s.search(q)) == 1 #q = qp.parse(u("lily the pink")) #assert len(s.search(q)), 1) assert len(s.find("title", u("lily the pink"))) == 1 def test_phrase_order(): tfield = fields.TEXT(stored=True, analyzer=analysis.SimpleAnalyzer()) schema = fields.Schema(text=tfield) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() for ls in permutations(["ape", "bay", "can", "day"], 4): writer.add_document(text=u(" ").join(ls)) writer.commit() with ix.searcher() as s: def result(q): r = s.search(q, limit=None, sortedby=None) return sorted([d['text'] for d in r]) q = query.Phrase("text", ["bay", "can", "day"]) assert result(q) == [u('ape bay can day'), u('bay can day ape')] def test_phrase_sameword(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(id=1, text=u("The film Linda Linda Linda is good")) writer.add_document(id=2, text=u("The model Linda Evangelista is pretty")) writer.commit() with ix.searcher() as s: r = s.search(query.Phrase("text", ["linda", "linda", "linda"]), limit=None) assert len(r) == 1 assert r[0]["id"] == 1 def test_phrase_multi(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta echo").split() w = None for i, ls in enumerate(permutations(domain)): if w is None: w = ix.writer() w.add_document(id=i, text=u(" ").join(ls)) if not i % 30: w.commit() w = None if w is not None: w.commit() with ix.searcher() as s: q = query.Phrase("text", ["alfa", "bravo"]) _ = s.search(q) def test_missing_field_scoring(): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u('Frank'), hobbies=u('baseball, basketball')) writer.commit() r = ix.reader() assert r.field_length("hobbies") == 2 assert r.field_length("name") == 1 r.close() writer = ix.writer() writer.add_document(name=u('Jonny')) writer.commit() with ix.searcher() as s: r = s.reader() assert len(ix._segments()) == 1 assert r.field_length("hobbies") == 2 assert r.field_length("name") == 2 parser = qparser.MultifieldParser(['name', 'hobbies'], schema) q = parser.parse(u("baseball")) result = s.search(q) assert len(result) == 1 def test_search_fieldname_underscores(): s = fields.Schema(my_name=fields.ID(stored=True), my_value=fields.TEXT) st = RamStorage() ix = st.create_index(s) w = ix.writer() w.add_document(my_name=u("Green"), my_value=u("It's not easy being green")) w.add_document(my_name=u("Red"), my_value=u("Hopping mad like a playground ball")) w.commit() qp = qparser.QueryParser("my_value", schema=s) with ix.searcher() as s: r = s.search(qp.parse(u("my_name:Green"))) assert r[0]['my_name'] == "Green" def test_short_prefix(): s = fields.Schema(name=fields.ID, value=fields.TEXT) qp = qparser.QueryParser("value", schema=s) q = qp.parse(u("s*")) assert q.__class__.__name__ == "Prefix" assert q.text == "s" def test_weighting(): from whoosh.scoring import Weighting, BaseScorer schema = fields.Schema(id=fields.ID(stored=True), n_comments=fields.STORED) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("1"), n_comments=5) w.add_document(id=u("2"), n_comments=12) w.add_document(id=u("3"), n_comments=2) w.add_document(id=u("4"), n_comments=7) w.commit() # Fake Weighting implementation class CommentWeighting(Weighting): def scorer(self, searcher, fieldname, text, qf=1): return self.CommentScorer(searcher.stored_fields) class CommentScorer(BaseScorer): def __init__(self, stored_fields): self.stored_fields = stored_fields def score(self, matcher): sf = self.stored_fields(matcher.id()) ncomments = sf.get("n_comments", 0) return ncomments with ix.searcher(weighting=CommentWeighting()) as s: q = query.TermRange("id", u("1"), u("4"), constantscore=False) r = s.search(q) ids = [fs["id"] for fs in r] assert ids == ["2", "4", "1", "3"] def test_dismax(): schema = fields.Schema(id=fields.STORED, f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, f1=u("alfa bravo charlie delta"), f2=u("alfa alfa alfa"), f3=u("alfa echo foxtrot hotel india")) w.commit() with ix.searcher(weighting=scoring.Frequency()) as s: assert list(s.documents(f1="alfa")) == [{"id": 1}] assert list(s.documents(f2="alfa")) == [{"id": 1}] assert list(s.documents(f3="alfa")) == [{"id": 1}] qs = [query.Term("f1", "alfa"), query.Term("f2", "alfa"), query.Term("f3", "alfa")] dm = query.DisjunctionMax(qs) r = s.search(dm) assert r.score(0) == 3.0 def test_deleted_wildcard(): schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("alfa")) w.add_document(id=u("bravo")) w.add_document(id=u("charlie")) w.add_document(id=u("delta")) w.add_document(id=u("echo")) w.add_document(id=u("foxtrot")) w.commit() w = ix.writer() w.delete_by_term("id", "bravo") w.delete_by_term("id", "delta") w.delete_by_term("id", "echo") w.commit() with ix.searcher() as s: r = s.search(query.Every("id")) assert sorted([d['id'] for d in r]) == ["alfa", "charlie", "foxtrot"] def test_missing_wildcard(): schema = fields.Schema(id=fields.ID(stored=True), f1=fields.TEXT, f2=fields.TEXT) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("1"), f1=u("alfa"), f2=u("apple")) w.add_document(id=u("2"), f1=u("bravo")) w.add_document(id=u("3"), f1=u("charlie"), f2=u("candy")) w.add_document(id=u("4"), f2=u("donut")) w.add_document(id=u("5")) w.commit() with ix.searcher() as s: r = s.search(query.Every("id")) assert sorted([d['id'] for d in r]) == ["1", "2", "3", "4", "5"] r = s.search(query.Every("f1")) assert sorted([d['id'] for d in r]) == ["1", "2", "3"] r = s.search(query.Every("f2")) assert sorted([d['id'] for d in r]) == ["1", "3", "4"] def test_finalweighting(): from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True), summary=fields.TEXT, n_comments=fields.STORED) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("1"), summary=u("alfa bravo"), n_comments=5) w.add_document(id=u("2"), summary=u("alfa"), n_comments=12) w.add_document(id=u("3"), summary=u("bravo"), n_comments=2) w.add_document(id=u("4"), summary=u("bravo bravo"), n_comments=7) w.commit() class CommentWeighting(Frequency): use_final = True def final(self, searcher, docnum, score): ncomments = searcher.stored_fields(docnum).get("n_comments", 0) return ncomments with ix.searcher(weighting=CommentWeighting()) as s: q = qparser.QueryParser("summary", None).parse("alfa OR bravo") r = s.search(q) ids = [fs["id"] for fs in r] assert ["2", "4", "1", "3"] == ids def test_outofdate(): schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=u("1")) w.add_document(id=u("2")) w.commit() s = ix.searcher() assert s.up_to_date() w = ix.writer() w.add_document(id=u("3")) w.add_document(id=u("4")) assert s.up_to_date() w.commit() assert not s.up_to_date() s = s.refresh() assert s.up_to_date() s.close() def test_find_missing(): schema = fields.Schema(id=fields.ID, text=fields.KEYWORD(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("1"), text=u("alfa")) w.add_document(id=u("2"), text=u("bravo")) w.add_document(text=u("charlie")) w.add_document(id=u("4"), text=u("delta")) w.add_document(text=u("echo")) w.add_document(id=u("6"), text=u("foxtrot")) w.add_document(text=u("golf")) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("text", schema) q = qp.parse(u("NOT id:*")) r = s.search(q, limit=None) assert list(h["text"] for h in r) == ["charlie", "echo", "golf"] def test_ngram_phrase(): f = fields.NGRAM(minsize=2, maxsize=2, phrase=True) schema = fields.Schema(text=f, path=fields.ID(stored=True)) ix = RamStorage().create_index(schema) writer = ix.writer() writer.add_document(text=u('\u9AD8\u6821\u307E\u3067\u306F\u6771\u4EAC' '\u3067\u3001\u5927\u5B66\u304B\u3089\u306F' '\u4EAC\u5927\u3067\u3059\u3002'), path=u('sample')) writer.commit() with ix.searcher() as s: p = qparser.QueryParser("text", schema) q = p.parse(u('\u6771\u4EAC\u5927\u5B66')) assert len(s.search(q)) == 1 q = p.parse(u('"\u6771\u4EAC\u5927\u5B66"')) assert len(s.search(q)) == 0 q = p.parse(u('"\u306F\u6771\u4EAC\u3067"')) assert len(s.search(q)) == 1 def test_ordered(): domain = u("alfa bravo charlie delta echo foxtrot").split(" ") schema = fields.Schema(f=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) writer = ix.writer() for ls in permutations(domain): writer.add_document(f=u(" ").join(ls)) writer.commit() with ix.searcher() as s: q = query.Ordered([query.Term("f", u("alfa")), query.Term("f", u("charlie")), query.Term("f", u("echo"))]) r = s.search(q) for hit in r: ls = hit["f"].split() assert "alfa" in ls assert "charlie" in ls assert "echo" in ls a = ls.index("alfa") c = ls.index("charlie") e = ls.index("echo") assert a < c and c < e, repr(ls) def test_otherwise(): schema = fields.Schema(id=fields.STORED, f=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, f=u("alfa one two")) w.add_document(id=2, f=u("alfa three four")) w.add_document(id=3, f=u("bravo four five")) w.add_document(id=4, f=u("bravo six seven")) w.commit() with ix.searcher() as s: q = query.Otherwise(query.Term("f", u("alfa")), query.Term("f", u("six"))) assert [d["id"] for d in s.search(q)] == [1, 2] q = query.Otherwise(query.Term("f", u("tango")), query.Term("f", u("four"))) assert [d["id"] for d in s.search(q)] == [2, 3] q = query.Otherwise(query.Term("f", u("tango")), query.Term("f", u("nine"))) assert [d["id"] for d in s.search(q)] == [] def test_fuzzyterm(): schema = fields.Schema(id=fields.STORED, f=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, f=u("alfa bravo charlie delta")) w.add_document(id=2, f=u("bravo charlie delta echo")) w.add_document(id=3, f=u("charlie delta echo foxtrot")) w.add_document(id=4, f=u("delta echo foxtrot golf")) w.commit() with ix.searcher() as s: q = query.FuzzyTerm("f", "brave") assert [d["id"] for d in s.search(q)] == [1, 2] def test_fuzzyterm2(): schema = fields.Schema(id=fields.STORED, f=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, f=u("alfa bravo charlie delta")) w.add_document(id=2, f=u("bravo charlie delta echo")) w.add_document(id=3, f=u("charlie delta echo foxtrot")) w.add_document(id=4, f=u("delta echo foxtrot golf")) w.commit() with ix.searcher() as s: assert list(s.reader().terms_within("f", u("brave"), 1)) == ["bravo"] q = query.FuzzyTerm("f", "brave") assert [d["id"] for d in s.search(q)] == [1, 2] def test_multireader_not(): schema = fields.Schema(id=fields.STORED, f=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, f=u("alfa bravo chralie")) w.add_document(id=1, f=u("bravo chralie delta")) w.add_document(id=2, f=u("charlie delta echo")) w.add_document(id=3, f=u("delta echo foxtrot")) w.add_document(id=4, f=u("echo foxtrot golf")) w.commit() with ix.searcher() as s: q = query.And([query.Term("f", "delta"), query.Not(query.Term("f", "delta"))]) r = s.search(q) assert len(r) == 0 ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=5, f=u("alfa bravo chralie")) w.add_document(id=6, f=u("bravo chralie delta")) w.commit(merge=False) w = ix.writer() w.add_document(id=7, f=u("charlie delta echo")) w.add_document(id=8, f=u("delta echo foxtrot")) w.commit(merge=False) w = ix.writer() w.add_document(id=9, f=u("echo foxtrot golf")) w.add_document(id=10, f=u("foxtrot golf delta")) w.commit(merge=False) assert len(ix._segments()) > 1 with ix.searcher() as s: q = query.And([query.Term("f", "delta"), query.Not(query.Term("f", "delta"))]) r = s.search(q) assert len(r) == 0 def test_boost_phrase(): schema = fields.Schema(title=fields.TEXT(field_boost=5.0, stored=True), text=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain): t = u(" ").join(ls) w.add_document(title=t, text=t) w.commit() q = query.Or([query.Term("title", u("alfa")), query.Term("title", u("bravo")), query.Phrase("text", [u("bravo"), u("charlie"), u("delta")]) ]) def boost_phrases(q): if isinstance(q, query.Phrase): q.boost *= 1000.0 return q else: return q.apply(boost_phrases) q = boost_phrases(q) with ix.searcher() as s: r = s.search(q, limit=None) for hit in r: if "bravo charlie delta" in hit["title"]: assert hit.score > 100.0 def test_filter(): schema = fields.Schema(id=fields.STORED, path=fields.ID, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, path=u("/a/1"), text=u("alfa bravo charlie")) w.add_document(id=2, path=u("/b/1"), text=u("bravo charlie delta")) w.add_document(id=3, path=u("/c/1"), text=u("charlie delta echo")) w.commit(merge=False) w = ix.writer() w.add_document(id=4, path=u("/a/2"), text=u("delta echo alfa")) w.add_document(id=5, path=u("/b/2"), text=u("echo alfa bravo")) w.add_document(id=6, path=u("/c/2"), text=u("alfa bravo charlie")) w.commit(merge=False) w = ix.writer() w.add_document(id=7, path=u("/a/3"), text=u("bravo charlie delta")) w.add_document(id=8, path=u("/b/3"), text=u("charlie delta echo")) w.add_document(id=9, path=u("/c/3"), text=u("delta echo alfa")) w.commit(merge=False) with ix.searcher() as s: fq = query.Or([query.Prefix("path", "/a"), query.Prefix("path", "/b")]) r = s.search(query.Term("text", "alfa"), filter=fq) assert [d["id"] for d in r] == [1, 4, 5] r = s.search(query.Term("text", "bravo"), filter=fq) assert [d["id"] for d in r] == [1, 2, 5, 7, ] def test_fieldboost(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, a=u("alfa bravo charlie"), b=u("echo foxtrot india")) w.add_document(id=1, a=u("delta bravo charlie"), b=u("alfa alfa alfa")) w.add_document(id=2, a=u("alfa alfa alfa"), b=u("echo foxtrot india")) w.add_document(id=3, a=u("alfa sierra romeo"), b=u("alfa tango echo")) w.add_document(id=4, a=u("bravo charlie delta"), b=u("alfa foxtrot india")) w.add_document(id=5, a=u("alfa alfa echo"), b=u("tango tango tango")) w.add_document(id=6, a=u("alfa bravo echo"), b=u("alfa alfa tango")) w.commit() def field_booster(fieldname, factor=2.0): "Returns a function which will boost the given field in a query tree" def booster_fn(obj): if obj.is_leaf() and obj.field() == fieldname: obj = copy.deepcopy(obj) obj.boost *= factor return obj else: return obj return booster_fn with ix.searcher() as s: q = query.Or([query.Term("a", u("alfa")), query.Term("b", u("alfa"))]) q = q.accept(field_booster("a", 100.0)) assert text_type(q) == text_type("(a:alfa^100.0 OR b:alfa)") r = s.search(q) assert [hit["id"] for hit in r] == [2, 5, 6, 3, 0, 1, 4] def test_andmaybe_quality(): schema = fields.Schema(id=fields.STORED, title=fields.TEXT(stored=True), year=fields.NUMERIC) ix = RamStorage().create_index(schema) domain = [(u('Alpha Bravo Charlie Delta'), 2000), (u('Echo Bravo Foxtrot'), 2000), (u('Bravo Golf Hotel'), 2002), (u('Bravo India'), 2002), (u('Juliet Kilo Bravo'), 2004), (u('Lima Bravo Mike'), 2004)] w = ix.writer() for title, year in domain: w.add_document(title=title, year=year) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("title", ix.schema) q = qp.parse(u("title:bravo ANDMAYBE year:2004")) titles = [hit["title"] for hit in s.search(q, limit=None)[:2]] assert "Juliet Kilo Bravo" in titles titles = [hit["title"] for hit in s.search(q, limit=2)] assert "Juliet Kilo Bravo" in titles def test_collect_limit(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id="a", text=u("alfa bravo charlie delta echo")) w.add_document(id="b", text=u("bravo charlie delta echo foxtrot")) w.add_document(id="c", text=u("charlie delta echo foxtrot golf")) w.add_document(id="d", text=u("delta echo foxtrot golf hotel")) w.add_document(id="e", text=u("echo foxtrot golf hotel india")) w.commit() with ix.searcher() as s: r = s.search(query.Term("text", u("golf")), limit=10) assert len(r) == 3 count = 0 for _ in r: count += 1 assert count == 3 w = ix.writer() w.add_document(id="f", text=u("foxtrot golf hotel india juliet")) w.add_document(id="g", text=u("golf hotel india juliet kilo")) w.add_document(id="h", text=u("hotel india juliet kilo lima")) w.add_document(id="i", text=u("india juliet kilo lima mike")) w.add_document(id="j", text=u("juliet kilo lima mike november")) w.commit(merge=False) with ix.searcher() as s: r = s.search(query.Term("text", u("golf")), limit=20) assert len(r) == 5 count = 0 for _ in r: count += 1 assert count == 5 def test_scorer(): schema = fields.Schema(key=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(key=u("alfa alfa alfa")) w.add_document(key=u("alfa alfa alfa alfa")) w.add_document(key=u("alfa alfa")) w.commit() w = ix.writer() w.add_document(key=u("alfa alfa alfa alfa alfa alfa")) w.add_document(key=u("alfa")) w.add_document(key=u("alfa alfa alfa alfa alfa")) w.commit(merge=False) # dw = scoring.DebugModel() # s = ix.searcher(weighting=dw) # r = s.search(query.Term("key", "alfa")) # log = dw.log # assert log, [('key', 'alfa', 0, 3.0, 3), # ('key', 'alfa', 1, 4.0, 4), # ('key', 'alfa', 2, 2.0, 2), # ('key', 'alfa', 0, 6.0, 6), # ('key', 'alfa', 1, 1.0, 1), # ('key', 'alfa', 2, 5.0, 5)]) def test_pos_scorer(): ana = analysis.SimpleAnalyzer() schema = fields.Schema(id=fields.STORED, key=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, key=u("0 0 1 0 0 0")) w.add_document(id=1, key=u("0 0 0 1 0 0")) w.add_document(id=2, key=u("0 1 0 0 0 0")) w.commit() w = ix.writer() w.add_document(id=3, key=u("0 0 0 0 0 1")) w.add_document(id=4, key=u("1 0 0 0 0 0")) w.add_document(id=5, key=u("0 0 0 0 1 0")) w.commit(merge=False) def pos_score_fn(searcher, fieldname, text, matcher): poses = matcher.value_as("positions") return 1.0 / (poses[0] + 1) pos_weighting = scoring.FunctionWeighting(pos_score_fn) s = ix.searcher(weighting=pos_weighting) r = s.search(query.Term("key", "1")) assert [hit["id"] for hit in r] == [4, 2, 0, 1, 5, 3] # def test_too_many_prefix_positions(): # schema = fields.Schema(id=fields.STORED, text=fields.TEXT) # ix = RamStorage().create_index(schema) # with ix.writer() as w: # for i in xrange(200): # text = u("a%s" % i) # w.add_document(id=i, text=text) # # q = query.Prefix("text", u("a")) # q.TOO_MANY_CLAUSES = 100 # # with ix.searcher() as s: # m = q.matcher(s) # assert m.supports("positions") # items = list(m.items_as("positions")) # assert [(i, [0]) for i in xrange(200)] == items def test_collapse(): from whoosh import collectors # id, text, size, tag domain = [("a", "blah blah blah", 5, "x"), ("b", "blah", 3, "y"), ("c", "blah blah blah blah", 2, "z"), ("d", "blah blah", 4, "x"), ("e", "bloop", 1, "-"), ("f", "blah blah blah blah blah", 6, "x"), ("g", "blah", 8, "w"), ("h", "blah blah", 7, "=")] schema = fields.Schema(id=fields.STORED, text=fields.TEXT, size=fields.NUMERIC, tag=fields.KEYWORD(sortable=True)) ix = RamStorage().create_index(schema) with ix.writer(codec=W3Codec()) as w: for id, text, size, tag in domain: w.add_document(id=u(id), text=u(text), size=size, tag=u(tag)) with ix.searcher() as s: q = query.Term("text", "blah") r = s.search(q, limit=None) assert " ".join(hit["id"] for hit in r) == "f c a d h b g" col = s.collector(limit=3) col = collectors.CollapseCollector(col, "tag") s.search_with_collector(q, col) r = col.results() assert " ".join(hit["id"] for hit in r) == "f c h" col = s.collector(limit=None) col = collectors.CollapseCollector(col, "tag") s.search_with_collector(q, col) r = col.results() assert " ".join(hit["id"] for hit in r) == "f c h b g" r = s.search(query.Every(), sortedby="size") assert " ".join(hit["id"] for hit in r) == "e c b d a f h g" col = s.collector(sortedby="size") col = collectors.CollapseCollector(col, "tag") s.search_with_collector(query.Every(), col) r = col.results() assert " ".join(hit["id"] for hit in r) == "e c b d h g" def test_collapse_nocolumn(): from whoosh import collectors # id, text, size, tag domain = [("a", "blah blah blah", 5, "x"), ("b", "blah", 3, "y"), ("c", "blah blah blah blah", 2, "z"), ("d", "blah blah", 4, "x"), ("e", "bloop", 1, "-"), ("f", "blah blah blah blah blah", 6, "x"), ("g", "blah", 8, "w"), ("h", "blah blah", 7, "=")] schema = fields.Schema(id=fields.STORED, text=fields.TEXT, size=fields.NUMERIC, tag=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: for id, text, size, tag in domain: w.add_document(id=u(id), text=u(text), size=size, tag=u(tag)) with ix.searcher() as s: q = query.Term("text", "blah") r = s.search(q, limit=None) assert " ".join(hit["id"] for hit in r) == "f c a d h b g" col = s.collector(limit=3) col = collectors.CollapseCollector(col, "tag") s.search_with_collector(q, col) r = col.results() assert " ".join(hit["id"] for hit in r) == "f c h" col = s.collector(limit=None) col = collectors.CollapseCollector(col, "tag") s.search_with_collector(q, col) r = col.results() assert " ".join(hit["id"] for hit in r) == "f c h b g" r = s.search(query.Every(), sortedby="size") assert " ".join(hit["id"] for hit in r) == "e c b d a f h g" col = s.collector(sortedby="size") col = collectors.CollapseCollector(col, "tag") s.search_with_collector(query.Every(), col) r = col.results() assert " ".join(hit["id"] for hit in r) == "e c b d h g" def test_collapse_length(): domain = u("alfa apple agnostic aplomb arc " "bravo big braid beer " "charlie crouch car " "delta dog " "echo " "foxtrot fold flip " "golf gym goop" ).split() schema = fields.Schema(key=fields.ID(sortable=True), word=fields.ID(stored=True)) ix = RamStorage().create_index(schema) with ix.writer(codec=W3Codec()) as w: for word in domain: w.add_document(key=word[0], word=word) with ix.searcher() as s: q = query.Every() def check(r): words = " ".join(hit["word"] for hit in r) assert words == "alfa bravo charlie delta echo foxtrot golf" assert r.scored_length() == 7 assert len(r) == 7 r = s.search(q, collapse="key", collapse_limit=1, limit=None) check(r) r = s.search(q, collapse="key", collapse_limit=1, limit=50) check(r) r = s.search(q, collapse="key", collapse_limit=1, limit=10) check(r) def test_collapse_length_nocolumn(): domain = u("alfa apple agnostic aplomb arc " "bravo big braid beer " "charlie crouch car " "delta dog " "echo " "foxtrot fold flip " "golf gym goop" ).split() schema = fields.Schema(key=fields.ID(), word=fields.ID(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for word in domain: w.add_document(key=word[0], word=word) with ix.searcher() as s: q = query.Every() def check(r): words = " ".join(hit["word"] for hit in r) assert words == "alfa bravo charlie delta echo foxtrot golf" assert r.scored_length() == 7 assert len(r) == 7 r = s.search(q, collapse="key", collapse_limit=1, limit=None) check(r) r = s.search(q, collapse="key", collapse_limit=1, limit=50) check(r) r = s.search(q, collapse="key", collapse_limit=1, limit=10) check(r) def test_collapse_order(): from whoosh import sorting schema = fields.Schema(id=fields.STORED, price=fields.NUMERIC(sortable=True), rating=fields.NUMERIC(sortable=True), tag=fields.ID(sortable=True)) ix = RamStorage().create_index(schema) with ix.writer(codec=W3Codec()) as w: w.add_document(id="a", price=10, rating=1, tag=u("x")) w.add_document(id="b", price=80, rating=3, tag=u("y")) w.add_document(id="c", price=60, rating=1, tag=u("z")) w.add_document(id="d", price=30, rating=2) w.add_document(id="e", price=50, rating=3, tag=u("x")) w.add_document(id="f", price=20, rating=1, tag=u("y")) w.add_document(id="g", price=50, rating=2, tag=u("z")) w.add_document(id="h", price=90, rating=5) w.add_document(id="i", price=50, rating=5, tag=u("x")) w.add_document(id="j", price=40, rating=1, tag=u("y")) w.add_document(id="k", price=50, rating=4, tag=u("z")) w.add_document(id="l", price=70, rating=2) with ix.searcher() as s: def check(kwargs, target): r = s.search(query.Every(), limit=None, **kwargs) assert " ".join(hit["id"] for hit in r) == target price = sorting.FieldFacet("price", reverse=True) rating = sorting.FieldFacet("rating", reverse=True) tag = sorting.FieldFacet("tag") check(dict(sortedby=price), "h b l c e g i k j d f a") check(dict(sortedby=price, collapse=tag), "h b l c e d") check(dict(sortedby=price, collapse=tag, collapse_order=rating), "h b l i k d") def test_collapse_order_nocolumn(): from whoosh import sorting schema = fields.Schema(id=fields.STORED, price=fields.NUMERIC(), rating=fields.NUMERIC(), tag=fields.ID()) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id="a", price=10, rating=1, tag=u("x")) w.add_document(id="b", price=80, rating=3, tag=u("y")) w.add_document(id="c", price=60, rating=1, tag=u("z")) w.add_document(id="d", price=30, rating=2) w.add_document(id="e", price=50, rating=3, tag=u("x")) w.add_document(id="f", price=20, rating=1, tag=u("y")) w.add_document(id="g", price=50, rating=2, tag=u("z")) w.add_document(id="h", price=90, rating=5) w.add_document(id="i", price=50, rating=5, tag=u("x")) w.add_document(id="j", price=40, rating=1, tag=u("y")) w.add_document(id="k", price=50, rating=4, tag=u("z")) w.add_document(id="l", price=70, rating=2) with ix.searcher() as s: def check(kwargs, target): r = s.search(query.Every(), limit=None, **kwargs) assert " ".join(hit["id"] for hit in r) == target price = sorting.FieldFacet("price", reverse=True) rating = sorting.FieldFacet("rating", reverse=True) tag = sorting.FieldFacet("tag") check(dict(sortedby=price), "h b l c e g i k j d f a") check(dict(sortedby=price, collapse=tag), "h b l c e d") check(dict(sortedby=price, collapse=tag, collapse_order=rating), "h b l i k d") def test_coord(): from whoosh.matching import CoordMatcher schema = fields.Schema(id=fields.STORED, hits=fields.STORED, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, hits=0, tags=u("blah blah blah blah")) w.add_document(id=1, hits=0, tags=u("echo echo blah blah")) w.add_document(id=2, hits=1, tags=u("bravo charlie delta echo")) w.add_document(id=3, hits=2, tags=u("charlie delta echo foxtrot")) w.add_document(id=4, hits=3, tags=u("delta echo foxtrot golf")) w.add_document(id=5, hits=3, tags=u("echo foxtrot golf hotel")) w.add_document(id=6, hits=2, tags=u("foxtrot golf hotel india")) w.add_document(id=7, hits=1, tags=u("golf hotel india juliet")) w.add_document(id=8, hits=0, tags=u("foxtrot foxtrot foo foo")) w.add_document(id=9, hits=0, tags=u("foo foo foo foo")) og = qparser.OrGroup.factory(0.99) qp = qparser.QueryParser("tags", schema, group=og) q = qp.parse("golf foxtrot echo") assert q.__class__ == query.Or assert q.scale == 0.99 with ix.searcher() as s: m = q.matcher(s) assert type(m) == CoordMatcher r = s.search(q, optimize=False) assert [hit["id"] for hit in r] == [4, 5, 3, 6, 1, 8, 2, 7] def test_keyword_search(): schema = fields.Schema(tags=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(tags=u("keyword1 keyword2 keyword3 keyword4 keyword5")) with ix.searcher() as s: r = s.search_page(query.Term("tags", "keyword3"), 1) assert r def test_groupedby_with_terms(): schema = fields.Schema(content=fields.TEXT, organism=fields.ID) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(organism=u("mus"), content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study00:00:00")) w.add_document(organism=u("mus"), content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study")) w.add_document(organism=u("hs"), content=u("This is the first document we've added!")) with ix.searcher() as s: q = qparser.QueryParser("content", schema=ix.schema).parse(u("IPFSTD1")) r = s.search(q, groupedby=["organism"], terms=True) assert len(r) == 2 assert r.groups("organism") == {"mus": [1, 0]} assert r.has_matched_terms() assert r.matched_terms() == set([('content', b('ipfstd1'))]) def test_score_length(): schema = fields.Schema(a=fields.TEXT, b=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(a=u("alfa bravo charlie")) w.add_document(b=u("delta echo foxtrot")) w.add_document(a=u("golf hotel india")) with ix.writer() as w: w.merge = False w.add_document(b=u("juliet kilo lima")) # In the second segment, there is an "a" field here, but in the # corresponding document in the first segment, the field doesn't exist, # so if the scorer is getting segment offsets wrong, scoring this # document will error w.add_document(a=u("mike november oskar")) w.add_document(b=u("papa quebec romeo")) with ix.searcher() as s: assert not s.is_atomic() p = s.postings("a", "mike") while p.is_active(): docnum = p.id() score = p.score() p.next() def test_terms_with_filter(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo charlie delta")) w.add_document(text=u("bravo charlie delta echo")) w.add_document(text=u("charlie delta echo foxtrot")) w.add_document(text=u("delta echo foxtrot golf")) w.add_document(text=u("echo foxtrot golf hotel")) w.add_document(text=u("foxtrot golf hotel alfa")) w.add_document(text=u("golf hotel alfa bravo")) w.add_document(text=u("hotel alfa bravo charlie")) with ix.searcher() as s: workingset = set([1, 2, 3]) q = query.Term("text", u("foxtrot")) r = s.search_page(q, pagenum=1, pagelen=5, terms=True, filter=workingset) assert r.scored_length() == 2 assert [hit.docnum for hit in r] == [2, 3] def test_terms_to_bytes(): schema = fields.Schema(a=fields.TEXT, b=fields.NUMERIC, id=fields.STORED) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, a=u("alfa bravo"), b=100) w.add_document(id=1, a=u("bravo charlie"), b=200) w.add_document(id=2, a=u("charlie delta"), b=100) w.add_document(id=3, a=u("delta echo"), b=200) with ix.searcher() as s: t1 = query.Term("b", 200) t2 = query.Term("a", "bravo") q = query.And([t1, t2]) r = s.search(q) assert [hit["id"] for hit in r] == [1] def test_issue_334(): schema = fields.Schema( kind=fields.ID(stored=True), name=fields.ID(stored=True), returns=fields.ID(stored=True), ) ix = RamStorage().create_index(schema) with ix.writer() as w: with w.group(): w.add_document(kind=u('class'), name=u('Index')) w.add_document(kind=u('method'), name=u('add document'), returns=u('void')) w.add_document(kind=u('method'), name=u('add reader'), returns=u('void')) w.add_document(kind=u('method'), name=u('close'), returns=u('void')) with w.group(): w.add_document(kind=u('class'), name=u('Accumulator')) w.add_document(kind=u('method'), name=u('add'), returns=u('void')) w.add_document(kind=u('method'), name=u('get result'), returns=u('number')) with w.group(): w.add_document(kind=u('class'), name=u('Calculator')) w.add_document(kind=u('method'), name=u('add'), returns=u('number')) w.add_document(kind=u('method'), name=u('add all'), returns=u('number')) w.add_document(kind=u('method'), name=u('add some'), returns=u('number')) w.add_document(kind=u('method'), name=u('multiply'), returns=u('number')) w.add_document(kind=u('method'), name=u('close'), returns=u('void')) with w.group(): w.add_document(kind=u('class'), name=u('Deleter')) w.add_document(kind=u('method'), name=u('add'), returns=u('void')) w.add_document(kind=u('method'), name=u('delete'), returns=u('void')) with ix.searcher() as s: pq = query.Term('kind', 'class') cq = query.Term('name', 'Calculator') q = query.NestedChildren(pq, cq) & query.Term('returns', 'void') r = s.search(q) assert len(r) == 1 assert r[0]["name"] == u("close") def test_find_decimals(): from decimal import Decimal schema = fields.Schema(name=fields.KEYWORD(stored=True), num=fields.NUMERIC(Decimal, decimal_places=5)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(name=u("alfa"), num=Decimal("1.5")) w.add_document(name=u("bravo"), num=Decimal("2.1")) w.add_document(name=u("charlie"), num=Decimal("5.3")) w.add_document(name=u("delta"), num=Decimal(3)) w.add_document(name=u("echo"), num=Decimal("3.00001")) w.add_document(name=u("foxtrot"), num=Decimal("3")) qp = qparser.QueryParser("name", ix.schema) q = qp.parse("num:3.0") assert isinstance(q, query.Term) with ix.searcher() as s: r = s.search(q) names = " ".join(sorted(hit["name"] for hit in r)) assert names == "delta foxtrot" Whoosh-2.5.7/tests/test_sorting.py0000644000076500000240000011003312254366764017341 0ustar mattstaff00000000000000from __future__ import with_statement from datetime import datetime, timedelta import random import gc from whoosh import fields, query, sorting from whoosh.compat import b, u from whoosh.compat import permutations, xrange from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex try: import multiprocessing except ImportError: pass else: class MPFCTask(multiprocessing.Process): def __init__(self, storage, indexname): multiprocessing.Process.__init__(self) self.storage = storage self.indexname = indexname def run(self): ix = self.storage.open_index(self.indexname) with ix.searcher() as s: r = s.search(query.Every(), sortedby="key", limit=None) result = "".join([h["key"] for h in r]) assert result == "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" docs = ({"id": u("zulu"), "num": 100, "tag": u("one"), "frac": 0.75}, {"id": u("xray"), "num": -5, "tag": u("three"), "frac": 2.0}, {"id": u("yankee"), "num": 3, "tag": u("two"), "frac": 5.5}, {"id": u("alfa"), "num": 7, "tag": u("three"), "frac": 2.25}, {"id": u("tango"), "num": 2, "tag": u("two"), "frac": 1.75}, {"id": u("foxtrot"), "num": -800, "tag": u("two"), "frac": 3.25}, {"id": u("sierra"), "num": 1, "tag": u("one"), "frac": 4.75}, {"id": u("whiskey"), "num": 0, "tag": u("three"), "frac": 5.25}, {"id": u("bravo"), "num": 582045, "tag": u("three"), "frac": 1.25}, ) def get_schema(): return fields.Schema(id=fields.ID(stored=True), num=fields.NUMERIC(stored=True), frac=fields.NUMERIC(float, stored=True), tag=fields.ID(stored=True), ev=fields.ID, ) def make_single_index(ix): w = ix.writer() for doc in docs: w.add_document(ev=u("a"), **doc) w.commit() def make_multi_index(ix): for i in xrange(0, len(docs), 3): w = ix.writer() for doc in docs[i:i + 3]: w.add_document(ev=u("a"), **doc) w.commit(merge=False) def try_sort(sortedby, key, q=None, limit=None, reverse=False): if q is None: q = query.Term("ev", u("a")) correct = [d["id"] for d in sorted(docs, key=key, reverse=reverse)][:limit] schema = get_schema() for fn in (make_single_index, make_multi_index): ix = RamStorage().create_index(schema) fn(ix) with ix.searcher() as s: r = s.search(q, sortedby=sortedby, limit=limit, reverse=reverse) rids = [d["id"] for d in r] assert rids == correct def test_sortedby(): try_sort("id", lambda d: d["id"]) try_sort("id", lambda d: d["id"], limit=5) try_sort("id", lambda d: d["id"], reverse=True) try_sort("id", lambda d: d["id"], limit=5, reverse=True) def test_multisort(): mf = sorting.MultiFacet(["tag", "id"]) try_sort(mf, lambda d: (d["tag"], d["id"])) try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True) try_sort(mf, lambda d: (d["tag"], d["id"]), limit=5) try_sort(mf, lambda d: (d["tag"], d["id"]), reverse=True, limit=5) def test_numeric(): try_sort("num", lambda d: d["num"]) try_sort("num", lambda d: d["num"], reverse=True) try_sort("num", lambda d: d["num"], limit=5) try_sort("frac", lambda d: d["frac"]) def test_empty_field(): schema = fields.Schema(id=fields.STORED, key=fields.KEYWORD) with TempIndex(schema, "emptysort") as ix: w = ix.writer() w.add_document(id=1) w.add_document(id=2) w.add_document(id=3) w.commit() with ix.searcher() as s: r = s.search(query.Every(), sortedby="key") assert [h["id"] for h in r] == [1, 2, 3] def test_page_sorted(): schema = fields.Schema(key=fields.ID(stored=True)) with TempIndex(schema, "pagesorted") as ix: domain = list(u("abcdefghijklmnopqrstuvwxyz")) random.shuffle(domain) w = ix.writer() for char in domain: w.add_document(key=char) w.commit() with ix.searcher() as s: r = s.search(query.Every(), sortedby="key", limit=5) assert r.scored_length() == 5 assert len(r) == s.doc_count_all() rp = s.search_page(query.Every(), 1, pagelen=5, sortedby="key") assert "".join([h["key"] for h in rp]) == "abcde" assert rp[10:] == [] rp = s.search_page(query.Term("key", "glonk"), 1, pagelen=5, sortedby="key") assert len(rp) == 0 assert rp.is_last_page() def test_score_facet(): schema = fields.Schema(id=fields.STORED, a=fields.TEXT, b=fields.TEXT, c=fields.ID) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, a=u("alfa alfa bravo"), b=u("bottle"), c=u("c")) w.add_document(id=2, a=u("alfa alfa alfa"), b=u("bottle"), c=u("c")) w.commit() w = ix.writer() w.add_document(id=3, a=u("alfa bravo bravo"), b=u("bottle"), c=u("c")) w.add_document(id=4, a=u("alfa bravo alfa"), b=u("apple"), c=u("c")) w.commit(merge=False) w = ix.writer() w.add_document(id=5, a=u("alfa bravo bravo"), b=u("apple"), c=u("c")) w.add_document(id=6, a=u("alfa alfa alfa"), b=u("apple"), c=u("c")) w.commit(merge=False) with ix.searcher() as s: facet = sorting.MultiFacet(["b", sorting.ScoreFacet()]) r = s.search(q=query.Term("a", u("alfa")), sortedby=facet) assert [h["id"] for h in r] == [6, 4, 5, 2, 1, 3] def test_function_facet(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True, vector=True)) ix = RamStorage().create_index(schema) w = ix.writer() domain = ("alfa", "bravo", "charlie") count = 1 for w1 in domain: for w2 in domain: for w3 in domain: for w4 in domain: w.add_document(id=count, text=u(" ").join((w1, w2, w3, w4))) count += 1 w.commit() def fn(searcher, docnum): v = dict(searcher.vector_as("frequency", docnum, "text")) # Give high score to documents that have equal number of "alfa" # and "bravo". Negate value so higher values sort first return 0 - (1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0)) with ix.searcher() as s: q = query.And([query.Term("text", u("alfa")), query.Term("text", u("bravo"))]) fnfacet = sorting.FunctionFacet(fn) r = s.search(q, sortedby=fnfacet) texts = [hit["text"] for hit in r] for t in texts[:10]: tks = t.split() assert tks.count("alfa") == tks.count("bravo") def test_numeric_field_facet(): schema = fields.Schema(id=fields.STORED, v1=fields.NUMERIC, v2=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, v1=2, v2=100) w.add_document(id=2, v1=1, v2=50) w.commit() w = ix.writer() w.add_document(id=3, v1=2, v2=200) w.add_document(id=4, v1=1, v2=100) w.commit() w = ix.writer(merge=False) w.add_document(id=5, v1=2, v2=50) w.add_document(id=6, v1=1, v2=200) w.commit() with ix.searcher() as s: mf = sorting.MultiFacet().add_field("v1").add_field("v2", reverse=True) r = s.search(query.Every(), sortedby=mf) assert [hit["id"] for hit in r] == [6, 4, 2, 3, 1, 5] def test_query_facet(): schema = fields.Schema(id=fields.STORED, v=fields.ID) ix = RamStorage().create_index(schema) for i, ltr in enumerate(u("iacgbehdf")): w = ix.writer() w.add_document(id=i, v=ltr) w.commit(merge=False) with ix.searcher() as s: q1 = query.TermRange("v", "a", "c") q2 = query.TermRange("v", "d", "f") q3 = query.TermRange("v", "g", "i") assert [hit["id"] for hit in s.search(q1)] == [1, 2, 4] assert [hit["id"] for hit in s.search(q2)] == [5, 7, 8] assert [hit["id"] for hit in s.search(q3)] == [0, 3, 6] facet = sorting.QueryFacet({"a-c": q1, "d-f": q2, "g-i": q3}) r = s.search(query.Every(), groupedby=facet) # If you specify a facet without a name, it's automatically called # "facet" assert r.groups("facet") == {"a-c": [1, 2, 4], "d-f": [5, 7, 8], "g-i": [0, 3, 6]} def test_query_facet_overlap(): domain = u("abcdefghi") schema = fields.Schema(v=fields.KEYWORD(stored=True), num=fields.NUMERIC(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for i, ltr in enumerate(domain): v = "%s %s" % (ltr, domain[8 - i]) w.add_document(num=i, v=v) with ix.searcher() as s: q1 = query.TermRange("v", "a", "c") q2 = query.TermRange("v", "d", "f") q3 = query.TermRange("v", "g", "i") facets = sorting.Facets() facets.add_query("myfacet", {"a-c": q1, "d-f": q2, "g-i": q3}, allow_overlap=True) r = s.search(query.Every(), groupedby=facets) gr = r.groups("myfacet") assert r.groups("myfacet") == {'a-c': [0, 1, 2, 6, 7, 8], 'd-f': [3, 4, 5], 'g-i': [0, 1, 2, 6, 7, 8]} def test_missing_field_facet(): schema = fields.Schema(id=fields.STORED, tag=fields.ID) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, tag=u("alfa")) w.add_document(id=1, tag=u("alfa")) w.add_document(id=2) w.add_document(id=3, tag=u("bravo")) w.add_document(id=4) w.commit() with ix.searcher() as s: r = s.search(query.Every(), groupedby="tag") assert r.groups("tag") == {None: [2, 4], 'bravo': [3], 'alfa': [0, 1]} def test_missing_numeric_facet(): schema = fields.Schema(id=fields.STORED, tag=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, tag=1) w.add_document(id=1, tag=1) w.add_document(id=2) w.add_document(id=3, tag=0) w.add_document(id=4) w.commit() with ix.searcher() as s: r = s.search(query.Every(), groupedby="tag") assert r.groups("tag") == {None: [2, 4], 0: [3], 1: [0, 1]} def test_missing_overlap(): schema = fields.Schema(a=fields.NUMERIC(stored=True), b=fields.KEYWORD(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(a=0, b=u("one two")) w.add_document(a=1) w.add_document(a=2, b=u("two three")) w.add_document(a=3) w.add_document(a=4, b=u("three four")) with ix.searcher() as s: facet = sorting.FieldFacet("b", allow_overlap=True) r = s.search(query.Every(), groupedby=facet) target = {"one": [0], "two": [0, 2], "three": [2, 4],"four": [4], None: [1, 3]} assert r.groups() == target def test_date_facet(): from whoosh import columns schema = fields.Schema(id=fields.STORED, date=fields.DATETIME) dc = schema["date"].default_column() assert isinstance(dc, columns.NumericColumn) ix = RamStorage().create_index(schema) w = ix.writer() d1 = datetime(2011, 7, 13) d2 = datetime(1984, 3, 29) w.add_document(id=0, date=d1) w.add_document(id=1, date=d1) w.add_document(id=2) w.add_document(id=3, date=d2) w.add_document(id=4) w.commit() with ix.searcher() as s: r = s.search(query.Every(), groupedby="date") assert r.groups() assert r.groups() == {d1: [0, 1], d2: [3], None: [2, 4]} def test_range_facet(): schema = fields.Schema(id=fields.STORED, price=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, price=200) w.add_document(id=1, price=100) w.add_document(id=2) w.add_document(id=3, price=50) w.add_document(id=4, price=500) w.add_document(id=5, price=125) w.commit() with ix.searcher() as s: rf = sorting.RangeFacet("price", 0, 1000, 100) r = s.search(query.Every(), groupedby={"price": rf}) assert r.groups("price") == {(0, 100): [3], (100, 200): [1, 5], (200, 300): [0], (500, 600): [4], None: [2]} def test_range_gaps(): schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() for i in range(10): w.add_document(id=i, num=i) w.commit() with ix.searcher() as s: rf = sorting.RangeFacet("num", 0, 1000, [1, 2, 3]) r = s.search(query.Every(), groupedby={"num": rf}) assert r.groups("num") == {(0, 1): [0], (1, 3): [1, 2], (3, 6): [3, 4, 5], (6, 9): [6, 7, 8], (9, 12): [9]} def test_daterange_facet(): schema = fields.Schema(id=fields.STORED, date=fields.DATETIME) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=0, date=datetime(2001, 1, 15)) w.add_document(id=1, date=datetime(2001, 1, 10)) w.add_document(id=2) w.add_document(id=3, date=datetime(2001, 1, 3)) w.add_document(id=4, date=datetime(2001, 1, 8)) w.add_document(id=5, date=datetime(2001, 1, 6)) w.commit() with ix.searcher() as s: rf = sorting.DateRangeFacet("date", datetime(2001, 1, 1), datetime(2001, 1, 20), timedelta(days=5)) r = s.search(query.Every(), groupedby={"date": rf}) dt = datetime assert r.groups("date") == {(dt(2001, 1, 1, 0, 0), dt(2001, 1, 6, 0, 0)): [3], (dt(2001, 1, 6, 0, 0), dt(2001, 1, 11, 0, 0)): [1, 4, 5], (dt(2001, 1, 11, 0, 0), dt(2001, 1, 16, 0, 0)): [0], None: [2]} def test_relative_daterange(): from whoosh.support.relativedelta import relativedelta dt = datetime schema = fields.Schema(id=fields.STORED, date=fields.DATETIME) ix = RamStorage().create_index(schema) basedate = datetime(2001, 1, 1) count = 0 with ix.writer() as w: while basedate < datetime(2001, 12, 1): w.add_document(id=count, date=basedate) basedate += timedelta(days=14, hours=16) count += 1 with ix.searcher() as s: gap = relativedelta(months=1) rf = sorting.DateRangeFacet("date", dt(2001, 1, 1), dt(2001, 12, 31), gap) r = s.search(query.Every(), groupedby={"date": rf}) assert r.groups("date") == {(dt(2001, 1, 1), dt(2001, 2, 1)): [0, 1, 2], (dt(2001, 2, 1), dt(2001, 3, 1)): [3, 4], (dt(2001, 3, 1), dt(2001, 4, 1)): [5, 6], (dt(2001, 4, 1), dt(2001, 5, 1)): [7, 8], (dt(2001, 5, 1), dt(2001, 6, 1)): [9, 10], (dt(2001, 6, 1), dt(2001, 7, 1)): [11, 12], (dt(2001, 7, 1), dt(2001, 8, 1)): [13, 14], (dt(2001, 8, 1), dt(2001, 9, 1)): [15, 16], (dt(2001, 9, 1), dt(2001, 10, 1)): [17, 18], (dt(2001, 10, 1), dt(2001, 11, 1)): [19, 20], (dt(2001, 11, 1), dt(2001, 12, 1)): [21, 22], } def test_overlapping_vector(): schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD(vector=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, tags=u("alfa bravo charlie")) w.add_document(id=1, tags=u("bravo charlie delta")) w.add_document(id=2, tags=u("charlie delta echo")) w.add_document(id=3, tags=u("delta echo alfa")) w.add_document(id=4, tags=u("echo alfa bravo")) with ix.searcher() as s: of = sorting.FieldFacet("tags", allow_overlap=True) cat = of.categorizer(s) assert cat._use_vectors r = s.search(query.Every(), groupedby={"tags": of}) assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], 'charlie': [0, 1, 2], 'delta': [1, 2, 3], 'echo': [2, 3, 4]} fcts = sorting.Facets() fcts.add_field("tags", allow_overlap=True) r = s.search(query.Every(), groupedby=fcts) assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], 'charlie': [0, 1, 2], 'delta': [1, 2, 3], 'echo': [2, 3, 4]} def test_overlapping_lists(): schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, tags=u("alfa bravo charlie")) w.add_document(id=1, tags=u("bravo charlie delta")) w.add_document(id=2, tags=u("charlie delta echo")) w.add_document(id=3, tags=u("delta echo alfa")) w.add_document(id=4, tags=u("echo alfa bravo")) with ix.searcher() as s: of = sorting.FieldFacet("tags", allow_overlap=True) cat = of.categorizer(s) assert not cat._use_vectors r = s.search(query.Every(), groupedby={"tags": of}) assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], 'charlie': [0, 1, 2], 'delta': [1, 2, 3], 'echo': [2, 3, 4]} fcts = sorting.Facets() fcts.add_field("tags", allow_overlap=True) r = s.search(query.Every(), groupedby=fcts) assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], 'charlie': [0, 1, 2], 'delta': [1, 2, 3], 'echo': [2, 3, 4]} def test_field_facets(): def check(method): with TempIndex(get_schema()) as ix: method(ix) with ix.searcher() as s: results = s.search(query.Every(), groupedby="tag") groups = results.groups() assert sorted(groups.items()) == [(u('one'), [0, 6]), (u('three'), [1, 3, 7, 8]), (u('two'), [2, 4, 5])] check(make_single_index) check(make_multi_index) def test_multifacet(): schema = fields.Schema(tag=fields.ID(stored=True), size=fields.ID(stored=True)) with TempIndex(schema, "multifacet") as ix: w = ix.writer() w.add_document(tag=u("alfa"), size=u("small")) w.add_document(tag=u("bravo"), size=u("medium")) w.add_document(tag=u("alfa"), size=u("large")) w.add_document(tag=u("bravo"), size=u("small")) w.add_document(tag=u("alfa"), size=u("medium")) w.add_document(tag=u("bravo"), size=u("medium")) w.commit() correct = {(u('bravo'), u('medium')): [1, 5], (u('alfa'), u('large')): [2], (u('alfa'), u('medium')): [4], (u('alfa'), u('small')): [0], (u('bravo'), u('small')): [3]} with ix.searcher() as s: facet = sorting.MultiFacet(["tag", "size"]) r = s.search(query.Every(), groupedby={"tag/size": facet}) cats = r.groups(("tag/size")) assert cats == correct def test_sort_filter(): schema = fields.Schema(group=fields.ID(stored=True), key=fields.ID(stored=True)) groups = u("alfa bravo charlie").split() keys = u("abcdefghijklmnopqrstuvwxyz") source = [] for i in xrange(100): key = keys[i % len(keys)] group = groups[i % len(groups)] source.append({"key": key, "group": group}) source.sort(key=lambda x: (x["key"], x["group"])) sample = list(source) random.shuffle(sample) with TempIndex(schema, "sortfilter") as ix: w = ix.writer() for i, fs in enumerate(sample): w.add_document(**fs) i += 1 if not i % 26: w.commit(merge=False) w = ix.writer() w.commit() fq = query.Term("group", u("bravo")) with ix.searcher() as s: r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, limit=20) assert [h.fields() for h in r] == [d for d in source if d["group"] == "bravo"][:20] fq = query.Term("group", u("bravo")) r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, limit=None) assert [h.fields() for h in r] == [d for d in source if d["group"] == "bravo"] ix.optimize() with ix.searcher() as s: r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, limit=20) assert [h.fields() for h in r] == [d for d in source if d["group"] == "bravo"][:20] fq = query.Term("group", u("bravo")) r = s.search(query.Every(), sortedby=("key", "group"), filter=fq, limit=None) assert [h.fields() for h in r] == [d for d in source if d["group"] == "bravo"] def test_sorting_function(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT(stored=True, vector=True)) ix = RamStorage().create_index(schema) w = ix.writer() domain = ("alfa", "bravo", "charlie") count = 1 for w1 in domain: for w2 in domain: for w3 in domain: for w4 in domain: w.add_document(id=count, text=u(" ").join((w1, w2, w3, w4))) count += 1 w.commit() def fn(searcher, docnum): v = dict(searcher.vector_as("frequency", docnum, "text")) # Sort documents that have equal number of "alfa" # and "bravo" first return 0 - 1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0) fnfacet = sorting.FunctionFacet(fn) with ix.searcher() as s: q = query.And([query.Term("text", u("alfa")), query.Term("text", u("bravo"))]) results = s.search(q, sortedby=fnfacet) r = [hit["text"] for hit in results] for t in r[:10]: tks = t.split() assert tks.count("alfa") == tks.count("bravo") class test_translate(): domain = [("alfa", 100, 50), ("bravo", 20, 80), ("charlie", 10, 10), ("delta", 82, 39), ("echo", 20, 73), ("foxtrot", 81, 59), ("golf", 39, 93), ("hotel", 57, 48), ("india", 84, 75), ] schema = fields.Schema(name=fields.TEXT(sortable=True), a=fields.NUMERIC(sortable=True), b=fields.NUMERIC(sortable=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for name, a, b in domain: w.add_document(name=u(name), a=a, b=b) with ix.searcher() as s: q = query.Every() # Baseline: just sort by a field r = s.search(q, sortedby="a") assert " ".join([hit["name"] for hit in r]) == "charlie bravo echo golf hotel foxtrot delta india alfa" # Sort by reversed name target = [x[0] for x in sorted(domain, key=lambda x: x[0][::-1])] tf = sorting.TranslateFacet(lambda name: name[::-1], sorting.FieldFacet("name")) r = s.search(q, sortedby=tf) assert [hit["name"] for hit in r] == target # Sort by average of a and b def avg(a, b): return (a + b) / 2 target = [x[0] for x in sorted(domain, key=lambda x: (x[1] + x[2]) / 2)] af = sorting.FieldFacet("a") bf = sorting.FieldFacet("b") tf = sorting.TranslateFacet(avg, af, bf) r = s.search(q, sortedby=tf) assert [hit["name"] for hit in r] == target def test_sorted_groups(): schema = fields.Schema(a=fields.STORED, b=fields.TEXT, c=fields.ID) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(a=0, b=u("blah"), c=u("apple")) w.add_document(a=1, b=u("blah blah"), c=u("bear")) w.add_document(a=2, b=u("blah blah blah"), c=u("apple")) w.add_document(a=3, b=u("blah blah blah blah"), c=u("bear")) w.add_document(a=4, b=u("blah blah blah blah blah"), c=u("apple")) w.add_document(a=5, b=u("blah blah blah blah blah blah"), c=u("bear")) with ix.searcher() as s: q = query.Term("b", "blah") r = s.search(q, groupedby="c") gs = r.groups("c") assert gs["apple"] == [4, 2, 0] assert gs["bear"] == [5, 3, 1] def test_group_types(): schema = fields.Schema(a=fields.STORED, b=fields.TEXT, c=fields.ID) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(a=0, b=u("blah"), c=u("apple")) w.add_document(a=1, b=u("blah blah"), c=u("bear")) w.add_document(a=2, b=u("blah blah blah"), c=u("apple")) w.add_document(a=3, b=u("blah blah blah blah"), c=u("bear")) w.add_document(a=4, b=u("blah blah blah blah blah"), c=u("apple")) w.add_document(a=5, b=u("blah blah blah blah blah blah"), c=u("bear")) w.add_document(a=6, b=u("blah blah blah blah blah blah blah"), c=u("apple")) with ix.searcher() as s: q = query.Term("b", "blah") f = sorting.FieldFacet("c", maptype=sorting.UnorderedList) r = s.search(q, groupedby=f) gs = r.groups() assert gs["apple"] == [0, 2, 4, 6] assert gs["bear"] == [1, 3, 5] f = sorting.FieldFacet("c", maptype=sorting.Count) r = s.search(q, groupedby=f) gs = r.groups() assert gs["apple"] == 4 assert gs["bear"] == 3 r = s.search(q, groupedby="c", maptype=sorting.Count) gs = r.groups() assert gs["apple"] == 4 assert gs["bear"] == 3 f = sorting.FieldFacet("c", maptype=sorting.Best) r = s.search(q, groupedby=f) gs = r.groups() assert gs["apple"] == 6 assert gs["bear"] == 5 def test_nocachefield_segments(): schema = fields.Schema(a=fields.ID(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(a=u("bravo")) w.add_document(a=u("echo")) w.add_document(a=u("juliet")) w.commit() w = ix.writer() w.add_document(a=u("kilo")) w.add_document(a=u("foxtrot")) w.add_document(a=u("charlie")) w.commit(merge=False) w = ix.writer() w.delete_by_term("a", u("echo")) w.add_document(a=u("alfa")) w.add_document(a=u("india")) w.add_document(a=u("delta")) w.commit(merge=False) with ix.searcher() as s: q = query.TermRange("a", u("bravo"), u("k")) facet = sorting.FieldFacet("a", reverse=True) r = s.search(q, sortedby=facet) assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "delta", "charlie", "bravo"] mq = query.Or([query.Term("a", u("bravo")), query.Term("a", u("delta"))]) anq = query.AndNot(q, mq) r = s.search(anq, sortedby=facet) assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "charlie"] mq = query.Or([query.Term("a", u("bravo")), query.Term("a", u("delta"))]) r = s.search(q, mask=mq, sortedby=facet) assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "charlie"] fq = query.Or([query.Term("a", u("alfa")), query.Term("a", u("charlie")), query.Term("a", u("echo")), query.Term("a", u("india")), ]) r = s.search(query.Every(), filter=fq, sortedby=facet) assert [hit["a"] for hit in r] == ["india", "charlie", "alfa"] nq = query.Not(query.Or([query.Term("a", u("alfa")), query.Term("a", u("india"))])) r = s.search(query.Every(), filter=nq, sortedby=facet) assert [hit["a"] for hit in r] == ["kilo", "juliet", "foxtrot", "delta", "charlie", "bravo"] def test_groupby_phrase(): domain = {"Alan Ball": "Tel Aviv", "Alan Charles": "San Francisco", "Alan Darwin": "London", "Alan Eames": "Paris"} schema = fields.Schema(name=fields.TEXT(stored=True), city=fields.TEXT(stored=True), city_g=fields.ID(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for name, city in domain.items(): w.add_document(name=u(name), city=u(city), city_g=u(city)) with ix.searcher() as s: q = query.Term("name", "alan") r = s.search(q, groupedby="city_g") keys = sorted(r.groups().keys()) assert keys == ["London", "Paris", "San Francisco", "Tel Aviv"] sff = sorting.StoredFieldFacet("city") r = s.search(q, groupedby=sff) keys = sorted(r.groups().keys()) assert keys == ["London", "Paris", "San Francisco", "Tel Aviv"] def test_sort_text_field(): domain = (("Visual Display of Quantitative Information, The", 10), ("Envisioning Information", 10), ("Visual Explanations", 10), ("Beautiful Evidence", -10), ("Visual and Statistical Thinking", -10), ("Cognitive Style of Powerpoint", -10)) sorted_titles = sorted(d[0] for d in domain) schema = fields.Schema(title=fields.TEXT(stored=True, sortable=True), num=fields.NUMERIC(sortable=True)) def test(ix): with ix.searcher() as s: # Sort by title r = s.search(query.Every(), sortedby="title") assert [hit["title"] for hit in r] == sorted_titles # Sort by reverse title facet = sorting.FieldFacet("title", reverse=True) r = s.search(query.Every(), sortedby=facet) assert [hit["title"] for hit in r] == list(reversed(sorted_titles)) # Sort by num (-10 to 10) first, and within that, by reverse title facet = sorting.MultiFacet() facet.add_field("num") facet.add_field("title", reverse=True) r = s.search(query.Every(), sortedby=facet) target = ["Visual and Statistical Thinking", "Cognitive Style of Powerpoint", "Beautiful Evidence", "Visual Explanations", "Visual Display of Quantitative Information, The", "Envisioning Information", ] assert [hit["title"] for hit in r] == target # Single segment ix = RamStorage().create_index(schema) with ix.writer() as w: for title, num in domain: w.add_document(title=u(title), num=num) test(ix) # Multisegment ix = RamStorage().create_index(schema) # Segment 1 with ix.writer() as w: for title, num in domain[:3]: w.add_document(title=u(title), num=num) # Segment 2 with ix.writer() as w: for title, num in domain[3:]: w.add_document(title=u(title), num=num) w.merge = False test(ix) def test_filtered_grouped(): schema = fields.Schema(tag=fields.ID, text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta echo foxtrot").split() with ix.writer() as w: for i, ls in enumerate(permutations(domain, 3)): tag = u(str(i % 3)) w.add_document(tag=tag, text=u(" ").join(ls)) with ix.searcher() as s: f = query.And([query.Term("text", "charlie"), query.Term("text", "delta")]) r = s.search(query.Every(), filter=f, groupedby="tag", limit=None) assert len(r) == 24 def test_add_sortable(): st = RamStorage() schema = fields.Schema(chapter=fields.ID(stored=True), price=fields.NUMERIC) ix = st.create_index(schema) with ix.writer() as w: w.add_document(chapter=u("alfa"), price=100) w.add_document(chapter=u("bravo"), price=200) w.add_document(chapter=u("charlie"), price=300) w.add_document(chapter=u("delta"), price=400) with ix.writer() as w: w.add_document(chapter=u("bravo"), price=500) w.add_document(chapter=u("alfa"), price=600) w.add_document(chapter=u("delta"), price=100) w.add_document(chapter=u("charlie"), price=200) w.merge = False with ix.reader() as r: assert not r.has_column("chapter") assert not r.has_column("price") with ix.writer() as w: sorting.add_sortable(w, "chapter", sorting.StoredFieldFacet("chapter")) sorting.add_sortable(w, "price", sorting.FieldFacet("price")) w.schema.test = 100 with ix.reader() as r: assert r.has_column("chapter") assert r.has_column("price") chapr = r.column_reader("chapter") pricer = r.column_reader("price") assert chapr[0] == "alfa" assert pricer[0] == 100 def test_missing_column(): from whoosh import collectors schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, tags=u("alfa bravo charlie")) w.add_document(id=1, tags=u("bravo charlie delta")) w.add_document(id=2, tags=u("charlie delta echo")) w.merge = False with ix.writer() as w: w.add_field("age", fields.NUMERIC(sortable=True)) w.add_document(id=3, tags=u("delta echo foxtrot"), age=10) w.add_document(id=4, tags=u("echo foxtrot golf"), age=5) w.add_document(id=5, tags=u("foxtrot golf alfa"), age=20) w.merge = False with ix.writer() as w: w.add_document(id=6, tags=u("golf alfa bravo"), age=2) w.add_document(id=7, tags=u("alfa hotel india"), age=50) w.add_document(id=8, tags=u("hotel india bravo"), age=15) w.merge = False with ix.searcher() as s: assert not s.is_atomic() q = query.Term("tags", u("alfa")) # Have to use yucky low-level collector API to make sure we used a # ColumnCategorizer to do the sorting c = s.collector(sortedby="age") assert isinstance(c, collectors.SortingCollector) s.search_with_collector(q, c) assert isinstance(c.categorizer, sorting.ColumnCategorizer) r = c.results() assert [hit["id"] for hit in r] == [6, 5, 7, 0] r = s.search(q, sortedby="age", reverse=True) assert [hit["id"] for hit in r] == [0, 7, 5, 6] def test_compound_sort(): fspec = fields.KEYWORD(stored=True, sortable=True) schema = fields.Schema(a=fspec, b=fspec, c=fspec) ix = RamStorage().create_index(schema) alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split() blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa").split() clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet").split() assert all(len(ls) == 10 for ls in (alist, blist, clist)) with ix.writer() as w: for i in xrange(10): w.add_document(a=alist[i], b=blist[i], c=clist[i]) with ix.searcher() as s: q = query.Every() sortedby = [sorting.FieldFacet("a"), sorting.FieldFacet("b", reverse=True), sorting.FieldFacet("c")] r = s.search(q, sortedby=sortedby) output = [] for hit in r: output.append(" ".join((hit["a"], hit["b"], hit["c"]))) assert output == [ "alfa charlie charlie", "alfa charlie india", "alfa bravo echo", "alfa alfa alfa", "alfa alfa golf", "bravo charlie foxtrot", "bravo bravo bravo", "bravo bravo hotel", "bravo alfa delta", "bravo alfa juliet", ] Whoosh-2.5.7/tests/test_spans.py0000644000076500000240000002557512254366350017007 0ustar mattstaff00000000000000from __future__ import with_statement from whoosh import analysis, fields, formats from whoosh.compat import u, xrange, permutations from whoosh.filedb.filestore import RamStorage from whoosh.query import spans from whoosh.query import And, Or, Term, Phrase domain = ("alfa", "bravo", "bravo", "charlie", "delta", "echo") _ix = None def get_index(): global _ix if _ix is not None: return _ix charfield = fields.FieldType(formats.Characters(), analysis.SimpleAnalyzer(), scorable=True, stored=True) schema = fields.Schema(text=charfield) st = RamStorage() _ix = st.create_index(schema) w = _ix.writer() for ls in permutations(domain, 4): w.add_document(text=u(" ").join(ls), _stored_text=ls) w.commit() return _ix def test_multimatcher(): schema = fields.Schema(content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "charlie", "delta") for _ in xrange(3): w = ix.writer() for ls in permutations(domain): w.add_document(content=u(" ").join(ls)) w.commit(merge=False) q = Term("content", "bravo") with ix.searcher() as s: m = q.matcher(s) while m.is_active(): content = s.stored_fields(m.id())["content"].split() spans = m.spans() for span in spans: assert content[span.start] == "bravo" m.next() def test_excludematcher(): schema = fields.Schema(content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "charlie", "delta") for _ in xrange(3): w = ix.writer() for ls in permutations(domain): w.add_document(content=u(" ").join(ls)) w.commit(merge=False) w = ix.writer() w.delete_document(5) w.delete_document(10) w.delete_document(28) w.commit(merge=False) q = Term("content", "bravo") with ix.searcher() as s: m = q.matcher(s) while m.is_active(): content = s.stored_fields(m.id())["content"].split() spans = m.spans() for span in spans: assert content[span.start] == "bravo" m.next() def test_span_term(): ix = get_index() with ix.searcher() as s: alllists = [d["text"] for d in s.all_stored_fields()] for word in domain: q = Term("text", word) m = q.matcher(s) ids = set() while m.is_active(): id = m.id() sps = m.spans() ids.add(id) original = list(s.stored_fields(id)["text"]) assert word in original if word != "bravo": assert len(sps) == 1 assert original.index(word) == sps[0].start assert original.index(word) == sps[0].end m.next() for i, ls in enumerate(alllists): if word in ls: assert i in ids else: assert i not in ids def test_span_first(): ix = get_index() with ix.searcher() as s: for word in domain: q = spans.SpanFirst(Term("text", word)) m = q.matcher(s) while m.is_active(): sps = m.spans() original = s.stored_fields(m.id())["text"] assert original[0] == word assert len(sps) == 1 assert sps[0].start == 0 assert sps[0].end == 0 m.next() q = spans.SpanFirst(Term("text", "bravo"), limit=1) m = q.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] for sp in m.spans(): assert orig[sp.start] == "bravo" m.next() def test_span_near(): ix = get_index() with ix.searcher() as s: def test(q): m = q.matcher(s) while m.is_active(): yield s.stored_fields(m.id())["text"], m.spans() m.next() for orig, sps in test(spans.SpanNear(Term("text", "alfa"), Term("text", "bravo"), ordered=True)): assert orig[sps[0].start] == "alfa" assert orig[sps[0].end] == "bravo" for orig, sps in test(spans.SpanNear(Term("text", "alfa"), Term("text", "bravo"), ordered=False)): first = orig[sps[0].start] second = orig[sps[0].end] assert ((first == "alfa" and second == "bravo") or (first == "bravo" and second == "alfa")) for orig, sps in test(spans.SpanNear(Term("text", "bravo"), Term("text", "bravo"), ordered=True)): text = " ".join(orig) assert text.find("bravo bravo") > -1 q = spans.SpanNear(spans.SpanNear(Term("text", "alfa"), Term("text", "charlie")), Term("text", "echo")) for orig, sps in test(q): text = " ".join(orig) assert text.find("alfa charlie echo") > -1 q = spans.SpanNear(Or([Term("text", "alfa"), Term("text", "charlie")]), Term("text", "echo"), ordered=True) for orig, sps in test(q): text = " ".join(orig) assert (text.find("alfa echo") > -1 or text.find("charlie echo") > -1) def test_near_unordered(): schema = fields.Schema(text=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(text=u("alfa bravo charlie delta echo")) w.add_document(text=u("alfa bravo delta echo charlie")) w.add_document(text=u("alfa charlie bravo delta echo")) w.add_document(text=u("echo delta alfa foxtrot")) w.commit() with ix.searcher() as s: q = spans.SpanNear(Term("text", "bravo"), Term("text", "charlie"), ordered=False) r = sorted(d["text"] for d in s.search(q)) assert r == [u('alfa bravo charlie delta echo'), u('alfa charlie bravo delta echo')] def test_span_near2(): ana = analysis.SimpleAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(text=u("The Lucene library is by Doug Cutting and Whoosh " + "was made by Matt Chaput")) w.commit() nq1 = spans.SpanNear(Term("text", "lucene"), Term("text", "doug"), slop=5) nq2 = spans.SpanNear(nq1, Term("text", "whoosh"), slop=4) with ix.searcher() as s: m = nq2.matcher(s) assert m.spans() == [spans.Span(1, 8)] def test_span_not(): ix = get_index() with ix.searcher() as s: nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), slop=2) bq = Term("text", "bravo") q = spans.SpanNot(nq, bq) m = q.matcher(s) while m.is_active(): orig = list(s.stored_fields(m.id())["text"]) i1 = orig.index("alfa") i2 = orig.index("charlie") dist = i2 - i1 assert 0 < dist < 3 if "bravo" in orig: assert orig.index("bravo") != i1 + 1 m.next() def test_span_or(): ix = get_index() with ix.searcher() as s: nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), slop=2) bq = Term("text", "bravo") q = spans.SpanOr([nq, bq]) m = q.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] assert ("alfa" in orig and "charlie" in orig) or "bravo" in orig m.next() def test_span_contains(): ix = get_index() with ix.searcher() as s: nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), slop=3) cq = spans.SpanContains(nq, Term("text", "echo")) m = cq.matcher(s) ls = [] while m.is_active(): orig = s.stored_fields(m.id())["text"] ls.append(" ".join(orig)) m.next() ls.sort() assert ls == ['alfa bravo echo charlie', 'alfa bravo echo charlie', 'alfa delta echo charlie', 'alfa echo bravo charlie', 'alfa echo bravo charlie', 'alfa echo charlie bravo', 'alfa echo charlie bravo', 'alfa echo charlie delta', 'alfa echo delta charlie', 'bravo alfa echo charlie', 'bravo alfa echo charlie', 'delta alfa echo charlie', ] def test_span_before(): ix = get_index() with ix.searcher() as s: bq = spans.SpanBefore(Term("text", "alfa"), Term("text", "charlie")) m = bq.matcher(s) while m.is_active(): orig = list(s.stored_fields(m.id())["text"]) assert "alfa" in orig assert "charlie" in orig assert orig.index("alfa") < orig.index("charlie") m.next() def test_span_condition(): ix = get_index() with ix.searcher() as s: sc = spans.SpanCondition(Term("text", "alfa"), Term("text", "charlie")) m = sc.matcher(s) while m.is_active(): orig = list(s.stored_fields(m.id())["text"]) assert "alfa" in orig assert "charlie" in orig for span in m.spans(): assert orig[span.start] == "alfa" m.next() def test_regular_or(): ix = get_index() with ix.searcher() as s: oq = Or([Term("text", "bravo"), Term("text", "alfa")]) m = oq.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] for span in m.spans(): v = orig[span.start] assert v == "bravo" or v == "alfa" m.next() def test_regular_and(): ix = get_index() with ix.searcher() as s: aq = And([Term("text", "bravo"), Term("text", "alfa")]) m = aq.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] for span in m.spans(): v = orig[span.start] assert v == "bravo" or v == "alfa" m.next() def test_span_characters(): ix = get_index() with ix.searcher() as s: pq = Phrase("text", ["bravo", "echo"]) m = pq.matcher(s) while m.is_active(): orig = " ".join(s.stored_fields(m.id())["text"]) for span in m.spans(): startchar, endchar = span.startchar, span.endchar assert orig[startchar:endchar] == "bravo echo" m.next() Whoosh-2.5.7/tests/test_spelling.py0000644000076500000240000003660612254366350017475 0ustar mattstaff00000000000000from __future__ import with_statement import gzip from whoosh import analysis, fields, highlight, spelling from whoosh.automata import fst from whoosh.compat import b, u, permutations from whoosh.filedb.filestore import RamStorage from whoosh.qparser import QueryParser from whoosh.util.testing import TempIndex def words_to_corrector(words): st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(words, f) f = st.open_file("test") return spelling.GraphCorrector(fst.GraphReader(f)) def test_graph_corrector(): wordlist = sorted(["render", "animation", "animate", "shader", "shading", "zebra", "koala", "lamppost", "ready", "kismet", "reaction", "page", "delete", "quick", "brown", "fox", "jumped", "over", "lazy", "dog", "wicked", "erase", "red", "team", "yellow", "under", "interest", "open", "print", "acrid", "sear", "deaf", "feed", "grow", "heal", "jolly", "kilt", "low", "zone", "xylophone", "crown", "vale", "brown", "neat", "meat", "reduction", "blunder", "preaction"]) sp = words_to_corrector(wordlist) sugs = sp.suggest("reoction", maxdist=2) assert sugs == ["reaction", "preaction", "reduction"] def test_reader_corrector_nograph(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("render zorro kaori postal")) w.add_document(text=u("reader zebra koala pastry")) w.add_document(text=u("leader libra ooala paster")) w.add_document(text=u("feeder lorry zoala baster")) w.commit() with ix.reader() as r: sp = spelling.ReaderCorrector(r, "text") assert sp.suggest(u("kaola"), maxdist=1) == ['koala'] assert sp.suggest(u("kaola"), maxdist=2) == ['koala', 'kaori', 'ooala', 'zoala'] def test_reader_corrector(): schema = fields.Schema(text=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("render zorro kaori postal")) w.add_document(text=u("reader zebra koala pastry")) w.add_document(text=u("leader libra ooala paster")) w.add_document(text=u("feeder lorry zoala baster")) w.commit() with ix.reader() as r: assert r.has_word_graph("text") sp = spelling.ReaderCorrector(r, "text") assert sp.suggest(u("kaola"), maxdist=1) == [u('koala')] assert sp.suggest(u("kaola"), maxdist=2) == [u('koala'), u('kaori'), u('ooala'), u('zoala')] def test_simple_spelling(): schema = fields.Schema(text=fields.TEXT(spelling=True)) domain = [u("alfa"), u("bravo"), u("charlie")] ix = RamStorage().create_index(schema) with ix.writer() as w: for word in domain: w.add_document(text=word) with ix.searcher() as s: r = ix.reader() assert r.has_word_graph("text") c = r._get_graph().cursor("text") assert list(r.word_graph("text").flatten_strings()) == domain def test_unicode_spelling(): schema = fields.Schema(text=fields.ID(spelling=True)) domain = [u("\u0924\u092a\u093e\u0907\u0939\u0930\u0941"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] ix = RamStorage().create_index(schema) with ix.writer() as w: for word in domain: w.add_document(text=word) with ix.reader() as r: assert r.has_word_graph("text") c = r._get_graph().cursor("text") assert list(c.flatten_strings()) == domain assert list(r.word_graph("text").flatten_strings()) == domain rc = spelling.ReaderCorrector(r, "text") assert rc.suggest(u("\u65e5\u672e\u672c")) == [u("\u65e5\u672c")] def test_add_spelling(): schema = fields.Schema(text1=fields.TEXT, text2=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text1=u("render zorro kaori postal"), text2=u("alfa")) w.add_document(text1=u("reader zebra koala pastry"), text2=u("alpa")) w.add_document(text1=u("leader libra ooala paster"), text2=u("alpha")) w.add_document(text1=u("feeder lorry zoala baster"), text2=u("olfo")) w.commit() with ix.reader() as r: assert not r.has_word_graph("text1") assert not r.has_word_graph("text2") from whoosh.writing import add_spelling add_spelling(ix, ["text1", "text2"]) with ix.reader() as r: assert r.has_word_graph("text1") assert r.has_word_graph("text2") sp = spelling.ReaderCorrector(r, "text1") assert sp.suggest(u("kaola"), maxdist=1) == [u('koala')] assert sp.suggest(u("kaola"), maxdist=2) == [u('koala'), u('kaori'), u('ooala'), u('zoala')] sp = spelling.ReaderCorrector(r, "text2") assert sp.suggest(u("alfo"), maxdist=1) == [u("alfa"), u("olfo")] def test_multisegment(): schema = fields.Schema(text=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) domain = u("special specious spectacular spongy spring specials").split() for word in domain: w = ix.writer() w.add_document(text=word) w.commit(merge=False) with ix.reader() as r: assert not r.is_atomic() assert r.has_word_graph("text") words = list(r.word_graph("text").flatten_strings()) assert words == sorted(domain) corr = r.corrector("text") assert corr.suggest("specail", maxdist=2) == ["special", "specials"] ix.optimize() with ix.reader() as r: assert r.is_atomic() fieldobj = schema["text"] assert [fieldobj.from_bytes(t) for t in r.lexicon("text")] == sorted(domain) assert r.has_word_graph("text") words = list(r.word_graph("text").flatten_strings()) assert words == sorted(domain) corr = r.corrector("text") assert corr.suggest("specail", maxdist=2) == ["special", "specials"] def test_multicorrector(): schema = fields.Schema(text=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) domain = u("special specious spectacular spongy spring specials").split() for word in domain: w = ix.writer() w.add_document(text=word) w.commit(merge=False) c1 = ix.reader().corrector("text") wordlist = sorted(u("bear bare beer sprung").split()) c2 = words_to_corrector(wordlist) mc = spelling.MultiCorrector([c1, c2]) assert mc.suggest("specail") == ["special", "specials"] assert mc.suggest("beur") == ["bear", "beer"] assert mc.suggest("sprang") == ["sprung", "spring"] def test_wordlist(): domain = "special specious spectacular spongy spring specials".split() domain.sort() cor = words_to_corrector(domain) assert cor.suggest("specail", maxdist=1) == ["special"] def test_wordfile(): import os.path files = os.listdir(".") testdir = "tests" fname = "english-words.10.gz" if testdir in files: path = os.path.join(testdir, fname) elif fname in files: path = fname else: return if not os.path.exists(path): return wordfile = gzip.open(path, "rb") cor = words_to_corrector(wordfile) wordfile.close() assert cor.suggest("specail") == ["special"] def test_query_highlight(): qp = QueryParser("a", None) hf = highlight.HtmlFormatter() def do(text, terms): q = qp.parse(text) tks = [tk for tk in q.all_tokens() if tk.text in terms] for tk in tks: if tk.startchar is None or tk.endchar is None: assert False, tk fragment = highlight.Fragment(text, tks) return hf.format_fragment(fragment) assert do("a b c d", ["b"]) == 'a b c d' assert do('a (x:b OR y:"c d") e', ("b", "c")) == 'a (x:b OR y:"c d") e' def test_query_terms(): qp = QueryParser("a", None) q = qp.parse("alfa b:(bravo OR c:charlie) delta") assert sorted(q.iter_all_terms()) == [("a", "alfa"), ("a", "delta"), ("b", "bravo"), ("c", "charlie")] q = qp.parse("alfa brav*") assert sorted(q.iter_all_terms()) == [("a", "alfa")] q = qp.parse('a b:("b c" d)^2 e') tokens = [(t.fieldname, t.text, t.boost) for t in q.all_tokens()] assert tokens == [('a', 'a', 1.0), ('b', 'b', 2.0), ('b', 'c', 2.0), ('b', 'd', 2.0), ('a', 'e', 1.0)] def test_correct_query(): schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(a=u("alfa bravo charlie delta")) w.add_document(a=u("delta echo foxtrot golf")) w.add_document(a=u("golf hotel india juliet")) w.add_document(a=u("juliet kilo lima mike")) w.commit() s = ix.searcher() qp = QueryParser("a", ix.schema) qtext = u('alpha ("brovo november" OR b:dolta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)' assert c.string == 'alfa ("bravo november" OR b:dolta) detail' qtext = u('alpha b:("brovo november" a:delta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' assert c.string == 'alfa b:("brovo november" a:delta) detail' hf = highlight.HtmlFormatter(classname="c") assert c.format_string(hf) == 'alfa b:("brovo november" a:delta) detail' def test_bypass_stemming(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(text=u("rendering shading modeling reactions")) w.commit() with ix.reader() as r: fieldobj = schema["text"] assert [fieldobj.from_bytes(t) for t in r.lexicon("text")] == ["model", "reaction", "render", "shade"] assert list(r.word_graph("text").flatten_strings()) == ["modeling", "reactions", "rendering", "shading"] def test_bypass_stemming2(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study00:00:00")) w.add_document(content=u("IPFSTD1 IPFSTD_kdwq134 Kaminski-all Study")) w.add_document(content=u("This is the first document we've added!")) def test_spelling_field_order(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT, b=fields.TEXT(analyzer=ana), c=fields.TEXT, d=fields.TEXT(analyzer=ana), e=fields.TEXT(analyzer=ana), f=fields.TEXT) ix = RamStorage().create_index(schema) domain = u("alfa bravo charlie delta").split() w = ix.writer() for ls in permutations(domain): value = " ".join(ls) w.add_document(a=value, b=value, c=value, d=value, e=value, f=value) w.commit() def test_find_self(): wordlist = sorted(u("book bake bike bone").split()) st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(wordlist, f) gr = fst.GraphReader(st.open_file("test")) gc = spelling.GraphCorrector(gr) assert gc.suggest("book")[0] != "book" assert gc.suggest("bake")[0] != "bake" assert gc.suggest("bike")[0] != "bike" assert gc.suggest("bone")[0] != "bone" def test_suggest_prefix(): domain = ("Shoot To Kill", "Bloom, Split and Deviate", "Rankle the Seas and the Skies", "Lightning Flash Flame Shell", "Flower Wind Rage and Flower God Roar, Heavenly Wind Rage and " "Heavenly Demon Sneer", "All Waves, Rise now and Become my Shield, Lightning, Strike " "now and Become my Blade", "Cry, Raise Your Head, Rain Without end", "Sting All Enemies To Death", "Reduce All Creation to Ash", "Sit Upon the Frozen Heavens", "Call forth the Twilight") schema = fields.Schema(content=fields.TEXT(stored=True, spelling=True), quick=fields.NGRAM(maxsize=10, stored=True)) with TempIndex(schema, "sugprefix") as ix: with ix.writer() as w: for item in domain: content = u(item) w.add_document(content=content, quick=content) with ix.searcher() as s: sugs = s.suggest("content", u("ra"), maxdist=2, prefix=2) assert sugs == ['rage', 'rain'] sugs = s.suggest("content", "ra", maxdist=2, prefix=1) assert sugs == ["rage", "rain", "roar"] def test_prefix_address(): fieldtype = fields.TEXT(spelling=True) schema = fields.Schema(f1=fieldtype, f2=fieldtype) with TempIndex(schema, "prefixaddr") as ix: with ix.writer() as w: w.add_document(f1=u("aabc aawx aaqr aade"), f2=u("aa12 aa34 aa56 aa78")) with ix.searcher() as s: sugs = s.suggest("f1", u("aa"), maxdist=2, prefix=2) assert sorted(sugs) == ["aabc", "aade", "aaqr", "aawx"] sugs = s.suggest("f2", u("aa"), maxdist=2, prefix=2) assert sorted(sugs) == ["aa12", "aa34", "aa56", "aa78"] def test_missing_suggestion(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(content=fields.TEXT(analyzer=ana, spelling=True), organism=fields.ID) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(organism=u("hs"), content=u("cells")) w.add_document(organism=u("hs"), content=u("cell")) with ix.searcher() as s: r = s.reader() assert r.has_word_graph("content") gr = r.word_graph("content") assert list(gr.flatten()) == [b("cell"), b("cells")] c = s.corrector("content") # Note that corrector won't suggest the word you submit even though it's # in the index assert c.suggest("cell") == ["cells"] def test_correct_correct(): from whoosh import qparser schema = fields.Schema(a=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) ix_writer = ix.writer() ix_writer.add_document(a=u('dworska')) ix_writer.add_document(a=u('swojska')) ix_writer.commit() s = ix.searcher() qtext = u('dworska') qp = qparser.QueryParser('a', ix.schema) q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.string == "dworska" assert c.format_string(highlight.UppercaseFormatter()) == "dworska" def test_very_long_words(): import sys length = int(sys.getrecursionlimit() * 1.5) strings1 = [u(chr(i) * length) for i in range(65, 70)] strings2 = [u(chr(i) * length) for i in range(71, 75)] ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for string in strings1: w.add_document(text=string) with ix.writer() as w: for string in strings2: w.add_document(text=string) w.optimize = True Whoosh-2.5.7/tests/test_tables.py0000644000076500000240000001373612254366350017131 0ustar mattstaff00000000000000# encoding: utf-8 from __future__ import with_statement import random from whoosh.compat import b, xrange, iteritems from whoosh.filedb.filestore import RamStorage from whoosh.filedb.filetables import HashReader, HashWriter from whoosh.filedb.filetables import OrderedHashWriter, OrderedHashReader from whoosh.util.testing import TempStorage def test_hash_single(): st = RamStorage() hw = HashWriter(st.create_file("test.hsh")) hw.add(b("alfa"), b("bravo")) hw.close() hr = HashReader.open(st, "test.hsh") assert hr.get(b("alfa")) == b("bravo") assert hr.get(b("foo")) is None def test_hash(): with TempStorage("hash") as st: hwf = st.create_file("test.hsh") hw = HashWriter(hwf) hw.add(b("foo"), b("bar")) hw.add(b("glonk"), b("baz")) hw.close() hr = HashReader.open(st, "test.hsh") assert hr.get(b("foo")) == b("bar") assert hr.get(b("baz")) is None hr.close() def test_hash_extras(): st = RamStorage() hw = HashWriter(st.create_file("test.hsh")) hw.extras["test"] = 100 hw.add(b("foo"), b("bar")) hw.add(b("glonk"), b("baz")) hw.close() hr = HashReader.open(st, "test.hsh") assert hr.extras["test"] == 100 assert hr.get(b("foo")) == b("bar") assert hr.get(b("baz")) is None hr.close() def test_hash_contents(): samp = [('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'), ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'), ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'), ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'), ] # Convert to bytes samp = set((b(k), b(v)) for k, v in samp) with TempStorage("hashcontents") as st: hw = HashWriter(st.create_file("test.hsh")) hw.add_all(samp) hw.close() hr = HashReader.open(st, "test.hsh") probes = list(samp) random.shuffle(probes) for key, value in probes: assert hr[key] == value assert set(hr.keys()) == set([k for k, v in samp]) assert set(hr.values()) == set([v for k, v in samp]) assert set(hr.items()) == samp hr.close() def test_random_hash(): from string import ascii_letters as domain times = 1000 minlen = 1 maxlen = len(domain) def randstring(): s = "".join(random.sample(domain, random.randint(minlen, maxlen))) return b(s) with TempStorage("randomhash") as st: samp = dict((randstring(), randstring()) for _ in xrange(times)) hw = HashWriter(st.create_file("test.hsh")) for k, v in iteritems(samp): hw.add(k, v) hw.close() keys = list(samp.keys()) random.shuffle(keys) hr = HashReader.open(st, "test.hsh") for k in keys: assert hr[k] == samp[k] hr.close() def test_random_access(): times = 1000 with TempStorage("orderedhash") as st: hw = HashWriter(st.create_file("test.hsh")) hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times)) hw.close() keys = list(range(times)) random.shuffle(keys) hr = HashReader.open(st, "test.hsh") for x in keys: assert hr[b("%08x" % x)] == b(str(x)) hr.close() def test_ordered_closest(): keys = ['alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf', 'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november'] # Make into bytes for Python 3 keys = [b(k) for k in keys] values = [str(len(k)).encode("ascii") for k in keys] with TempStorage("orderedclosest") as st: hw = OrderedHashWriter(st.create_file("test.hsh")) hw.add_all(zip(keys, values)) hw.close() hr = OrderedHashReader.open(st, "test.hsh") ck = hr.closest_key assert ck(b('')) == b('alfa') assert ck(b(' ')) == b('alfa') assert ck(b('alfa')) == b('alfa') assert ck(b('bravot')) == b('charlie') assert ck(b('charlie')) == b('charlie') assert ck(b('kiloton')) == b('lima') assert ck(b('oskar')) is None assert list(hr.keys()) == keys assert list(hr.values()) == values assert list(hr.keys_from(b('f'))) == keys[5:] hr.close() def test_extras(): st = RamStorage() hw = HashWriter(st.create_file("test")) hw.extras["test"] = 100 hw.extras["blah"] = "foo" hw.close() hr = HashReader(st.open_file("test"), st.file_length("test")) assert hr.extras["test"] == 100 assert hr.extras["blah"] == "foo" hr.close() hw = OrderedHashWriter(st.create_file("test")) hw.extras["test"] = 100 hw.extras["blah"] = "foo" hw.close() hr = HashReader(st.open_file("test"), st.file_length("test")) assert hr.extras["test"] == 100 assert hr.extras["blah"] == "foo" hr.close() hr = OrderedHashReader(st.open_file("test"), st.file_length("test")) assert hr.extras["test"] == 100 assert hr.extras["blah"] == "foo" hr.close() def test_checksum_file(): from whoosh.filedb.structfile import ChecksumFile from zlib import crc32 def wr(f): f.write(b("Testing")) f.write_int(-100) f.write_varint(10395) f.write_string(b("Hello")) f.write_ushort(32959) st = RamStorage() # Write a file normally f = st.create_file("control") wr(f) f.close() # Checksum the contents f = st.open_file("control") target = crc32(f.read()) & 0xffffffff f.close() # Write a file with checksumming f = st.create_file("test") cf = ChecksumFile(f) wr(cf) assert cf.checksum() == target f.close() # Read the file with checksumming f = st.open_file("test") cf = ChecksumFile(f) assert cf.read(7) == b("Testing") assert cf.read_int() == -100 assert cf.read_varint() == 10395 assert cf.read_string() == b("Hello") assert cf.read_ushort() == 32959 assert cf.checksum() == target cf.close() Whoosh-2.5.7/tests/test_vectors.py0000644000076500000240000000661612254366350017343 0ustar mattstaff00000000000000from __future__ import with_statement from whoosh import fields, formats from whoosh.compat import u from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex def test_single_term(): schema = fields.Schema(text=fields.TEXT(vector=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(text=u("TEST TEST TEST")) with ix.searcher() as s: v = s.vector(0, "text") assert v.is_active() def test_vector_reading(): schema = fields.Schema(title=fields.TEXT, content=fields.TEXT(vector=formats.Frequency())) with TempIndex(schema, "vectorreading") as ix: writer = ix.writer() writer.add_document(title=u("one"), content=u("This is the story of the black " + "hole story")) writer.commit() with ix.reader() as r: assert list(r.vector_as("frequency", 0, "content")) == [(u('black'), 1), (u('hole'), 1), (u('story'), 2)] def test_vector_merge(): schema = fields.Schema(title=fields.TEXT, content=fields.TEXT(vector=formats.Frequency())) with TempIndex(schema, "vectormerge") as ix: writer = ix.writer() writer.add_document(title=u("one"), content=u("This is the story of the black hole " + "story")) writer.commit() writer = ix.writer() writer.add_document(title=u("two"), content=u("You can read along in your book")) writer.commit() with ix.searcher() as s: r = s.reader() docnum = s.document_number(title=u("one")) vec = list(r.vector_as("frequency", docnum, "content")) assert vec == [(u('black'), 1), (u('hole'), 1), (u('story'), 2)] docnum = s.document_number(title=u("two")) vec = list(r.vector_as("frequency", docnum, "content")) assert vec == [(u('along'), 1), (u('book'), 1), (u('read'), 1)] def test_vector_unicode(): cf = fields.TEXT(vector=True) schema = fields.Schema(content=cf) ix = RamStorage().create_index(schema) writer = ix.writer() writer.add_document(content=u("\u13a0\u13a1\u13a2 \u13a3\u13a4\u13a5")) writer.add_document(content=u("\u13a6\u13a7\u13a8 \u13a9\u13aa\u13ab")) writer.commit() writer = ix.writer() writer.add_document(content=u("\u13ac\u13ad\u13ae \u13af\u13b0\u13b1")) writer.add_document(content=u("\u13b2\u13b3\u13b4 \u13b5\u13b6\u13b7")) writer.commit() with ix.reader() as r: vec = list(r.vector_as("frequency", 0, "content")) assert vec == [(u('\u13ac\u13ad\u13ae'), 1), (u('\u13af\u13b0\u13b1'), 1)] def test_add_vectored_field(): schema = fields.Schema(id=fields.ID(stored=True), f1=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=u("a"), f1=u("Testing one two three")) with ix.writer() as w: w.add_field("f2", fields.TEXT(vector=True)) w.add_document(id=u("b"), f2=u("Frosting four five six")) with ix.searcher() as s: docnum1 = s.document_number(id="a") assert not s.has_vector(docnum1, "f1") docnum2 = s.document_number(id="b") assert not s.has_vector(docnum2, "f1") assert s.has_vector(docnum2, "f2") Whoosh-2.5.7/tests/test_weightings.py0000644000076500000240000000526012254366350020020 0ustar mattstaff00000000000000from __future__ import with_statement import inspect from random import choice, randint import sys from whoosh import fields, query, scoring from whoosh.compat import u, xrange, permutations from whoosh.filedb.filestore import RamStorage def _weighting_classes(ignore): # Get all the subclasses of Weighting in whoosh.scoring return [c for _, c in inspect.getmembers(scoring, inspect.isclass) if scoring.Weighting in c.__bases__ and c not in ignore] def test_all(): domain = [u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot")] schema = fields.Schema(text=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) w = ix.writer() for _ in xrange(100): w.add_document(text=u(" ").join(choice(domain) for _ in xrange(randint(10, 20)))) w.commit() # List ABCs that should not be tested abcs = () # provide initializer arguments for any weighting classes that require them init_args = {"MultiWeighting": ([scoring.BM25F()], {"text": scoring.Frequency()}), "ReverseWeighting": ([scoring.BM25F()], {})} for wclass in _weighting_classes(abcs): try: if wclass.__name__ in init_args: args, kwargs = init_args[wclass.__name__] weighting = wclass(*args, **kwargs) else: weighting = wclass() except TypeError: e = sys.exc_info()[1] raise TypeError("Error instantiating %r: %s" % (wclass, e)) with ix.searcher(weighting=weighting) as s: try: for word in domain: s.search(query.Term("text", word)) except Exception: e = sys.exc_info()[1] e.msg = "Error searching with %r: %s" % (wclass, e) raise def test_compatibility(): from whoosh.scoring import Weighting # This is the old way of doing a custom weighting model, check that # it's still supported... class LegacyWeighting(Weighting): use_final = True def score(self, searcher, fieldname, text, docnum, weight): return weight + 0.5 def final(self, searcher, docnum, score): return score * 1.5 schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() domain = "alfa bravo charlie delta".split() for ls in permutations(domain, 3): w.add_document(text=u(" ").join(ls)) w.commit() s = ix.searcher(weighting=LegacyWeighting()) r = s.search(query.Term("text", u("bravo"))) assert r.score(0) == 2.25 Whoosh-2.5.7/tests/test_writing.py0000644000076500000240000003725412254366350017343 0ustar mattstaff00000000000000from __future__ import with_statement import random, time, threading import pytest from whoosh import analysis, fields, query, writing from whoosh.compat import b, u, xrange, text_type from whoosh.filedb.filestore import RamStorage from whoosh.util.testing import TempIndex def test_no_stored(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "nostored") as ix: domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india")) w = ix.writer() for i in xrange(20): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.commit() with ix.reader() as r: assert sorted([int(id) for id in r.lexicon("id")]) == list(range(20)) def test_asyncwriter(): schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(schema, "asyncwriter") as ix: domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india")) writers = [] # Simulate doing 20 (near-)simultaneous commits. If we weren't using # AsyncWriter, at least some of these would fail because the first # writer wouldn't be finished yet. for i in xrange(20): w = writing.AsyncWriter(ix) writers.append(w) w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.commit() # Wait for all writers to finish before checking the results for w in writers: if w.running: w.join() # Check whether all documents made it into the index. with ix.reader() as r: assert sorted([int(id) for id in r.lexicon("id")]) == list(range(20)) def test_asyncwriter_no_stored(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "asyncnostored") as ix: domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india")) writers = [] # Simulate doing 20 (near-)simultaneous commits. If we weren't using # AsyncWriter, at least some of these would fail because the first # writer wouldn't be finished yet. for i in xrange(20): w = writing.AsyncWriter(ix) writers.append(w) w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.commit() # Wait for all writers to finish before checking the results for w in writers: if w.running: w.join() # Check whether all documents made it into the index. with ix.reader() as r: assert sorted([int(id) for id in r.lexicon("id")]) == list(range(20)) def test_updates(): schema = fields.Schema(id=fields.ID(unique=True, stored=True)) ix = RamStorage().create_index(schema) for _ in xrange(10): with ix.writer() as w: w.update_document(id=u("a")) assert ix.doc_count() == 1 def test_buffered(): schema = fields.Schema(id=fields.ID, text=fields.TEXT) with TempIndex(schema, "buffered") as ix: domain = u("alfa bravo charlie delta echo foxtrot golf hotel india") domain = domain.split() w = writing.BufferedWriter(ix, period=None, limit=10, commitargs={"merge": False}) for i in xrange(20): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) time.sleep(0.1) w.close() assert len(ix._segments()) == 2 def test_buffered_search(): schema = fields.Schema(id=fields.STORED, text=fields.TEXT) with TempIndex(schema, "bufferedsearch") as ix: w = writing.BufferedWriter(ix, period=None, limit=5) w.add_document(id=1, text=u("alfa bravo charlie")) w.add_document(id=2, text=u("bravo tango delta")) w.add_document(id=3, text=u("tango delta echo")) w.add_document(id=4, text=u("charlie delta echo")) with w.searcher() as s: r = s.search(query.Term("text", u("tango"))) assert sorted([d["id"] for d in r]) == [2, 3] w.add_document(id=5, text=u("foxtrot golf hotel")) w.add_document(id=6, text=u("india tango juliet")) w.add_document(id=7, text=u("tango kilo lima")) w.add_document(id=8, text=u("mike november echo")) with w.searcher() as s: r = s.search(query.Term("text", u("tango"))) assert sorted([d["id"] for d in r]) == [2, 3, 6, 7] w.close() def test_buffered_update(): schema = fields.Schema(id=fields.ID(stored=True, unique=True), payload=fields.STORED) with TempIndex(schema, "bufferedupdate") as ix: w = writing.BufferedWriter(ix, period=None, limit=5) for i in xrange(10): for char in u("abc"): fs = dict(id=char, payload=text_type(i) + char) w.update_document(**fs) with w.reader() as r: sfs = [sf for _, sf in r.iter_docs()] sfs = sorted(sfs, key=lambda x: x["id"]) assert sfs == [{'id': u('a'), 'payload': u('9a')}, {'id': u('b'), 'payload': u('9b')}, {'id': u('c'), 'payload': u('9c')}] assert r.doc_count() == 3 w.close() def test_buffered_threads(): domain = u("alfa bravo charlie delta").split() schema = fields.Schema(name=fields.ID(unique=True, stored=True)) with TempIndex(schema, "buffthreads") as ix: class SimWriter(threading.Thread): def run(self): for _ in xrange(5): w.update_document(name=random.choice(domain)) time.sleep(random.uniform(0.01, 0.1)) w = writing.BufferedWriter(ix, limit=10) threads = [SimWriter() for _ in xrange(5)] for thread in threads: thread.start() for thread in threads: thread.join() w.close() with ix.reader() as r: assert r.doc_count() == 4 assert sorted([d["name"] for d in r.all_stored_fields()]) == domain def test_fractional_weights(): ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() # With Positions format schema = fields.Schema(f=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5")) w.commit() with ix.searcher() as s: wts = [] for word in s.lexicon("f"): p = s.postings("f", word) wts.append(p.weight()) assert wts == [0.5, 1.5, 2.0, 1.5] # Try again with Frequency format schema = fields.Schema(f=fields.TEXT(analyzer=ana, phrase=False)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5")) w.commit() with ix.searcher() as s: wts = [] for word in s.lexicon("f"): p = s.postings("f", word) wts.append(p.weight()) assert wts == [0.5, 1.5, 2.0, 1.5] def test_cancel_delete(): schema = fields.Schema(id=fields.ID(stored=True)) # Single segment with TempIndex(schema, "canceldelete1") as ix: w = ix.writer() for char in u("ABCD"): w.add_document(id=char) w.commit() with ix.reader() as r: assert not r.has_deletions() w = ix.writer() w.delete_document(2) w.delete_document(3) w.cancel() with ix.reader() as r: assert not r.has_deletions() assert not r.is_deleted(2) assert not r.is_deleted(3) # Multiple segments with TempIndex(schema, "canceldelete2") as ix: for char in u("ABCD"): w = ix.writer() w.add_document(id=char) w.commit(merge=False) with ix.reader() as r: assert not r.has_deletions() w = ix.writer() w.delete_document(2) w.delete_document(3) w.cancel() with ix.reader() as r: assert not r.has_deletions() assert not r.is_deleted(2) assert not r.is_deleted(3) def test_delete_nonexistant(): from whoosh.writing import IndexingError schema = fields.Schema(id=fields.ID(stored=True)) # Single segment with TempIndex(schema, "deletenon1") as ix: w = ix.writer() for char in u("ABC"): w.add_document(id=char) w.commit() try: w = ix.writer() with pytest.raises(IndexingError): w.delete_document(5) finally: w.cancel() # Multiple segments with TempIndex(schema, "deletenon1") as ix: for char in u("ABC"): w = ix.writer() w.add_document(id=char) w.commit(merge=False) try: w = ix.writer() with pytest.raises(IndexingError): w.delete_document(5) finally: w.cancel() def test_add_field(): schema = fields.Schema(a=fields.TEXT) with TempIndex(schema, "addfield") as ix: with ix.writer() as w: w.add_document(a=u("alfa bravo charlie")) with ix.writer() as w: w.add_field("b", fields.ID(stored=True)) w.add_field("c*", fields.ID(stored=True), glob=True) w.add_document(a=u("delta echo foxtrot"), b=u("india"), cat=u("juliet")) with ix.searcher() as s: fs = s.document(b=u("india")) assert fs == {"b": "india", "cat": "juliet"} def test_add_reader(): schema = fields.Schema(i=fields.ID(stored=True, unique=True), a=fields.TEXT(stored=True, spelling=True), b=fields.TEXT(vector=True)) with TempIndex(schema, "addreader") as ix: with ix.writer() as w: w.add_document(i=u("0"), a=u("alfa bravo charlie delta"), b=u("able baker coxwell dog")) w.add_document(i=u("1"), a=u("bravo charlie delta echo"), b=u("elf fabio gong hiker")) w.add_document(i=u("2"), a=u("charlie delta echo foxtrot"), b=u("india joker king loopy")) w.add_document(i=u("3"), a=u("delta echo foxtrot golf"), b=u("mister noogie oompah pancake")) with ix.writer() as w: w.delete_by_term("i", "1") w.delete_by_term("i", "3") with ix.writer() as w: w.add_document(i=u("4"), a=u("hotel india juliet kilo"), b=u("quick rhubarb soggy trap")) w.add_document(i=u("5"), a=u("india juliet kilo lima"), b=u("umber violet weird xray")) with ix.reader() as r: assert r.doc_count_all() == 4 sfs = list(r.all_stored_fields()) assert sfs == [{"i": u("4"), "a": u("hotel india juliet kilo")}, {"i": u("5"), "a": u("india juliet kilo lima")}, {"i": u("0"), "a": u("alfa bravo charlie delta")}, {"i": u("2"), "a": u("charlie delta echo foxtrot")}, ] assert " ".join(r.field_terms("a")) == "alfa bravo charlie delta echo foxtrot hotel india juliet kilo lima" vs = [] for docnum in r.all_doc_ids(): v = r.vector(docnum, "b") vs.append(list(v.all_ids())) assert vs == [["quick", "rhubarb", "soggy", "trap"], ["umber", "violet", "weird", "xray"], ["able", "baker", "coxwell", "dog"], ["india", "joker", "king", "loopy"] ] gr = r.word_graph("a") assert list(gr.flatten_strings()) == ["alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "hotel", "india", "juliet", "kilo", "lima"] def test_add_reader_spelling(): # Test whether add_spell_word() items get copied over in a merge # Because b is stemming and spelled, it will use add_spell_word() ana = analysis.StemmingAnalyzer() schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema, "addreadersp") as ix: with ix.writer() as w: w.add_document(a=u("rendering modeling compositing enabling"), b=u("rendering modeling compositing enabling")) w.add_document(a=u("flying rolling tying quitting polling"), b=u("flying rolling tying quitting polling")) with ix.writer() as w: w.add_document(a=u("writing eyeing ripping timing yelling"), b=u("writing eyeing ripping timing yelling")) w.add_document(a=u("undoing indicating opening pressing"), b=u("undoing indicating opening pressing")) with ix.searcher() as s: gr = s.reader().word_graph("a") assert " ".join(gr.flatten_strings()) == ("compositing enabling eyeing flying indicating " "modeling opening polling pressing quitting " "rendering ripping rolling timing tying undoing " "writing yelling") gr = s.reader().word_graph("b") assert " ".join(gr.flatten_strings()) == ("compositing enabling eyeing flying indicating " "modeling opening polling pressing quitting " "rendering ripping rolling timing tying undoing " "writing yelling") def test_clear(): schema = fields.Schema(a=fields.KEYWORD) ix = RamStorage().create_index(schema) # Add some segments with ix.writer() as w: w.add_document(a=u("one two three")) w.merge = False with ix.writer() as w: w.add_document(a=u("two three four")) w.merge = False with ix.writer() as w: w.add_document(a=u("three four five")) w.merge = False # Clear with ix.writer() as w: w.add_document(a=u("foo bar baz")) w.mergetype = writing.CLEAR with ix.searcher() as s: assert s.doc_count_all() == 1 assert list(s.reader().lexicon("a")) == [b("bar"), b("baz"), b("foo")] def test_spellable_list(): # Make sure a spellable field works with a list of pre-analyzed tokens ana = analysis.StemmingAnalyzer() schema = fields.Schema(Location=fields.STORED,Lang=fields.STORED, Title=fields.TEXT(spelling=True, analyzer=ana)) ix = RamStorage().create_index(schema) doc = {'Location': '1000/123', 'Lang': 'E', 'Title': ['Introduction', 'Numerical', 'Analysis']} with ix.writer() as w: w.add_document(**doc) def test_zero_procs(): schema = fields.Schema(text=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer(procs=0) as w: assert isinstance(w, writing.IndexWriter) with ix.writer(procs=1) as w: assert isinstance(w, writing.IndexWriter)