pax_global_header00006660000000000000000000000064130555255240014520gustar00rootroot0000000000000052 comment=f67036e5cdaf2f88c19ab344b1a75788faac1dc0 xapian-haystack-2.1.0/000077500000000000000000000000001305552552400146055ustar00rootroot00000000000000xapian-haystack-2.1.0/.coveragerc000066400000000000000000000003431305552552400167260ustar00rootroot00000000000000[report] exclude_lines = def __repr__ raise NotImplementedError raise MissingDependency except xapian.DatabaseModifiedError [run] source = haystack/backends/xapian_backend.py test_haystack/xapian_tests xapian-haystack-2.1.0/.gitignore000066400000000000000000000000561305552552400165760ustar00rootroot00000000000000*.pyc *.tmproj *.DS_Store dist NOTES MANIFEST xapian-haystack-2.1.0/.travis.yml000066400000000000000000000042271305552552400167230ustar00rootroot00000000000000sudo: false language: python matrix: include: - python: 3.4 env: DJANGO_VERSION=1.10 XAPIAN_VERSION=1.4.1 - python: 3.4 env: DJANGO_VERSION=1.9 XAPIAN_VERSION=1.4.1 - python: 3.4 env: DJANGO_VERSION=1.8 XAPIAN_VERSION=1.4.1 - python: 3.4 env: DJANGO_VERSION=1.10 XAPIAN_VERSION=1.3.3 - python: 3.4 env: DJANGO_VERSION=1.9 XAPIAN_VERSION=1.3.3 - python: 3.4 env: DJANGO_VERSION=1.8 XAPIAN_VERSION=1.3.3 - python: 2.7 env: DJANGO_VERSION=1.10 XAPIAN_VERSION=1.4.1 - python: 2.7 env: DJANGO_VERSION=1.9 XAPIAN_VERSION=1.4.1 - python: 2.7 env: DJANGO_VERSION=1.8 XAPIAN_VERSION=1.4.1 - python: 2.7 env: DJANGO_VERSION=1.10 XAPIAN_VERSION=1.3.3 - python: 2.7 env: DJANGO_VERSION=1.9 XAPIAN_VERSION=1.3.3 - python: 2.7 env: DJANGO_VERSION=1.8 XAPIAN_VERSION=1.3.3 - python: 2.7 env: DJANGO_VERSION=1.10 XAPIAN_VERSION=1.2.24 - python: 2.7 env: DJANGO_VERSION=1.9 XAPIAN_VERSION=1.2.24 - python: 2.7 env: DJANGO_VERSION=1.8 XAPIAN_VERSION=1.2.24 addons: apt: sources: - ubuntu-toolchain-r-test packages: # Xapian requires uuid-dev, Xapian ==1.3.3 requires compilers with c++11. - uuid-dev - gcc-4.8 - g++-4.8 install: # install Xapian - CXX=g++-4.8 ./install_xapian.sh $XAPIAN_VERSION - pip install Django==$DJANGO_VERSION - pip install coveralls # install Django haystack - cd .. # move from xapian-haystack - git clone https://github.com/toastdriven/django-haystack.git # cp xapian-haystack to django-haystack - cp xapian-haystack/xapian_backend.py django-haystack/haystack/backends - cp -r xapian-haystack/tests/* django-haystack/test_haystack/ - cp xapian-haystack/tests/xapian_tests/__init__.py django-haystack/test_haystack/ - cp xapian-haystack/.coveragerc django-haystack/ script: - cd django-haystack/ - PYTHONPATH=`pwd` coverage run `which django-admin.py` test test_haystack.xapian_tests --settings=test_haystack.xapian_settings after_success: coveralls xapian-haystack-2.1.0/AUTHORS000066400000000000000000000044421305552552400156610ustar00rootroot00000000000000Primary Authors: ---------------- * David Sauve Thanks to: ---------- * Daniel Lindsley for the awesome Haystack API and putting up with all of my questions. * Trapeze Media for providing time and resources to complete this project as well as Q&A. * Richard Boulton for answering questions regarding the Xapian python bindings and API. * The Xapian team for creating and releasing under the GPL, such a great search engine. * Supreet Sethi for suggestions regarding morphologic date comparisons and for fixing NOT query expressions. * Joshua Jonah for changes to highlighting logic to avoid reserved words. * J00bar for a fix with `get_identifier`, fixing query_filter reference in SearchQuery, and a better clear method. * Jannis Leidel for setting up the code base for pip, easy_install and PyPI. * Erik Aigner for the initial patch to get_identifier changes. * Travis Cline for the initial patch to support SQ objects in Haystack. * wshallum for a patch that makes date facets compatible with Python 2.4 * askfor for reporting issues with narrow_queries and float fields. * Brandon Konkle for a patch that corrected the behaviour of weights on multiple term boosts. * Adam Endicott for the initial patch that corrected an oversight with stemming not always being done during a search. * Sym Roe for a patch that improved performance in "more-like-this" and suggestion the removal of FLAG_PARTIAL. * liranz for pointing out a potential conflict with arguments pass into `SearchResult` * Jacob Kaplan-Moss for pointing out that write permission shouldn't be required for searching. * glassresistor for assistance troubleshooting an issue with boosting a phrase query & a patch to make weighting schemes overridable. * James Addison for helping to debug an intermittent issue with `order_by` and `build_schema`. * Michael Opitz for a patch that enables support for `inmemorydb`. * Evgeniy Kirov for a patch that adds `HAYSTACK_XAPIAN_LANGUAGE` used for setting the stemming language. * domasx2 for a patch that explicitly closes the database when not in use. * naktinis for a patch that fixed changes the behaviour of the `narrow_queries` argument of `search` so that queries are ANDed together rather than ORed. xapian-haystack-2.1.0/LICENSE000066400000000000000000000355641305552552400156270ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS xapian-haystack-2.1.0/MANIFEST.in000066400000000000000000000000631305552552400163420ustar00rootroot00000000000000include AUTHORS include LICENSE include README.rst xapian-haystack-2.1.0/README.rst000066400000000000000000000110711305552552400162740ustar00rootroot00000000000000Xapian backend for Django-Haystack ================================== .. _Travis: https://travis-ci.org/notanumber/xapian-haystack .. image:: https://travis-ci.org/notanumber/xapian-haystack.svg?branch=master :target: https://travis-ci.org/notanumber/xapian-haystack .. image:: https://coveralls.io/repos/notanumber/xapian-haystack/badge.svg?branch=master&service=github :target: https://coveralls.io/github/notanumber/xapian-haystack?branch=master Xapian-haystack is a backend of `Django-Haystack `__ for the `Xapian `__ search engine. Thanks for checking it out. You can find more information about Xapian `here `__. Features -------- Xapian-Haystack provides all the standard features of Haystack: - Weighting - Faceted search (date, query, etc.) - Sorting - Spelling suggestions - EdgeNGram and Ngram (for autocomplete) Limitations ----------- The `endswith` search operation is not supported by Xapian-Haystack. Requirements ------------ - Python 2.7 or 3+ - Django 1.8+ - Django-Haystack 2.5.1 - Xapian 1.2.19+ In particular, we build-test this backend in `Travis`_ using: - Python 2.7 and 3.4 - Django 1.8, 1.9 and 1.10 - Django-Haystack (master) - Xapian 1.2.19 (Python 2 only), 1.3.3 (both), and 1.4.1 (both) Installation ------------ First, install Xapian in your machine e.g. with the script provided, `install_xapian.sh`. Call it after activating the virtual environment to install:: source /bin/activate ./install_xapian.sh `` must be >=1.3.0 for Python 3 envs. This takes around 10 minutes. Finally, install Xapian-Haystack by running:: pip install git+https://github.com/notanumber/xapian-haystack.git Configuration ------------- Xapian is configured as other backends of Haystack. You have to define the connection to the database, which is done to a path to a directory, e.g:: HAYSTACK_CONNECTIONS = { 'default': { 'ENGINE': 'xapian_backend.XapianEngine', 'PATH': os.path.join(os.path.dirname(__file__), 'xapian_index') }, } The backend has the following optional settings: - ``HAYSTACK_XAPIAN_LANGUAGE``: the stemming language; the default is `english` and the list of available languages can be found `here `__. - ``HAYSTACK_XAPIAN_WEIGHTING_SCHEME``: a tuple with parameters to be passed to the weighting scheme `BM25 `__. By default, it uses the same parameters as Xapian recommends; this setting allows you to change them. - ``HAYSTACK_XAPIAN_FLAGS``: the options used to parse `AutoQueries`; the default is ``FLAG_PHRASE | FLAG_BOOLEAN | FLAG_LOVEHATE | FLAG_WILDCARD | FLAG_PURE_NOT`` See `here `__ for more information on what they mean. - ``HAYSTACK_XAPIAN_STEMMING_STRATEGY``: This option lets you chose the stemming strategy used by Xapian. Possible values are ``STEM_NONE``, ``STEM_SOME``, ``STEM_ALL``, ``STEM_ALL_Z``, where ``STEM_SOME`` is the default. See `here `__ for more information about the different strategies. Testing ------- Xapian-Haystack has a test suite in continuous deployment in `Travis`_. The script ``.travis.yml`` contains the steps required to run the test suite. Source ------ The source code can be found in `github `_. Credits ------- Xapian-Haystack is maintained by `Jorge C. Leitão `__; `David Sauve `__ was the main contributor of Xapian-Haystack and Xapian-Haystack was originally funded by `Trapeze `__. `Claudep `__ is a frequent contributor. `ANtlord `__ implemented support for EdgeNgram and Ngram. License ------- Xapian-haystack is free software licenced under GNU General Public Licence v2 and Copyright (c) 2009, 2010, 2011, 2012 David Sauve, 2009, 2010 Trapeze, 2014 Jorge C. Leitão. It may be redistributed under the terms specified in the LICENSE file. Questions, Comments, Concerns: ------------------------------ Feel free to open an issue `here `__ or pull request your work. You can ask questions on the django-haystack `mailing list `_: or in the irc ``#haystack``. xapian-haystack-2.1.0/install_xapian.sh000077500000000000000000000024401305552552400201520ustar00rootroot00000000000000#!/usr/bin/env bash # first argument of the script is Xapian version (e.g. 1.2.19) VERSION=$1 # prepare mkdir $VIRTUAL_ENV/packages && cd $VIRTUAL_ENV/packages CORE=xapian-core-$VERSION BINDINGS=xapian-bindings-$VERSION # download echo "Downloading source..." curl -O https://oligarchy.co.uk/xapian/$VERSION/${CORE}.tar.xz curl -O https://oligarchy.co.uk/xapian/$VERSION/${BINDINGS}.tar.xz # extract echo "Extracting source..." tar xf ${CORE}.tar.xz tar xf ${BINDINGS}.tar.xz # install echo "Installing Xapian-core..." cd $VIRTUAL_ENV/packages/${CORE} ./configure --prefix=$VIRTUAL_ENV && make && make install PYV=`python -c "import sys;t='{v[0]}'.format(v=list(sys.version_info[:1]));sys.stdout.write(t)";` if [ $PYV = "2" ]; then PYTHON_FLAG=--with-python else PYTHON_FLAG=--with-python3 fi if [ $VERSION = "1.3.3" ]; then XAPIAN_CONFIG=$VIRTUAL_ENV/bin/xapian-config-1.3 else XAPIAN_CONFIG= fi # The bindings for Python require python-sphinx echo "Installing Python-Sphinx..." pip install sphinx echo "Installing Xapian-bindings..." cd $VIRTUAL_ENV/packages/${BINDINGS} ./configure --prefix=$VIRTUAL_ENV $PYTHON_FLAG XAPIAN_CONFIG=$XAPIAN_CONFIG && make && make install # clean cd $VIRTUAL_ENV rm -rf $VIRTUAL_ENV/packages # test echo "Testing Xapian..." python -c "import xapian" xapian-haystack-2.1.0/requirements.txt000066400000000000000000000000431305552552400200660ustar00rootroot00000000000000Django>=1.8 Django-Haystack>=2.5.1 xapian-haystack-2.1.0/setup.py000066400000000000000000000017301305552552400163200ustar00rootroot00000000000000# -*- coding: utf-8 -*- from __future__ import unicode_literals import os from distutils.core import setup def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() setup( name='xapian-haystack', version='2.1.0', description='A Xapian backend for Haystack', long_description=read('README.rst'), classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: GNU General Public License (GPL)', 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 'Framework :: Django', ], author='Jorge C. Leitão', author_email='jorgecarleitao@gmail.com', url='http://github.com/notanumber/xapian-haystack', download_url='http://github.com/notanumber/xapian-haystack/tarball/2.1.0', license='GPL2', py_modules=['xapian_backend'], install_requires=[ 'django>=1.8', 'django-haystack>=2.5.1', ] ) xapian-haystack-2.1.0/tests/000077500000000000000000000000001305552552400157475ustar00rootroot00000000000000xapian-haystack-2.1.0/tests/xapian_settings.py000077500000000000000000000005511305552552400215250ustar00rootroot00000000000000import os from .settings import * INSTALLED_APPS = [ 'django.contrib.contenttypes', 'test_haystack.core', 'test_haystack.xapian_tests', ] HAYSTACK_CONNECTIONS = { 'default': { 'ENGINE': 'haystack.backends.xapian_backend.XapianEngine', 'PATH': os.path.join('tmp', 'test_xapian_query'), 'INCLUDE_SPELLING': True, } } xapian-haystack-2.1.0/tests/xapian_tests/000077500000000000000000000000001305552552400204515ustar00rootroot00000000000000xapian-haystack-2.1.0/tests/xapian_tests/__init__.py000066400000000000000000000000001305552552400225500ustar00rootroot00000000000000xapian-haystack-2.1.0/tests/xapian_tests/models.py000066400000000000000000000022261305552552400223100ustar00rootroot00000000000000from django.db import models from django.contrib.contenttypes.models import ContentType from ..core.models import MockTag, AnotherMockModel, MockModel, AFourthMockModel class Document(models.Model): type_name = models.CharField(max_length=50) number = models.IntegerField() name = models.CharField(max_length=200) date = models.DateField() summary = models.TextField() text = models.TextField() class BlogEntry(models.Model): """ Same as tests.core.MockModel with a few extra fields for testing various sorting and ordering criteria. """ datetime = models.DateTimeField() date = models.DateField() tags = models.ManyToManyField(MockTag) author = models.CharField(max_length=255) text = models.TextField() funny_text = models.TextField() non_ascii = models.TextField() url = models.URLField() boolean = models.BooleanField() number = models.IntegerField() float_number = models.FloatField() decimal_number = models.DecimalField(max_digits=4, decimal_places=2) class DjangoContentType(models.Model): content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE) xapian-haystack-2.1.0/tests/xapian_tests/search_indexes.py000066400000000000000000000116341305552552400240140ustar00rootroot00000000000000from __future__ import unicode_literals from haystack import indexes from . import models class DocumentIndex(indexes.SearchIndex, indexes.Indexable): text = indexes.CharField(document=True) summary = indexes.CharField(model_attr='summary') type_name = indexes.CharField(model_attr='type_name') number = indexes.IntegerField(model_attr='number') name = indexes.CharField(model_attr='name') date = indexes.DateField(model_attr='date') tags = indexes.MultiValueField() def get_model(self): return models.Document def prepare_tags(self, obj): l = [['tag', 'tag-test', 'tag-test-test'], ['tag', 'tag-test'], ['tag']] return l[obj.id % 3] class BlogSearchIndex(indexes.SearchIndex): text = indexes.CharField( document=True, use_template=True, template_name='search/indexes/core/mockmodel_text.txt' ) name = indexes.CharField(model_attr='author', faceted=True) date = indexes.DateField(model_attr='date') datetime = indexes.DateField(model_attr='datetime') number = indexes.IntegerField(model_attr='number') boolean = indexes.BooleanField(model_attr='boolean') #slug = indexes.CharField(indexed=False, model_attr='slug') float_number = indexes.FloatField(model_attr='float_number') month = indexes.CharField(indexed=False) url = indexes.CharField(model_attr='url') empty = indexes.CharField() # Various MultiValueFields sites = indexes.MultiValueField() tags = indexes.MultiValueField() keys = indexes.MultiValueField() titles = indexes.MultiValueField() def get_model(self): return models.BlogEntry def prepare_sites(self, obj): return ['%d' % (i * obj.id) for i in range(1, 4)] def prepare_tags(self, obj): if obj.id == 1: return ['a', 'b', 'c'] elif obj.id == 2: return ['ab', 'bc', 'cd'] else: return ['an', 'to', 'or'] def prepare_keys(self, obj): return [i * obj.id for i in range(1, 4)] def prepare_titles(self, obj): if obj.id == 1: return ['object one title one', 'object one title two'] elif obj.id == 2: return ['object two title one', 'object two title two'] else: return ['object three title one', 'object three title two'] def prepare_month(self, obj): return '%02d' % obj.date.month def prepare_empty(self, obj): return '' class CompleteBlogEntryIndex(indexes.SearchIndex): text = indexes.CharField(model_attr='text', document=True) author = indexes.CharField(model_attr='author') url = indexes.CharField(model_attr='url') non_ascii = indexes.CharField(model_attr='non_ascii') funny_text = indexes.CharField(model_attr='funny_text') datetime = indexes.DateTimeField(model_attr='datetime') date = indexes.DateField(model_attr='date') boolean = indexes.BooleanField(model_attr='boolean') number = indexes.IntegerField(model_attr='number') float_number = indexes.FloatField(model_attr='float_number') decimal_number = indexes.DecimalField(model_attr='decimal_number') multi_value = indexes.MultiValueField() def get_model(self): return models.BlogEntry def prepare_multi_value(self, obj): return [tag.name for tag in obj.tags.all()] class XapianNGramIndex(indexes.SearchIndex): text = indexes.CharField(model_attr='author', document=True) ngram = indexes.NgramField(model_attr='author') def get_model(self): return models.BlogEntry class XapianEdgeNGramIndex(indexes.SearchIndex): text = indexes.CharField(model_attr='author', document=True) edge_ngram = indexes.EdgeNgramField(model_attr='author') def get_model(self): return models.BlogEntry class DjangoContentTypeIndex(indexes.SearchIndex): text = indexes.CharField(document=True) def get_model(self): return models.DjangoContentType class MockSearchIndex(indexes.SearchIndex): text = indexes.CharField(document=True, use_template=True) name = indexes.CharField(model_attr='author', faceted=True) pub_date = indexes.DateTimeField(model_attr='pub_date') title = indexes.CharField() def get_model(self): return models.MockModel class BoostMockSearchIndex(indexes.SearchIndex): text = indexes.CharField( document=True, use_template=True, template_name='search/indexes/core/mockmodel_template.txt' ) author = indexes.CharField(model_attr='author', weight=2.0) editor = indexes.CharField(model_attr='editor') pub_date = indexes.DateField(model_attr='pub_date') def get_model(self): return models.AFourthMockModel class MockQueryIndex(indexes.SearchIndex): text = indexes.CharField(document=True) pub_date = indexes.DateTimeField() title = indexes.CharField() foo = indexes.CharField() def get_model(self): return models.MockModel xapian-haystack-2.1.0/tests/xapian_tests/tests/000077500000000000000000000000001305552552400216135ustar00rootroot00000000000000xapian-haystack-2.1.0/tests/xapian_tests/tests/__init__.py000066400000000000000000000000001305552552400237120ustar00rootroot00000000000000xapian-haystack-2.1.0/tests/xapian_tests/tests/test_backend.py000066400000000000000000000766321305552552400246310ustar00rootroot00000000000000from __future__ import unicode_literals from decimal import Decimal import datetime import sys import xapian import subprocess import os from django.apps import apps from django.contrib.contenttypes.models import ContentType from django.test import TestCase from django.utils.encoding import force_text from haystack import connections from haystack.backends.xapian_backend import InvalidIndexError, _term_to_xapian_value from haystack.models import SearchResult from haystack.utils.loading import UnifiedIndex from ..search_indexes import XapianNGramIndex, XapianEdgeNGramIndex, \ CompleteBlogEntryIndex, BlogSearchIndex, DjangoContentTypeIndex from ..models import BlogEntry, AnotherMockModel, MockTag, DjangoContentType XAPIAN_VERSION = [int(x) for x in xapian.__version__.split('.')] class XapianSearchResult(SearchResult): def __init__(self, app_label, model_name, pk, score, **kwargs): super(XapianSearchResult, self).__init__(app_label, model_name, pk, score, **kwargs) self._model = apps.get_model('xapian_tests', model_name) def get_terms(backend, *args): if XAPIAN_VERSION[1] <= 2: # old versions use "delve". executable = 'delve' else: # new versions use 'xapian-delve' executable = 'xapian-delve' # dev versions (odd minor) use a suffix if XAPIAN_VERSION[1] % 2 != 0: executable = executable+'-%d.%d' % tuple(XAPIAN_VERSION[0:2]) result = subprocess.check_output([executable] + list(args) + [backend.path], env=os.environ.copy()).decode('utf-8') result = result.split(": ") if len(result) > 1: return result[1].strip().split(" ") return [] def pks(results): return [result.pk for result in results] class HaystackBackendTestCase(object): """ Abstract TestCase that implements an hack to ensure `connections` has the right index It has a method get_index() that returns a SearchIndex that must be overwritten. """ def get_index(self): raise NotImplementedError def setUp(self): self.old_ui = connections['default'].get_unified_index() self.ui = UnifiedIndex() self.index = self.get_index() self.ui.build(indexes=[self.index]) self.backend = connections['default'].get_backend() connections['default']._index = self.ui def tearDown(self): self.backend.clear() connections['default']._index = self.old_ui def assertExpectedQuery(self, query, string_or_list, xapian12string=''): if isinstance(string_or_list, list): strings = string_or_list else: strings = [string_or_list] expected = ['Query(%s)' % string for string in strings] if XAPIAN_VERSION[1] <= 2: if xapian12string: expected = ['Xapian::Query(%s)' % xapian12string] else: expected = ['Xapian::Query(%s)' % string for string in strings] self.assertIn(str(query), expected) class BackendIndexationTestCase(HaystackBackendTestCase, TestCase): """ Tests indexation behavior. Tests related to how the backend indexes terms, values, and others go here. """ def get_index(self): return CompleteBlogEntryIndex() def setUp(self): super(BackendIndexationTestCase, self).setUp() tag1 = MockTag.objects.create(name='tag') tag2 = MockTag.objects.create(name='tag-tag') tag3 = MockTag.objects.create(name='tag-tag-tag') entry = BlogEntry() entry.id = 1 entry.text = 'this_is_a_word inside a big text' entry.author = 'david' entry.url = 'http://example.com/1/' entry.boolean = True entry.number = 123456789 entry.float_number = 123.123456789 entry.decimal_number = Decimal('22.34') entry.funny_text = 'this-text das das' entry.non_ascii = 'thsi sdas das corrup\xe7\xe3o das' entry.datetime = datetime.datetime(2009, 2, 25, 1, 1, 1) entry.date = datetime.date(2008, 8, 8) entry.save() entry.tags.add(tag1, tag2, tag3) self.backend.update(self.index, [entry]) self.entry = entry def test_app_is_not_split(self): """ Tests that the app path is not split and added as independent terms. """ terms = get_terms(self.backend, '-a') self.assertFalse('tests' in terms) self.assertFalse('Ztest' in terms) def test_app_is_not_indexed(self): """ Tests that the app path is not indexed. """ terms = get_terms(self.backend, '-a') self.assertFalse('tests.xapianmockmodel.1' in terms) self.assertFalse('xapianmockmodel' in terms) self.assertFalse('tests' in terms) def test_fields_exist(self): """ Tests that all fields are in the database """ terms = get_terms(self.backend, '-a') for field in ['author', 'datetime', 'text', 'url']: is_inside = False for term in terms: if term.startswith("X%s" % field.upper()): is_inside = True break self.assertTrue(is_inside, field) def test_text_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('this_is_a_word' in terms) self.assertTrue('Zthis_is_a_word' in terms) self.assertTrue('ZXTEXTthis_is_a_word' in terms) self.assertTrue('XTEXTthis_is_a_word' in terms) self.assertFalse('^this_is_a_word inside a big text$' in terms) def test_text_posting(self): """ Tests that text is correctly positioned in the document """ expected_order = ['^', 'this_is_a_word', 'inside', 'a', 'big', 'text', '$'] def get_positions(term): """ Uses delve to get the positions of the term in the first document. """ return sorted([int(pos) for pos in get_terms(self.backend, '-r1', '-tXTEXT%s' % term)]) # confirms expected_order previous_position = get_positions(expected_order[0]) for term in expected_order[1:]: pos = get_positions(term) # only two positions per term # (one from term_generator, one from literal text) self.assertEqual(len(pos), 2) self.assertEqual(pos[0] - 1, previous_position[0]) self.assertEqual(pos[1] - 1, previous_position[1]) previous_position[0] += 1 previous_position[1] += 1 def test_author_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('XAUTHORdavid' in terms) self.assertTrue('ZXAUTHORdavid' in terms) self.assertTrue('Zdavid' in terms) self.assertTrue('david' in terms) def test_funny_text_field(self): terms = get_terms(self.backend, '-r1') self.assertTrue('this-text' in terms) def test_datetime_field(self): terms = get_terms(self.backend, '-a') self.assertFalse('XDATETIME20090225000000' in terms) self.assertFalse('ZXDATETIME20090225000000' in terms) self.assertFalse('20090225000000' in terms) self.assertTrue('XDATETIME2009-02-25' in terms) self.assertTrue('2009-02-25' in terms) self.assertTrue('01:01:01' in terms) self.assertTrue('XDATETIME01:01:01' in terms) def test_date_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('XDATE2008-08-08' in terms) self.assertTrue('2008-08-08' in terms) self.assertFalse('XDATE00:00:00' in terms) self.assertFalse('00:00:00' in terms) def test_url_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('http://example.com/1/' in terms) def test_bool_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('XBOOLEANtrue' in terms) self.assertFalse('ZXBOOLEANtrue' in terms) def test_integer_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('123456789' in terms) self.assertTrue('XNUMBER123456789' in terms) self.assertFalse('ZXNUMBER123456789' in terms) def test_float_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('123.123456789' in terms) self.assertTrue('XFLOAT_NUMBER123.123456789' in terms) self.assertFalse('ZXFLOAT_NUMBER123.123456789' in terms) def test_decimal_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('22.34' in terms) self.assertTrue('XDECIMAL_NUMBER22.34' in terms) self.assertFalse('ZXDECIMAL_NUMBER22.34' in terms) def test_multivalue_field(self): """ Regression test for #103 """ terms = get_terms(self.backend, '-a') self.assertTrue('tag' in terms) self.assertTrue('tag-tag' in terms) self.assertTrue('tag-tag-tag' in terms) self.assertTrue('XMULTI_VALUEtag' in terms) self.assertTrue('XMULTI_VALUEtag-tag' in terms) self.assertTrue('XMULTI_VALUEtag-tag-tag' in terms) def test_non_ascii_chars(self): terms = get_terms(self.backend, '-a') self.assertIn('corrup\xe7\xe3o', terms) class BackendFeaturesTestCase(HaystackBackendTestCase, TestCase): """ Tests supported features on the backend side. Tests to features implemented on the backend go here. """ def get_index(self): return BlogSearchIndex() @staticmethod def get_entry(i): entry = BlogEntry() entry.id = i entry.author = 'david%s' % i entry.url = 'http://example.com/%d/' % i entry.boolean = bool(i % 2) entry.number = i*5 entry.float_number = i*5.0 entry.decimal_number = Decimal('22.34') entry.datetime = datetime.datetime(2009, 2, 25, 1, 1, 1) - datetime.timedelta(seconds=i) entry.date = datetime.date(2009, 2, 23) + datetime.timedelta(days=i) return entry def setUp(self): super(BackendFeaturesTestCase, self).setUp() self.sample_objs = [] for i in range(1, 4): entry = self.get_entry(i) self.sample_objs.append(entry) self.sample_objs[0].float_number = 834.0 self.sample_objs[1].float_number = 35.5 self.sample_objs[2].float_number = 972.0 for obj in self.sample_objs: obj.save() self.backend.update(self.index, BlogEntry.objects.all()) def test_update(self): self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), [1, 2, 3]) def test_duplicate_update(self): """ Regression test for #6. """ self.backend.update(self.index, self.sample_objs) self.assertEqual(self.backend.document_count(), 3) def test_remove(self): self.backend.remove(self.sample_objs[0]) self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), [2, 3]) def test_clear(self): self.backend.clear() self.assertEqual(self.backend.document_count(), 0) self.backend.update(self.index, self.sample_objs) self.assertEqual(self.backend.document_count(), 3) self.backend.clear([AnotherMockModel]) self.assertEqual(self.backend.document_count(), 3) self.backend.clear([BlogEntry]) self.assertEqual(self.backend.document_count(), 0) self.backend.update(self.index, self.sample_objs) self.assertEqual(self.backend.document_count(), 3) self.backend.clear([AnotherMockModel, BlogEntry]) self.assertEqual(self.backend.document_count(), 0) def test_search(self): # no match query self.assertEqual(self.backend.search(xapian.Query()), {'hits': 0, 'results': []}) # all match query self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), [1, 2, 3]) # Other `result_class` self.assertTrue( isinstance(self.backend.search(xapian.Query('indexed'), result_class=XapianSearchResult)['results'][0], XapianSearchResult)) def test_search_field_with_punctuation(self): self.assertEqual(pks(self.backend.search(xapian.Query('http://example.com/1/'))['results']), [1]) def test_search_by_mvf(self): self.assertEqual(self.backend.search(xapian.Query('ab'))['hits'], 1) self.assertEqual(self.backend.search(xapian.Query('b'))['hits'], 1) self.assertEqual(self.backend.search(xapian.Query('to'))['hits'], 1) self.assertEqual(self.backend.search(xapian.Query('one'))['hits'], 3) def test_field_facets(self): self.assertEqual(self.backend.search(xapian.Query(), facets=['name']), {'hits': 0, 'results': []}) results = self.backend.search(xapian.Query('indexed'), facets=['name']) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['fields']['name'], [('david1', 1), ('david2', 1), ('david3', 1)]) results = self.backend.search(xapian.Query('indexed'), facets=['boolean']) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['fields']['boolean'], [(False, 1), (True, 2)]) results = self.backend.search(xapian.Query('indexed'), facets=['sites']) self.assertEqual(results['hits'], 3) self.assertEqual(set(results['facets']['fields']['sites']), set([('1', 1), ('3', 2), ('2', 2), ('4', 1), ('6', 2), ('9', 1)])) results = self.backend.search(xapian.Query('indexed'), facets=['number']) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['fields']['number'], [(5, 1), (10, 1), (15, 1)]) results = self.backend.search(xapian.Query('indexed'), facets=['float_number']) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['fields']['float_number'], [(35.5, 1), (834.0, 1), (972.0, 1)]) def test_raise_index_error_on_wrong_field(self): """ Regression test for #109. """ self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas']) def test_date_facets_month(self): facets = {'datetime': {'start_date': datetime.datetime(2008, 10, 26), 'end_date': datetime.datetime(2009, 3, 26), 'gap_by': 'month'}} self.assertEqual(self.backend.search(xapian.Query(), date_facets=facets), {'hits': 0, 'results': []}) results = self.backend.search(xapian.Query('indexed'), date_facets=facets) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['dates']['datetime'], [ (datetime.datetime(2009, 2, 26, 0, 0), 0), (datetime.datetime(2009, 1, 26, 0, 0), 3), (datetime.datetime(2008, 12, 26, 0, 0), 0), (datetime.datetime(2008, 11, 26, 0, 0), 0), (datetime.datetime(2008, 10, 26, 0, 0), 0), ]) def test_date_facets_seconds(self): facets = {'datetime': {'start_date': datetime.datetime(2009, 2, 25, 1, 0, 57), 'end_date': datetime.datetime(2009, 2, 25, 1, 1, 1), 'gap_by': 'second'}} self.assertEqual(self.backend.search(xapian.Query(), date_facets=facets), {'hits': 0, 'results': []}) results = self.backend.search(xapian.Query('indexed'), date_facets=facets) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['dates']['datetime'], [ (datetime.datetime(2009, 2, 25, 1, 1, 0), 0), (datetime.datetime(2009, 2, 25, 1, 0, 59), 1), (datetime.datetime(2009, 2, 25, 1, 0, 58), 1), (datetime.datetime(2009, 2, 25, 1, 0, 57), 1), ]) def test_date_facets_days(self): facets = {'date': {'start_date': datetime.datetime(2009, 2, 1), 'end_date': datetime.datetime(2009, 3, 15), 'gap_by': 'day', 'gap_amount': 15}} results = self.backend.search(xapian.Query('indexed'), date_facets=facets) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['dates']['date'], [ (datetime.datetime(2009, 3, 3, 0, 0), 0), (datetime.datetime(2009, 2, 16, 0, 0), 3), (datetime.datetime(2009, 2, 1, 0, 0), 0) ]) def test_query_facets(self): self.assertEqual(self.backend.search(xapian.Query(), query_facets={'name': 'da*'}), {'hits': 0, 'results': []}) results = self.backend.search(xapian.Query('indexed'), query_facets={'name': 'da*'}) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['queries']['name'], ('da*', 3)) def test_narrow_queries(self): self.assertEqual(self.backend.search(xapian.Query(), narrow_queries={'name:david1'}), {'hits': 0, 'results': []}) results = self.backend.search(xapian.Query('indexed'), narrow_queries={'name:david1'}) self.assertEqual(results['hits'], 1) def test_highlight(self): self.assertEqual(self.backend.search(xapian.Query(), highlight=True), {'hits': 0, 'results': []}) self.assertEqual(self.backend.search(xapian.Query('indexed'), highlight=True)['hits'], 3) results = self.backend.search(xapian.Query('indexed'), highlight=True)['results'] self.assertEqual([result.highlighted['text'] for result in results], ['indexed!\n1', 'indexed!\n2', 'indexed!\n3']) def test_spelling_suggestion(self): self.assertEqual(self.backend.search(xapian.Query('indxe'))['hits'], 0) self.assertEqual(self.backend.search(xapian.Query('indxe'))['spelling_suggestion'], 'indexed') self.assertEqual(self.backend.search(xapian.Query('indxed'))['hits'], 0) self.assertEqual(self.backend.search(xapian.Query('indxed'))['spelling_suggestion'], 'indexed') self.assertEqual(self.backend.search(xapian.Query('foo'))['hits'], 0) self.assertEqual(self.backend.search(xapian.Query('foo'), spelling_query='indexy')['spelling_suggestion'], 'indexed') self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['hits'], 0) self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['spelling_suggestion'], 'david1') def test_more_like_this(self): results = self.backend.more_like_this(self.sample_objs[0]) self.assertEqual(pks(results['results']), [3, 2]) results = self.backend.more_like_this(self.sample_objs[0], additional_query=xapian.Query('david3')) self.assertEqual(pks(results['results']), [3]) results = self.backend.more_like_this(self.sample_objs[0], limit_to_registered_models=True) self.assertEqual(pks(results['results']), [3, 2]) # Other `result_class` result = self.backend.more_like_this(self.sample_objs[0], result_class=XapianSearchResult) self.assertTrue(isinstance(result['results'][0], XapianSearchResult)) def test_order_by(self): #results = self.backend.search(xapian.Query(''), sort_by=['datetime']) #print([d.datetime for d in results['results']]) #self.assertEqual(pks(results['results']), [3, 2, 1]) #results = self.backend.search(xapian.Query(''), sort_by=['-datetime']) #self.assertEqual(pks(results['results']), [1, 2, 3]) #results = self.backend.search(xapian.Query(''), sort_by=['date']) #self.assertEqual(pks(results['results']), [1, 2, 3]) #results = self.backend.search(xapian.Query(''), sort_by=['-date']) #self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['id']) self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-id']) self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['number']) self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-number']) self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['float_number']) self.assertEqual(pks(results['results']), [2, 1, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-float_number']) self.assertEqual(pks(results['results']), [3, 1, 2]) results = self.backend.search(xapian.Query(''), sort_by=['boolean', 'id']) self.assertEqual(pks(results['results']), [2, 1, 3]) results = self.backend.search(xapian.Query(''), sort_by=['boolean', '-id']) self.assertEqual(pks(results['results']), [2, 3, 1]) def test_verify_type(self): self.assertEqual([result.month for result in self.backend.search(xapian.Query(''))['results']], ['02', '02', '02']) def test_term_to_xapian_value(self): self.assertEqual(_term_to_xapian_value('abc', 'text'), 'abc') self.assertEqual(_term_to_xapian_value(1, 'integer'), '000000000001') self.assertEqual(_term_to_xapian_value(2653, 'integer'), '000000002653') self.assertEqual(_term_to_xapian_value(25.5, 'float'), b'\xb2`') self.assertEqual(_term_to_xapian_value([1, 2, 3], 'text'), '[1, 2, 3]') self.assertEqual(_term_to_xapian_value((1, 2, 3), 'text'), '(1, 2, 3)') self.assertEqual(_term_to_xapian_value({'a': 1, 'c': 3, 'b': 2}, 'text'), force_text({'a': 1, 'c': 3, 'b': 2})) self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 16, 14), 'datetime'), '20090509161400') self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 0, 0), 'date'), '20090509000000') self.assertEqual(_term_to_xapian_value(datetime.datetime(1899, 5, 18, 0, 0), 'date'), '18990518000000') def test_build_schema(self): search_fields = connections['default'].get_unified_index().all_searchfields() (content_field_name, fields) = self.backend.build_schema(search_fields) self.assertEqual(content_field_name, 'text') self.assertEqual(len(fields), 14 + 3) self.assertEqual(fields, [ {'column': 0, 'type': 'text', 'field_name': 'id', 'multi_valued': 'false'}, {'column': 1, 'type': 'integer', 'field_name': 'django_id', 'multi_valued': 'false'}, {'column': 2, 'type': 'text', 'field_name': 'django_ct', 'multi_valued': 'false'}, {'column': 3, 'type': 'boolean', 'field_name': 'boolean', 'multi_valued': 'false'}, {'column': 4, 'type': 'date', 'field_name': 'date', 'multi_valued': 'false'}, {'column': 5, 'type': 'date', 'field_name': 'datetime', 'multi_valued': 'false'}, {'column': 6, 'type': 'text', 'field_name': 'empty', 'multi_valued': 'false'}, {'column': 7, 'type': 'float', 'field_name': 'float_number', 'multi_valued': 'false'}, {'column': 8, 'type': 'text', 'field_name': 'keys', 'multi_valued': 'true'}, {'column': 9, 'type': 'text', 'field_name': 'name', 'multi_valued': 'false'}, {'column': 10, 'type': 'text', 'field_name': 'name_exact', 'multi_valued': 'false'}, {'column': 11, 'type': 'integer', 'field_name': 'number', 'multi_valued': 'false'}, {'column': 12, 'type': 'text', 'field_name': 'sites', 'multi_valued': 'true'}, {'column': 13, 'type': 'text', 'field_name': 'tags', 'multi_valued': 'true'}, {'column': 14, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'}, {'column': 15, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'}, {'column': 16, 'type': 'text', 'field_name': 'url', 'multi_valued': 'false'}, ]) def test_parse_query(self): self.assertExpectedQuery(self.backend.parse_query('indexed'), 'Zindex@1', xapian12string='Zindex:(pos=1)') self.assertExpectedQuery(self.backend.parse_query('name:david'), 'ZXNAMEdavid@1', xapian12string='ZXNAMEdavid:(pos=1)') if xapian.minor_version() >= 2: # todo: why `SYNONYM WILDCARD OR XNAMEda`? self.assertExpectedQuery( self.backend.parse_query('name:da*'), '(SYNONYM WILDCARD OR XNAMEda)', xapian12string='(XNAMEdavid1:(pos=1) SYNONYM ' 'XNAMEdavid2:(pos=1) SYNONYM ' 'XNAMEdavid3:(pos=1))') else: self.assertEqual(str(self.backend.parse_query('name:da*')), 'Xapian::Query((' 'XNAMEdavid1:(pos=1) OR ' 'XNAMEdavid2:(pos=1) OR ' 'XNAMEdavid3:(pos=1)))') def test_parse_query_range(self): self.assertExpectedQuery(self.backend.parse_query('name:david1..david2'), '0 * VALUE_RANGE 9 david1 david2', xapian12string='VALUE_RANGE 9 david1 david2') self.assertExpectedQuery(self.backend.parse_query('number:0..10'), '0 * VALUE_RANGE 11 000000000000 000000000010', xapian12string='VALUE_RANGE 11 000000000000 000000000010') self.assertExpectedQuery(self.backend.parse_query('number:..10'), '0 * VALUE_RANGE 11 %012d 000000000010' % (-sys.maxsize - 1), xapian12string='VALUE_RANGE 11 %012d 000000000010' % (-sys.maxsize - 1)) self.assertExpectedQuery(self.backend.parse_query('number:10..*'), '0 * VALUE_RANGE 11 000000000010 %012d' % sys.maxsize, xapian12string='VALUE_RANGE 11 000000000010 %012d' % sys.maxsize) def test_order_by_django_id(self): """ We need this test because ordering on more than 10 entries was not correct at some point. """ self.sample_objs = [] number_list = list(range(1, 101)) for i in number_list: entry = self.get_entry(i) self.sample_objs.append(entry) for obj in self.sample_objs: obj.save() self.backend.clear() self.backend.update(self.index, self.sample_objs) results = self.backend.search(xapian.Query(''), sort_by=['-django_id']) self.assertEqual(pks(results['results']), list(reversed(number_list))) def test_more_like_this_with_unindexed_model(self): """ Tests that more_like_this raises an error when it is called with an unindexed model and if silently_fail is True. Also tests the other way around. """ mock = BlogEntry() mock.id = 10 mock.author = 'david10' try: self.assertEqual(self.backend.more_like_this(mock)['results'], []) except InvalidIndexError: self.fail("InvalidIndexError raised when silently_fail is True") self.backend.silently_fail = False self.assertRaises(InvalidIndexError, self.backend.more_like_this, mock) class IndexationNGramTestCase(HaystackBackendTestCase, TestCase): def get_index(self): return XapianNGramIndex() def setUp(self): super(IndexationNGramTestCase, self).setUp() mock = BlogEntry() mock.id = 1 mock.author = 'david' mock1 = BlogEntry() mock1.id = 2 mock1.author = 'da1id' self.backend.update(self.index, [mock, mock1]) def test_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('da' in terms) self.assertTrue('XNGRAMda' in terms) self.assertTrue('dav' in terms) self.assertTrue('XNGRAMdav' in terms) self.assertTrue('davi' in terms) self.assertTrue('XNGRAMdavi' in terms) self.assertTrue('david' in terms) self.assertTrue('XNGRAMdavid' in terms) self.assertTrue('vid' in terms) self.assertTrue('XNGRAMvid' in terms) self.assertTrue('id' in terms) self.assertTrue('XNGRAMid' in terms) self.assertTrue('av' in terms) self.assertTrue('XNGRAMav' in terms) def test_search(self): """Tests edge ngram search with different parts of words""" # Minimun length of query string must be equal to EDGE_NGRAM_MIN_LENGTH. self.assertEqual(pks(self.backend.search(xapian.Query('da'))['results']), [1, 2]) self.assertEqual(pks(self.backend.search(xapian.Query('dav'))['results']), [1]) self.assertEqual(pks(self.backend.search(xapian.Query('da1'))['results']), [2]) class IndexationEdgeNGramTestCase(HaystackBackendTestCase, TestCase): def get_index(self): return XapianEdgeNGramIndex() def setUp(self): super(IndexationEdgeNGramTestCase, self).setUp() mock = BlogEntry() mock.id = 1 mock.author = 'david' mock1 = BlogEntry() mock1.id = 2 mock1.author = 'da1id' self.backend.update(self.index, [mock, mock1]) def test_field(self): terms = get_terms(self.backend, '-a') self.assertTrue('da' in terms) self.assertTrue('XEDGE_NGRAMda' in terms) self.assertTrue('dav' in terms) self.assertTrue('XEDGE_NGRAMdav' in terms) self.assertTrue('davi' in terms) self.assertTrue('XEDGE_NGRAMdavi' in terms) self.assertTrue('david' in terms) self.assertTrue('XEDGE_NGRAMdavid' in terms) self.assertTrue('vid' not in terms) self.assertTrue('XEDGE_NGRAMvid' not in terms) self.assertTrue('id' not in terms) self.assertTrue('XEDGE_NGRAMid' not in terms) self.assertTrue('av' not in terms) self.assertTrue('XEDGE_NGRAMav' not in terms) def test_search(self): """Tests edge ngram search with different parts of words""" # Minimun length of query string must be equal to NGRAM_MIN_LENGTH. self.assertEqual(pks(self.backend.search(xapian.Query('da'))['results']), [1, 2]) self.assertEqual(pks(self.backend.search(xapian.Query('dav'))['results']), [1]) self.assertEqual(pks(self.backend.search(xapian.Query('da1'))['results']), [2]) class IndexationDjangoContentTypeTestCase(HaystackBackendTestCase, TestCase): def get_index(self): return DjangoContentTypeIndex() def setUp(self): super(IndexationDjangoContentTypeTestCase, self).setUp() entry1 = ContentType(model='DjangoContentType') entry1.save() entry2 = DjangoContentType(content_type=entry1) entry2.save() self.backend.update(self.index, [entry2]) def test_basic(self): terms = get_terms(self.backend, '-a') self.assertTrue('CONTENTTYPExapian_tests.djangocontenttype' in terms) xapian-haystack-2.1.0/tests/xapian_tests/tests/test_interface.py000066400000000000000000000205361305552552400251720ustar00rootroot00000000000000from __future__ import unicode_literals import datetime from django.db.models import Q from django.test import TestCase from haystack import connections from haystack.inputs import AutoQuery from haystack.query import SearchQuerySet from ..models import Document from ..search_indexes import DocumentIndex from ..tests.test_backend import pks class InterfaceTestCase(TestCase): """ Tests the interface of Xapian-Haystack. Tests related to usability and expected behavior go here. """ def setUp(self): super(InterfaceTestCase, self).setUp() types_names = ['book', 'magazine', 'article'] texts = ['This is a huge text', 'This is a medium text', 'This is a small text'] dates = [datetime.date(year=2010, month=1, day=1), datetime.date(year=2010, month=2, day=1), datetime.date(year=2010, month=3, day=1)] summaries = ['This is a huge corrup\xe7\xe3o summary', 'This is a medium summary', 'This is a small summary'] for i in range(1, 13): doc = Document() doc.type_name = types_names[i % 3] doc.number = i * 2 doc.name = "%s %d" % (doc.type_name, doc.number) doc.date = dates[i % 3] doc.summary = summaries[i % 3] doc.text = texts[i % 3] doc.save() self.index = DocumentIndex() self.ui = connections['default'].get_unified_index() self.ui.build(indexes=[self.index]) self.backend = connections['default'].get_backend() self.backend.update(self.index, Document.objects.all()) self.queryset = SearchQuerySet() def tearDown(self): Document.objects.all().delete() #self.backend.clear() super(InterfaceTestCase, self).tearDown() def test_count(self): self.assertEqual(self.queryset.count(), Document.objects.count()) def test_content_search(self): result = self.queryset.filter(content='medium this') self.assertEqual(sorted(pks(result)), pks(Document.objects.all())) # documents with "medium" AND "this" have higher score self.assertEqual(pks(result)[:4], [1, 4, 7, 10]) def test_field_search(self): self.assertEqual(pks(self.queryset.filter(name__contains='8')), [4]) self.assertEqual(pks(self.queryset.filter(type_name='book')), pks(Document.objects.filter(type_name='book'))) self.assertEqual(pks(self.queryset.filter(text__contains='text huge')), pks(Document.objects.filter(text__contains='text huge'))) def test_field_contains(self): self.assertEqual(pks(self.queryset.filter(summary__contains='huge')), pks(Document.objects.filter(summary__contains='huge'))) result = self.queryset.filter(summary__contains='huge summary') self.assertEqual(sorted(pks(result)), pks(Document.objects.all())) # documents with "huge" AND "summary" have higher score self.assertEqual(pks(result)[:4], [3, 6, 9, 12]) def test_field_exact(self): self.assertEqual(pks(self.queryset.filter(name__exact='8')), []) self.assertEqual(pks(self.queryset.filter(name__exact='magazine 2')), [1]) def test_content_exact(self): self.assertEqual(pks(self.queryset.filter(content__exact='huge')), []) def test_content_and(self): self.assertEqual(pks(self.queryset.filter(content='huge').filter(summary='medium')), []) self.assertEqual(len(self.queryset.filter(content='huge this')), 12) self.assertEqual(len(self.queryset.filter(content='huge this').filter(summary__contains='huge')), 4) def test_content_or(self): self.assertEqual(len(self.queryset.filter(content='huge medium')), 8) self.assertEqual(len(self.queryset.filter(content='huge medium small')), 12) def test_field_and(self): self.assertEqual(pks(self.queryset.filter(name='8').filter(name='4')), []) def test_field_or(self): self.assertEqual(pks(self.queryset.filter(name__contains='8 4')), [2, 4]) def test_field_in(self): self.assertEqual(set(pks(self.queryset.filter(name__in=['magazine 2', 'article 4']))), set(pks(Document.objects.filter(name__in=['magazine 2', 'article 4'])))) self.assertEqual(pks(self.queryset.filter(number__in=[4])), pks(Document.objects.filter(number__in=[4]))) self.assertEqual(pks(self.queryset.filter(number__in=[4, 8])), pks(Document.objects.filter(number__in=[4, 8]))) def test_private_fields(self): self.assertEqual(pks(self.queryset.filter(django_id=4)), pks(Document.objects.filter(id__in=[4]))) self.assertEqual(pks(self.queryset.filter(django_id__in=[2, 4])), pks(Document.objects.filter(id__in=[2, 4]))) self.assertEqual(set(pks(self.queryset.models(Document))), set(pks(Document.objects.all()))) def test_field_startswith(self): self.assertEqual(len(self.queryset.filter(name__startswith='magaz')), 4) self.assertEqual(set(pks(self.queryset.filter(summary__startswith='This is a huge'))), set(pks(Document.objects.filter(summary__startswith='This is a huge')))) def test_auto_query(self): # todo: improve to query text only. self.assertEqual(set(pks(self.queryset.auto_query("huge OR medium"))), set(pks(Document.objects.filter(Q(text__contains="huge") | Q(text__contains="medium"))))) self.assertEqual(set(pks(self.queryset.auto_query("huge AND medium"))), set(pks(Document.objects.filter(Q(text__contains="huge") & Q(text__contains="medium"))))) self.assertEqual(set(pks(self.queryset.auto_query("text:huge text:-this"))), set(pks(Document.objects.filter(Q(text__contains="huge") & ~Q(text__contains="this"))))) self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 OR 4"))), 2) self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 AND 4"))), 0) def test_value_range(self): self.assertEqual(set(pks(self.queryset.filter(number__lt=3))), set(pks(Document.objects.filter(number__lt=3)))) self.assertEqual(set(pks(self.queryset.filter(django_id__gte=6))), set(pks(Document.objects.filter(id__gte=6)))) def test_date_range(self): date = datetime.date(year=2010, month=2, day=1) self.assertEqual(set(pks(self.queryset.filter(date__gte=date))), set(pks(Document.objects.filter(date__gte=date)))) date = datetime.date(year=2010, month=3, day=1) self.assertEqual(set(pks(self.queryset.filter(date__lte=date))), set(pks(Document.objects.filter(date__lte=date)))) def test_order_by(self): # private order self.assertEqual(pks(self.queryset.order_by("-django_id")), pks(Document.objects.order_by("-id"))) # value order self.assertEqual(pks(self.queryset.order_by("number")), pks(Document.objects.order_by("number"))) # text order self.assertEqual(pks(self.queryset.order_by("summary")), pks(Document.objects.order_by("summary"))) # date order self.assertEqual(pks(self.queryset.order_by("-date")), pks(Document.objects.order_by("-date"))) def test_non_ascii_search(self): """ Regression test for #119. """ self.assertEqual(pks(self.queryset.filter(content='corrup\xe7\xe3o')), pks(Document.objects.filter(summary__contains='corrup\xe7\xe3o'))) def test_multi_values_exact_search(self): """ Regression test for #103 """ self.assertEqual(len(self.queryset.filter(tags__exact='tag')), 12) self.assertEqual(len(self.queryset.filter(tags__exact='tag-test')), 8) self.assertEqual(len(self.queryset.filter(tags__exact='tag-test-test')), 4) xapian-haystack-2.1.0/tests/xapian_tests/tests/test_query.py000066400000000000000000000466731305552552400244110ustar00rootroot00000000000000from __future__ import unicode_literals import datetime from django.conf import settings from django.test import TestCase from haystack import connections, reset_search_queries from haystack.models import SearchResult from haystack.query import SearchQuerySet, SQ from ...mocks import MockSearchResult from ..models import MockModel, AnotherMockModel, AFourthMockModel from ..search_indexes import MockQueryIndex, MockSearchIndex, BoostMockSearchIndex from ..tests.test_backend import HaystackBackendTestCase class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): """ Tests the XapianSearchQuery, the class that converts SearchQuerySet queries using the `__` notation to XapianQueries. """ fixtures = ['base_data.json'] def get_index(self): return MockQueryIndex() def setUp(self): super(XapianSearchQueryTestCase, self).setUp() self.sq = connections['default'].get_query() def test_all(self): self.assertExpectedQuery(self.sq.build_query(), '') def test_single_word(self): self.sq.add_filter(SQ(content='hello')) self.assertExpectedQuery(self.sq.build_query(), '(Zhello OR hello)') def test_single_word_not(self): self.sq.add_filter(~SQ(content='hello')) self.assertExpectedQuery(self.sq.build_query(), '( AND_NOT (Zhello OR hello))') def test_single_word_field_exact(self): self.sq.add_filter(SQ(foo__exact='hello')) self.assertExpectedQuery(self.sq.build_query(), '(XFOO^ PHRASE 3 XFOOhello PHRASE 3 XFOO$)') def test_single_word_field_exact_not(self): self.sq.add_filter(~SQ(foo='hello')) self.assertExpectedQuery(self.sq.build_query(), '( AND_NOT ' '(XFOO^ PHRASE 3 XFOOhello PHRASE 3 XFOO$))') def test_boolean(self): self.sq.add_filter(SQ(content=True)) self.assertExpectedQuery(self.sq.build_query(), '(Ztrue OR true)') def test_date(self): self.sq.add_filter(SQ(content=datetime.date(2009, 5, 8))) self.assertExpectedQuery(self.sq.build_query(), '(Z2009-05-08 OR 2009-05-08)') def test_date_not(self): self.sq.add_filter(~SQ(content=datetime.date(2009, 5, 8))) self.assertExpectedQuery(self.sq.build_query(), '( AND_NOT ' '(Z2009-05-08 OR 2009-05-08))') def test_datetime(self): self.sq.add_filter(SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) self.assertExpectedQuery(self.sq.build_query(), '((Z2009-05-08 OR 2009-05-08) OR' ' (Z11:28:00 OR 11:28:00))', xapian12string='(Z2009-05-08 OR 2009-05-08 OR' ' Z11:28:00 OR 11:28:00)') def test_datetime_not(self): self.sq.add_filter(~SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) self.assertExpectedQuery(self.sq.build_query(), '( AND_NOT ((Z2009-05-08 OR 2009-05-08) OR (Z11:28:00 OR 11:28:00)))', xapian12string='( AND_NOT ' '(Z2009-05-08 OR 2009-05-08 OR' ' Z11:28:00 OR 11:28:00))') def test_float(self): self.sq.add_filter(SQ(content=25.52)) self.assertExpectedQuery(self.sq.build_query(), '(Z25.52 OR 25.52)') def test_multiple_words_and(self): self.sq.add_filter(SQ(content='hello')) self.sq.add_filter(SQ(content='world')) self.assertExpectedQuery(self.sq.build_query(), '((Zhello OR hello) AND (Zworld OR world))') def test_multiple_words_not(self): self.sq.add_filter(~SQ(content='hello')) self.sq.add_filter(~SQ(content='world')) self.assertExpectedQuery(self.sq.build_query(), '(( AND_NOT (Zhello OR hello)) AND' ' ( AND_NOT (Zworld OR world)))') def test_multiple_words_or(self): self.sq.add_filter(SQ(content='hello') | SQ(content='world')) self.assertExpectedQuery( self.sq.build_query(), '((Zhello OR hello) OR (Zworld OR world))', xapian12string='(Zhello OR hello OR Zworld OR world)') def test_multiple_words_or_not(self): self.sq.add_filter(~SQ(content='hello') | ~SQ(content='world')) self.assertExpectedQuery(self.sq.build_query(), '(( AND_NOT (Zhello OR hello)) OR' ' ( AND_NOT (Zworld OR world)))') def test_multiple_words_mixed(self): self.sq.add_filter(SQ(content='why') | SQ(content='hello')) self.sq.add_filter(~SQ(content='world')) self.assertExpectedQuery( self.sq.build_query(), '(((Zwhi OR why) OR (Zhello OR hello)) AND ' '( AND_NOT (Zworld OR world)))', xapian12string='((Zwhi OR why OR Zhello OR hello) AND' ' ( AND_NOT (Zworld OR world)))',) def test_multiple_word_field_exact(self): self.sq.add_filter(SQ(foo='hello')) self.sq.add_filter(SQ(title='world')) self.assertExpectedQuery(self.sq.build_query(), '((XFOO^ PHRASE 3 XFOOhello PHRASE 3 XFOO$) AND' ' (XTITLE^ PHRASE 3 XTITLEworld PHRASE 3 XTITLE$))') def test_multiple_word_field_exact_not(self): self.sq.add_filter(~SQ(foo='hello')) self.sq.add_filter(~SQ(title='world')) self.assertExpectedQuery(self.sq.build_query(), '(( AND_NOT (XFOO^ PHRASE 3 XFOOhello PHRASE 3 XFOO$)) AND' ' ( AND_NOT (XTITLE^ PHRASE 3 XTITLEworld PHRASE 3 XTITLE$)))') def test_or(self): self.sq.add_filter(SQ(content='hello world')) self.assertExpectedQuery( self.sq.build_query(), '((Zhello OR hello) OR (Zworld OR world))', xapian12string='(Zhello OR hello OR Zworld OR world)') def test_not_or(self): self.sq.add_filter(~SQ(content='hello world')) self.assertExpectedQuery( self.sq.build_query(), '( AND_NOT ((Zhello OR hello) OR (Zworld OR world)))', xapian12string='( AND_NOT (Zhello OR hello OR Zworld OR world))') def test_boost(self): self.sq.add_filter(SQ(content='hello')) self.sq.add_boost('world', 5) self.assertExpectedQuery(self.sq.build_query(), '((Zhello OR hello) AND_MAYBE' ' 5 * (Zworld OR world))') def test_not_in_filter_single_words(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"])) self.assertExpectedQuery(self.sq.build_query(), '((Zwhi OR why) AND ' '( AND_NOT (' '(XTITLE^ PHRASE 3 XTITLEdune PHRASE 3 XTITLE$) OR ' '(XTITLE^ PHRASE 3 XTITLEjaws PHRASE 3 XTITLE$))))') def test_in_filter_multiple_words(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article"])) self.assertExpectedQuery(self.sq.build_query(), '((Zwhi OR why) AND ((XTITLE^ PHRASE 5 XTITLEa PHRASE 5 ' 'XTITLEfamous PHRASE 5 XTITLEpaper PHRASE 5 XTITLE$) OR ' '(XTITLE^ PHRASE 5 XTITLEan PHRASE 5 XTITLEinfamous PHRASE 5 ' 'XTITLEarticle PHRASE 5 XTITLE$)))') def test_in_filter_multiple_words_with_punctuation(self): self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article", "My Store Inc."])) self.assertExpectedQuery(self.sq.build_query(), '((XTITLE^ PHRASE 5 XTITLEa PHRASE 5 XTITLEfamous PHRASE 5' ' XTITLEpaper PHRASE 5 XTITLE$) OR ' '(XTITLE^ PHRASE 5 XTITLEan PHRASE 5 XTITLEinfamous PHRASE 5' ' XTITLEarticle PHRASE 5 XTITLE$) OR ' '(XTITLE^ PHRASE 5 XTITLEmy PHRASE 5 XTITLEstore PHRASE 5' ' XTITLEinc. PHRASE 5 XTITLE$))') def test_not_in_filter_multiple_words(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(~SQ(title__in=["A Famous Paper", "An Infamous Article"])) self.assertExpectedQuery(self.sq.build_query(), '((Zwhi OR why) AND ( AND_NOT ' '((XTITLE^ PHRASE 5 XTITLEa PHRASE 5 XTITLEfamous PHRASE 5 ' 'XTITLEpaper PHRASE 5 XTITLE$) OR (XTITLE^ PHRASE 5 ' 'XTITLEan PHRASE 5 XTITLEinfamous PHRASE 5 ' 'XTITLEarticle PHRASE 5 XTITLE$))))') def test_in_filter_datetime(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(pub_date__in=[datetime.datetime(2009, 7, 6, 1, 56, 21)])) self.assertExpectedQuery(self.sq.build_query(), '((Zwhi OR why) AND ' '(XPUB_DATE2009-07-06 AND_MAYBE XPUB_DATE01:56:21))') def test_clean(self): self.assertEqual(self.sq.clean('hello world'), 'hello world') self.assertEqual(self.sq.clean('hello AND world'), 'hello AND world') self.assertEqual(self.sq.clean('hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world'), 'hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world') self.assertEqual(self.sq.clean('so please NOTe i am in a bAND and bORed'), 'so please NOTe i am in a bAND and bORed') def test_with_models(self): self.sq.add_filter(SQ(content='hello')) self.sq.add_model(MockModel) self.assertExpectedQuery(self.sq.build_query(), '((Zhello OR hello) AND ' '0 * CONTENTTYPEcore.mockmodel)') self.sq.add_model(AnotherMockModel) self.assertExpectedQuery(self.sq.build_query(), ['((Zhello OR hello) AND ' '(0 * CONTENTTYPEcore.mockmodel OR' ' 0 * CONTENTTYPEcore.anothermockmodel))', '((Zhello OR hello) AND ' '(0 * CONTENTTYPEcore.anothermockmodel OR' ' 0 * CONTENTTYPEcore.mockmodel))']) def test_with_punctuation(self): self.sq.add_filter(SQ(content='http://www.example.com')) self.assertExpectedQuery(self.sq.build_query(), '(Zhttp://www.example.com OR' ' http://www.example.com)') def test_in_filter_values_list(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True))) self.assertExpectedQuery(self.sq.build_query(), '((Zwhi OR why) AND (' '(XTITLE^ PHRASE 3 XTITLE1 PHRASE 3 XTITLE$) OR ' '(XTITLE^ PHRASE 3 XTITLE2 PHRASE 3 XTITLE$) OR ' '(XTITLE^ PHRASE 3 XTITLE3 PHRASE 3 XTITLE$)))') def test_content_type(self): self.sq.add_filter(SQ(django_ct='time')) self.assertExpectedQuery(self.sq.build_query(), 'CONTENTTYPEtime') class SearchQueryTestCase(HaystackBackendTestCase, TestCase): """ Tests expected behavior of SearchQuery. """ fixtures = ['base_data.json'] def get_index(self): return MockSearchIndex() def setUp(self): super(SearchQueryTestCase, self).setUp() self.backend.update(self.index, MockModel.objects.all()) self.sq = connections['default'].get_query() def test_get_spelling(self): self.sq.add_filter(SQ(content='indxd')) self.assertEqual(self.sq.get_spelling_suggestion(), 'indexed') self.assertEqual(self.sq.get_spelling_suggestion('indxd'), 'indexed') def test_contains(self): self.sq.add_filter(SQ(content='circular')) self.sq.add_filter(SQ(title__contains='haystack')) self.assertExpectedQuery(self.sq.build_query(), '((Zcircular OR circular) AND ' '(ZXTITLEhaystack OR XTITLEhaystack))') def test_startswith(self): self.sq.add_filter(SQ(name__startswith='da')) self.assertEqual([result.pk for result in self.sq.get_results()], [1, 2, 3]) def test_endswith(self): with self.assertRaises(NotImplementedError): self.sq.add_filter(SQ(name__endswith='el2')) self.sq.get_results() def test_gt(self): self.sq.add_filter(SQ(name__gt='m')) self.assertExpectedQuery(self.sq.build_query(), '( AND_NOT VALUE_RANGE 3 a m)') def test_gte(self): self.sq.add_filter(SQ(name__gte='m')) self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz' 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' 'zzzzzzzzzzzzzzzzzzzzzzzzzzzz') def test_lt(self): self.sq.add_filter(SQ(name__lt='m')) self.assertExpectedQuery(self.sq.build_query(), '( AND_NOT VALUE_RANGE 3 m ' 'zzzzzzzzzzzzzzzzzzzzzzzzzzzz' 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' 'zzzzzzzzzzzzzzzzzzzzzzzzzzzz)') def test_lte(self): self.sq.add_filter(SQ(name__lte='m')) self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 3 a m') def test_range(self): self.sq.add_filter(SQ(django_id__range=[2, 4])) self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 1 000000000002 000000000004') self.sq.add_filter(~SQ(django_id__range=[0, 2])) self.assertExpectedQuery(self.sq.build_query(), '(VALUE_RANGE 1 000000000002 000000000004 AND ' '( AND_NOT VALUE_RANGE 1 000000000000 000000000002))') self.assertEqual([result.pk for result in self.sq.get_results()], [3]) def test_multiple_filter_types(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(pub_date__lte=datetime.datetime(2009, 2, 10, 1, 59, 0))) self.sq.add_filter(SQ(name__gt='david')) self.sq.add_filter(SQ(title__gte='B')) self.sq.add_filter(SQ(django_id__in=[1, 2, 3])) self.assertExpectedQuery(self.sq.build_query(), '((Zwhi OR why) AND' ' VALUE_RANGE 5 00010101000000 20090210015900 AND' ' ( AND_NOT VALUE_RANGE 3 a david)' ' AND VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzz' 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' 'zzzzzzzzzzzzzzzzzzzzzzzzz AND' ' (QQ000000000001 OR QQ000000000002 OR QQ000000000003))') def test_log_query(self): reset_search_queries() self.assertEqual(len(connections['default'].queries), 0) # Stow. old_debug = settings.DEBUG settings.DEBUG = False len(self.sq.get_results()) self.assertEqual(len(connections['default'].queries), 0) settings.DEBUG = True # Redefine it to clear out the cached results. self.sq = connections['default'].get_query() self.sq.add_filter(SQ(name='bar')) len(self.sq.get_results()) self.assertEqual(len(connections['default'].queries), 1) self.assertExpectedQuery(connections['default'].queries[0]['query_string'], '(XNAME^ PHRASE 3 XNAMEbar PHRASE 3 XNAME$)') # And again, for good measure. self.sq = connections['default'].get_query() self.sq.add_filter(SQ(name='bar')) self.sq.add_filter(SQ(text='moof')) len(self.sq.get_results()) self.assertEqual(len(connections['default'].queries), 2) self.assertExpectedQuery(connections['default'].queries[0]['query_string'], '(XNAME^ PHRASE 3 XNAMEbar PHRASE 3 XNAME$)') self.assertExpectedQuery(connections['default'].queries[1]['query_string'], '((XNAME^ PHRASE 3 XNAMEbar PHRASE 3 XNAME$) AND' ' (XTEXT^ PHRASE 3 XTEXTmoof PHRASE 3 XTEXT$))') # Restore. settings.DEBUG = old_debug class LiveSearchQuerySetTestCase(HaystackBackendTestCase, TestCase): """ SearchQuerySet specific tests """ fixtures = ['base_data.json'] def get_index(self): return MockSearchIndex() def setUp(self): super(LiveSearchQuerySetTestCase, self).setUp() self.backend.update(self.index, MockModel.objects.all()) self.sq = connections['default'].get_query() self.sqs = SearchQuerySet() def test_result_class(self): # Assert that we're defaulting to ``SearchResult``. sqs = self.sqs.all() self.assertTrue(isinstance(sqs[0], SearchResult)) # Custom class. sqs = self.sqs.result_class(MockSearchResult).all() self.assertTrue(isinstance(sqs[0], MockSearchResult)) # Reset to default. sqs = self.sqs.result_class(None).all() self.assertTrue(isinstance(sqs[0], SearchResult)) def test_facet(self): self.assertEqual(len(self.sqs.facet('name').facet_counts()['fields']['name']), 3) class BoostFieldTestCase(HaystackBackendTestCase, TestCase): """ Tests boosted fields. """ def get_index(self): return BoostMockSearchIndex() def setUp(self): super(BoostFieldTestCase, self).setUp() self.sample_objs = [] for i in range(1, 5): mock = AFourthMockModel() mock.id = i if i % 2: mock.author = 'daniel' mock.editor = 'david' else: mock.author = 'david' mock.editor = 'daniel' mock.pub_date = datetime.date(2009, 2, 25) - datetime.timedelta(days=i) self.sample_objs.append(mock) self.backend.update(self.index, self.sample_objs) def test_boost(self): sqs = SearchQuerySet() self.assertEqual(len(sqs.all()), 4) results = sqs.filter(SQ(author='daniel') | SQ(editor='daniel')) self.assertEqual([result.id for result in results], [ 'core.afourthmockmodel.1', 'core.afourthmockmodel.3', 'core.afourthmockmodel.2', 'core.afourthmockmodel.4' ]) xapian-haystack-2.1.0/xapian_backend.py000077500000000000000000002001501305552552400201070ustar00rootroot00000000000000from __future__ import unicode_literals import datetime import pickle import os import re import shutil import sys from django.utils import six from django.conf import settings from django.core.exceptions import ImproperlyConfigured from django.utils.encoding import force_text from haystack import connections from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, SearchNode, log_query from haystack.constants import ID, DJANGO_ID, DJANGO_CT, DEFAULT_OPERATOR from haystack.exceptions import HaystackError, MissingDependency from haystack.inputs import AutoQuery from haystack.models import SearchResult from haystack.utils import get_identifier, get_model_ct NGRAM_MIN_LENGTH = 2 NGRAM_MAX_LENGTH = 15 try: import xapian except ImportError: raise MissingDependency("The 'xapian' backend requires the installation of 'Xapian'. " "Please refer to the documentation.") class NotSupportedError(Exception): """ When the installed version of Xapian doesn't support something and we have the old implementation. """ pass # this maps the different reserved fields to prefixes used to # create the database: # id str: unique document id. # django_id int: id of the django model instance. # django_ct str: of the content type of the django model. # field str: name of the field of the index. TERM_PREFIXES = { ID: 'Q', DJANGO_ID: 'QQ', DJANGO_CT: 'CONTENTTYPE', 'field': 'X' } MEMORY_DB_NAME = ':memory:' DEFAULT_XAPIAN_FLAGS = ( xapian.QueryParser.FLAG_PHRASE | xapian.QueryParser.FLAG_BOOLEAN | xapian.QueryParser.FLAG_LOVEHATE | xapian.QueryParser.FLAG_WILDCARD | xapian.QueryParser.FLAG_PURE_NOT ) # Mapping from `HAYSTACK_DEFAULT_OPERATOR` to Xapian operators XAPIAN_OPTS = {'AND': xapian.Query.OP_AND, 'OR': xapian.Query.OP_OR, 'PHRASE': xapian.Query.OP_PHRASE, 'NEAR': xapian.Query.OP_NEAR } # number of documents checked by default when building facets # this must be improved to be relative to the total number of docs. DEFAULT_CHECK_AT_LEAST = 1000 # field types accepted to be serialized as values in Xapian FIELD_TYPES = {'text', 'integer', 'date', 'datetime', 'float', 'boolean', 'edge_ngram', 'ngram'} # defines the format used to store types in Xapian # this format ensures datetimes are sorted correctly DATETIME_FORMAT = '%Y%m%d%H%M%S' INTEGER_FORMAT = '%012d' # defines the distance given between # texts with positional information TERMPOS_DISTANCE = 100 class InvalidIndexError(HaystackError): """Raised when an index can not be opened.""" pass class XHValueRangeProcessor(xapian.ValueRangeProcessor): """ A Processor to construct ranges of values """ def __init__(self, backend): self.backend = backend xapian.ValueRangeProcessor.__init__(self) def __call__(self, begin, end): """ Construct a tuple for value range processing. `begin` -- a string in the format ':[low_range]' If 'low_range' is omitted, assume the smallest possible value. `end` -- a string in the the format '[high_range|*]'. If '*', assume the highest possible value. Return a tuple of three strings: (column, low, high) """ colon = begin.find(':') field_name = begin[:colon] begin = begin[colon + 1:len(begin)] for field_dict in self.backend.schema: if field_dict['field_name'] == field_name: field_type = field_dict['type'] if not begin: if field_type == 'text': begin = 'a' # TODO: A better way of getting a min text value? elif field_type == 'integer': begin = -sys.maxsize - 1 elif field_type == 'float': begin = float('-inf') elif field_type == 'date' or field_type == 'datetime': begin = '00010101000000' elif end == '*': if field_type == 'text': end = 'z' * 100 # TODO: A better way of getting a max text value? elif field_type == 'integer': end = sys.maxsize elif field_type == 'float': end = float('inf') elif field_type == 'date' or field_type == 'datetime': end = '99990101000000' if field_type == 'float': begin = _term_to_xapian_value(float(begin), field_type) end = _term_to_xapian_value(float(end), field_type) elif field_type == 'integer': begin = _term_to_xapian_value(int(begin), field_type) end = _term_to_xapian_value(int(end), field_type) return field_dict['column'], str(begin), str(end) class XHExpandDecider(xapian.ExpandDecider): def __call__(self, term): """ Return True if the term should be used for expanding the search query, False otherwise. Ignore terms related with the content type of objects. """ if term.decode('utf-8').startswith(TERM_PREFIXES[DJANGO_CT]): return False return True class XapianSearchBackend(BaseSearchBackend): """ `SearchBackend` defines the Xapian search backend for use with the Haystack API for Django search. It uses the Xapian Python bindings to interface with Xapian, and as such is subject to this bug: when Django is running with mod_python or mod_wsgi under Apache. Until this issue has been fixed by Xapian, it is neccessary to set `WSGIApplicationGroup to %{GLOBAL}` when using mod_wsgi, or `PythonInterpreter main_interpreter` when using mod_python. In order to use this backend, `PATH` must be included in the `connection_options`. This should point to a location where you would your indexes to reside. """ inmemory_db = None def __init__(self, connection_alias, **connection_options): """ Instantiates an instance of `SearchBackend`. Optional arguments: `connection_alias` -- The name of the connection `language` -- The stemming language (default = 'english') `**connection_options` -- The various options needed to setup the backend. Also sets the stemming language to be used to `language`. """ super(XapianSearchBackend, self).__init__(connection_alias, **connection_options) if not 'PATH' in connection_options: raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.path = connection_options.get('PATH') if self.path != MEMORY_DB_NAME and not os.path.exists(self.path): os.makedirs(self.path) self.flags = connection_options.get('FLAGS', DEFAULT_XAPIAN_FLAGS) self.language = getattr(settings, 'HAYSTACK_XAPIAN_LANGUAGE', 'english') stemming_strategy_string = getattr(settings, 'HAYSTACK_XAPIAN_STEMMING_STRATEGY', 'STEM_SOME') self.stemming_strategy = getattr(xapian.QueryParser, stemming_strategy_string, xapian.QueryParser.STEM_SOME) # these 4 attributes are caches populated in `build_schema` # they are checked in `_update_cache` # use property to retrieve them self._fields = {} self._schema = [] self._content_field_name = None self._columns = {} def _update_cache(self): """ To avoid build_schema every time, we cache some values: they only change when a SearchIndex changes, which typically restarts the Python. """ fields = connections[self.connection_alias].get_unified_index().all_searchfields() if self._fields != fields: self._fields = fields self._content_field_name, self._schema = self.build_schema(self._fields) @property def schema(self): self._update_cache() return self._schema @property def content_field_name(self): self._update_cache() return self._content_field_name @property def column(self): """ Returns the column in the database of a given field name. """ self._update_cache() return self._columns def update(self, index, iterable): """ Updates the `index` with any objects in `iterable` by adding/updating the database as needed. Required arguments: `index` -- The `SearchIndex` to process `iterable` -- An iterable of model instances to index For each object in `iterable`, a document is created containing all of the terms extracted from `index.full_prepare(obj)` with field prefixes, and 'as-is' as needed. Also, if the field type is 'text' it will be stemmed and stored with the 'Z' prefix as well. eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest` Each document also contains an extra term in the format: `XCONTENTTYPE.` As well as a unique identifier in the the format: `Q..` eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar` This is useful for querying for a specific document corresponding to a model instance. The document also contains a pickled version of the object itself and the document ID in the document data field. Finally, we also store field values to be used for sorting data. We store these in the document value slots (position zero is reserver for the document ID). All values are stored as unicode strings with conversion of float, int, double, values being done by Xapian itself through the use of the :method:xapian.sortable_serialise method. """ database = self._database(writable=True) try: term_generator = xapian.TermGenerator() term_generator.set_database(database) term_generator.set_stemmer(xapian.Stem(self.language)) try: term_generator.set_stemming_strategy(self.stemming_strategy) except AttributeError: # Versions before Xapian 1.2.11 do not support stemming strategies for TermGenerator pass if self.include_spelling is True: term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) def _add_text(termpos, text, weight, prefix=''): """ indexes text appending 2 extra terms to identify beginning and ending of the text. """ term_generator.set_termpos(termpos) start_term = '%s^' % prefix end_term = '%s$' % prefix # add begin document.add_posting(start_term, termpos, weight) # add text term_generator.index_text(text, weight, prefix) termpos = term_generator.get_termpos() # add ending termpos += 1 document.add_posting(end_term, termpos, weight) # increase termpos term_generator.set_termpos(termpos) term_generator.increase_termpos(TERMPOS_DISTANCE) return term_generator.get_termpos() def _add_literal_text(termpos, text, weight, prefix=''): """ Adds sentence to the document with positional information but without processing. The sentence is bounded by "^" "$" to allow exact matches. """ text = '^ %s $' % text for word in text.split(): term = '%s%s' % (prefix, word) document.add_posting(term, termpos, weight) termpos += 1 termpos += TERMPOS_DISTANCE return termpos def add_text(termpos, prefix, text, weight): """ Adds text to the document with positional information and processing (e.g. stemming). """ termpos = _add_text(termpos, text, weight, prefix=prefix) termpos = _add_text(termpos, text, weight, prefix='') termpos = _add_literal_text(termpos, text, weight, prefix=prefix) termpos = _add_literal_text(termpos, text, weight, prefix='') return termpos def _get_ngram_lengths(value): values = value.split() for item in values: for ngram_length in six.moves.range(NGRAM_MIN_LENGTH, NGRAM_MAX_LENGTH + 1): yield item, ngram_length for obj in iterable: document = xapian.Document() term_generator.set_document(document) def ngram_terms(value): for item, length in _get_ngram_lengths(value): item_length = len(item) for start in six.moves.range(0, item_length - length + 1): for size in six.moves.range(length, length + 1): end = start + size if end > item_length: continue yield _to_xapian_term(item[start:end]) def edge_ngram_terms(value): for item, length in _get_ngram_lengths(value): yield _to_xapian_term(item[0:length]) def add_edge_ngram_to_document(prefix, value, weight): """ Splits the term in ngrams and adds each ngram to the index. The minimum and maximum size of the ngram is respectively NGRAM_MIN_LENGTH and NGRAM_MAX_LENGTH. """ for term in edge_ngram_terms(value): document.add_term(term, weight) document.add_term(prefix + term, weight) def add_ngram_to_document(prefix, value, weight): """ Splits the term in ngrams and adds each ngram to the index. The minimum and maximum size of the ngram is respectively NGRAM_MIN_LENGTH and NGRAM_MAX_LENGTH. """ for term in ngram_terms(value): document.add_term(term, weight) document.add_term(prefix + term, weight) def add_non_text_to_document(prefix, term, weight): """ Adds term to the document without positional information and without processing. If the term is alone, also adds it as "^$" to allow exact matches on single terms. """ document.add_term(term, weight) document.add_term(prefix + term, weight) def add_datetime_to_document(termpos, prefix, term, weight): """ Adds a datetime to document with positional order to allow exact matches on it. """ date, time = term.split() document.add_posting(date, termpos, weight) termpos += 1 document.add_posting(time, termpos, weight) termpos += 1 document.add_posting(prefix + date, termpos, weight) termpos += 1 document.add_posting(prefix + time, termpos, weight) termpos += TERMPOS_DISTANCE + 1 return termpos data = index.full_prepare(obj) weights = index.get_field_weights() termpos = term_generator.get_termpos() # identifies the current position in the document. for field in self.schema: if field['field_name'] not in list(data.keys()): # not supported fields are ignored. continue if field['field_name'] in weights: weight = int(weights[field['field_name']]) else: weight = 1 value = data[field['field_name']] if field['field_name'] in (ID, DJANGO_ID, DJANGO_CT): # Private fields are indexed in a different way: # `django_id` is an int and `django_ct` is text; # besides, they are indexed by their (unstemmed) value. if field['field_name'] == DJANGO_ID: value = int(value) value = _term_to_xapian_value(value, field['type']) document.add_term(TERM_PREFIXES[field['field_name']] + value, weight) document.add_value(field['column'], value) continue else: prefix = TERM_PREFIXES['field'] + field['field_name'].upper() # if not multi_valued, we add as a document value # for sorting and facets if field['multi_valued'] == 'false': document.add_value(field['column'], _term_to_xapian_value(value, field['type'])) else: for t in value: # add the exact match of each value term = _to_xapian_term(t) termpos = add_text(termpos, prefix, term, weight) continue term = _to_xapian_term(value) if term == '': continue # from here on the term is a string; # we now decide how it is indexed if field['type'] == 'text': # text is indexed with positional information termpos = add_text(termpos, prefix, term, weight) elif field['type'] == 'datetime': termpos = add_datetime_to_document(termpos, prefix, term, weight) elif field['type'] == 'ngram': add_ngram_to_document(prefix, value, weight) elif field['type'] == 'edge_ngram': add_edge_ngram_to_document(prefix, value, weight) else: # all other terms are added without positional information add_non_text_to_document(prefix, term, weight) # store data without indexing it document.set_data(pickle.dumps( (obj._meta.app_label, obj._meta.model_name, obj.pk, data), pickle.HIGHEST_PROTOCOL )) # add the id of the document document_id = TERM_PREFIXES[ID] + get_identifier(obj) document.add_term(document_id) # finally, replace or add the document to the database database.replace_document(document_id, document) except UnicodeDecodeError: sys.stderr.write('Chunk failed.\n') pass finally: database.close() def remove(self, obj): """ Remove indexes for `obj` from the database. We delete all instances of `Q..` which should be unique to this object. """ database = self._database(writable=True) database.delete_document(TERM_PREFIXES[ID] + get_identifier(obj)) database.close() def clear(self, models=(), commit=True): """ Clear all instances of `models` from the database or all models, if not specified. Optional Arguments: `models` -- Models to clear from the database (default = []) If `models` is empty, an empty query is executed which matches all documents in the database. Afterwards, each match is deleted. Otherwise, for each model, a `delete_document` call is issued with the term `XCONTENTTYPE.`. This will delete all documents with the specified model type. """ if not models: # Because there does not appear to be a "clear all" method, # it's much quicker to remove the contents of the `self.path` # folder than it is to remove each document one at a time. if os.path.exists(self.path): shutil.rmtree(self.path) else: database = self._database(writable=True) for model in models: database.delete_document(TERM_PREFIXES[DJANGO_CT] + get_model_ct(model)) database.close() def document_count(self): try: return self._database().get_doccount() except InvalidIndexError: return 0 def _build_models_query(self, query): """ Builds a query from `query` that filters to documents only from registered models. """ registered_models_ct = self.build_models_list() if registered_models_ct: restrictions = [xapian.Query('%s%s' % (TERM_PREFIXES[DJANGO_CT], model_ct)) for model_ct in registered_models_ct] limit_query = xapian.Query(xapian.Query.OP_OR, restrictions) query = xapian.Query(xapian.Query.OP_AND, query, limit_query) return query def _check_field_names(self, field_names): """ Raises InvalidIndexError if any of a field_name in field_names is not indexed. """ if field_names: for field_name in field_names: try: self.column[field_name] except KeyError: raise InvalidIndexError('Trying to use non indexed field "%s"' % field_name) @log_query def search(self, query, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, limit_to_registered_models=None, result_class=None, **kwargs): """ Executes the Xapian::query as defined in `query`. Required arguments: `query` -- Search query to execute Optional arguments: `sort_by` -- Sort results by specified field (default = None) `start_offset` -- Slice results from `start_offset` (default = 0) `end_offset` -- Slice results at `end_offset` (default = None), if None, then all documents `fields` -- Filter results on `fields` (default = '') `highlight` -- Highlight terms in results (default = False) `facets` -- Facet results on fields (default = None) `date_facets` -- Facet results on date ranges (default = None) `query_facets` -- Facet results on queries (default = None) `narrow_queries` -- Narrow queries (default = None) `spelling_query` -- An optional query to execute spelling suggestion on `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True) Returns: A dictionary with the following keys: `results` -- A list of `SearchResult` `hits` -- The total available results `facets` - A dictionary of facets with the following keys: `fields` -- A list of field facets `dates` -- A list of date facets `queries` -- A list of query facets If faceting was not used, the `facets` key will not be present If `query` is None, returns no results. If `INCLUDE_SPELLING` was enabled in the connection options, the extra flag `FLAG_SPELLING_CORRECTION` will be passed to the query parser and any suggestions for spell correction will be returned as well as the results. """ if xapian.Query.empty(query): return { 'results': [], 'hits': 0, } self._check_field_names(facets) self._check_field_names(date_facets) self._check_field_names(query_facets) database = self._database() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if result_class is None: result_class = SearchResult if self.include_spelling is True: spelling_suggestion = self._do_spelling_suggestion(database, query, spelling_query) else: spelling_suggestion = '' if narrow_queries is not None: query = xapian.Query( xapian.Query.OP_AND, query, xapian.Query( xapian.Query.OP_AND, [self.parse_query(narrow_query) for narrow_query in narrow_queries] ) ) if limit_to_registered_models: query = self._build_models_query(query) enquire = xapian.Enquire(database) if hasattr(settings, 'HAYSTACK_XAPIAN_WEIGHTING_SCHEME'): enquire.set_weighting_scheme(xapian.BM25Weight(*settings.HAYSTACK_XAPIAN_WEIGHTING_SCHEME)) enquire.set_query(query) if sort_by: try: _xapian_sort(enquire, sort_by, self.column) except NotSupportedError: _old_xapian_sort(enquire, sort_by, self.column) results = [] facets_dict = { 'fields': {}, 'dates': {}, 'queries': {}, } if not end_offset: end_offset = database.get_doccount() - start_offset ## prepare spies in case of facets if facets: facets_spies = self._prepare_facet_field_spies(facets) for spy in facets_spies: enquire.add_matchspy(spy) # print enquire.get_query() matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) for match in matches: app_label, model_name, pk, model_data = pickle.loads(self._get_document_data(database, match.document)) if highlight: model_data['highlighted'] = { self.content_field_name: self._do_highlight( model_data.get(self.content_field_name), query ) } results.append( result_class(app_label, model_name, pk, match.percent, **model_data) ) if facets: # pick single valued facets from spies single_facets_dict = self._process_facet_field_spies(facets_spies) # pick multivalued valued facets from results multi_facets_dict = self._do_multivalued_field_facets(results, facets) # merge both results (http://stackoverflow.com/a/38990/931303) facets_dict['fields'] = dict(list(single_facets_dict.items()) + list(multi_facets_dict.items())) if date_facets: facets_dict['dates'] = self._do_date_facets(results, date_facets) if query_facets: facets_dict['queries'] = self._do_query_facets(results, query_facets) return { 'results': results, 'hits': self._get_hit_count(database, enquire), 'facets': facets_dict, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query=None, start_offset=0, end_offset=None, limit_to_registered_models=True, result_class=None, **kwargs): """ Given a model instance, returns a result set of similar documents. Required arguments: `model_instance` -- The model instance to use as a basis for retrieving similar documents. Optional arguments: `additional_query` -- An additional query to narrow results `start_offset` -- The starting offset (default=0) `end_offset` -- The ending offset (default=None), if None, then all documents `limit_to_registered_models` -- Limit returned results to models registered in the search (default = True) Returns: A dictionary with the following keys: `results` -- A list of `SearchResult` `hits` -- The total available results Opens a database connection, then builds a simple query using the `model_instance` to build the unique identifier. For each document retrieved(should always be one), adds an entry into an RSet (relevance set) with the document id, then, uses the RSet to query for an ESet (A set of terms that can be used to suggest expansions to the original query), omitting any document that was in the original query. Finally, processes the resulting matches and returns. """ database = self._database() if result_class is None: result_class = SearchResult query = xapian.Query(TERM_PREFIXES[ID] + get_identifier(model_instance)) enquire = xapian.Enquire(database) enquire.set_query(query) rset = xapian.RSet() if not end_offset: end_offset = database.get_doccount() match = None for match in self._get_enquire_mset(database, enquire, 0, end_offset): rset.add_document(match.docid) if match is None: if not self.silently_fail: raise InvalidIndexError('Instance %s with id "%d" not indexed' % (get_identifier(model_instance), model_instance.id)) else: return {'results': [], 'hits': 0} query = xapian.Query( xapian.Query.OP_ELITE_SET, [expand.term for expand in enquire.get_eset(match.document.termlist_count(), rset, XHExpandDecider())], match.document.termlist_count() ) query = xapian.Query( xapian.Query.OP_AND_NOT, [query, TERM_PREFIXES[ID] + get_identifier(model_instance)] ) if limit_to_registered_models: query = self._build_models_query(query) if additional_query: query = xapian.Query( xapian.Query.OP_AND, query, additional_query ) enquire.set_query(query) results = [] matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) for match in matches: app_label, model_name, pk, model_data = pickle.loads(self._get_document_data(database, match.document)) results.append( result_class(app_label, model_name, pk, match.percent, **model_data) ) return { 'results': results, 'hits': self._get_hit_count(database, enquire), 'facets': { 'fields': {}, 'dates': {}, 'queries': {}, }, 'spelling_suggestion': None, } def parse_query(self, query_string): """ Given a `query_string`, will attempt to return a xapian.Query Required arguments: ``query_string`` -- A query string to parse Returns a xapian.Query """ if query_string == '*': return xapian.Query('') # Match everything elif query_string == '': return xapian.Query() # Match nothing qp = xapian.QueryParser() qp.set_database(self._database()) qp.set_stemmer(xapian.Stem(self.language)) qp.set_stemming_strategy(self.stemming_strategy) qp.set_default_op(XAPIAN_OPTS[DEFAULT_OPERATOR]) qp.add_boolean_prefix(DJANGO_CT, TERM_PREFIXES[DJANGO_CT]) for field_dict in self.schema: # since 'django_ct' has a boolean_prefix, # we ignore it here. if field_dict['field_name'] == DJANGO_CT: continue qp.add_prefix( field_dict['field_name'], TERM_PREFIXES['field'] + field_dict['field_name'].upper() ) vrp = XHValueRangeProcessor(self) qp.add_valuerangeprocessor(vrp) return qp.parse_query(query_string, self.flags) def build_schema(self, fields): """ Build the schema from fields. :param fields: A list of fields in the index :returns: list of dictionaries Each dictionary has the keys field_name: The name of the field index type: what type of value it is 'multi_valued': if it allows more than one value 'column': a number identifying it 'type': the type of the field 'multi_valued': 'false', 'column': 0} """ content_field_name = '' schema_fields = [ {'field_name': ID, 'type': 'text', 'multi_valued': 'false', 'column': 0}, {'field_name': DJANGO_ID, 'type': 'integer', 'multi_valued': 'false', 'column': 1}, {'field_name': DJANGO_CT, 'type': 'text', 'multi_valued': 'false', 'column': 2}, ] self._columns[ID] = 0 self._columns[DJANGO_ID] = 1 self._columns[DJANGO_CT] = 2 column = len(schema_fields) for field_name, field_class in sorted(list(fields.items()), key=lambda n: n[0]): if field_class.document is True: content_field_name = field_class.index_fieldname if field_class.indexed is True: field_data = { 'field_name': field_class.index_fieldname, 'type': 'text', 'multi_valued': 'false', 'column': column, } if field_class.field_type == 'date': field_data['type'] = 'date' elif field_class.field_type == 'datetime': field_data['type'] = 'datetime' elif field_class.field_type == 'integer': field_data['type'] = 'integer' elif field_class.field_type == 'float': field_data['type'] = 'float' elif field_class.field_type == 'boolean': field_data['type'] = 'boolean' elif field_class.field_type == 'ngram': field_data['type'] = 'ngram' elif field_class.field_type == 'edge_ngram': field_data['type'] = 'edge_ngram' if field_class.is_multivalued: field_data['multi_valued'] = 'true' schema_fields.append(field_data) self._columns[field_data['field_name']] = column column += 1 return content_field_name, schema_fields @staticmethod def _do_highlight(content, query, tag='em'): """ Highlight `query` terms in `content` with html `tag`. This method assumes that the input text (`content`) does not contain any special formatting. That is, it does not contain any html tags or similar markup that could be screwed up by the highlighting. Required arguments: `content` -- Content to search for instances of `text` `text` -- The text to be highlighted """ for term in query: term = term.decode('utf-8') for match in re.findall('[^A-Z]+', term): # Ignore field identifiers match_re = re.compile(match, re.I) content = match_re.sub('<%s>%s' % (tag, term, tag), content) return content def _prepare_facet_field_spies(self, facets): """ Returns a list of spies based on the facets used to count frequencies. """ spies = [] for facet in facets: slot = self.column[facet] spy = xapian.ValueCountMatchSpy(slot) # add attribute "slot" to know which column this spy is targeting. spy.slot = slot spies.append(spy) return spies def _process_facet_field_spies(self, spies): """ Returns a dict of facet names with lists of tuples of the form (term, term_frequency) from a list of spies that observed the enquire. """ facet_dict = {} for spy in spies: field = self.schema[spy.slot] field_name, field_type = field['field_name'], field['type'] facet_dict[field_name] = [] for facet in list(spy.values()): if field_type == 'float': # the float term is a Xapian serialized object, which is # in bytes. term = facet.term else: term = facet.term.decode('utf-8') facet_dict[field_name].append((_from_xapian_value(term, field_type), facet.termfreq)) return facet_dict def _do_multivalued_field_facets(self, results, field_facets): """ Implements a multivalued field facet on the results. This is implemented using brute force - O(N^2) - because Xapian does not have it implemented yet (see http://trac.xapian.org/ticket/199) """ facet_dict = {} for field in field_facets: facet_list = {} if not self._multi_value_field(field): continue for result in results: field_value = getattr(result, field) for item in field_value: # Facet each item in a MultiValueField facet_list[item] = facet_list.get(item, 0) + 1 facet_dict[field] = list(facet_list.items()) return facet_dict @staticmethod def _do_date_facets(results, date_facets): """ Private method that facets a document by date ranges Required arguments: `results` -- A list SearchResults to facet `date_facets` -- A dictionary containing facet parameters: {'field': {'start_date': ..., 'end_date': ...: 'gap_by': '...', 'gap_amount': n}} nb., gap must be one of the following: year|month|day|hour|minute|second For each date facet field in `date_facets`, generates a list of date ranges (from `start_date` to `end_date` by `gap_by`) then iterates through `results` and tallies the count for each date_facet. Returns a dictionary of date facets (fields) containing a list with entries for each range and a count of documents matching the range. eg. { 'pub_date': [ (datetime.datetime(2009, 1, 1, 0, 0), 5), (datetime.datetime(2009, 2, 1, 0, 0), 0), (datetime.datetime(2009, 3, 1, 0, 0), 0), (datetime.datetime(2008, 4, 1, 0, 0), 1), (datetime.datetime(2008, 5, 1, 0, 0), 2), ], } """ def next_datetime(previous, gap_value, gap_type): year = previous.year month = previous.month if gap_type == 'year': next = previous.replace(year=year + gap_value) elif gap_type == 'month': if month + gap_value <= 12: next = previous.replace(month=month + gap_value) else: next = previous.replace( month=((month + gap_value) % 12), year=(year + (month + gap_value) // 12) ) elif gap_type == 'day': next = previous + datetime.timedelta(days=gap_value) elif gap_type == 'hour': return previous + datetime.timedelta(hours=gap_value) elif gap_type == 'minute': next = previous + datetime.timedelta(minutes=gap_value) elif gap_type == 'second': next = previous + datetime.timedelta(seconds=gap_value) else: raise TypeError('\'gap_by\' must be ' '{second, minute, day, month, year}') return next facet_dict = {} for date_facet, facet_params in list(date_facets.items()): gap_type = facet_params.get('gap_by') gap_value = facet_params.get('gap_amount', 1) date_range = facet_params['start_date'] # construct the bins of the histogram facet_list = [] while date_range < facet_params['end_date']: facet_list.append((date_range, 0)) date_range = next_datetime(date_range, gap_value, gap_type) facet_list = sorted(facet_list, key=lambda x: x[0], reverse=True) for result in results: result_date = getattr(result, date_facet) # convert date to datetime if not isinstance(result_date, datetime.datetime): result_date = datetime.datetime(result_date.year, result_date.month, result_date.day) # ignore results outside the boundaries. if facet_list[0][0] < result_date < facet_list[-1][0]: continue # populate the histogram by putting the result on the right bin. for n, facet_date in enumerate(facet_list): if result_date > facet_date[0]: # equal to facet_list[n][1] += 1, but for a tuple facet_list[n] = (facet_list[n][0], (facet_list[n][1] + 1)) break # bin found; go to next result facet_dict[date_facet] = facet_list return facet_dict def _do_query_facets(self, results, query_facets): """ Private method that facets a document by query Required arguments: `results` -- A list SearchResults to facet `query_facets` -- A dictionary containing facet parameters: {'field': 'query', [...]} For each query in `query_facets`, generates a dictionary entry with the field name as the key and a tuple with the query and result count as the value. eg. {'name': ('a*', 5)} """ facet_dict = {} for field, query in list(dict(query_facets).items()): facet_dict[field] = (query, self.search(self.parse_query(query))['hits']) return facet_dict @staticmethod def _do_spelling_suggestion(database, query, spelling_query): """ Private method that returns a single spelling suggestion based on `spelling_query` or `query`. Required arguments: `database` -- The database to check spelling against `query` -- The query to check `spelling_query` -- If not None, this will be checked instead of `query` Returns a string with a suggested spelling """ if spelling_query: if ' ' in spelling_query: return ' '.join([database.get_spelling_suggestion(term).decode('utf-8') for term in spelling_query.split()]) else: return database.get_spelling_suggestion(spelling_query).decode('utf-8') term_set = set() for term in query: for match in re.findall('[^A-Z]+', term.decode('utf-8')): # Ignore field identifiers term_set.add(database.get_spelling_suggestion(match).decode('utf-8')) return ' '.join(term_set) def _database(self, writable=False): """ Private method that returns a xapian.Database for use. Optional arguments: ``writable`` -- Open the database in read/write mode (default=False) Returns an instance of a xapian.Database or xapian.WritableDatabase """ if self.path == MEMORY_DB_NAME: if not self.inmemory_db: self.inmemory_db = xapian.inmemory_open() return self.inmemory_db if writable: database = xapian.WritableDatabase(self.path, xapian.DB_CREATE_OR_OPEN) else: try: database = xapian.Database(self.path) except xapian.DatabaseOpeningError: raise InvalidIndexError('Unable to open index at %s' % self.path) return database @staticmethod def _get_enquire_mset(database, enquire, start_offset, end_offset, checkatleast=DEFAULT_CHECK_AT_LEAST): """ A safer version of Xapian.enquire.get_mset Simply wraps the Xapian version and catches any `Xapian.DatabaseModifiedError`, attempting a `database.reopen` as needed. Required arguments: `database` -- The database to be read `enquire` -- An instance of an Xapian.enquire object `start_offset` -- The start offset to pass to `enquire.get_mset` `end_offset` -- The end offset to pass to `enquire.get_mset` """ try: return enquire.get_mset(start_offset, end_offset, checkatleast) except xapian.DatabaseModifiedError: database.reopen() return enquire.get_mset(start_offset, end_offset, checkatleast) @staticmethod def _get_document_data(database, document): """ A safer version of Xapian.document.get_data Simply wraps the Xapian version and catches any `Xapian.DatabaseModifiedError`, attempting a `database.reopen` as needed. Required arguments: `database` -- The database to be read `document` -- An instance of an Xapian.document object """ try: return document.get_data() except xapian.DatabaseModifiedError: database.reopen() return document.get_data() def _get_hit_count(self, database, enquire): """ Given a database and enquire instance, returns the estimated number of matches. Required arguments: `database` -- The database to be queried `enquire` -- The enquire instance """ return self._get_enquire_mset( database, enquire, 0, database.get_doccount() ).size() def _multi_value_field(self, field): """ Private method that returns `True` if a field is multi-valued, else `False`. Required arguemnts: `field` -- The field to lookup Returns a boolean value indicating whether the field is multi-valued. """ for field_dict in self.schema: if field_dict['field_name'] == field: return field_dict['multi_valued'] == 'true' return False class XapianSearchQuery(BaseSearchQuery): """ This class is the Xapian specific version of the SearchQuery class. It acts as an intermediary between the ``SearchQuerySet`` and the ``SearchBackend`` itself. """ def build_params(self, *args, **kwargs): kwargs = super(XapianSearchQuery, self).build_params(*args, **kwargs) if self.end_offset is not None: kwargs['end_offset'] = self.end_offset - self.start_offset return kwargs def build_query(self): if not self.query_filter: query = xapian.Query('') else: query = self._query_from_search_node(self.query_filter) if self.models: subqueries = [ xapian.Query( xapian.Query.OP_SCALE_WEIGHT, xapian.Query('%s%s' % (TERM_PREFIXES[DJANGO_CT], get_model_ct(model))), 0 # Pure boolean sub-query ) for model in self.models ] query = xapian.Query( xapian.Query.OP_AND, query, xapian.Query(xapian.Query.OP_OR, subqueries) ) if self.boost: subqueries = [ xapian.Query( xapian.Query.OP_SCALE_WEIGHT, self._term_query(term, None, None), value ) for term, value in list(self.boost.items()) ] query = xapian.Query( xapian.Query.OP_AND_MAYBE, query, xapian.Query(xapian.Query.OP_OR, subqueries) ) return query def _query_from_search_node(self, search_node, is_not=False): query_list = [] for child in search_node.children: if isinstance(child, SearchNode): query_list.append( self._query_from_search_node(child, child.negated) ) else: expression, term = child field_name, filter_type = search_node.split_expression(expression) constructed_query_list = self._query_from_term(term, field_name, filter_type, is_not) query_list.extend(constructed_query_list) if search_node.connector == 'OR': return xapian.Query(xapian.Query.OP_OR, query_list) else: return xapian.Query(xapian.Query.OP_AND, query_list) def _query_from_term(self, term, field_name, filter_type, is_not): """ Uses arguments to construct a list of xapian.Query's. """ if field_name != 'content' and field_name not in self.backend.column: raise InvalidIndexError('field "%s" not indexed' % field_name) # It it is an AutoQuery, it has no filters # or others, thus we short-circuit the procedure. if isinstance(term, AutoQuery): if field_name != 'content': query = '%s:%s' % (field_name, term.prepare(self)) else: query = term.prepare(self) return [self.backend.parse_query(query)] query_list = [] # Handle `ValuesListQuerySet`. if hasattr(term, 'values_list'): term = list(term) if field_name == 'content': # content is the generic search: # force no field_name search # and the field_type to be 'text'. field_name = None field_type = 'text' # we don't know what is the type(term), so we parse it. # Ideally this would not be required, but # some filters currently depend on the term to make decisions. term = _to_xapian_term(term) query_list.append(self._filter_contains(term, field_name, field_type, is_not)) # when filter has no filter_type, haystack uses # filter_type = 'content'. Here we remove it # since the above query is already doing this if filter_type == 'content': filter_type = None else: # get the field_type from the backend field_type = self.backend.schema[self.backend.column[field_name]]['type'] # private fields don't accept 'contains' or 'startswith' # since they have no meaning. if filter_type in ('contains', 'startswith') and field_name in (ID, DJANGO_ID, DJANGO_CT): filter_type = 'exact' if field_type == 'text': # we don't know what type "term" is, but we know we are searching as text # so we parse it like that. # Ideally this would not be required since _term_query does it, but # some filters currently depend on the term to make decisions. if isinstance(term, list): term = [_to_xapian_term(term) for term in term] else: term = _to_xapian_term(term) # todo: we should check that the filter is valid for this field_type or raise InvalidIndexError if filter_type == 'contains': query_list.append(self._filter_contains(term, field_name, field_type, is_not)) elif filter_type in ('content', 'exact'): query_list.append(self._filter_exact(term, field_name, field_type, is_not)) elif filter_type == 'in': query_list.append(self._filter_in(term, field_name, field_type, is_not)) elif filter_type == 'startswith': query_list.append(self._filter_startswith(term, field_name, field_type, is_not)) elif filter_type == 'endswith': raise NotImplementedError("The Xapian search backend doesn't support endswith queries.") elif filter_type == 'gt': query_list.append(self._filter_gt(term, field_name, field_type, is_not)) elif filter_type == 'gte': query_list.append(self._filter_gte(term, field_name, field_type, is_not)) elif filter_type == 'lt': query_list.append(self._filter_lt(term, field_name, field_type, is_not)) elif filter_type == 'lte': query_list.append(self._filter_lte(term, field_name, field_type, is_not)) elif filter_type == 'range': query_list.append(self._filter_range(term, field_name, field_type, is_not)) return query_list def _all_query(self): """ Returns a match all query. """ return xapian.Query('') def _filter_contains(self, term, field_name, field_type, is_not): """ Splits the sentence in terms and join them with OR, using stemmed and un-stemmed. Assumes term is not a list. """ if field_type == 'text': term_list = term.split() else: term_list = [term] query = self._or_query(term_list, field_name, field_type) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: return query def _filter_in(self, term_list, field_name, field_type, is_not): """ Returns a query that matches exactly ANY term in term_list. Notice that: A in {B,C} <=> (A = B or A = C) ~(A in {B,C}) <=> ~(A = B or A = C) Because OP_AND_NOT(C, D) <=> (C and ~D), then D=(A in {B,C}) requires `is_not=False`. Assumes term is a list. """ query_list = [self._filter_exact(term, field_name, field_type, is_not=False) for term in term_list] if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), xapian.Query(xapian.Query.OP_OR, query_list)) else: return xapian.Query(xapian.Query.OP_OR, query_list) def _filter_exact(self, term, field_name, field_type, is_not): """ Returns a query that matches exactly the un-stemmed term with positional order. Assumes term is not a list. """ if field_type == 'text' and field_name not in (DJANGO_CT,): term = '^ %s $' % term query = self._phrase_query(term.split(), field_name, field_type) else: query = self._term_query(term, field_name, field_type, stemmed=False) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: return query def _filter_startswith(self, term, field_name, field_type, is_not): """ Returns a startswith query on the un-stemmed term. Assumes term is not a list. """ if field_type == 'text': if len(term.split()) == 1: term = '^ %s*' % term query = self.backend.parse_query(term) else: term = '^ %s' % term query = self._phrase_query(term.split(), field_name, field_type) else: term = '^%s*' % term query = self.backend.parse_query(term) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) return query def _or_query(self, term_list, field, field_type): """ Joins each item of term_list decorated by _term_query with an OR. """ term_list = [self._term_query(term, field, field_type) for term in term_list] return xapian.Query(xapian.Query.OP_OR, term_list) def _phrase_query(self, term_list, field_name, field_type): """ Returns a query that matches exact terms with positional order (i.e. ["this", "thing"] != ["thing", "this"]) and no stem. If `field_name` is not `None`, restrict to the field. """ term_list = [self._term_query(term, field_name, field_type, stemmed=False) for term in term_list] query = xapian.Query(xapian.Query.OP_PHRASE, term_list) return query def _term_query(self, term, field_name, field_type, stemmed=True): """ Constructs a query of a single term. If `field_name` is not `None`, the term is search on that field only. If exact is `True`, the search is restricted to boolean matches. """ constructor = '{prefix}{term}' # construct the prefix to be used. prefix = '' if field_name: prefix = TERM_PREFIXES['field'] + field_name.upper() term = _to_xapian_term(term) if field_name in (ID, DJANGO_ID, DJANGO_CT): # to ensure the value is serialized correctly. if field_name == DJANGO_ID: term = int(term) term = _term_to_xapian_value(term, field_type) return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term)) # we construct the query dates in a slightly different way if field_type == 'datetime': date, time = term.split() return xapian.Query(xapian.Query.OP_AND_MAYBE, constructor.format(prefix=prefix, term=date), constructor.format(prefix=prefix, term=time) ) # only use stem if field is text or "None" if field_type not in ('text', None): stemmed = False unstemmed_term = constructor.format(prefix=prefix, term=term) if stemmed: stem = xapian.Stem(self.backend.language) stemmed_term = 'Z' + constructor.format(prefix=prefix, term=stem(term).decode('utf-8')) return xapian.Query(xapian.Query.OP_OR, xapian.Query(stemmed_term), xapian.Query(unstemmed_term) ) else: return xapian.Query(unstemmed_term) def _filter_gt(self, term, field_name, field_type, is_not): return self._filter_lte(term, field_name, field_type, is_not=not is_not) def _filter_lt(self, term, field_name, field_type, is_not): return self._filter_gte(term, field_name, field_type, is_not=not is_not) def _filter_gte(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is greater than `term` in a specified `field`. """ vrp = XHValueRangeProcessor(self.backend) pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term, field_type)), '*') if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) ) return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) def _filter_lte(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is less than `term` in a specified `field`. """ vrp = XHValueRangeProcessor(self.backend) pos, begin, end = vrp('%s:' % field_name, '%s' % _term_to_xapian_value(term, field_type)) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) ) return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) def _filter_range(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is between the values from the `term` list. """ vrp = XHValueRangeProcessor(self.backend) pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term[0], field_type)), '%s' % _term_to_xapian_value(term[1], field_type)) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) ) return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) def _term_to_xapian_value(term, field_type): """ Converts a term to a serialized Xapian value based on the field_type. """ assert field_type in FIELD_TYPES def strf(dt): """ Equivalent to datetime.datetime.strptime(dt, DATETIME_FORMAT) but accepts years below 1900 (see http://stackoverflow.com/q/10263956/931303) """ return '%04d%02d%02d%02d%02d%02d' % ( dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) if field_type == 'boolean': assert isinstance(term, bool) if term: value = 't' else: value = 'f' elif field_type == 'integer': value = INTEGER_FORMAT % term elif field_type == 'float': value = xapian.sortable_serialise(term) elif field_type == 'date' or field_type == 'datetime': if field_type == 'date': # http://stackoverflow.com/a/1937636/931303 and comments term = datetime.datetime.combine(term, datetime.time()) value = strf(term) else: # field_type == 'text' value = _to_xapian_term(term) return value def _to_xapian_term(term): """ Converts a Python type to a Xapian term that can be indexed. """ return force_text(term).lower() def _from_xapian_value(value, field_type): """ Converts a serialized Xapian value to Python equivalent based on the field_type. Doesn't accept multivalued fields. """ assert field_type in FIELD_TYPES if field_type == 'boolean': if value == 't': return True elif value == 'f': return False else: InvalidIndexError('Field type "%d" does not accept value "%s"' % (field_type, value)) elif field_type == 'integer': return int(value) elif field_type == 'float': return xapian.sortable_unserialise(value) elif field_type == 'date' or field_type == 'datetime': datetime_value = datetime.datetime.strptime(value, DATETIME_FORMAT) if field_type == 'datetime': return datetime_value else: return datetime_value.date() else: # field_type == 'text' return value def _old_xapian_sort(enquire, sort_by, column): sorter = xapian.MultiValueSorter() for sort_field in sort_by: if sort_field.startswith('-'): reverse = True sort_field = sort_field[1:] # Strip the '-' else: reverse = False # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311 sorter.add(column[sort_field], reverse) enquire.set_sort_by_key_then_relevance(sorter, True) def _xapian_sort(enquire, sort_by, column): try: sorter = xapian.MultiValueKeyMaker() except AttributeError: raise NotSupportedError for sort_field in sort_by: if sort_field.startswith('-'): reverse = False sort_field = sort_field[1:] # Strip the '-' else: reverse = True sorter.add_value(column[sort_field], reverse) enquire.set_sort_by_key_then_relevance(sorter, True) class XapianEngine(BaseEngine): backend = XapianSearchBackend query = XapianSearchQuery