pax_global_header 0000666 0000000 0000000 00000000064 14405573772 0014530 g ustar 00root root 0000000 0000000 52 comment=30c727e653d5289f30cfe20afda0fa79796a3be1
xapian-haystack-3.1.0/ 0000775 0000000 0000000 00000000000 14405573772 0014616 5 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/.coveragerc 0000664 0000000 0000000 00000000752 14405573772 0016743 0 ustar 00root root 0000000 0000000 [report]
exclude_lines =
def __repr__
raise NotImplementedError
raise MissingDependency
except xapian.DatabaseModifiedError
[run]
source =
haystack.backends.xapian_backend
test_haystack/xapian_tests
[paths]
# Merge coverage data from running tests in a django-haystack
# checkout with our own paths for coverage reporting.
backend =
./
*/django-haystack/haystack/backends/
tests =
tests/xapian_tests/
*/django-haystack/test_haystack/xapian_tests/
xapian-haystack-3.1.0/.github/ 0000775 0000000 0000000 00000000000 14405573772 0016156 5 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/.github/workflows/ 0000775 0000000 0000000 00000000000 14405573772 0020213 5 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/.github/workflows/test.yml 0000664 0000000 0000000 00000010004 14405573772 0021710 0 ustar 00root root 0000000 0000000 name: Test
on: [push, pull_request]
env:
FORCE_COLOR: 1
jobs:
prebuild_xapian_wheel:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
xapian-version: ['1.4.19']
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Checkout xapian-haystack
uses: actions/checkout@v2
- name: Check for cached xapian wheel
# https://github.com/actions/cache#cache-limits
# says this cached wheel will be evicted after a week unused.
id: xapian-cache
uses: actions/cache@v2
with:
path: xapian*.whl
key: xapian-${{ matrix.xapian-version }}-${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('xapian_wheel_builder.sh') }}
- name: Build xapian wheel
if: steps.xapian-cache.outputs.cache-hit != 'true'
run: |
./xapian_wheel_builder.sh ${{ matrix.xapian-version }}
test:
needs: prebuild_xapian_wheel
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
django-version: ['3.2', '4.0', '4.1']
xapian-version: ['1.4.19']
filelock-version: ['3.4.2']
exclude:
# Django dropped python 3.7 support in 4.0
- python-version: '3.7'
django-version: '4.0'
- python-version: '3.7'
django-version: '4.1'
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Checkout xapian-haystack
uses: actions/checkout@v2
- name: Check for cached xapian wheel
# This will always succeed since the previous job just ran.
id: xapian-cache
uses: actions/cache@v2
with:
path: xapian*.whl
key: xapian-${{ matrix.xapian-version }}-${{ runner.os }}-${{ env.pythonLocation }}-${{ hashFiles('xapian_wheel_builder.sh') }}
- name: Install Django and other Python dependencies
run: |
python -m pip install --upgrade pip
pip install django~=${{ matrix.django-version }} filelock~=${{ matrix.filelock-version }} coveralls xapian*.whl
- name: Checkout django-haystack
uses: actions/checkout@v2
with:
repository: 'django-haystack/django-haystack'
path: django-haystack
- name: Copy some test files to django-haystack
run: |
cp xapian_backend.py django-haystack/haystack/backends/
cp -r tests/* django-haystack/test_haystack/
cp tests/xapian_tests/__init__.py django-haystack/test_haystack/
cp .coveragerc django-haystack/
- name: Set PYTHONPATH
run: |
echo "PYTHONPATH=/usr/lib/python3/dist-packages:." >> $GITHUB_ENV
- name: Ensure all apps have migrations
run: |
cd django-haystack
django-admin makemigrations --settings=test_haystack.xapian_settings
- name: Running tests
run: |
cd django-haystack
coverage run $(command -v django-admin) test test_haystack.xapian_tests --settings=test_haystack.xapian_settings
env:
PYTHONPATH: "/usr/lib/python3/dist-packages:."
- name: Coveralls
run: |
coverage combine django-haystack/.coverage
coverage report
coveralls --service=github
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COVERALLS_FLAG_NAME: python-${{ matrix.python-version }}-django-${{ matrix.django-version }}-xapian-${{ matrix.xapian-version }}
COVERALLS_PARALLEL: true
coveralls:
needs: test
runs-on: ubuntu-latest
steps:
- name: Inform Coveralls of Completion
run: |
pip3 install --upgrade coveralls
coveralls --service=github --finish
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
xapian-haystack-3.1.0/.gitignore 0000664 0000000 0000000 00000000056 14405573772 0016607 0 ustar 00root root 0000000 0000000 *.pyc
*.tmproj
*.DS_Store
dist
NOTES
MANIFEST
xapian-haystack-3.1.0/AUTHORS 0000664 0000000 0000000 00000004442 14405573772 0015672 0 ustar 00root root 0000000 0000000 Primary Authors:
----------------
* David Sauve
Thanks to:
----------
* Daniel Lindsley for the awesome Haystack API and putting up with all of my questions.
* Trapeze Media for providing time and resources to complete this project as well as Q&A.
* Richard Boulton for answering questions regarding the Xapian python bindings and API.
* The Xapian team for creating and releasing under the GPL, such a great search engine.
* Supreet Sethi for suggestions regarding morphologic date comparisons and for fixing NOT query expressions.
* Joshua Jonah for changes to highlighting logic to avoid reserved words.
* J00bar for a fix with `get_identifier`, fixing query_filter reference in SearchQuery, and a better clear method.
* Jannis Leidel for setting up the code base for pip, easy_install and PyPI.
* Erik Aigner for the initial patch to get_identifier changes.
* Travis Cline for the initial patch to support SQ objects in Haystack.
* wshallum for a patch that makes date facets compatible with Python 2.4
* askfor for reporting issues with narrow_queries and float fields.
* Brandon Konkle for a patch that corrected the behaviour of weights on multiple term boosts.
* Adam Endicott for the initial patch that corrected an oversight with stemming not always being done during a search.
* Sym Roe for a patch that improved performance in "more-like-this" and suggestion the removal of FLAG_PARTIAL.
* liranz for pointing out a potential conflict with arguments pass into `SearchResult`
* Jacob Kaplan-Moss for pointing out that write permission shouldn't be required for searching.
* glassresistor for assistance troubleshooting an issue with boosting a phrase query & a patch to make weighting schemes overridable.
* James Addison for helping to debug an intermittent issue with `order_by` and `build_schema`.
* Michael Opitz for a patch that enables support for `inmemorydb`.
* Evgeniy Kirov for a patch that adds `HAYSTACK_XAPIAN_LANGUAGE` used for setting the stemming language.
* domasx2 for a patch that explicitly closes the database when not in use.
* naktinis for a patch that fixed changes the behaviour of the `narrow_queries` argument of `search` so that queries are ANDed together rather than ORed.
xapian-haystack-3.1.0/CHANGELOG.rst 0000664 0000000 0000000 00000002303 14405573772 0016635 0 ustar 00root root 0000000 0000000 =========================
xapian-haystack Changelog
=========================
v3.1.0 (2023-03-19)
-------------------
- Add DJANGO_CT, DJANGO_ID, ID to be used with '__exact' internally.
- Ability to configure ngram min and max lengths.
- Supported Django versions: 3.2, 4.0, 4.1
- Dropped support for Python 3.6.
- Fixed DatabaseLocked errors when running management commands with
multiple workers.
v3.0.1 (2021-11-12)
-------------------
- Removed deprecated ``force_text`` usage, which will stop emitting
RemovedInDjango40Warning's.
- Test files are now included in release tarball.
v3.0.0 (2021-10-26)
-------------------
- Dropped Python 2 support.
- Supported Django versions: 2.2, 3.0, 3.1, 3.2
- Dropped support for xapian < 1.4
- Added new ``xapian_wheel_builder.sh`` script.
- Fixed ``os.path.exists`` race situation.
- Fixed setup.py on non-UTF-8 systems.
v2.1.1 (2017-05-18)
-------------------
- Django 1.8 as minimal version, added support for Django 1.9/1.10.
- Adapted default Haystack query from ``contains`` to ``content``.
- Raise ``NotImplementedError`` for endswith queries.
- Supported range search filter (#161).
- Configure ``limit_to_registered_models`` according to haystack docs.
xapian-haystack-3.1.0/LICENSE 0000664 0000000 0000000 00000035564 14405573772 0015640 0 ustar 00root root 0000000 0000000 GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
xapian-haystack-3.1.0/MANIFEST.in 0000664 0000000 0000000 00000000146 14405573772 0016355 0 ustar 00root root 0000000 0000000 include AUTHORS
include CHANGELOG.rst
include LICENSE
include README.rst
recursive-include tests *.py
xapian-haystack-3.1.0/README.rst 0000664 0000000 0000000 00000011340 14405573772 0016304 0 ustar 00root root 0000000 0000000 Xapian backend for Django-Haystack
==================================
.. image:: https://github.com/notanumber/xapian-haystack/actions/workflows/test.yml/badge.svg
:target: https://github.com/notanumber/xapian-haystack/actions
:alt: GitHub Actions
.. image:: https://coveralls.io/repos/notanumber/xapian-haystack/badge.svg?branch=master&service=github
:target: https://coveralls.io/github/notanumber/xapian-haystack?branch=master
.. image:: https://img.shields.io/pypi/v/xapian-haystack.svg
:target: https://pypi.org/project/xapian-haystack/
:alt: PyPI version
Xapian-haystack is a backend of `Django-Haystack `__
for the `Xapian `__ search engine.
Thanks for checking it out.
You can find more information about Xapian `here `__.
Features
--------
Xapian-Haystack provides all the standard features of Haystack:
- Weighting
- Faceted search (date, query, etc.)
- Sorting
- Spelling suggestions
- EdgeNGram and Ngram (for autocomplete)
Limitations
-----------
The `endswith` search operation is not supported by Xapian-Haystack.
Requirements
------------
- Python 3+
- Django 2.2+
- Django-Haystack 2.8.0
- Xapian 1.4+
Installation
------------
First, install Xapian in your machine e.g. with the script provided,
`install_xapian.sh`. Call it after activating the virtual environment to install::
source /bin/activate
./install_xapian.sh
`` must be >=1.4.0. This takes around 10 minutes.
Finally, install Xapian-Haystack by running::
pip install xapian-haystack
Configuration
-------------
Xapian is configured as other backends of Haystack.
You have to define the connection to the database,
which is done to a path to a directory, e.g::
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'xapian_backend.XapianEngine',
'PATH': os.path.join(os.path.dirname(__file__), 'xapian_index')
},
}
The backend has the following optional settings:
- ``HAYSTACK_XAPIAN_LANGUAGE``: the stemming language; the default is `english` and the list of available languages
can be found `here `__.
- ``HAYSTACK_XAPIAN_WEIGHTING_SCHEME``: a tuple with parameters to be passed to the weighting scheme
`BM25 `__.
By default, it uses the same parameters as Xapian recommends; this setting allows you to change them.
- ``HAYSTACK_XAPIAN_FLAGS``: the options used to parse `AutoQueries`;
the default is ``FLAG_PHRASE | FLAG_BOOLEAN | FLAG_LOVEHATE | FLAG_WILDCARD | FLAG_PURE_NOT``
See `here `__ for more information
on what they mean.
- ``HAYSTACK_XAPIAN_STEMMING_STRATEGY``: This option lets you chose the stemming strategy used by Xapian. Possible
values are ``STEM_NONE``, ``STEM_SOME``, ``STEM_ALL``, ``STEM_ALL_Z``, where ``STEM_SOME`` is the default.
See `here `__ for
more information about the different strategies.
- ``XAPIAN_NGRAM_MIN_LENGTH``, ``XAPIAN_NGRAM_MAX_LENGTH``: options for custom configuration of ngrams (phrases) length.
- ``HAYSTACK_XAPIAN_USE_LOCKFILE``: Use a lockfile to prevent database locking errors when running management commands with multiple workers.
Defaults to `True`.
Testing
-------
Xapian-Haystack has a test suite in continuous deployment with GitHub Actions. The file
``.github/workflows/test.yml`` contains the steps required to run the test suite.
Source
------
The source code can be found in `github `_.
Credits
-------
Xapian-Haystack is maintained by `Jorge C. Leitão `__;
`David Sauve `__ was the main contributor of Xapian-Haystack and
Xapian-Haystack was originally funded by `Trapeze `__.
`Claude Paroz `__ is a frequent contributor.
`ANtlord `__ implemented support for EdgeNgram and Ngram.
License
-------
Xapian-haystack is free software licenced under GNU General Public Licence v2 and
Copyright (c) 2009, 2010, 2011, 2012 David Sauve, 2009, 2010 Trapeze, 2014 Jorge C. Leitão.
It may be redistributed under the terms specified in the LICENSE file.
Questions, Comments, Concerns:
------------------------------
Feel free to open an issue `here `__
or pull request your work.
You can ask questions on the django-haystack `mailing list `_:
or in the irc ``#haystack``.
xapian-haystack-3.1.0/install_xapian.sh 0000775 0000000 0000000 00000002370 14405573772 0020165 0 ustar 00root root 0000000 0000000 #!/usr/bin/env bash
# first argument of the script is Xapian version (e.g. 1.4.19)
VERSION=$1
if [ -z "$VERSION" ]; then
echo "usage: $0 version_number" 1>&2
exit 1
fi
# prepare
mkdir -p $VIRTUAL_ENV/packages && cd $VIRTUAL_ENV/packages
CORE=xapian-core-$VERSION
BINDINGS=xapian-bindings-$VERSION
# download
echo "Downloading source..."
curl -O https://oligarchy.co.uk/xapian/$VERSION/${CORE}.tar.xz
curl -O https://oligarchy.co.uk/xapian/$VERSION/${BINDINGS}.tar.xz
# extract
echo "Extracting source..."
tar xf ${CORE}.tar.xz
tar xf ${BINDINGS}.tar.xz
# install
echo "Installing Xapian-core..."
cd $VIRTUAL_ENV/packages/${CORE}
./configure --prefix=$VIRTUAL_ENV && make && make install
PYTHON_FLAG=--with-python3
# The bindings for Python require python-sphinx
echo "Installing Python-Sphinx..."
SPHINX2_FIXED_VERSION=1.4.12
if [ $(printf "${VERSION}\n${SPHINX2_FIXED_VERSION}" | sort -V | head -n1) = "${SPHINX2_FIXED_VERSION}" ]; then
pip install sphinx
else
pip install "sphinx<2"
fi
echo "Installing Xapian-bindings..."
cd $VIRTUAL_ENV/packages/${BINDINGS}
./configure --prefix=$VIRTUAL_ENV $PYTHON_FLAG && make && make install
# clean
cd $VIRTUAL_ENV
rm -rf $VIRTUAL_ENV/packages
# test
echo "Testing Xapian..."
python -c "import xapian"
xapian-haystack-3.1.0/requirements.txt 0000664 0000000 0000000 00000000057 14405573772 0020104 0 ustar 00root root 0000000 0000000 Django>=2.2
Django-Haystack>=3.0
filelock>=3.4
xapian-haystack-3.1.0/setup.py 0000664 0000000 0000000 00000001731 14405573772 0016332 0 ustar 00root root 0000000 0000000 from distutils.core import setup
from pathlib import Path
def read(fname):
return (Path(__file__).parent / fname).read_text(encoding='utf-8')
setup(
name='xapian-haystack',
version='3.1.0',
description='A Xapian backend for Haystack',
long_description=read('README.rst'),
long_description_content_type='text/x-rst',
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
'Framework :: Django',
'Programming Language :: Python :: 3 :: Only',
],
author='Jorge C. Leitão',
author_email='jorgecarleitao@gmail.com',
url='https://github.com/notanumber/xapian-haystack',
license='GPL2',
py_modules=['xapian_backend'],
install_requires=[
'django>=3.2',
'django-haystack>=2.8.0',
'filelock>=3.4',
]
)
xapian-haystack-3.1.0/tests/ 0000775 0000000 0000000 00000000000 14405573772 0015760 5 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/tests/xapian_settings.py 0000775 0000000 0000000 00000000717 14405573772 0021542 0 ustar 00root root 0000000 0000000 import os
from .settings import *
INSTALLED_APPS = [
'django.contrib.auth',
'django.contrib.admin',
'django.contrib.contenttypes',
'django.contrib.messages',
'haystack',
'test_haystack.core',
'test_haystack.xapian_tests',
]
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'haystack.backends.xapian_backend.XapianEngine',
'PATH': os.path.join('tmp', 'test_xapian_query'),
'INCLUDE_SPELLING': True,
}
}
xapian-haystack-3.1.0/tests/xapian_tests/ 0000775 0000000 0000000 00000000000 14405573772 0020462 5 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/tests/xapian_tests/__init__.py 0000664 0000000 0000000 00000000000 14405573772 0022561 0 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/tests/xapian_tests/migrations/ 0000775 0000000 0000000 00000000000 14405573772 0022636 5 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/tests/xapian_tests/migrations/0001_initial.py 0000664 0000000 0000000 00000004006 14405573772 0025301 0 ustar 00root root 0000000 0000000 from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('core', '__first__'),
('contenttypes', '__first__'),
]
operations = [
migrations.CreateModel(
name='Document',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('type_name', models.CharField(max_length=50)),
('number', models.IntegerField()),
('name', models.CharField(max_length=200)),
('date', models.DateField()),
('summary', models.TextField()),
('text', models.TextField()),
],
),
migrations.CreateModel(
name='DjangoContentType',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='contenttypes.contenttype')),
],
),
migrations.CreateModel(
name='BlogEntry',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('datetime', models.DateTimeField()),
('date', models.DateField()),
('author', models.CharField(max_length=255)),
('text', models.TextField()),
('funny_text', models.TextField()),
('non_ascii', models.TextField()),
('url', models.URLField()),
('boolean', models.BooleanField()),
('number', models.IntegerField()),
('float_number', models.FloatField()),
('decimal_number', models.DecimalField(decimal_places=2, max_digits=4)),
('tags', models.ManyToManyField(to='core.MockTag')),
],
),
]
xapian-haystack-3.1.0/tests/xapian_tests/migrations/__init__.py 0000664 0000000 0000000 00000000000 14405573772 0024735 0 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/tests/xapian_tests/models.py 0000664 0000000 0000000 00000002226 14405573772 0022321 0 ustar 00root root 0000000 0000000 from django.db import models
from django.contrib.contenttypes.models import ContentType
from ..core.models import MockTag, AnotherMockModel, MockModel, AFourthMockModel
class Document(models.Model):
type_name = models.CharField(max_length=50)
number = models.IntegerField()
name = models.CharField(max_length=200)
date = models.DateField()
summary = models.TextField()
text = models.TextField()
class BlogEntry(models.Model):
"""
Same as tests.core.MockModel with a few extra fields for testing various
sorting and ordering criteria.
"""
datetime = models.DateTimeField()
date = models.DateField()
tags = models.ManyToManyField(MockTag)
author = models.CharField(max_length=255)
text = models.TextField()
funny_text = models.TextField()
non_ascii = models.TextField()
url = models.URLField()
boolean = models.BooleanField()
number = models.IntegerField()
float_number = models.FloatField()
decimal_number = models.DecimalField(max_digits=4, decimal_places=2)
class DjangoContentType(models.Model):
content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
xapian-haystack-3.1.0/tests/xapian_tests/search_indexes.py 0000664 0000000 0000000 00000011563 14405573772 0024026 0 ustar 00root root 0000000 0000000 from haystack import indexes
from . import models
class DocumentIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True)
summary = indexes.CharField(model_attr='summary')
type_name = indexes.CharField(model_attr='type_name')
number = indexes.IntegerField(model_attr='number')
name = indexes.CharField(model_attr='name')
date = indexes.DateField(model_attr='date')
tags = indexes.MultiValueField()
def get_model(self):
return models.Document
def prepare_tags(self, obj):
l = [['tag', 'tag-test', 'tag-test-test'],
['tag', 'tag-test'],
['tag']]
return l[obj.id % 3]
class BlogSearchIndex(indexes.SearchIndex):
text = indexes.CharField(
document=True, use_template=True,
template_name='search/indexes/core/mockmodel_text.txt'
)
name = indexes.CharField(model_attr='author', faceted=True)
date = indexes.DateField(model_attr='date')
datetime = indexes.DateField(model_attr='datetime')
number = indexes.IntegerField(model_attr='number')
boolean = indexes.BooleanField(model_attr='boolean')
#slug = indexes.CharField(indexed=False, model_attr='slug')
float_number = indexes.FloatField(model_attr='float_number')
month = indexes.CharField(indexed=False)
url = indexes.CharField(model_attr='url')
empty = indexes.CharField()
# Various MultiValueFields
sites = indexes.MultiValueField()
tags = indexes.MultiValueField()
keys = indexes.MultiValueField()
titles = indexes.MultiValueField()
def get_model(self):
return models.BlogEntry
def prepare_sites(self, obj):
return ['%d' % (i * obj.id) for i in range(1, 4)]
def prepare_tags(self, obj):
if obj.id == 1:
return ['a', 'b', 'c']
elif obj.id == 2:
return ['ab', 'bc', 'cd']
else:
return ['an', 'to', 'or']
def prepare_keys(self, obj):
return [i * obj.id for i in range(1, 4)]
def prepare_titles(self, obj):
if obj.id == 1:
return ['object one title one', 'object one title two']
elif obj.id == 2:
return ['object two title one', 'object two title two']
else:
return ['object three title one', 'object three title two']
def prepare_month(self, obj):
return '%02d' % obj.date.month
def prepare_empty(self, obj):
return ''
class CompleteBlogEntryIndex(indexes.SearchIndex):
text = indexes.CharField(model_attr='text', document=True)
author = indexes.CharField(model_attr='author')
url = indexes.CharField(model_attr='url')
non_ascii = indexes.CharField(model_attr='non_ascii')
funny_text = indexes.CharField(model_attr='funny_text')
datetime = indexes.DateTimeField(model_attr='datetime')
date = indexes.DateField(model_attr='date')
boolean = indexes.BooleanField(model_attr='boolean')
number = indexes.IntegerField(model_attr='number')
float_number = indexes.FloatField(model_attr='float_number')
decimal_number = indexes.DecimalField(model_attr='decimal_number')
multi_value = indexes.MultiValueField()
def get_model(self):
return models.BlogEntry
def prepare_multi_value(self, obj):
return [tag.name for tag in obj.tags.all()]
class XapianNGramIndex(indexes.SearchIndex):
text = indexes.CharField(model_attr='author', document=True)
ngram = indexes.NgramField(model_attr='author')
def get_model(self):
return models.BlogEntry
class XapianEdgeNGramIndex(indexes.SearchIndex):
text = indexes.CharField(model_attr='author', document=True)
edge_ngram = indexes.EdgeNgramField(model_attr='author')
def get_model(self):
return models.BlogEntry
class DjangoContentTypeIndex(indexes.SearchIndex):
text = indexes.CharField(document=True)
def get_model(self):
return models.DjangoContentType
class MockSearchIndex(indexes.SearchIndex):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr='author', faceted=True)
pub_date = indexes.DateTimeField(model_attr='pub_date')
title = indexes.CharField()
def get_model(self):
return models.MockModel
class BoostMockSearchIndex(indexes.SearchIndex):
text = indexes.CharField(
document=True, use_template=True,
template_name='search/indexes/core/mockmodel_template.txt'
)
author = indexes.CharField(model_attr='author', weight=2.0)
editor = indexes.CharField(model_attr='editor')
pub_date = indexes.DateField(model_attr='pub_date')
def get_model(self):
return models.AFourthMockModel
class MockQueryIndex(indexes.SearchIndex):
text = indexes.CharField(document=True)
pub_date = indexes.DateTimeField()
title = indexes.CharField()
foo = indexes.CharField()
def get_model(self):
return models.MockModel
xapian-haystack-3.1.0/tests/xapian_tests/tests/ 0000775 0000000 0000000 00000000000 14405573772 0021624 5 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/tests/xapian_tests/tests/__init__.py 0000664 0000000 0000000 00000000000 14405573772 0023723 0 ustar 00root root 0000000 0000000 xapian-haystack-3.1.0/tests/xapian_tests/tests/test_backend.py 0000664 0000000 0000000 00000076020 14405573772 0024631 0 ustar 00root root 0000000 0000000 from decimal import Decimal
import datetime
import inspect
import sys
import xapian
import subprocess
import os
from django.apps import apps
from django.contrib.contenttypes.models import ContentType
from django.test import TestCase
from haystack import connections
from haystack.backends.xapian_backend import InvalidIndexError, _term_to_xapian_value
from haystack.models import SearchResult
from haystack.utils.loading import UnifiedIndex
from ..search_indexes import XapianNGramIndex, XapianEdgeNGramIndex, \
CompleteBlogEntryIndex, BlogSearchIndex, DjangoContentTypeIndex
from ..models import BlogEntry, AnotherMockModel, MockTag, DjangoContentType
XAPIAN_VERSION = [int(x) for x in xapian.__version__.split('.')]
class XapianSearchResult(SearchResult):
def __init__(self, app_label, model_name, pk, score, **kwargs):
super().__init__(app_label, model_name, pk, score, **kwargs)
self._model = apps.get_model('xapian_tests', model_name)
def get_terms(backend, *args):
executable = 'xapian-delve'
# dev versions (odd minor) use a suffix
if XAPIAN_VERSION[1] % 2 != 0:
executable = executable+'-%d.%d' % tuple(XAPIAN_VERSION[0:2])
# look for a xapian-delve built by `xapian_wheel_builder`
wheel_delve = os.path.join(os.path.dirname(inspect.getfile(xapian)), executable)
if os.path.exists(wheel_delve):
executable = wheel_delve
result = subprocess.check_output([executable] + list(args) + [backend.path],
env=os.environ.copy()).decode('utf-8')
result = result.split(": ")
if len(result) > 1:
return result[1].strip().split(" ")
return []
def pks(results):
return [result.pk for result in results]
class HaystackBackendTestCase:
"""
Abstract TestCase that implements an hack to ensure `connections`
has the right index
It has a method get_index() that returns a SearchIndex
that must be overwritten.
"""
def get_index(self):
raise NotImplementedError
def setUp(self):
self.old_ui = connections['default'].get_unified_index()
self.ui = UnifiedIndex()
self.index = self.get_index()
self.ui.build(indexes=[self.index])
self.backend = connections['default'].get_backend()
connections['default']._index = self.ui
def tearDown(self):
self.backend.clear()
connections['default']._index = self.old_ui
def assertExpectedQuery(self, query, string_or_list):
if isinstance(string_or_list, list):
strings = string_or_list
else:
strings = [string_or_list]
expected = ['Query(%s)' % string for string in strings]
self.assertIn(str(query), expected)
class BackendIndexationTestCase(HaystackBackendTestCase, TestCase):
"""
Tests indexation behavior.
Tests related to how the backend indexes terms,
values, and others go here.
"""
def get_index(self):
return CompleteBlogEntryIndex()
def setUp(self):
super().setUp()
tag1 = MockTag.objects.create(name='tag')
tag2 = MockTag.objects.create(name='tag-tag')
tag3 = MockTag.objects.create(name='tag-tag-tag')
entry = BlogEntry()
entry.id = 1
entry.text = 'this_is_a_word inside a big text'
entry.author = 'david'
entry.url = 'http://example.com/1/'
entry.boolean = True
entry.number = 123456789
entry.float_number = 123.123456789
entry.decimal_number = Decimal('22.34')
entry.funny_text = 'this-text das das'
entry.non_ascii = 'thsi sdas das corrup\xe7\xe3o das'
entry.datetime = datetime.datetime(2009, 2, 25, 1, 1, 1)
entry.date = datetime.date(2008, 8, 8)
entry.save()
entry.tags.add(tag1, tag2, tag3)
self.backend.update(self.index, [entry])
self.entry = entry
def test_app_is_not_split(self):
"""
Tests that the app path is not split
and added as independent terms.
"""
terms = get_terms(self.backend, '-a')
self.assertFalse('tests' in terms)
self.assertFalse('Ztest' in terms)
def test_app_is_not_indexed(self):
"""
Tests that the app path is not indexed.
"""
terms = get_terms(self.backend, '-a')
self.assertFalse('tests.xapianmockmodel.1' in terms)
self.assertFalse('xapianmockmodel' in terms)
self.assertFalse('tests' in terms)
def test_fields_exist(self):
"""
Tests that all fields are in the database
"""
terms = get_terms(self.backend, '-a')
for field in ['author', 'datetime', 'text', 'url']:
is_inside = False
for term in terms:
if term.startswith("X%s" % field.upper()):
is_inside = True
break
self.assertTrue(is_inside, field)
def test_text_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('this_is_a_word' in terms)
self.assertTrue('Zthis_is_a_word' in terms)
self.assertTrue('ZXTEXTthis_is_a_word' in terms)
self.assertTrue('XTEXTthis_is_a_word' in terms)
self.assertFalse('^this_is_a_word inside a big text$' in terms)
def test_text_posting(self):
"""
Tests that text is correctly positioned in the document
"""
expected_order = ['^', 'this_is_a_word', 'inside', 'a', 'big', 'text', '$']
def get_positions(term):
"""
Uses delve to get
the positions of the term in the first document.
"""
return sorted([int(pos) for pos in get_terms(self.backend, '-r1', '-tXTEXT%s' % term)])
# confirms expected_order
previous_position = get_positions(expected_order[0])
for term in expected_order[1:]:
pos = get_positions(term)
# only two positions per term
# (one from term_generator, one from literal text)
self.assertEqual(len(pos), 2)
self.assertEqual(pos[0] - 1, previous_position[0])
self.assertEqual(pos[1] - 1, previous_position[1])
previous_position[0] += 1
previous_position[1] += 1
def test_author_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('XAUTHORdavid' in terms)
self.assertTrue('ZXAUTHORdavid' in terms)
self.assertTrue('Zdavid' in terms)
self.assertTrue('david' in terms)
def test_funny_text_field(self):
terms = get_terms(self.backend, '-r1')
self.assertTrue('this-text' in terms)
def test_datetime_field(self):
terms = get_terms(self.backend, '-a')
self.assertFalse('XDATETIME20090225000000' in terms)
self.assertFalse('ZXDATETIME20090225000000' in terms)
self.assertFalse('20090225000000' in terms)
self.assertTrue('XDATETIME2009-02-25' in terms)
self.assertTrue('2009-02-25' in terms)
self.assertTrue('01:01:01' in terms)
self.assertTrue('XDATETIME01:01:01' in terms)
def test_date_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('XDATE2008-08-08' in terms)
self.assertTrue('2008-08-08' in terms)
self.assertFalse('XDATE00:00:00' in terms)
self.assertFalse('00:00:00' in terms)
def test_url_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('http://example.com/1/' in terms)
def test_bool_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('XBOOLEANtrue' in terms)
self.assertFalse('ZXBOOLEANtrue' in terms)
def test_integer_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('123456789' in terms)
self.assertTrue('XNUMBER123456789' in terms)
self.assertFalse('ZXNUMBER123456789' in terms)
def test_float_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('123.123456789' in terms)
self.assertTrue('XFLOAT_NUMBER123.123456789' in terms)
self.assertFalse('ZXFLOAT_NUMBER123.123456789' in terms)
def test_decimal_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('22.34' in terms)
self.assertTrue('XDECIMAL_NUMBER22.34' in terms)
self.assertFalse('ZXDECIMAL_NUMBER22.34' in terms)
def test_multivalue_field(self):
"""
Regression test for #103
"""
terms = get_terms(self.backend, '-a')
self.assertTrue('tag' in terms)
self.assertTrue('tag-tag' in terms)
self.assertTrue('tag-tag-tag' in terms)
self.assertTrue('XMULTI_VALUEtag' in terms)
self.assertTrue('XMULTI_VALUEtag-tag' in terms)
self.assertTrue('XMULTI_VALUEtag-tag-tag' in terms)
def test_non_ascii_chars(self):
terms = get_terms(self.backend, '-a')
self.assertIn('corrup\xe7\xe3o', terms)
class BackendFeaturesTestCase(HaystackBackendTestCase, TestCase):
"""
Tests supported features on the backend side.
Tests to features implemented on the backend
go here.
"""
def get_index(self):
return BlogSearchIndex()
@staticmethod
def get_entry(i):
entry = BlogEntry()
entry.id = i
entry.author = 'david%s' % i
entry.url = 'http://example.com/%d/' % i
entry.boolean = bool(i % 2)
entry.number = i*5
entry.float_number = i*5.0
entry.decimal_number = Decimal('22.34')
entry.datetime = datetime.datetime(2009, 2, 25, 1, 1, 1) - datetime.timedelta(seconds=i)
entry.date = datetime.date(2009, 2, 23) + datetime.timedelta(days=i)
return entry
def setUp(self):
super().setUp()
self.sample_objs = []
for i in range(1, 4):
entry = self.get_entry(i)
self.sample_objs.append(entry)
self.sample_objs[0].float_number = 834.0
self.sample_objs[1].float_number = 35.5
self.sample_objs[2].float_number = 972.0
for obj in self.sample_objs:
obj.save()
self.backend.update(self.index, BlogEntry.objects.all())
def test_update(self):
self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']),
[1, 2, 3])
def test_duplicate_update(self):
"""
Regression test for #6.
"""
self.backend.update(self.index, self.sample_objs)
self.assertEqual(self.backend.document_count(), 3)
def test_remove(self):
self.backend.remove(self.sample_objs[0])
self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']),
[2, 3])
def test_clear(self):
self.backend.clear()
self.assertEqual(self.backend.document_count(), 0)
self.backend.update(self.index, self.sample_objs)
self.assertEqual(self.backend.document_count(), 3)
self.backend.clear([AnotherMockModel])
self.assertEqual(self.backend.document_count(), 3)
self.backend.clear([BlogEntry])
self.assertEqual(self.backend.document_count(), 0)
self.backend.update(self.index, self.sample_objs)
self.assertEqual(self.backend.document_count(), 3)
self.backend.clear([AnotherMockModel, BlogEntry])
self.assertEqual(self.backend.document_count(), 0)
def test_search(self):
# no match query
self.assertEqual(self.backend.search(xapian.Query()), {'hits': 0, 'results': []})
# all match query
self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']),
[1, 2, 3])
# Other `result_class`
self.assertTrue(
isinstance(self.backend.search(xapian.Query('indexed'), result_class=XapianSearchResult)['results'][0],
XapianSearchResult))
def test_search_field_with_punctuation(self):
self.assertEqual(pks(self.backend.search(xapian.Query('http://example.com/1/'))['results']),
[1])
def test_search_by_mvf(self):
self.assertEqual(self.backend.search(xapian.Query('ab'))['hits'], 1)
self.assertEqual(self.backend.search(xapian.Query('b'))['hits'], 1)
self.assertEqual(self.backend.search(xapian.Query('to'))['hits'], 1)
self.assertEqual(self.backend.search(xapian.Query('one'))['hits'], 3)
def test_field_facets(self):
self.assertEqual(self.backend.search(xapian.Query(), facets=['name']),
{'hits': 0, 'results': []})
results = self.backend.search(xapian.Query('indexed'), facets=['name'])
self.assertEqual(results['hits'], 3)
self.assertEqual(results['facets']['fields']['name'],
[('david1', 1), ('david2', 1), ('david3', 1)])
results = self.backend.search(xapian.Query('indexed'), facets=['boolean'])
self.assertEqual(results['hits'], 3)
self.assertEqual(results['facets']['fields']['boolean'],
[(False, 1), (True, 2)])
results = self.backend.search(xapian.Query('indexed'), facets=['sites'])
self.assertEqual(results['hits'], 3)
self.assertEqual(set(results['facets']['fields']['sites']),
set([('1', 1), ('3', 2), ('2', 2), ('4', 1), ('6', 2), ('9', 1)]))
results = self.backend.search(xapian.Query('indexed'),
facets=['number'])
self.assertEqual(results['hits'], 3)
self.assertEqual(results['facets']['fields']['number'],
[(5, 1), (10, 1), (15, 1)])
results = self.backend.search(xapian.Query('indexed'),
facets=['float_number'])
self.assertEqual(results['hits'], 3)
self.assertEqual(results['facets']['fields']['float_number'],
[(35.5, 1), (834.0, 1), (972.0, 1)])
def test_raise_index_error_on_wrong_field(self):
"""
Regression test for #109.
"""
self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas'])
def test_date_facets_month(self):
facets = {'datetime': {'start_date': datetime.datetime(2008, 10, 26),
'end_date': datetime.datetime(2009, 3, 26),
'gap_by': 'month'}}
self.assertEqual(self.backend.search(xapian.Query(), date_facets=facets),
{'hits': 0, 'results': []})
results = self.backend.search(xapian.Query('indexed'), date_facets=facets)
self.assertEqual(results['hits'], 3)
self.assertEqual(results['facets']['dates']['datetime'], [
(datetime.datetime(2009, 2, 26, 0, 0), 0),
(datetime.datetime(2009, 1, 26, 0, 0), 3),
(datetime.datetime(2008, 12, 26, 0, 0), 0),
(datetime.datetime(2008, 11, 26, 0, 0), 0),
(datetime.datetime(2008, 10, 26, 0, 0), 0),
])
def test_date_facets_seconds(self):
facets = {'datetime': {'start_date': datetime.datetime(2009, 2, 25, 1, 0, 57),
'end_date': datetime.datetime(2009, 2, 25, 1, 1, 1),
'gap_by': 'second'}}
self.assertEqual(self.backend.search(xapian.Query(), date_facets=facets),
{'hits': 0, 'results': []})
results = self.backend.search(xapian.Query('indexed'), date_facets=facets)
self.assertEqual(results['hits'], 3)
self.assertEqual(results['facets']['dates']['datetime'], [
(datetime.datetime(2009, 2, 25, 1, 1, 0), 0),
(datetime.datetime(2009, 2, 25, 1, 0, 59), 1),
(datetime.datetime(2009, 2, 25, 1, 0, 58), 1),
(datetime.datetime(2009, 2, 25, 1, 0, 57), 1),
])
def test_date_facets_days(self):
facets = {'date': {'start_date': datetime.datetime(2009, 2, 1),
'end_date': datetime.datetime(2009, 3, 15),
'gap_by': 'day',
'gap_amount': 15}}
results = self.backend.search(xapian.Query('indexed'), date_facets=facets)
self.assertEqual(results['hits'], 3)
self.assertEqual(results['facets']['dates']['date'], [
(datetime.datetime(2009, 3, 3, 0, 0), 0),
(datetime.datetime(2009, 2, 16, 0, 0), 3),
(datetime.datetime(2009, 2, 1, 0, 0), 0)
])
def test_query_facets(self):
self.assertEqual(self.backend.search(xapian.Query(), query_facets={'name': 'da*'}),
{'hits': 0, 'results': []})
results = self.backend.search(xapian.Query('indexed'), query_facets={'name': 'da*'})
self.assertEqual(results['hits'], 3)
self.assertEqual(results['facets']['queries']['name'], ('da*', 3))
def test_narrow_queries(self):
self.assertEqual(self.backend.search(xapian.Query(), narrow_queries={'name:david1'}),
{'hits': 0, 'results': []})
results = self.backend.search(xapian.Query('indexed'), narrow_queries={'name:david1'})
self.assertEqual(results['hits'], 1)
def test_highlight(self):
self.assertEqual(self.backend.search(xapian.Query(), highlight=True),
{'hits': 0, 'results': []})
self.assertEqual(self.backend.search(xapian.Query('indexed'), highlight=True)['hits'], 3)
results = self.backend.search(xapian.Query('indexed'), highlight=True)['results']
self.assertEqual([result.highlighted['text'] for result in results],
['indexed!\n1\n', 'indexed!\n2\n', 'indexed!\n3\n'])
def test_spelling_suggestion(self):
self.assertEqual(self.backend.search(xapian.Query('indxe'))['hits'], 0)
self.assertEqual(self.backend.search(xapian.Query('indxe'))['spelling_suggestion'],
'indexed')
self.assertEqual(self.backend.search(xapian.Query('indxed'))['hits'], 0)
self.assertEqual(self.backend.search(xapian.Query('indxed'))['spelling_suggestion'],
'indexed')
self.assertEqual(self.backend.search(xapian.Query('foo'))['hits'], 0)
self.assertEqual(self.backend.search(xapian.Query('foo'), spelling_query='indexy')['spelling_suggestion'],
'indexed')
self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['hits'], 0)
self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['spelling_suggestion'],
'david1')
def test_more_like_this(self):
results = self.backend.more_like_this(self.sample_objs[0])
self.assertEqual(pks(results['results']), [3, 2])
results = self.backend.more_like_this(self.sample_objs[0],
additional_query=xapian.Query('david3'))
self.assertEqual(pks(results['results']), [3])
results = self.backend.more_like_this(self.sample_objs[0],
limit_to_registered_models=True)
self.assertEqual(pks(results['results']), [3, 2])
# Other `result_class`
result = self.backend.more_like_this(self.sample_objs[0],
result_class=XapianSearchResult)
self.assertTrue(isinstance(result['results'][0], XapianSearchResult))
def test_order_by(self):
#results = self.backend.search(xapian.Query(''), sort_by=['datetime'])
#print([d.datetime for d in results['results']])
#self.assertEqual(pks(results['results']), [3, 2, 1])
#results = self.backend.search(xapian.Query(''), sort_by=['-datetime'])
#self.assertEqual(pks(results['results']), [1, 2, 3])
#results = self.backend.search(xapian.Query(''), sort_by=['date'])
#self.assertEqual(pks(results['results']), [1, 2, 3])
#results = self.backend.search(xapian.Query(''), sort_by=['-date'])
#self.assertEqual(pks(results['results']), [3, 2, 1])
results = self.backend.search(xapian.Query(''), sort_by=['id'])
self.assertEqual(pks(results['results']), [1, 2, 3])
results = self.backend.search(xapian.Query(''), sort_by=['-id'])
self.assertEqual(pks(results['results']), [3, 2, 1])
results = self.backend.search(xapian.Query(''), sort_by=['number'])
self.assertEqual(pks(results['results']), [1, 2, 3])
results = self.backend.search(xapian.Query(''), sort_by=['-number'])
self.assertEqual(pks(results['results']), [3, 2, 1])
results = self.backend.search(xapian.Query(''), sort_by=['float_number'])
self.assertEqual(pks(results['results']), [2, 1, 3])
results = self.backend.search(xapian.Query(''), sort_by=['-float_number'])
self.assertEqual(pks(results['results']), [3, 1, 2])
results = self.backend.search(xapian.Query(''), sort_by=['boolean', 'id'])
self.assertEqual(pks(results['results']), [2, 1, 3])
results = self.backend.search(xapian.Query(''), sort_by=['boolean', '-id'])
self.assertEqual(pks(results['results']), [2, 3, 1])
def test_verify_type(self):
self.assertEqual([result.month for result in self.backend.search(xapian.Query(''))['results']],
['02', '02', '02'])
def test_term_to_xapian_value(self):
self.assertEqual(_term_to_xapian_value('abc', 'text'), 'abc')
self.assertEqual(_term_to_xapian_value(1, 'integer'), '000000000001')
self.assertEqual(_term_to_xapian_value(2653, 'integer'), '000000002653')
self.assertEqual(_term_to_xapian_value(25.5, 'float'), b'\xb2`')
self.assertEqual(_term_to_xapian_value([1, 2, 3], 'text'), '[1, 2, 3]')
self.assertEqual(_term_to_xapian_value((1, 2, 3), 'text'), '(1, 2, 3)')
self.assertEqual(_term_to_xapian_value({'a': 1, 'c': 3, 'b': 2}, 'text'),
str({'a': 1, 'c': 3, 'b': 2}))
self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 16, 14), 'datetime'),
'20090509161400')
self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 0, 0), 'date'),
'20090509000000')
self.assertEqual(_term_to_xapian_value(datetime.datetime(1899, 5, 18, 0, 0), 'date'),
'18990518000000')
def test_build_schema(self):
search_fields = connections['default'].get_unified_index().all_searchfields()
(content_field_name, fields) = self.backend.build_schema(search_fields)
self.assertEqual(content_field_name, 'text')
self.assertEqual(len(fields), 14 + 3)
self.assertEqual(fields, [
{'column': 0, 'type': 'text', 'field_name': 'id', 'multi_valued': 'false'},
{'column': 1, 'type': 'integer', 'field_name': 'django_id', 'multi_valued': 'false'},
{'column': 2, 'type': 'text', 'field_name': 'django_ct', 'multi_valued': 'false'},
{'column': 3, 'type': 'boolean', 'field_name': 'boolean', 'multi_valued': 'false'},
{'column': 4, 'type': 'date', 'field_name': 'date', 'multi_valued': 'false'},
{'column': 5, 'type': 'date', 'field_name': 'datetime', 'multi_valued': 'false'},
{'column': 6, 'type': 'text', 'field_name': 'empty', 'multi_valued': 'false'},
{'column': 7, 'type': 'float', 'field_name': 'float_number', 'multi_valued': 'false'},
{'column': 8, 'type': 'text', 'field_name': 'keys', 'multi_valued': 'true'},
{'column': 9, 'type': 'text', 'field_name': 'name', 'multi_valued': 'false'},
{'column': 10, 'type': 'text', 'field_name': 'name_exact', 'multi_valued': 'false'},
{'column': 11, 'type': 'integer', 'field_name': 'number', 'multi_valued': 'false'},
{'column': 12, 'type': 'text', 'field_name': 'sites', 'multi_valued': 'true'},
{'column': 13, 'type': 'text', 'field_name': 'tags', 'multi_valued': 'true'},
{'column': 14, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'},
{'column': 15, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'},
{'column': 16, 'type': 'text', 'field_name': 'url', 'multi_valued': 'false'},
])
def test_parse_query(self):
self.assertExpectedQuery(self.backend.parse_query('indexed'), 'Zindex@1')
self.assertExpectedQuery(self.backend.parse_query('name:david'), 'ZXNAMEdavid@1')
if xapian.minor_version() >= 2:
# todo: why `SYNONYM WILDCARD OR XNAMEda`?
self.assertExpectedQuery(
self.backend.parse_query('name:da*'),
[
'(SYNONYM WILDCARD OR XNAMEda)',
'WILDCARD SYNONYM XNAMEda',
])
else:
self.assertEqual(str(self.backend.parse_query('name:da*')),
'Xapian::Query(('
'XNAMEdavid1:(pos=1) OR '
'XNAMEdavid2:(pos=1) OR '
'XNAMEdavid3:(pos=1)))')
def test_parse_query_range(self):
self.assertExpectedQuery(self.backend.parse_query('name:david1..david2'),
[
'0 * VALUE_RANGE 9 david1 david2',
'VALUE_RANGE 9 david1 david2',
])
self.assertExpectedQuery(self.backend.parse_query('number:0..10'),
[
'0 * VALUE_RANGE 11 000000000000 000000000010',
'VALUE_RANGE 11 000000000000 000000000010',
])
self.assertExpectedQuery(self.backend.parse_query('number:..10'),
[
'0 * VALUE_RANGE 11 %012d 000000000010' % (-sys.maxsize - 1),
'VALUE_RANGE 11 %012d 000000000010' % (-sys.maxsize - 1),
])
self.assertExpectedQuery(self.backend.parse_query('number:10..*'),
[
'0 * VALUE_RANGE 11 000000000010 %012d' % sys.maxsize,
'VALUE_RANGE 11 000000000010 %012d' % sys.maxsize,
])
def test_order_by_django_id(self):
"""
We need this test because ordering on more than
10 entries was not correct at some point.
"""
self.sample_objs = []
number_list = list(range(1, 101))
for i in number_list:
entry = self.get_entry(i)
self.sample_objs.append(entry)
for obj in self.sample_objs:
obj.save()
self.backend.clear()
self.backend.update(self.index, self.sample_objs)
results = self.backend.search(xapian.Query(''), sort_by=['-django_id'])
self.assertEqual(pks(results['results']), list(reversed(number_list)))
def test_more_like_this_with_unindexed_model(self):
"""
Tests that more_like_this raises an error when it is called
with an unindexed model and if silently_fail is True.
Also tests the other way around.
"""
mock = BlogEntry()
mock.id = 10
mock.author = 'david10'
try:
self.assertEqual(self.backend.more_like_this(mock)['results'], [])
except InvalidIndexError:
self.fail("InvalidIndexError raised when silently_fail is True")
self.backend.silently_fail = False
self.assertRaises(InvalidIndexError, self.backend.more_like_this, mock)
class IndexationNGramTestCase(HaystackBackendTestCase, TestCase):
def get_index(self):
return XapianNGramIndex()
def setUp(self):
super().setUp()
mock = BlogEntry()
mock.id = 1
mock.author = 'david'
mock1 = BlogEntry()
mock1.id = 2
mock1.author = 'da1id'
self.backend.update(self.index, [mock, mock1])
def test_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('da' in terms)
self.assertTrue('XNGRAMda' in terms)
self.assertTrue('dav' in terms)
self.assertTrue('XNGRAMdav' in terms)
self.assertTrue('davi' in terms)
self.assertTrue('XNGRAMdavi' in terms)
self.assertTrue('david' in terms)
self.assertTrue('XNGRAMdavid' in terms)
self.assertTrue('vid' in terms)
self.assertTrue('XNGRAMvid' in terms)
self.assertTrue('id' in terms)
self.assertTrue('XNGRAMid' in terms)
self.assertTrue('av' in terms)
self.assertTrue('XNGRAMav' in terms)
def test_search(self):
"""Tests edge ngram search with different parts of words"""
# Minimun length of query string must be equal to EDGE_NGRAM_MIN_LENGTH.
self.assertEqual(pks(self.backend.search(xapian.Query('da'))['results']),
[1, 2])
self.assertEqual(pks(self.backend.search(xapian.Query('dav'))['results']),
[1])
self.assertEqual(pks(self.backend.search(xapian.Query('da1'))['results']),
[2])
class IndexationEdgeNGramTestCase(HaystackBackendTestCase, TestCase):
def get_index(self):
return XapianEdgeNGramIndex()
def setUp(self):
super().setUp()
mock = BlogEntry()
mock.id = 1
mock.author = 'david'
mock1 = BlogEntry()
mock1.id = 2
mock1.author = 'da1id'
self.backend.update(self.index, [mock, mock1])
def test_field(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('da' in terms)
self.assertTrue('XEDGE_NGRAMda' in terms)
self.assertTrue('dav' in terms)
self.assertTrue('XEDGE_NGRAMdav' in terms)
self.assertTrue('davi' in terms)
self.assertTrue('XEDGE_NGRAMdavi' in terms)
self.assertTrue('david' in terms)
self.assertTrue('XEDGE_NGRAMdavid' in terms)
self.assertTrue('vid' not in terms)
self.assertTrue('XEDGE_NGRAMvid' not in terms)
self.assertTrue('id' not in terms)
self.assertTrue('XEDGE_NGRAMid' not in terms)
self.assertTrue('av' not in terms)
self.assertTrue('XEDGE_NGRAMav' not in terms)
def test_search(self):
"""Tests edge ngram search with different parts of words"""
# Minimun length of query string must be equal to NGRAM_MIN_LENGTH.
self.assertEqual(pks(self.backend.search(xapian.Query('da'))['results']),
[1, 2])
self.assertEqual(pks(self.backend.search(xapian.Query('dav'))['results']),
[1])
self.assertEqual(pks(self.backend.search(xapian.Query('da1'))['results']),
[2])
class IndexationDjangoContentTypeTestCase(HaystackBackendTestCase, TestCase):
def get_index(self):
return DjangoContentTypeIndex()
def setUp(self):
super().setUp()
entry1 = ContentType(model='DjangoContentType')
entry1.save()
entry2 = DjangoContentType(content_type=entry1)
entry2.save()
self.backend.update(self.index, [entry2])
def test_basic(self):
terms = get_terms(self.backend, '-a')
self.assertTrue('CONTENTTYPExapian_tests.djangocontenttype' in terms)
xapian-haystack-3.1.0/tests/xapian_tests/tests/test_interface.py 0000664 0000000 0000000 00000020407 14405573772 0025200 0 ustar 00root root 0000000 0000000 import datetime
from django.db.models import Q
from django.test import TestCase
from haystack import connections
from haystack.inputs import AutoQuery
from haystack.query import SearchQuerySet
from ..models import Document
from ..search_indexes import DocumentIndex
from ..tests.test_backend import pks
class InterfaceTestCase(TestCase):
"""
Tests the interface of Xapian-Haystack.
Tests related to usability and expected behavior
go here.
"""
def setUp(self):
super().setUp()
types_names = ['book', 'magazine', 'article']
texts = ['This is a huge text',
'This is a medium text',
'This is a small text']
dates = [datetime.date(year=2010, month=1, day=1),
datetime.date(year=2010, month=2, day=1),
datetime.date(year=2010, month=3, day=1)]
summaries = ['This is a huge corrup\xe7\xe3o summary',
'This is a medium summary',
'This is a small summary']
for i in range(1, 13):
doc = Document()
doc.type_name = types_names[i % 3]
doc.number = i * 2
doc.name = "%s %d" % (doc.type_name, doc.number)
doc.date = dates[i % 3]
doc.summary = summaries[i % 3]
doc.text = texts[i % 3]
doc.save()
self.index = DocumentIndex()
self.ui = connections['default'].get_unified_index()
self.ui.build(indexes=[self.index])
self.backend = connections['default'].get_backend()
self.backend.update(self.index, Document.objects.all())
self.queryset = SearchQuerySet()
def tearDown(self):
Document.objects.all().delete()
#self.backend.clear()
super().tearDown()
def test_count(self):
self.assertEqual(self.queryset.count(), Document.objects.count())
def test_content_search(self):
result = self.queryset.filter(content='medium this')
self.assertEqual(sorted(pks(result)),
pks(Document.objects.all()))
# documents with "medium" AND "this" have higher score
self.assertEqual(pks(result)[:4], [1, 4, 7, 10])
def test_field_search(self):
self.assertEqual(pks(self.queryset.filter(name__contains='8')), [4])
self.assertEqual(pks(self.queryset.filter(type_name='book')),
pks(Document.objects.filter(type_name='book')))
self.assertEqual(pks(self.queryset.filter(text__contains='text huge')),
pks(Document.objects.filter(text__contains='text huge')))
def test_field_contains(self):
self.assertEqual(pks(self.queryset.filter(summary__contains='huge')),
pks(Document.objects.filter(summary__contains='huge')))
result = self.queryset.filter(summary__contains='huge summary')
self.assertEqual(sorted(pks(result)),
pks(Document.objects.all()))
# documents with "huge" AND "summary" have higher score
self.assertEqual(pks(result)[:4], [3, 6, 9, 12])
def test_field_exact(self):
self.assertEqual(pks(self.queryset.filter(name__exact='8')), [])
self.assertEqual(pks(self.queryset.filter(name__exact='magazine 2')), [1])
def test_content_exact(self):
self.assertEqual(pks(self.queryset.filter(content__exact='huge')), [])
def test_content_and(self):
self.assertEqual(pks(self.queryset.filter(content='huge').filter(summary='medium')), [])
self.assertEqual(len(self.queryset.filter(content='huge this')), 12)
self.assertEqual(len(self.queryset.filter(content='huge this').filter(summary__contains='huge')), 4)
def test_content_or(self):
self.assertEqual(len(self.queryset.filter(content='huge medium')), 8)
self.assertEqual(len(self.queryset.filter(content='huge medium small')), 12)
def test_field_and(self):
self.assertEqual(pks(self.queryset.filter(name='8').filter(name='4')), [])
def test_field_or(self):
self.assertEqual(pks(self.queryset.filter(name__contains='8 4')), [2, 4])
def test_field_in(self):
self.assertEqual(set(pks(self.queryset.filter(name__in=['magazine 2', 'article 4']))),
set(pks(Document.objects.filter(name__in=['magazine 2', 'article 4']))))
self.assertEqual(pks(self.queryset.filter(number__in=[4])),
pks(Document.objects.filter(number__in=[4])))
self.assertEqual(pks(self.queryset.filter(number__in=[4, 8])),
pks(Document.objects.filter(number__in=[4, 8])))
def test_private_fields(self):
self.assertEqual(pks(self.queryset.filter(django_id=4)),
pks(Document.objects.filter(id__in=[4])))
self.assertEqual(pks(self.queryset.filter(django_id__in=[2, 4])),
pks(Document.objects.filter(id__in=[2, 4])))
self.assertEqual(set(pks(self.queryset.models(Document))),
set(pks(Document.objects.all())))
def test_field_startswith(self):
self.assertEqual(len(self.queryset.filter(name__startswith='magaz')), 4)
self.assertEqual(set(pks(self.queryset.filter(summary__startswith='This is a huge'))),
set(pks(Document.objects.filter(summary__startswith='This is a huge'))))
def test_auto_query(self):
# todo: improve to query text only.
self.assertEqual(set(pks(self.queryset.auto_query("huge OR medium"))),
set(pks(Document.objects.filter(Q(text__contains="huge") |
Q(text__contains="medium")))))
self.assertEqual(set(pks(self.queryset.auto_query("huge AND medium"))),
set(pks(Document.objects.filter(Q(text__contains="huge") &
Q(text__contains="medium")))))
self.assertEqual(set(pks(self.queryset.auto_query("text:huge text:-this"))),
set(pks(Document.objects.filter(Q(text__contains="huge") &
~Q(text__contains="this")))))
self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 OR 4"))), 2)
self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 AND 4"))), 0)
def test_value_range(self):
self.assertEqual(set(pks(self.queryset.filter(number__lt=3))),
set(pks(Document.objects.filter(number__lt=3))))
self.assertEqual(set(pks(self.queryset.filter(django_id__gte=6))),
set(pks(Document.objects.filter(id__gte=6))))
def test_date_range(self):
date = datetime.date(year=2010, month=2, day=1)
self.assertEqual(set(pks(self.queryset.filter(date__gte=date))),
set(pks(Document.objects.filter(date__gte=date))))
date = datetime.date(year=2010, month=3, day=1)
self.assertEqual(set(pks(self.queryset.filter(date__lte=date))),
set(pks(Document.objects.filter(date__lte=date))))
def test_order_by(self):
# private order
self.assertEqual(pks(self.queryset.order_by("-django_id")),
pks(Document.objects.order_by("-id")))
# value order
self.assertEqual(pks(self.queryset.order_by("number")),
pks(Document.objects.order_by("number")))
# text order
self.assertEqual(pks(self.queryset.order_by("summary")),
pks(Document.objects.order_by("summary")))
# date order
self.assertEqual(pks(self.queryset.order_by("-date")),
pks(Document.objects.order_by("-date")))
def test_non_ascii_search(self):
"""
Regression test for #119.
"""
self.assertEqual(pks(self.queryset.filter(content='corrup\xe7\xe3o')),
pks(Document.objects.filter(summary__contains='corrup\xe7\xe3o')))
def test_multi_values_exact_search(self):
"""
Regression test for #103
"""
self.assertEqual(len(self.queryset.filter(tags__exact='tag')), 12)
self.assertEqual(len(self.queryset.filter(tags__exact='tag-test')), 8)
self.assertEqual(len(self.queryset.filter(tags__exact='tag-test-test')), 4)
xapian-haystack-3.1.0/tests/xapian_tests/tests/test_management_commands.py 0000664 0000000 0000000 00000006176 14405573772 0027244 0 ustar 00root root 0000000 0000000 import sys
from io import StringIO
from unittest import TestCase
from django.core.management import call_command
from ..models import BlogEntry
from ..search_indexes import BlogSearchIndex
from .test_backend import BackendFeaturesTestCase, HaystackBackendTestCase
class ManagementCommandTestCase(HaystackBackendTestCase, TestCase):
NUM_BLOG_ENTRIES = 20
def get_index(self):
return BlogSearchIndex()
def setUp(self):
super().setUp()
self.sample_objs = []
for i in range(1, self.NUM_BLOG_ENTRIES + 1):
entry = BackendFeaturesTestCase.get_entry(i)
self.sample_objs.append(entry)
entry.save()
def verify_indexed_document_count(self, expected):
count = self.backend.document_count()
self.assertEqual(count, expected)
def verify_indexed_documents(self):
"""Confirm that the documents in the search index match the database"""
count = self.backend.document_count()
self.assertEqual(count, self.NUM_BLOG_ENTRIES)
pks = set(BlogEntry.objects.values_list("pk", flat=True))
doc_ids = set()
database = self.backend._database()
for pk in pks:
xapian_doc = database.get_document(pk)
doc_id = xapian_doc.get_docid()
doc_ids.add(doc_id)
database.close()
self.assertSetEqual(pks, doc_ids)
def test_clear(self):
self.backend.update(self.index, BlogEntry.objects.all())
self.verify_indexed_documents()
call_command("clear_index", interactive=False, verbosity=0)
self.verify_indexed_document_count(0)
def test_update(self):
self.verify_indexed_document_count(0)
call_command("update_index", verbosity=0)
self.verify_indexed_documents()
def test_rebuild(self):
self.verify_indexed_document_count(0)
call_command("rebuild_index", interactive=False, verbosity=0)
self.verify_indexed_documents()
def test_remove(self):
self.verify_indexed_document_count(0)
call_command("update_index", verbosity=0)
self.verify_indexed_documents()
# Remove three instances.
three_pks = BlogEntry.objects.all()[:3].values_list("pk", flat=True)
BlogEntry.objects.filter(pk__in=three_pks).delete()
self.verify_indexed_document_count(self.NUM_BLOG_ENTRIES)
# Plain ``update_index`` doesn't fix it.
call_command("update_index", verbosity=0)
self.verify_indexed_document_count(self.NUM_BLOG_ENTRIES)
# … but remove does:
call_command("update_index", remove=True, verbosity=0)
self.verify_indexed_document_count(self.NUM_BLOG_ENTRIES - 3)
def test_multiprocessing(self):
self.verify_indexed_document_count(0)
old_stderr = sys.stderr
sys.stderr = StringIO()
call_command(
"update_index",
verbosity=2,
workers=10,
batchsize=2,
)
err = sys.stderr.getvalue()
sys.stderr = old_stderr
print(err)
self.assertNotIn("xapian.DatabaseLockError", err)
self.verify_indexed_documents()
xapian-haystack-3.1.0/tests/xapian_tests/tests/test_query.py 0000664 0000000 0000000 00000045562 14405573772 0024416 0 ustar 00root root 0000000 0000000 import datetime
from django.conf import settings
from django.test import TestCase
from haystack import connections, reset_search_queries
from haystack.models import SearchResult
from haystack.query import SearchQuerySet, SQ
from ...mocks import MockSearchResult
from ..models import MockModel, AnotherMockModel, AFourthMockModel
from ..search_indexes import MockQueryIndex, MockSearchIndex, BoostMockSearchIndex
from ..tests.test_backend import HaystackBackendTestCase
class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase):
"""
Tests the XapianSearchQuery, the class that converts SearchQuerySet queries
using the `__` notation to XapianQueries.
"""
fixtures = ['base_data.json']
def get_index(self):
return MockQueryIndex()
def setUp(self):
super().setUp()
self.sq = connections['default'].get_query()
def test_all(self):
self.assertExpectedQuery(self.sq.build_query(), '')
def test_single_word(self):
self.sq.add_filter(SQ(content='hello'))
self.assertExpectedQuery(self.sq.build_query(), '(Zhello OR hello)')
def test_single_word_not(self):
self.sq.add_filter(~SQ(content='hello'))
self.assertExpectedQuery(self.sq.build_query(),
'( AND_NOT (Zhello OR hello))')
def test_single_word_field_exact(self):
self.sq.add_filter(SQ(foo__exact='hello'))
self.assertExpectedQuery(self.sq.build_query(),
'(XFOO^ PHRASE 3 XFOOhello PHRASE 3 XFOO$)')
def test_single_word_field_exact_not(self):
self.sq.add_filter(~SQ(foo='hello'))
self.assertExpectedQuery(self.sq.build_query(),
'( AND_NOT '
'(XFOO^ PHRASE 3 XFOOhello PHRASE 3 XFOO$))')
def test_boolean(self):
self.sq.add_filter(SQ(content=True))
self.assertExpectedQuery(self.sq.build_query(), '(Ztrue OR true)')
def test_date(self):
self.sq.add_filter(SQ(content=datetime.date(2009, 5, 8)))
self.assertExpectedQuery(self.sq.build_query(),
'(Z2009-05-08 OR 2009-05-08)')
def test_date_not(self):
self.sq.add_filter(~SQ(content=datetime.date(2009, 5, 8)))
self.assertExpectedQuery(self.sq.build_query(),
'( AND_NOT '
'(Z2009-05-08 OR 2009-05-08))')
def test_datetime(self):
self.sq.add_filter(SQ(content=datetime.datetime(2009, 5, 8, 11, 28)))
self.assertExpectedQuery(self.sq.build_query(),
'((Z2009-05-08 OR 2009-05-08) OR'
' (Z11:28:00 OR 11:28:00))')
def test_datetime_not(self):
self.sq.add_filter(~SQ(content=datetime.datetime(2009, 5, 8, 11, 28)))
self.assertExpectedQuery(self.sq.build_query(),
'( AND_NOT ((Z2009-05-08 OR 2009-05-08) OR (Z11:28:00 OR 11:28:00)))')
def test_float(self):
self.sq.add_filter(SQ(content=25.52))
self.assertExpectedQuery(self.sq.build_query(), '(Z25.52 OR 25.52)')
def test_multiple_words_and(self):
self.sq.add_filter(SQ(content='hello'))
self.sq.add_filter(SQ(content='world'))
self.assertExpectedQuery(self.sq.build_query(),
'((Zhello OR hello) AND (Zworld OR world))')
def test_multiple_words_not(self):
self.sq.add_filter(~SQ(content='hello'))
self.sq.add_filter(~SQ(content='world'))
self.assertExpectedQuery(self.sq.build_query(),
'(( AND_NOT (Zhello OR hello)) AND'
' ( AND_NOT (Zworld OR world)))')
def test_multiple_words_or(self):
self.sq.add_filter(SQ(content='hello') | SQ(content='world'))
self.assertExpectedQuery(
self.sq.build_query(),
'((Zhello OR hello) OR (Zworld OR world))')
def test_multiple_words_or_not(self):
self.sq.add_filter(~SQ(content='hello') | ~SQ(content='world'))
self.assertExpectedQuery(self.sq.build_query(),
'(( AND_NOT (Zhello OR hello)) OR'
' ( AND_NOT (Zworld OR world)))')
def test_multiple_words_mixed(self):
self.sq.add_filter(SQ(content='why') | SQ(content='hello'))
self.sq.add_filter(~SQ(content='world'))
self.assertExpectedQuery(
self.sq.build_query(),
'(((Zwhi OR why) OR (Zhello OR hello)) AND '
'( AND_NOT (Zworld OR world)))')
def test_multiple_word_field_exact(self):
self.sq.add_filter(SQ(foo='hello'))
self.sq.add_filter(SQ(title='world'))
self.assertExpectedQuery(self.sq.build_query(),
'((XFOO^ PHRASE 3 XFOOhello PHRASE 3 XFOO$) AND'
' (XTITLE^ PHRASE 3 XTITLEworld PHRASE 3 XTITLE$))')
def test_multiple_word_field_exact_not(self):
self.sq.add_filter(~SQ(foo='hello'))
self.sq.add_filter(~SQ(title='world'))
self.assertExpectedQuery(self.sq.build_query(),
'(( AND_NOT (XFOO^ PHRASE 3 XFOOhello PHRASE 3 XFOO$)) AND'
' ( AND_NOT (XTITLE^ PHRASE 3 XTITLEworld PHRASE 3 XTITLE$)))')
def test_or(self):
self.sq.add_filter(SQ(content='hello world'))
self.assertExpectedQuery(
self.sq.build_query(), '((Zhello OR hello) OR (Zworld OR world))')
def test_not_or(self):
self.sq.add_filter(~SQ(content='hello world'))
self.assertExpectedQuery(
self.sq.build_query(),
'( AND_NOT ((Zhello OR hello) OR (Zworld OR world)))')
def test_boost(self):
self.sq.add_filter(SQ(content='hello'))
self.sq.add_boost('world', 5)
self.assertExpectedQuery(self.sq.build_query(),
'((Zhello OR hello) AND_MAYBE'
' 5 * (Zworld OR world))')
def test_not_in_filter_single_words(self):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"]))
self.assertExpectedQuery(self.sq.build_query(),
'((Zwhi OR why) AND '
'( AND_NOT ('
'(XTITLE^ PHRASE 3 XTITLEdune PHRASE 3 XTITLE$) OR '
'(XTITLE^ PHRASE 3 XTITLEjaws PHRASE 3 XTITLE$))))')
def test_in_filter_multiple_words(self):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article"]))
self.assertExpectedQuery(self.sq.build_query(),
'((Zwhi OR why) AND ((XTITLE^ PHRASE 5 XTITLEa PHRASE 5 '
'XTITLEfamous PHRASE 5 XTITLEpaper PHRASE 5 XTITLE$) OR '
'(XTITLE^ PHRASE 5 XTITLEan PHRASE 5 XTITLEinfamous PHRASE 5 '
'XTITLEarticle PHRASE 5 XTITLE$)))')
def test_in_filter_multiple_words_with_punctuation(self):
self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article", "My Store Inc."]))
self.assertExpectedQuery(self.sq.build_query(),
'((XTITLE^ PHRASE 5 XTITLEa PHRASE 5 XTITLEfamous PHRASE 5'
' XTITLEpaper PHRASE 5 XTITLE$) OR '
'(XTITLE^ PHRASE 5 XTITLEan PHRASE 5 XTITLEinfamous PHRASE 5'
' XTITLEarticle PHRASE 5 XTITLE$) OR '
'(XTITLE^ PHRASE 5 XTITLEmy PHRASE 5 XTITLEstore PHRASE 5'
' XTITLEinc. PHRASE 5 XTITLE$))')
def test_not_in_filter_multiple_words(self):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(~SQ(title__in=["A Famous Paper", "An Infamous Article"]))
self.assertExpectedQuery(self.sq.build_query(),
'((Zwhi OR why) AND ( AND_NOT '
'((XTITLE^ PHRASE 5 XTITLEa PHRASE 5 XTITLEfamous PHRASE 5 '
'XTITLEpaper PHRASE 5 XTITLE$) OR (XTITLE^ PHRASE 5 '
'XTITLEan PHRASE 5 XTITLEinfamous PHRASE 5 '
'XTITLEarticle PHRASE 5 XTITLE$))))')
def test_in_filter_datetime(self):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(SQ(pub_date__in=[datetime.datetime(2009, 7, 6, 1, 56, 21)]))
self.assertExpectedQuery(self.sq.build_query(),
'((Zwhi OR why) AND '
'(XPUB_DATE2009-07-06 AND_MAYBE XPUB_DATE01:56:21))')
def test_clean(self):
self.assertEqual(self.sq.clean('hello world'), 'hello world')
self.assertEqual(self.sq.clean('hello AND world'), 'hello AND world')
self.assertEqual(self.sq.clean('hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world'),
'hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world')
self.assertEqual(self.sq.clean('so please NOTe i am in a bAND and bORed'),
'so please NOTe i am in a bAND and bORed')
def test_with_models(self):
self.sq.add_filter(SQ(content='hello'))
self.sq.add_model(MockModel)
self.assertExpectedQuery(self.sq.build_query(),
'((Zhello OR hello) AND '
'0 * CONTENTTYPEcore.mockmodel)')
self.sq.add_model(AnotherMockModel)
self.assertExpectedQuery(self.sq.build_query(),
['((Zhello OR hello) AND '
'(0 * CONTENTTYPEcore.mockmodel OR'
' 0 * CONTENTTYPEcore.anothermockmodel))',
'((Zhello OR hello) AND '
'(0 * CONTENTTYPEcore.anothermockmodel OR'
' 0 * CONTENTTYPEcore.mockmodel))'])
def test_with_punctuation(self):
self.sq.add_filter(SQ(content='http://www.example.com'))
self.assertExpectedQuery(self.sq.build_query(),
'(Zhttp://www.example.com OR'
' http://www.example.com)')
def test_in_filter_values_list(self):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id',
flat=True)))
self.assertExpectedQuery(self.sq.build_query(),
'((Zwhi OR why) AND ('
'(XTITLE^ PHRASE 3 XTITLE1 PHRASE 3 XTITLE$) OR '
'(XTITLE^ PHRASE 3 XTITLE2 PHRASE 3 XTITLE$) OR '
'(XTITLE^ PHRASE 3 XTITLE3 PHRASE 3 XTITLE$)))')
def test_content_type(self):
self.sq.add_filter(SQ(django_ct='time'))
self.assertExpectedQuery(self.sq.build_query(), 'CONTENTTYPEtime')
def test_unphrased_id(self):
'An internal ID should NOT be phrased so one can exclude IDs.'
self.sq.add_filter(SQ(id__in=['testing123', 'testing456']))
expected = '(Qtesting123 OR Qtesting456)'
self.assertExpectedQuery(
query=self.sq.build_query(), string_or_list=expected)
class SearchQueryTestCase(HaystackBackendTestCase, TestCase):
"""
Tests expected behavior of
SearchQuery.
"""
fixtures = ['base_data.json']
def get_index(self):
return MockSearchIndex()
def setUp(self):
super().setUp()
self.backend.update(self.index, MockModel.objects.all())
self.sq = connections['default'].get_query()
def test_get_spelling(self):
self.sq.add_filter(SQ(content='indxd'))
self.assertEqual(self.sq.get_spelling_suggestion(), 'indexed')
self.assertEqual(self.sq.get_spelling_suggestion('indxd'), 'indexed')
def test_contains(self):
self.sq.add_filter(SQ(content='circular'))
self.sq.add_filter(SQ(title__contains='haystack'))
self.assertExpectedQuery(self.sq.build_query(),
'((Zcircular OR circular) AND '
'(ZXTITLEhaystack OR XTITLEhaystack))')
def test_startswith(self):
self.sq.add_filter(SQ(name__startswith='da'))
self.assertEqual([result.pk for result in self.sq.get_results()], [1, 2, 3])
def test_endswith(self):
with self.assertRaises(NotImplementedError):
self.sq.add_filter(SQ(name__endswith='el2'))
self.sq.get_results()
def test_gt(self):
self.sq.add_filter(SQ(name__gt='m'))
self.assertExpectedQuery(self.sq.build_query(),
'( AND_NOT VALUE_RANGE 3 a m)')
def test_gte(self):
self.sq.add_filter(SQ(name__gte='m'))
self.assertExpectedQuery(self.sq.build_query(),
'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzz')
def test_lt(self):
self.sq.add_filter(SQ(name__lt='m'))
self.assertExpectedQuery(self.sq.build_query(),
'( AND_NOT VALUE_RANGE 3 m '
'zzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzz)')
def test_lte(self):
self.sq.add_filter(SQ(name__lte='m'))
self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 3 a m')
def test_range(self):
self.sq.add_filter(SQ(django_id__range=[2, 4]))
self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 1 000000000002 000000000004')
self.sq.add_filter(~SQ(django_id__range=[0, 2]))
self.assertExpectedQuery(self.sq.build_query(),
'(VALUE_RANGE 1 000000000002 000000000004 AND '
'( AND_NOT VALUE_RANGE 1 000000000000 000000000002))')
self.assertEqual([result.pk for result in self.sq.get_results()], [3])
def test_multiple_filter_types(self):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(SQ(pub_date__lte=datetime.datetime(2009, 2, 10, 1, 59, 0)))
self.sq.add_filter(SQ(name__gt='david'))
self.sq.add_filter(SQ(title__gte='B'))
self.sq.add_filter(SQ(django_id__in=[1, 2, 3]))
self.assertExpectedQuery(self.sq.build_query(),
'((Zwhi OR why) AND'
' VALUE_RANGE 5 00010101000000 20090210015900 AND'
' ( AND_NOT VALUE_RANGE 3 a david)'
' AND VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzz AND'
' (QQ000000000001 OR QQ000000000002 OR QQ000000000003))')
def test_log_query(self):
reset_search_queries()
self.assertEqual(len(connections['default'].queries), 0)
# Stow.
old_debug = settings.DEBUG
settings.DEBUG = False
len(self.sq.get_results())
self.assertEqual(len(connections['default'].queries), 0)
settings.DEBUG = True
# Redefine it to clear out the cached results.
self.sq = connections['default'].get_query()
self.sq.add_filter(SQ(name='bar'))
len(self.sq.get_results())
self.assertEqual(len(connections['default'].queries), 1)
self.assertExpectedQuery(connections['default'].queries[0]['query_string'],
'(XNAME^ PHRASE 3 XNAMEbar PHRASE 3 XNAME$)')
# And again, for good measure.
self.sq = connections['default'].get_query()
self.sq.add_filter(SQ(name='bar'))
self.sq.add_filter(SQ(text='moof'))
len(self.sq.get_results())
self.assertEqual(len(connections['default'].queries), 2)
self.assertExpectedQuery(connections['default'].queries[0]['query_string'],
'(XNAME^ PHRASE 3 XNAMEbar PHRASE 3 XNAME$)')
self.assertExpectedQuery(connections['default'].queries[1]['query_string'],
'((XNAME^ PHRASE 3 XNAMEbar PHRASE 3 XNAME$) AND'
' (XTEXT^ PHRASE 3 XTEXTmoof PHRASE 3 XTEXT$))')
# Restore.
settings.DEBUG = old_debug
class LiveSearchQuerySetTestCase(HaystackBackendTestCase, TestCase):
"""
SearchQuerySet specific tests
"""
fixtures = ['base_data.json']
def get_index(self):
return MockSearchIndex()
def setUp(self):
super().setUp()
self.backend.update(self.index, MockModel.objects.all())
self.sq = connections['default'].get_query()
self.sqs = SearchQuerySet()
def test_result_class(self):
# Assert that we're defaulting to ``SearchResult``.
sqs = self.sqs.all()
self.assertTrue(isinstance(sqs[0], SearchResult))
# Custom class.
sqs = self.sqs.result_class(MockSearchResult).all()
self.assertTrue(isinstance(sqs[0], MockSearchResult))
# Reset to default.
sqs = self.sqs.result_class(None).all()
self.assertTrue(isinstance(sqs[0], SearchResult))
def test_facet(self):
self.assertEqual(len(self.sqs.facet('name').facet_counts()['fields']['name']), 3)
class BoostFieldTestCase(HaystackBackendTestCase, TestCase):
"""
Tests boosted fields.
"""
def get_index(self):
return BoostMockSearchIndex()
def setUp(self):
super().setUp()
self.sample_objs = []
for i in range(1, 5):
mock = AFourthMockModel()
mock.id = i
if i % 2:
mock.author = 'daniel'
mock.editor = 'david'
else:
mock.author = 'david'
mock.editor = 'daniel'
mock.pub_date = datetime.date(2009, 2, 25) - datetime.timedelta(days=i)
self.sample_objs.append(mock)
self.backend.update(self.index, self.sample_objs)
def test_boost(self):
sqs = SearchQuerySet()
self.assertEqual(len(sqs.all()), 4)
results = sqs.filter(SQ(author='daniel') | SQ(editor='daniel'))
self.assertEqual([result.id for result in results], [
'core.afourthmockmodel.1',
'core.afourthmockmodel.3',
'core.afourthmockmodel.2',
'core.afourthmockmodel.4'
])
xapian-haystack-3.1.0/xapian_backend.py 0000775 0000000 0000000 00000177365 14405573772 0020145 0 ustar 00root root 0000000 0000000 import datetime
import pickle
from pathlib import Path
import os
import re
import shutil
import sys
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from filelock import FileLock
from haystack import connections
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, SearchNode, log_query
from haystack.constants import ID, DJANGO_ID, DJANGO_CT, DEFAULT_OPERATOR
from haystack.exceptions import HaystackError, MissingDependency
from haystack.inputs import AutoQuery
from haystack.models import SearchResult
from haystack.utils import get_identifier, get_model_ct
NGRAM_MIN_LENGTH = getattr(settings, 'XAPIAN_NGRAM_MIN_LENGTH', 2)
NGRAM_MAX_LENGTH = getattr(settings, 'XAPIAN_NGRAM_MAX_LENGTH', 15)
try:
import xapian
except ImportError:
raise MissingDependency("The 'xapian' backend requires the installation of 'Xapian'. "
"Please refer to the documentation.")
# this maps the different reserved fields to prefixes used to
# create the database:
# id str: unique document id.
# django_id int: id of the django model instance.
# django_ct str: of the content type of the django model.
# field str: name of the field of the index.
TERM_PREFIXES = {
ID: 'Q',
DJANGO_ID: 'QQ',
DJANGO_CT: 'CONTENTTYPE',
'field': 'X'
}
_EXACT_SEARCHFIELDS = frozenset((DJANGO_CT, DJANGO_ID, ID))
MEMORY_DB_NAME = ':memory:'
DEFAULT_XAPIAN_FLAGS = (
xapian.QueryParser.FLAG_PHRASE |
xapian.QueryParser.FLAG_BOOLEAN |
xapian.QueryParser.FLAG_LOVEHATE |
xapian.QueryParser.FLAG_WILDCARD |
xapian.QueryParser.FLAG_PURE_NOT
)
# Mapping from `HAYSTACK_DEFAULT_OPERATOR` to Xapian operators
XAPIAN_OPTS = {'AND': xapian.Query.OP_AND,
'OR': xapian.Query.OP_OR,
'PHRASE': xapian.Query.OP_PHRASE,
'NEAR': xapian.Query.OP_NEAR
}
# number of documents checked by default when building facets
# this must be improved to be relative to the total number of docs.
DEFAULT_CHECK_AT_LEAST = 1000
# field types accepted to be serialized as values in Xapian
FIELD_TYPES = {'text', 'integer', 'date', 'datetime', 'float', 'boolean',
'edge_ngram', 'ngram'}
# defines the format used to store types in Xapian
# this format ensures datetimes are sorted correctly
DATETIME_FORMAT = '%Y%m%d%H%M%S'
INTEGER_FORMAT = '%012d'
# defines the distance given between
# texts with positional information
TERMPOS_DISTANCE = 100
def filelocked(func):
"""Decorator to wrap a XapianSearchBackend method in a filelock."""
def wrapper(self, *args, **kwargs):
"""Run the function inside a lock."""
if self.path == MEMORY_DB_NAME or not self.use_lockfile:
func(self, *args, **kwargs)
else:
lockfile = Path(self.filelock.lock_file)
lockfile.parent.mkdir(parents=True, exist_ok=True)
lockfile.touch()
with self.filelock:
func(self, *args, **kwargs)
return wrapper
class InvalidIndexError(HaystackError):
"""Raised when an index can not be opened."""
pass
class XHValueRangeProcessor(xapian.ValueRangeProcessor):
"""
A Processor to construct ranges of values
"""
def __init__(self, backend):
self.backend = backend
xapian.ValueRangeProcessor.__init__(self)
def __call__(self, begin, end):
"""
Construct a tuple for value range processing.
`begin` -- a string in the format ':[low_range]'
If 'low_range' is omitted, assume the smallest possible value.
`end` -- a string in the the format '[high_range|*]'. If '*', assume
the highest possible value.
Return a tuple of three strings: (column, low, high)
"""
colon = begin.find(':')
field_name = begin[:colon]
begin = begin[colon + 1:len(begin)]
for field_dict in self.backend.schema:
if field_dict['field_name'] == field_name:
field_type = field_dict['type']
if not begin:
if field_type == 'text':
begin = 'a' # TODO: A better way of getting a min text value?
elif field_type == 'integer':
begin = -sys.maxsize - 1
elif field_type == 'float':
begin = float('-inf')
elif field_type in ['date', 'datetime']:
begin = '00010101000000'
elif end == '*':
if field_type == 'text':
end = 'z' * 100 # TODO: A better way of getting a max text value?
elif field_type == 'integer':
end = sys.maxsize
elif field_type == 'float':
end = float('inf')
elif field_type in ['date', 'datetime']:
end = '99990101000000'
if field_type == 'float':
begin = _term_to_xapian_value(float(begin), field_type)
end = _term_to_xapian_value(float(end), field_type)
elif field_type == 'integer':
begin = _term_to_xapian_value(int(begin), field_type)
end = _term_to_xapian_value(int(end), field_type)
return field_dict['column'], str(begin), str(end)
class XHExpandDecider(xapian.ExpandDecider):
def __call__(self, term):
"""
Return True if the term should be used for expanding the search
query, False otherwise.
Ignore terms related with the content type of objects.
"""
if term.decode('utf-8').startswith(TERM_PREFIXES[DJANGO_CT]):
return False
return True
class XapianSearchBackend(BaseSearchBackend):
"""
`SearchBackend` defines the Xapian search backend for use with the Haystack
API for Django search.
It uses the Xapian Python bindings to interface with Xapian.
In order to use this backend, `PATH` must be included in the
`connection_options`. This should point to a location where you would your
indexes to reside.
"""
inmemory_db = None
def __init__(self, connection_alias, **connection_options):
"""
Instantiates an instance of `SearchBackend`.
Optional arguments:
`connection_alias` -- The name of the connection
`language` -- The stemming language (default = 'english')
`**connection_options` -- The various options needed to setup
the backend.
Also sets the stemming language to be used to `language`.
"""
self.use_lockfile = bool(
getattr(settings, 'HAYSTACK_XAPIAN_USE_LOCKFILE', True)
)
super().__init__(connection_alias, **connection_options)
if not 'PATH' in connection_options:
raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'."
% connection_alias)
self.path = connection_options.get('PATH')
if self.path != MEMORY_DB_NAME:
try:
os.makedirs(self.path)
except FileExistsError:
pass
if self.use_lockfile:
lockfile = Path(self.path) / "lockfile"
self.filelock = FileLock(lockfile)
self.flags = connection_options.get('FLAGS', DEFAULT_XAPIAN_FLAGS)
self.language = getattr(settings, 'HAYSTACK_XAPIAN_LANGUAGE', 'english')
stemming_strategy_string = getattr(settings, 'HAYSTACK_XAPIAN_STEMMING_STRATEGY', 'STEM_SOME')
self.stemming_strategy = getattr(xapian.QueryParser, stemming_strategy_string, xapian.QueryParser.STEM_SOME)
# these 4 attributes are caches populated in `build_schema`
# they are checked in `_update_cache`
# use property to retrieve them
self._fields = {}
self._schema = []
self._content_field_name = None
self._columns = {}
def _update_cache(self):
"""
To avoid build_schema every time, we cache
some values: they only change when a SearchIndex
changes, which typically restarts the Python.
"""
fields = connections[self.connection_alias].get_unified_index().all_searchfields()
if self._fields != fields:
self._fields = fields
self._content_field_name, self._schema = self.build_schema(self._fields)
@property
def schema(self):
self._update_cache()
return self._schema
@property
def content_field_name(self):
self._update_cache()
return self._content_field_name
@property
def column(self):
"""
Returns the column in the database of a given field name.
"""
self._update_cache()
return self._columns
@filelocked
def update(self, index, iterable, commit=True):
"""
Updates the `index` with any objects in `iterable` by adding/updating
the database as needed.
Required arguments:
`index` -- The `SearchIndex` to process
`iterable` -- An iterable of model instances to index
Optional arguments:
`commit` -- ignored
For each object in `iterable`, a document is created containing all
of the terms extracted from `index.full_prepare(obj)` with field prefixes,
and 'as-is' as needed. Also, if the field type is 'text' it will be
stemmed and stored with the 'Z' prefix as well.
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest`
Each document also contains an extra term in the format:
`XCONTENTTYPE.`
As well as a unique identifier in the the format:
`Q..`
eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar`
This is useful for querying for a specific document corresponding to
a model instance.
The document also contains a pickled version of the object itself and
the document ID in the document data field.
Finally, we also store field values to be used for sorting data. We
store these in the document value slots (position zero is reserver
for the document ID). All values are stored as unicode strings with
conversion of float, int, double, values being done by Xapian itself
through the use of the :method:xapian.sortable_serialise method.
"""
database = self._database(writable=True)
try:
term_generator = xapian.TermGenerator()
term_generator.set_database(database)
term_generator.set_stemmer(xapian.Stem(self.language))
term_generator.set_stemming_strategy(self.stemming_strategy)
if self.include_spelling is True:
term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
def _add_text(termpos, text, weight, prefix=''):
"""
indexes text appending 2 extra terms
to identify beginning and ending of the text.
"""
term_generator.set_termpos(termpos)
start_term = '%s^' % prefix
end_term = '%s$' % prefix
# add begin
document.add_posting(start_term, termpos, weight)
# add text
term_generator.index_text(text, weight, prefix)
termpos = term_generator.get_termpos()
# add ending
termpos += 1
document.add_posting(end_term, termpos, weight)
# increase termpos
term_generator.set_termpos(termpos)
term_generator.increase_termpos(TERMPOS_DISTANCE)
return term_generator.get_termpos()
def _add_literal_text(termpos, text, weight, prefix=''):
"""
Adds sentence to the document with positional information
but without processing.
The sentence is bounded by "^" "$" to allow exact matches.
"""
text = '^ %s $' % text
for word in text.split():
term = '%s%s' % (prefix, word)
document.add_posting(term, termpos, weight)
termpos += 1
termpos += TERMPOS_DISTANCE
return termpos
def add_text(termpos, prefix, text, weight):
"""
Adds text to the document with positional information
and processing (e.g. stemming).
"""
termpos = _add_text(termpos, text, weight, prefix=prefix)
termpos = _add_text(termpos, text, weight, prefix='')
termpos = _add_literal_text(termpos, text, weight, prefix=prefix)
termpos = _add_literal_text(termpos, text, weight, prefix='')
return termpos
def _get_ngram_lengths(value):
values = value.split()
for item in values:
for ngram_length in range(NGRAM_MIN_LENGTH, NGRAM_MAX_LENGTH + 1):
yield item, ngram_length
for obj in iterable:
document = xapian.Document()
term_generator.set_document(document)
def ngram_terms(value):
for item, length in _get_ngram_lengths(value):
item_length = len(item)
for start in range(0, item_length - length + 1):
for size in range(length, length + 1):
end = start + size
if end > item_length:
continue
yield _to_xapian_term(item[start:end])
def edge_ngram_terms(value):
for item, length in _get_ngram_lengths(value):
yield _to_xapian_term(item[0:length])
def add_edge_ngram_to_document(prefix, value, weight):
"""
Splits the term in ngrams and adds each ngram to the index.
The minimum and maximum size of the ngram is respectively
NGRAM_MIN_LENGTH and NGRAM_MAX_LENGTH.
"""
for term in edge_ngram_terms(value):
document.add_term(term, weight)
document.add_term(prefix + term, weight)
def add_ngram_to_document(prefix, value, weight):
"""
Splits the term in ngrams and adds each ngram to the index.
The minimum and maximum size of the ngram is respectively
NGRAM_MIN_LENGTH and NGRAM_MAX_LENGTH.
"""
for term in ngram_terms(value):
document.add_term(term, weight)
document.add_term(prefix + term, weight)
def add_non_text_to_document(prefix, term, weight):
"""
Adds term to the document without positional information
and without processing.
If the term is alone, also adds it as "^$"
to allow exact matches on single terms.
"""
document.add_term(term, weight)
document.add_term(prefix + term, weight)
def add_datetime_to_document(termpos, prefix, term, weight):
"""
Adds a datetime to document with positional order
to allow exact matches on it.
"""
date, time = term.split()
document.add_posting(date, termpos, weight)
termpos += 1
document.add_posting(time, termpos, weight)
termpos += 1
document.add_posting(prefix + date, termpos, weight)
termpos += 1
document.add_posting(prefix + time, termpos, weight)
termpos += TERMPOS_DISTANCE + 1
return termpos
data = index.full_prepare(obj)
weights = index.get_field_weights()
termpos = term_generator.get_termpos() # identifies the current position in the document.
for field in self.schema:
if field['field_name'] not in list(data.keys()):
# not supported fields are ignored.
continue
if field['field_name'] in weights:
weight = int(weights[field['field_name']])
else:
weight = 1
value = data[field['field_name']]
if field['field_name'] in (ID, DJANGO_ID, DJANGO_CT):
# Private fields are indexed in a different way:
# `django_id` is an int and `django_ct` is text;
# besides, they are indexed by their (unstemmed) value.
if field['field_name'] == DJANGO_ID:
value = int(value)
value = _term_to_xapian_value(value, field['type'])
document.add_term(TERM_PREFIXES[field['field_name']] + value, weight)
document.add_value(field['column'], value)
continue
else:
prefix = TERM_PREFIXES['field'] + field['field_name'].upper()
# if not multi_valued, we add as a document value
# for sorting and facets
if field['multi_valued'] == 'false':
document.add_value(field['column'], _term_to_xapian_value(value, field['type']))
else:
for t in value:
# add the exact match of each value
term = _to_xapian_term(t)
termpos = add_text(termpos, prefix, term, weight)
continue
term = _to_xapian_term(value)
if term == '':
continue
# from here on the term is a string;
# we now decide how it is indexed
if field['type'] == 'text':
# text is indexed with positional information
termpos = add_text(termpos, prefix, term, weight)
elif field['type'] == 'datetime':
termpos = add_datetime_to_document(termpos, prefix, term, weight)
elif field['type'] == 'ngram':
add_ngram_to_document(prefix, value, weight)
elif field['type'] == 'edge_ngram':
add_edge_ngram_to_document(prefix, value, weight)
else:
# all other terms are added without positional information
add_non_text_to_document(prefix, term, weight)
# store data without indexing it
document.set_data(pickle.dumps(
(obj._meta.app_label, obj._meta.model_name, obj.pk, data),
pickle.HIGHEST_PROTOCOL
))
# add the id of the document
document_id = TERM_PREFIXES[ID] + get_identifier(obj)
document.add_term(document_id)
# finally, replace or add the document to the database
database.replace_document(document_id, document)
except UnicodeDecodeError:
sys.stderr.write('Chunk failed.\n')
pass
finally:
database.close()
@filelocked
def remove(self, obj, commit=True):
"""
Remove indexes for `obj` from the database.
We delete all instances of `Q..` which
should be unique to this object.
Optional arguments:
`commit` -- ignored
"""
database = self._database(writable=True)
database.delete_document(TERM_PREFIXES[ID] + get_identifier(obj))
database.close()
def clear(self, models=(), commit=True):
"""
Clear all instances of `models` from the database or all models, if
not specified.
Optional Arguments:
`models` -- Models to clear from the database (default = [])
If `models` is empty, an empty query is executed which matches all
documents in the database. Afterwards, each match is deleted.
Otherwise, for each model, a `delete_document` call is issued with
the term `XCONTENTTYPE.`. This will delete
all documents with the specified model type.
"""
if not models:
# Because there does not appear to be a "clear all" method,
# it's much quicker to remove the contents of the `self.path`
# folder than it is to remove each document one at a time.
if os.path.exists(self.path):
shutil.rmtree(self.path)
else:
database = self._database(writable=True)
for model in models:
database.delete_document(TERM_PREFIXES[DJANGO_CT] + get_model_ct(model))
database.close()
def document_count(self):
try:
return self._database().get_doccount()
except InvalidIndexError:
return 0
def _build_models_query(self, query):
"""
Builds a query from `query` that filters to documents only from registered models.
"""
registered_models_ct = self.build_models_list()
if registered_models_ct:
restrictions = [xapian.Query('%s%s' % (TERM_PREFIXES[DJANGO_CT], model_ct))
for model_ct in registered_models_ct]
limit_query = xapian.Query(xapian.Query.OP_OR, restrictions)
query = xapian.Query(xapian.Query.OP_AND, query, limit_query)
return query
def _check_field_names(self, field_names):
"""
Raises InvalidIndexError if any of a field_name in field_names is
not indexed.
"""
if field_names:
for field_name in field_names:
try:
self.column[field_name]
except KeyError:
raise InvalidIndexError('Trying to use non indexed field "%s"' % field_name)
@log_query
def search(self, query, sort_by=None, start_offset=0, end_offset=None,
fields='', highlight=False, facets=None, date_facets=None,
query_facets=None, narrow_queries=None, spelling_query=None,
limit_to_registered_models=None, result_class=None, **kwargs):
"""
Executes the Xapian::query as defined in `query`.
Required arguments:
`query` -- Search query to execute
Optional arguments:
`sort_by` -- Sort results by specified field (default = None)
`start_offset` -- Slice results from `start_offset` (default = 0)
`end_offset` -- Slice results at `end_offset` (default = None), if None, then all documents
`fields` -- Filter results on `fields` (default = '')
`highlight` -- Highlight terms in results (default = False)
`facets` -- Facet results on fields (default = None)
`date_facets` -- Facet results on date ranges (default = None)
`query_facets` -- Facet results on queries (default = None)
`narrow_queries` -- Narrow queries (default = None)
`spelling_query` -- An optional query to execute spelling suggestion on
`limit_to_registered_models` -- Limit returned results to models registered in
the current `SearchSite` (default = True)
Returns:
A dictionary with the following keys:
`results` -- A list of `SearchResult`
`hits` -- The total available results
`facets` - A dictionary of facets with the following keys:
`fields` -- A list of field facets
`dates` -- A list of date facets
`queries` -- A list of query facets
If faceting was not used, the `facets` key will not be present
If `query` is None, returns no results.
If `INCLUDE_SPELLING` was enabled in the connection options, the
extra flag `FLAG_SPELLING_CORRECTION` will be passed to the query parser
and any suggestions for spell correction will be returned as well as
the results.
"""
if xapian.Query.empty(query):
return {
'results': [],
'hits': 0,
}
self._check_field_names(facets)
self._check_field_names(date_facets)
self._check_field_names(query_facets)
database = self._database()
if limit_to_registered_models is None:
limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
if result_class is None:
result_class = SearchResult
if self.include_spelling is True:
spelling_suggestion = self._do_spelling_suggestion(database, query, spelling_query)
else:
spelling_suggestion = ''
if narrow_queries is not None:
query = xapian.Query(
xapian.Query.OP_AND, query, xapian.Query(
xapian.Query.OP_AND, [self.parse_query(narrow_query) for narrow_query in narrow_queries]
)
)
if limit_to_registered_models:
query = self._build_models_query(query)
enquire = xapian.Enquire(database)
if hasattr(settings, 'HAYSTACK_XAPIAN_WEIGHTING_SCHEME'):
enquire.set_weighting_scheme(xapian.BM25Weight(*settings.HAYSTACK_XAPIAN_WEIGHTING_SCHEME))
enquire.set_query(query)
if sort_by:
_xapian_sort(enquire, sort_by, self.column)
results = []
facets_dict = {
'fields': {},
'dates': {},
'queries': {},
}
if not end_offset:
end_offset = database.get_doccount() - start_offset
## prepare spies in case of facets
if facets:
facets_spies = self._prepare_facet_field_spies(facets)
for spy in facets_spies:
enquire.add_matchspy(spy)
# print enquire.get_query()
matches = self._get_enquire_mset(database, enquire, start_offset, end_offset)
for match in matches:
app_label, model_name, pk, model_data = pickle.loads(self._get_document_data(database, match.document))
if highlight:
model_data['highlighted'] = {
self.content_field_name: self._do_highlight(
model_data.get(self.content_field_name), query
)
}
results.append(
result_class(app_label, model_name, pk, match.percent, **model_data)
)
if facets:
# pick single valued facets from spies
single_facets_dict = self._process_facet_field_spies(facets_spies)
# pick multivalued valued facets from results
multi_facets_dict = self._do_multivalued_field_facets(results, facets)
# merge both results (http://stackoverflow.com/a/38990/931303)
facets_dict['fields'] = dict(list(single_facets_dict.items()) + list(multi_facets_dict.items()))
if date_facets:
facets_dict['dates'] = self._do_date_facets(results, date_facets)
if query_facets:
facets_dict['queries'] = self._do_query_facets(results, query_facets)
return {
'results': results,
'hits': self._get_hit_count(database, enquire),
'facets': facets_dict,
'spelling_suggestion': spelling_suggestion,
}
def more_like_this(self, model_instance, additional_query=None,
start_offset=0, end_offset=None,
limit_to_registered_models=True, result_class=None, **kwargs):
"""
Given a model instance, returns a result set of similar documents.
Required arguments:
`model_instance` -- The model instance to use as a basis for
retrieving similar documents.
Optional arguments:
`additional_query` -- An additional query to narrow results
`start_offset` -- The starting offset (default=0)
`end_offset` -- The ending offset (default=None), if None, then all documents
`limit_to_registered_models` -- Limit returned results to models registered in the search (default = True)
Returns:
A dictionary with the following keys:
`results` -- A list of `SearchResult`
`hits` -- The total available results
Opens a database connection, then builds a simple query using the
`model_instance` to build the unique identifier.
For each document retrieved(should always be one), adds an entry into
an RSet (relevance set) with the document id, then, uses the RSet
to query for an ESet (A set of terms that can be used to suggest
expansions to the original query), omitting any document that was in
the original query.
Finally, processes the resulting matches and returns.
"""
database = self._database()
if result_class is None:
result_class = SearchResult
query = xapian.Query(TERM_PREFIXES[ID] + get_identifier(model_instance))
enquire = xapian.Enquire(database)
enquire.set_query(query)
rset = xapian.RSet()
if not end_offset:
end_offset = database.get_doccount()
match = None
for match in self._get_enquire_mset(database, enquire, 0, end_offset):
rset.add_document(match.docid)
if match is None:
if not self.silently_fail:
raise InvalidIndexError('Instance %s with id "%d" not indexed' %
(get_identifier(model_instance), model_instance.id))
else:
return {'results': [],
'hits': 0}
query = xapian.Query(
xapian.Query.OP_ELITE_SET,
[expand.term for expand in enquire.get_eset(match.document.termlist_count(), rset, XHExpandDecider())],
match.document.termlist_count()
)
query = xapian.Query(
xapian.Query.OP_AND_NOT, [query, TERM_PREFIXES[ID] + get_identifier(model_instance)]
)
if limit_to_registered_models:
query = self._build_models_query(query)
if additional_query:
query = xapian.Query(
xapian.Query.OP_AND, query, additional_query
)
enquire.set_query(query)
results = []
matches = self._get_enquire_mset(database, enquire, start_offset, end_offset)
for match in matches:
app_label, model_name, pk, model_data = pickle.loads(self._get_document_data(database, match.document))
results.append(
result_class(app_label, model_name, pk, match.percent, **model_data)
)
return {
'results': results,
'hits': self._get_hit_count(database, enquire),
'facets': {
'fields': {},
'dates': {},
'queries': {},
},
'spelling_suggestion': None,
}
def parse_query(self, query_string):
"""
Given a `query_string`, will attempt to return a xapian.Query
Required arguments:
``query_string`` -- A query string to parse
Returns a xapian.Query
"""
if query_string == '*':
return xapian.Query('') # Match everything
elif query_string == '':
return xapian.Query() # Match nothing
qp = xapian.QueryParser()
qp.set_database(self._database())
qp.set_stemmer(xapian.Stem(self.language))
qp.set_stemming_strategy(self.stemming_strategy)
qp.set_default_op(XAPIAN_OPTS[DEFAULT_OPERATOR])
qp.add_boolean_prefix(DJANGO_CT, TERM_PREFIXES[DJANGO_CT])
for field_dict in self.schema:
# since 'django_ct' has a boolean_prefix,
# we ignore it here.
if field_dict['field_name'] == DJANGO_CT:
continue
qp.add_prefix(
field_dict['field_name'],
TERM_PREFIXES['field'] + field_dict['field_name'].upper()
)
vrp = XHValueRangeProcessor(self)
qp.add_valuerangeprocessor(vrp)
return qp.parse_query(query_string, self.flags)
def build_schema(self, fields):
"""
Build the schema from fields.
:param fields: A list of fields in the index
:returns: list of dictionaries
Each dictionary has the keys
field_name: The name of the field index
type: what type of value it is
'multi_valued': if it allows more than one value
'column': a number identifying it
'type': the type of the field
'multi_valued': 'false', 'column': 0}
"""
content_field_name = ''
schema_fields = [
{'field_name': ID,
'type': 'text',
'multi_valued': 'false',
'column': 0},
{'field_name': DJANGO_ID,
'type': 'integer',
'multi_valued': 'false',
'column': 1},
{'field_name': DJANGO_CT,
'type': 'text',
'multi_valued': 'false',
'column': 2},
]
self._columns[ID] = 0
self._columns[DJANGO_ID] = 1
self._columns[DJANGO_CT] = 2
column = len(schema_fields)
for field_name, field_class in sorted(list(fields.items()), key=lambda n: n[0]):
if field_class.document is True:
content_field_name = field_class.index_fieldname
if field_class.indexed is True:
field_data = {
'field_name': field_class.index_fieldname,
'type': 'text',
'multi_valued': 'false',
'column': column,
}
if field_class.field_type == 'date':
field_data['type'] = 'date'
elif field_class.field_type == 'datetime':
field_data['type'] = 'datetime'
elif field_class.field_type == 'integer':
field_data['type'] = 'integer'
elif field_class.field_type == 'float':
field_data['type'] = 'float'
elif field_class.field_type == 'boolean':
field_data['type'] = 'boolean'
elif field_class.field_type == 'ngram':
field_data['type'] = 'ngram'
elif field_class.field_type == 'edge_ngram':
field_data['type'] = 'edge_ngram'
if field_class.is_multivalued:
field_data['multi_valued'] = 'true'
schema_fields.append(field_data)
self._columns[field_data['field_name']] = column
column += 1
return content_field_name, schema_fields
@staticmethod
def _do_highlight(content, query, tag='em'):
"""
Highlight `query` terms in `content` with html `tag`.
This method assumes that the input text (`content`) does not contain
any special formatting. That is, it does not contain any html tags
or similar markup that could be screwed up by the highlighting.
Required arguments:
`content` -- Content to search for instances of `text`
`text` -- The text to be highlighted
"""
for term in query:
term = term.decode('utf-8')
for match in re.findall('[^A-Z]+', term): # Ignore field identifiers
match_re = re.compile(match, re.I)
content = match_re.sub('<%s>%s%s>' % (tag, term, tag), content)
return content
def _prepare_facet_field_spies(self, facets):
"""
Returns a list of spies based on the facets
used to count frequencies.
"""
spies = []
for facet in facets:
slot = self.column[facet]
spy = xapian.ValueCountMatchSpy(slot)
# add attribute "slot" to know which column this spy is targeting.
spy.slot = slot
spies.append(spy)
return spies
def _process_facet_field_spies(self, spies):
"""
Returns a dict of facet names with lists of
tuples of the form (term, term_frequency)
from a list of spies that observed the enquire.
"""
facet_dict = {}
for spy in spies:
field = self.schema[spy.slot]
field_name, field_type = field['field_name'], field['type']
facet_dict[field_name] = []
for facet in list(spy.values()):
if field_type == 'float':
# the float term is a Xapian serialized object, which is
# in bytes.
term = facet.term
else:
term = facet.term.decode('utf-8')
facet_dict[field_name].append((_from_xapian_value(term, field_type),
facet.termfreq))
return facet_dict
def _do_multivalued_field_facets(self, results, field_facets):
"""
Implements a multivalued field facet on the results.
This is implemented using brute force - O(N^2) -
because Xapian does not have it implemented yet
(see http://trac.xapian.org/ticket/199)
"""
facet_dict = {}
for field in field_facets:
facet_list = {}
if not self._multi_value_field(field):
continue
for result in results:
field_value = getattr(result, field)
for item in field_value: # Facet each item in a MultiValueField
facet_list[item] = facet_list.get(item, 0) + 1
facet_dict[field] = list(facet_list.items())
return facet_dict
@staticmethod
def _do_date_facets(results, date_facets):
"""
Private method that facets a document by date ranges
Required arguments:
`results` -- A list SearchResults to facet
`date_facets` -- A dictionary containing facet parameters:
{'field': {'start_date': ..., 'end_date': ...: 'gap_by': '...', 'gap_amount': n}}
nb., gap must be one of the following:
year|month|day|hour|minute|second
For each date facet field in `date_facets`, generates a list
of date ranges (from `start_date` to `end_date` by `gap_by`) then
iterates through `results` and tallies the count for each date_facet.
Returns a dictionary of date facets (fields) containing a list with
entries for each range and a count of documents matching the range.
eg. {
'pub_date': [
(datetime.datetime(2009, 1, 1, 0, 0), 5),
(datetime.datetime(2009, 2, 1, 0, 0), 0),
(datetime.datetime(2009, 3, 1, 0, 0), 0),
(datetime.datetime(2008, 4, 1, 0, 0), 1),
(datetime.datetime(2008, 5, 1, 0, 0), 2),
],
}
"""
def next_datetime(previous, gap_value, gap_type):
year = previous.year
month = previous.month
if gap_type == 'year':
next = previous.replace(year=year + gap_value)
elif gap_type == 'month':
if month + gap_value <= 12:
next = previous.replace(month=month + gap_value)
else:
next = previous.replace(
month=((month + gap_value) % 12),
year=(year + (month + gap_value) // 12)
)
elif gap_type == 'day':
next = previous + datetime.timedelta(days=gap_value)
elif gap_type == 'hour':
return previous + datetime.timedelta(hours=gap_value)
elif gap_type == 'minute':
next = previous + datetime.timedelta(minutes=gap_value)
elif gap_type == 'second':
next = previous + datetime.timedelta(seconds=gap_value)
else:
raise TypeError('\'gap_by\' must be '
'{second, minute, day, month, year}')
return next
facet_dict = {}
for date_facet, facet_params in list(date_facets.items()):
gap_type = facet_params.get('gap_by')
gap_value = facet_params.get('gap_amount', 1)
date_range = facet_params['start_date']
# construct the bins of the histogram
facet_list = []
while date_range < facet_params['end_date']:
facet_list.append((date_range, 0))
date_range = next_datetime(date_range, gap_value, gap_type)
facet_list = sorted(facet_list, key=lambda x: x[0], reverse=True)
for result in results:
result_date = getattr(result, date_facet)
# convert date to datetime
if not isinstance(result_date, datetime.datetime):
result_date = datetime.datetime(result_date.year,
result_date.month,
result_date.day)
# ignore results outside the boundaries.
if facet_list[0][0] < result_date < facet_list[-1][0]:
continue
# populate the histogram by putting the result on the right bin.
for n, facet_date in enumerate(facet_list):
if result_date > facet_date[0]:
# equal to facet_list[n][1] += 1, but for a tuple
facet_list[n] = (facet_list[n][0], (facet_list[n][1] + 1))
break # bin found; go to next result
facet_dict[date_facet] = facet_list
return facet_dict
def _do_query_facets(self, results, query_facets):
"""
Private method that facets a document by query
Required arguments:
`results` -- A list SearchResults to facet
`query_facets` -- A dictionary containing facet parameters:
{'field': 'query', [...]}
For each query in `query_facets`, generates a dictionary entry with
the field name as the key and a tuple with the query and result count
as the value.
eg. {'name': ('a*', 5)}
"""
facet_dict = {}
for field, query in list(dict(query_facets).items()):
facet_dict[field] = (query, self.search(self.parse_query(query))['hits'])
return facet_dict
@staticmethod
def _do_spelling_suggestion(database, query, spelling_query):
"""
Private method that returns a single spelling suggestion based on
`spelling_query` or `query`.
Required arguments:
`database` -- The database to check spelling against
`query` -- The query to check
`spelling_query` -- If not None, this will be checked instead of `query`
Returns a string with a suggested spelling
"""
if spelling_query:
if ' ' in spelling_query:
return ' '.join([database.get_spelling_suggestion(term).decode('utf-8') for term in spelling_query.split()])
else:
return database.get_spelling_suggestion(spelling_query).decode('utf-8')
term_set = set()
for term in query:
for match in re.findall('[^A-Z]+', term.decode('utf-8')): # Ignore field identifiers
term_set.add(database.get_spelling_suggestion(match).decode('utf-8'))
return ' '.join(term_set)
def _database(self, writable=False):
"""
Private method that returns a xapian.Database for use.
Optional arguments:
``writable`` -- Open the database in read/write mode (default=False)
Returns an instance of a xapian.Database or xapian.WritableDatabase
"""
if self.path == MEMORY_DB_NAME:
if not self.inmemory_db:
self.inmemory_db = xapian.inmemory_open()
return self.inmemory_db
if writable:
database = xapian.WritableDatabase(self.path, xapian.DB_CREATE_OR_OPEN)
else:
try:
database = xapian.Database(self.path)
except xapian.DatabaseOpeningError:
raise InvalidIndexError('Unable to open index at %s' % self.path)
return database
@staticmethod
def _get_enquire_mset(database, enquire, start_offset, end_offset, checkatleast=DEFAULT_CHECK_AT_LEAST):
"""
A safer version of Xapian.enquire.get_mset
Simply wraps the Xapian version and catches any `Xapian.DatabaseModifiedError`,
attempting a `database.reopen` as needed.
Required arguments:
`database` -- The database to be read
`enquire` -- An instance of an Xapian.enquire object
`start_offset` -- The start offset to pass to `enquire.get_mset`
`end_offset` -- The end offset to pass to `enquire.get_mset`
"""
try:
return enquire.get_mset(start_offset, end_offset, checkatleast)
except xapian.DatabaseModifiedError:
database.reopen()
return enquire.get_mset(start_offset, end_offset, checkatleast)
@staticmethod
def _get_document_data(database, document):
"""
A safer version of Xapian.document.get_data
Simply wraps the Xapian version and catches any `Xapian.DatabaseModifiedError`,
attempting a `database.reopen` as needed.
Required arguments:
`database` -- The database to be read
`document` -- An instance of an Xapian.document object
"""
try:
return document.get_data()
except xapian.DatabaseModifiedError:
database.reopen()
return document.get_data()
def _get_hit_count(self, database, enquire):
"""
Given a database and enquire instance, returns the estimated number
of matches.
Required arguments:
`database` -- The database to be queried
`enquire` -- The enquire instance
"""
return self._get_enquire_mset(
database, enquire, 0, database.get_doccount()
).size()
def _multi_value_field(self, field):
"""
Private method that returns `True` if a field is multi-valued, else
`False`.
Required arguemnts:
`field` -- The field to lookup
Returns a boolean value indicating whether the field is multi-valued.
"""
for field_dict in self.schema:
if field_dict['field_name'] == field:
return field_dict['multi_valued'] == 'true'
return False
class XapianSearchQuery(BaseSearchQuery):
"""
This class is the Xapian specific version of the SearchQuery class.
It acts as an intermediary between the ``SearchQuerySet`` and the
``SearchBackend`` itself.
"""
def build_params(self, *args, **kwargs):
kwargs = super().build_params(*args, **kwargs)
if self.end_offset is not None:
kwargs['end_offset'] = self.end_offset - self.start_offset
return kwargs
def build_query(self):
if not self.query_filter:
query = xapian.Query('')
else:
query = self._query_from_search_node(self.query_filter)
if self.models:
subqueries = [
xapian.Query(
xapian.Query.OP_SCALE_WEIGHT,
xapian.Query('%s%s' % (TERM_PREFIXES[DJANGO_CT], get_model_ct(model))),
0 # Pure boolean sub-query
) for model in self.models
]
query = xapian.Query(
xapian.Query.OP_AND, query,
xapian.Query(xapian.Query.OP_OR, subqueries)
)
if self.boost:
subqueries = [
xapian.Query(
xapian.Query.OP_SCALE_WEIGHT,
self._term_query(term, None, None), value
) for term, value in list(self.boost.items())
]
query = xapian.Query(
xapian.Query.OP_AND_MAYBE, query,
xapian.Query(xapian.Query.OP_OR, subqueries)
)
return query
def _query_from_search_node(self, search_node, is_not=False):
query_list = []
for child in search_node.children:
if isinstance(child, SearchNode):
query_list.append(
self._query_from_search_node(child, child.negated)
)
else:
expression, term = child
field_name, filter_type = search_node.split_expression(expression)
constructed_query_list = self._query_from_term(term, field_name, filter_type, is_not)
query_list.extend(constructed_query_list)
if search_node.connector == 'OR':
return xapian.Query(xapian.Query.OP_OR, query_list)
else:
return xapian.Query(xapian.Query.OP_AND, query_list)
def _query_from_term(self, term, field_name, filter_type, is_not):
"""
Uses arguments to construct a list of xapian.Query's.
"""
if field_name != 'content' and field_name not in self.backend.column:
raise InvalidIndexError('field "%s" not indexed' % field_name)
# It it is an AutoQuery, it has no filters
# or others, thus we short-circuit the procedure.
if isinstance(term, AutoQuery):
if field_name != 'content':
query = '%s:%s' % (field_name, term.prepare(self))
else:
query = term.prepare(self)
return [self.backend.parse_query(query)]
query_list = []
# Handle `ValuesListQuerySet`.
if hasattr(term, 'values_list'):
term = list(term)
if field_name == 'content':
# content is the generic search:
# force no field_name search
# and the field_type to be 'text'.
field_name = None
field_type = 'text'
# we don't know what is the type(term), so we parse it.
# Ideally this would not be required, but
# some filters currently depend on the term to make decisions.
term = _to_xapian_term(term)
query_list.append(self._filter_contains(term, field_name, field_type, is_not))
# when filter has no filter_type, haystack uses
# filter_type = 'content'. Here we remove it
# since the above query is already doing this
if filter_type == 'content':
filter_type = None
else:
# get the field_type from the backend
field_type = self.backend.schema[self.backend.column[field_name]]['type']
# private fields don't accept 'contains' or 'startswith'
# since they have no meaning.
if filter_type in ('contains', 'startswith') and field_name in (ID, DJANGO_ID, DJANGO_CT):
filter_type = 'exact'
if field_type == 'text':
# we don't know what type "term" is, but we know we are searching as text
# so we parse it like that.
# Ideally this would not be required since _term_query does it, but
# some filters currently depend on the term to make decisions.
if isinstance(term, list):
term = [_to_xapian_term(term) for term in term]
else:
term = _to_xapian_term(term)
# todo: we should check that the filter is valid for this field_type or raise InvalidIndexError
if filter_type == 'contains':
query_list.append(self._filter_contains(term, field_name, field_type, is_not))
elif filter_type in ('content', 'exact'):
query_list.append(self._filter_exact(term, field_name, field_type, is_not))
elif filter_type == 'in':
query_list.append(self._filter_in(term, field_name, field_type, is_not))
elif filter_type == 'startswith':
query_list.append(self._filter_startswith(term, field_name, field_type, is_not))
elif filter_type == 'endswith':
raise NotImplementedError("The Xapian search backend doesn't support endswith queries.")
elif filter_type == 'gt':
query_list.append(self._filter_gt(term, field_name, field_type, is_not))
elif filter_type == 'gte':
query_list.append(self._filter_gte(term, field_name, field_type, is_not))
elif filter_type == 'lt':
query_list.append(self._filter_lt(term, field_name, field_type, is_not))
elif filter_type == 'lte':
query_list.append(self._filter_lte(term, field_name, field_type, is_not))
elif filter_type == 'range':
query_list.append(self._filter_range(term, field_name, field_type, is_not))
return query_list
def _all_query(self):
"""
Returns a match all query.
"""
return xapian.Query('')
def _filter_contains(self, term, field_name, field_type, is_not):
"""
Splits the sentence in terms and join them with OR,
using stemmed and un-stemmed.
Assumes term is not a list.
"""
if field_type == 'text':
term_list = term.split()
else:
term_list = [term]
query = self._or_query(term_list, field_name, field_type)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
else:
return query
def _filter_in(self, term_list, field_name, field_type, is_not):
"""
Returns a query that matches exactly ANY term in term_list.
Notice that:
A in {B,C} <=> (A = B or A = C)
~(A in {B,C}) <=> ~(A = B or A = C)
Because OP_AND_NOT(C, D) <=> (C and ~D), then D=(A in {B,C}) requires `is_not=False`.
Assumes term is a list.
"""
query_list = [self._filter_exact(term, field_name, field_type, is_not=False)
for term in term_list]
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(),
xapian.Query(xapian.Query.OP_OR, query_list))
else:
return xapian.Query(xapian.Query.OP_OR, query_list)
def _filter_exact(self, term, field_name, field_type, is_not):
"""
Returns a query that matches exactly the un-stemmed term
with positional order.
Assumes term is not a list.
"""
if field_type == 'text' and field_name not in _EXACT_SEARCHFIELDS:
term = '^ %s $' % term
query = self._phrase_query(term.split(), field_name, field_type)
else:
query = self._term_query(term, field_name, field_type, stemmed=False)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
else:
return query
def _filter_startswith(self, term, field_name, field_type, is_not):
"""
Returns a startswith query on the un-stemmed term.
Assumes term is not a list.
"""
if field_type == 'text':
if len(term.split()) == 1:
term = '^ %s*' % term
query = self.backend.parse_query(term)
else:
term = '^ %s' % term
query = self._phrase_query(term.split(), field_name, field_type)
else:
term = '^%s*' % term
query = self.backend.parse_query(term)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
return query
def _or_query(self, term_list, field, field_type):
"""
Joins each item of term_list decorated by _term_query with an OR.
"""
term_list = [self._term_query(term, field, field_type) for term in term_list]
return xapian.Query(xapian.Query.OP_OR, term_list)
def _phrase_query(self, term_list, field_name, field_type):
"""
Returns a query that matches exact terms with
positional order (i.e. ["this", "thing"] != ["thing", "this"])
and no stem.
If `field_name` is not `None`, restrict to the field.
"""
term_list = [self._term_query(term, field_name, field_type,
stemmed=False) for term in term_list]
query = xapian.Query(xapian.Query.OP_PHRASE, term_list)
return query
def _term_query(self, term, field_name, field_type, stemmed=True):
"""
Constructs a query of a single term.
If `field_name` is not `None`, the term is search on that field only.
If exact is `True`, the search is restricted to boolean matches.
"""
constructor = '{prefix}{term}'
# construct the prefix to be used.
prefix = ''
if field_name:
prefix = TERM_PREFIXES['field'] + field_name.upper()
term = _to_xapian_term(term)
if field_name in (ID, DJANGO_ID, DJANGO_CT):
# to ensure the value is serialized correctly.
if field_name == DJANGO_ID:
term = int(term)
term = _term_to_xapian_value(term, field_type)
return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term))
# we construct the query dates in a slightly different way
if field_type == 'datetime':
date, time = term.split()
return xapian.Query(xapian.Query.OP_AND_MAYBE,
constructor.format(prefix=prefix, term=date),
constructor.format(prefix=prefix, term=time)
)
# only use stem if field is text or "None"
if field_type not in ('text', None):
stemmed = False
unstemmed_term = constructor.format(prefix=prefix, term=term)
if stemmed:
stem = xapian.Stem(self.backend.language)
stemmed_term = 'Z' + constructor.format(prefix=prefix, term=stem(term).decode('utf-8'))
return xapian.Query(xapian.Query.OP_OR,
xapian.Query(stemmed_term),
xapian.Query(unstemmed_term)
)
else:
return xapian.Query(unstemmed_term)
def _filter_gt(self, term, field_name, field_type, is_not):
return self._filter_lte(term, field_name, field_type, is_not=not is_not)
def _filter_lt(self, term, field_name, field_type, is_not):
return self._filter_gte(term, field_name, field_type, is_not=not is_not)
def _filter_gte(self, term, field_name, field_type, is_not):
"""
Private method that returns a xapian.Query that searches for any term
that is greater than `term` in a specified `field`.
"""
vrp = XHValueRangeProcessor(self.backend)
pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term, field_type)), '*')
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT,
self._all_query(),
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
)
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
def _filter_lte(self, term, field_name, field_type, is_not):
"""
Private method that returns a xapian.Query that searches for any term
that is less than `term` in a specified `field`.
"""
vrp = XHValueRangeProcessor(self.backend)
pos, begin, end = vrp('%s:' % field_name, '%s' % _term_to_xapian_value(term, field_type))
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT,
self._all_query(),
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
)
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
def _filter_range(self, term, field_name, field_type, is_not):
"""
Private method that returns a xapian.Query that searches for any term
that is between the values from the `term` list.
"""
vrp = XHValueRangeProcessor(self.backend)
pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term[0], field_type)),
'%s' % _term_to_xapian_value(term[1], field_type))
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT,
self._all_query(),
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
)
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
def _term_to_xapian_value(term, field_type):
"""
Converts a term to a serialized
Xapian value based on the field_type.
"""
assert field_type in FIELD_TYPES
def strf(dt):
"""
Equivalent to datetime.datetime.strptime(dt, DATETIME_FORMAT)
but accepts years below 1900 (see http://stackoverflow.com/q/10263956/931303)
"""
return '%04d%02d%02d%02d%02d%02d' % (
dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
if field_type == 'boolean':
assert isinstance(term, bool)
if term:
value = 't'
else:
value = 'f'
elif field_type == 'integer':
value = INTEGER_FORMAT % term
elif field_type == 'float':
value = xapian.sortable_serialise(term)
elif field_type in ['date', 'datetime']:
if field_type == 'date':
# http://stackoverflow.com/a/1937636/931303 and comments
term = datetime.datetime.combine(term, datetime.time())
value = strf(term)
else: # field_type == 'text'
value = _to_xapian_term(term)
return value
def _to_xapian_term(term):
"""
Converts a Python type to a
Xapian term that can be indexed.
"""
return str(term).lower()
def _from_xapian_value(value, field_type):
"""
Converts a serialized Xapian value
to Python equivalent based on the field_type.
Doesn't accept multivalued fields.
"""
assert field_type in FIELD_TYPES
if field_type == 'boolean':
if value == 't':
return True
elif value == 'f':
return False
else:
InvalidIndexError('Field type "%d" does not accept value "%s"' % (field_type, value))
elif field_type == 'integer':
return int(value)
elif field_type == 'float':
return xapian.sortable_unserialise(value)
elif field_type in ['date', 'datetime']:
datetime_value = datetime.datetime.strptime(value, DATETIME_FORMAT)
if field_type == 'datetime':
return datetime_value
else:
return datetime_value.date()
else: # field_type == 'text'
return value
def _xapian_sort(enquire, sort_by, column):
sorter = xapian.MultiValueKeyMaker()
for sort_field in sort_by:
if sort_field.startswith('-'):
reverse = False
sort_field = sort_field[1:] # Strip the '-'
else:
reverse = True
sorter.add_value(column[sort_field], reverse)
enquire.set_sort_by_key_then_relevance(sorter, True)
class XapianEngine(BaseEngine):
backend = XapianSearchBackend
query = XapianSearchQuery
xapian-haystack-3.1.0/xapian_wheel_builder.sh 0000775 0000000 0000000 00000015034 14405573772 0021332 0 ustar 00root root 0000000 0000000 #!/bin/sh
uname_sysname="$(uname -s)"
case "${uname_sysname}" in
Linux)
;;
Darwin)
;;
FreeBSD)
;;
*)
echo "Platform ${uname_sysname} is not supported"
exit 1
esac
PYTHON=$(command -v python3)
# shellcheck disable=SC2046
set -- $(getopt p: "$@")
for opt; do
case "$opt" in
-p)
PYTHON="$2"; shift 2 ;;
--)
shift ; break ;;
esac
done
usage() {
echo "usage: $0 [-p ] version_number" 1>&2
}
version_at_least() {
test "$(printf "%s\\n%s" "${VERSION}" "${1}" | sort -V | head -n1)" = "${1}"
return $?
}
VERSION=${1-${XAPIAN_VERSION}}
if [ -z "${VERSION}" ]; then
usage
exit 1
fi
if [ -z "${PYTHON}" ] || [ ! -x "${PYTHON}" ]; then
usage
echo "error: could not find python3, please specify with -p" 1>&2
exit 1
fi
exittrap() { :; }
sigtrap() {
# Whether or not exittrap runs on EXIT due to a signal is not defined.
# We use it for cleanup, and cleaning up twice is not a problem,
# so let's do that.
exittrap
exit $(($1 + 128))
}
for sig in 1 2 13 15; do
# shellcheck disable=SC2064
trap "sigtrap $sig" $sig
done
trap 'exittrap' EXIT
WHL_DEST=$(pwd)
TMPDIR=$(mktemp -d -t "xapian-builder-XXXXXX") || die "Unable to mktemp"
exittrap() { rm -rf "${TMPDIR}"; }
set -e
echo "Building in ${TMPDIR}."
cd "${TMPDIR}"
echo "Preparing build virtualenv..."
VE="${TMPDIR}/ve"
"${PYTHON}" -m venv "${VE}"
"${VE}/bin/python" -m pip install --upgrade pip wheel setuptools
# xapian before 1.4.12 had issues building with sphinx>=2
if version_at_least "1.4.12"; then
"${VE}/bin/pip" install sphinx
else
"${VE}/bin/pip" install "sphinx<2"
fi
BASE_URI="https://oligarchy.co.uk/xapian/"
CORE="xapian-core-${VERSION}"
BINDINGS="xapian-bindings-${VERSION}"
CORE_URI="${BASE_URI}${VERSION}/${CORE}.tar.xz"
BINDINGS_URI="${BASE_URI}${VERSION}/${BINDINGS}.tar.xz"
echo "Downloading source..."
curl -O "${CORE_URI}"
curl -O "${BINDINGS_URI}"
echo "Extracting source..."
mkdir src
tar -C src -xf "${CORE}.tar.xz"
tar -C src -xf "${BINDINGS}.tar.xz"
# building xapian-core
mkdir target
prefix=${TMPDIR}/target
pprefix=${prefix}
case "${uname_sysname}" in
Linux|FreeBSD)
while [ ${#pprefix} -lt 7 ]; do
# add padding as needed
pprefix=${pprefix}/.
done
;;
esac
echo "Building xapian core..."
(
cd "src/${CORE}"
./configure --prefix="${pprefix}"
make
make install
)
XAPIAN_CONFIG="${prefix}/bin/xapian-config*"
echo "Building xapian python3 bindings..."
(
cd "src/${BINDINGS}"
# We're building python3 bindings here, and we need to contort things to make it work.
# We want the xapian-config we just built.
# We want the sphinx we just put in a virtualenv because the xapian bindings insist on making their docs.
# We use the python3 from that same virtualenv, because the xapian bindings don't use the shebang line of sphinx-build.
# We override PYTHON3_LIB because if we don't then the bindings will be installed in the virutalenv, despite what we set prefix to.
case "${uname_sysname}" in
FreeBSD)
sed -i '' -e 's|-lstdc++||' configure
;;
esac
./configure --prefix="$prefix" --with-python3 XAPIAN_CONFIG="${XAPIAN_CONFIG}" SPHINX_BUILD="${VE}/bin/sphinx-build" PYTHON3="${VE}/bin/python3" PYTHON3_LIB="${prefix}"
make
make install
)
binary_patch_rpath() {
file="${1}"
case "${uname_sysname}" in
Linux|FreeBSD)
# Binary patch rpath to be '$ORIGIN' as needed.
rpath_offset=$(strings -t d "${file}" | grep "${pprefix}/lib" | awk '{ printf $1; }')
printf "\$ORIGIN\\000" | dd of="${file}" obs=1 seek="${rpath_offset}" conv=notrunc 2>/dev/null
# Verify
readelf -d "${file}" | grep RUNPATH | grep -q ORIGIN
libxapian_name=$(ldd "${file}" | grep libxapian | awk '{ printf $1; }')
;;
Darwin)
libxapian_name=$(otool -L "${file}" | grep -o 'libxapian.*' | awk '{ printf $1; }')
install_name_tool -change "${prefix}/lib/${libxapian_name}" "@loader_path/${libxapian_name}" "${file}"
;;
esac
}
echo "preparing xapian wheel..."
for file in "${prefix}"/xapian/*.so; do
binary_patch_rpath "${file}"
done
# Copy libxapian into place alongside the python bindings.
cp "${prefix}/lib/${libxapian_name}" "${prefix}/xapian"
case "${uname_sysname}" in
Darwin)
install_name_tool -id "@loader_path/${libxapian_name}" "${prefix}/xapian/${libxapian_name}"
;;
esac
for file in "${prefix}"/bin/xapian-delve*; do
binary_patch_rpath "${file}"
cp "${file}" "${prefix}/xapian"
done
# Prepare the scaffolding for the wheel
cat > "${prefix}/setup.py" < "${prefix}/MANIFEST.in" < "${prefix}/README" <