pax_global_header00006660000000000000000000000064130061310560014505gustar00rootroot0000000000000052 comment=663786e621c7ba9d71a2d6bfb7d6a92cce7d6586 pysolr-3.6.0/000077500000000000000000000000001300613105600130435ustar00rootroot00000000000000pysolr-3.6.0/.gitchangelog.rc000066400000000000000000000141321300613105600161030ustar00rootroot00000000000000## ## Format ## ## ACTION: [AUDIENCE:] COMMIT_MSG [!TAG ...] ## ## Description ## ## ACTION is one of 'chg', 'fix', 'new' ## ## Is WHAT the change is about. ## ## 'chg' is for refactor, small improvement, cosmetic changes... ## 'fix' is for bug fixes ## 'new' is for new features, big improvement ## ## AUDIENCE is optional and one of 'dev', 'usr', 'pkg', 'test', 'doc' ## ## Is WHO is concerned by the change. ## ## 'dev' is for developpers (API changes, refactors...) ## 'usr' is for final users (UI changes) ## 'pkg' is for packagers (packaging changes) ## 'test' is for testers (test only related changes) ## 'doc' is for doc guys (doc only changes) ## ## COMMIT_MSG is ... well ... the commit message itself. ## ## TAGs are additionnal adjective as 'refactor' 'minor' 'cosmetic' ## ## They are preceded with a '!' or a '@' (prefer the former, as the ## latter is wrongly interpreted in github.) Commonly used tags are: ## ## 'refactor' is obviously for refactoring code only ## 'minor' is for a very meaningless change (a typo, adding a comment) ## 'cosmetic' is for cosmetic driven change (re-indentation, 80-col...) ## 'wip' is for partial functionality but complete subfunctionality. ## ## Example: ## ## new: usr: support of bazaar implemented ## chg: re-indentend some lines !cosmetic ## new: dev: updated code to be compatible with last version of killer lib. ## fix: pkg: updated year of licence coverage. ## new: test: added a bunch of test around user usability of feature X. ## fix: typo in spelling my name in comment. !minor ## ## Please note that multi-line commit message are supported, and only the ## first line will be considered as the "summary" of the commit message. So ## tags, and other rules only applies to the summary. The body of the commit ## message will be displayed in the changelog without reformatting. ## ## ``ignore_regexps`` is a line of regexps ## ## Any commit having its full commit message matching any regexp listed here ## will be ignored and won't be reported in the changelog. ## ignore_regexps = [ r'@minor', r'!minor', r'@cosmetic', r'!cosmetic', r'@refactor', r'!refactor', r'@wip', r'!wip', r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*[p|P]kg:', r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*[d|D]ev:', r'^(.{3,3}\s*:)?\s*[fF]irst commit.?\s*$', ] ## ``section_regexps`` is a list of 2-tuples associating a string label and a ## list of regexp ## ## Commit messages will be classified in sections thanks to this. Section ## titles are the label, and a commit is classified under this section if any ## of the regexps associated is matching. ## section_regexps = [ ('New', [ r'^[nN]ew\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', ]), ('Changes', [ r'^[cC]hg\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', ]), ('Fix', [ r'^[fF]ix\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', ]), ('Other', None ## Match all lines ), ] ## ``body_process`` is a callable ## ## This callable will be given the original body and result will ## be used in the changelog. ## ## Available constructs are: ## ## - any python callable that take one txt argument and return txt argument. ## ## - ReSub(pattern, replacement): will apply regexp substitution. ## ## - Indent(chars=" "): will indent the text with the prefix ## Please remember that template engines gets also to modify the text and ## will usually indent themselves the text if needed. ## ## - Wrap(regexp=r"\n\n"): re-wrap text in separate paragraph to fill 80-Columns ## ## - noop: do nothing ## ## - ucfirst: ensure the first letter is uppercase. ## (usually used in the ``subject_process`` pipeline) ## ## - final_dot: ensure text finishes with a dot ## (usually used in the ``subject_process`` pipeline) ## ## - strip: remove any spaces before or after the content of the string ## ## Additionally, you can `pipe` the provided filters, for instance: #body_process = Wrap(regexp=r'\n(?=\w+\s*:)') | Indent(chars=" ") #body_process = Wrap(regexp=r'\n(?=\w+\s*:)') #body_process = noop body_process = ReSub(r'((^|\n)[A-Z]\w+(-\w+)*: .*(\n\s+.*)*)+$', r'') | strip ## ``subject_process`` is a callable ## ## This callable will be given the original subject and result will ## be used in the changelog. ## ## Available constructs are those listed in ``body_process`` doc. subject_process = (strip | ReSub(r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n@]*)(@[a-z]+\s+)*$', r'\4') | ucfirst | final_dot) ## ``tag_filter_regexp`` is a regexp ## ## Tags that will be used for the changelog must match this regexp. ## tag_filter_regexp = r'^v[0-9]+\.[0-9]+(\.[0-9]+)?$' ## ``unreleased_version_label`` is a string ## ## This label will be used as the changelog Title of the last set of changes ## between last valid tag and HEAD if any. unreleased_version_label = "%%version%% (unreleased)" ## ``output_engine`` is a callable ## ## This will change the output format of the generated changelog file ## ## Available choices are: ## ## - rest_py ## ## Legacy pure python engine, outputs ReSTructured text. ## This is the default. ## ## - mustache() ## ## Template name could be any of the available templates in ## ``templates/mustache/*.tpl``. ## Requires python package ``pystache``. ## Examples: ## - mustache("markdown") ## - mustache("restructuredtext") ## ## - makotemplate() ## ## Template name could be any of the available templates in ## ``templates/mako/*.tpl``. ## Requires python package ``mako``. ## Examples: ## - makotemplate("restructuredtext") ## output_engine = rest_py #output_engine = mustache("restructuredtext") #output_engine = mustache("markdown") #output_engine = makotemplate("restructuredtext") ## ``include_merge`` is a boolean ## ## This option tells git-log whether to include merge commits in the log. ## The default is to include them. include_merge = True pysolr-3.6.0/.github/000077500000000000000000000000001300613105600144035ustar00rootroot00000000000000pysolr-3.6.0/.github/issue_template.md000066400000000000000000000005051300613105600177500ustar00rootroot00000000000000# I have * [ ] Tested with the latest release * [ ] Tested with the current master branch * [ ] Searched for similar existing issues ## Expected behaviour ## Actual behaviour ## Steps to reproduce the behaviour 1. ## Configuration * Operating system version: * Search engine version: * Python version: * pysolr version:pysolr-3.6.0/.github/pull_request_template.md000066400000000000000000000006101300613105600213410ustar00rootroot00000000000000# Hey, thanks for contributing to pysolr. Please confirm that [the tests pass](https://github.com/django-haystack/pysolr/blob/master/README.rst#running-tests) locally # Once your pull request has been submitted, the full test suite will be executed on https://travis-ci.org/django-haystack/pysolr/pull_requests. Pull requests with passing tests are far more likely to be reviewed and merged.pysolr-3.6.0/.gitignore000066400000000000000000000000341300613105600150300ustar00rootroot00000000000000.tox solr*.tgz solr-app solrpysolr-3.6.0/.travis.yml000066400000000000000000000012421300613105600151530ustar00rootroot00000000000000sudo: false language: python python: - "2.7" - "3.3" - "3.4" - "3.5" - "pypy" cache: apt: true pip: true directories: - $HOME/download-cache env: - SOLRCLOUD=false - SOLRCLOUD=true matrix: allow_failures: - python: 'pypy' addons: apt_packages: - default-jdk install: - "pip install 'requests>2'" - "pip install ." - 'if [[ $TRAVIS_PYTHON_VERSION == "2.7" ]]; then travis_retry pip install faulthandler; fi' - 'if [[ "${SOLRCLOUD:-false}" == "true" ]]; then pip install -e .[solrcloud]; fi' script: - python run-tests.py notifications: # irc: "irc.freenode.org#pysolr" email: false pysolr-3.6.0/AUTHORS000066400000000000000000000030341300613105600141130ustar00rootroot00000000000000Primaries: * Joseph Kocherhans * Daniel Lindsley * Jacob Kaplan-Moss * Chris Adams Contributors: * initcrash for a patch regarding datetime formatting. * maciekp.lists for a patch correcting URL construction. * jarek & dekstop for a patch regarding sending Unicode documents. * Tomasz.Wegrzanowski for a patch to enable document boosting. * thomas.j.lee for a patch to add stats support. * Chak for a patch regarding empty string being unnecessarily sent. * james.colin.brady for a patch to enable working with the cores. * anti-social for a patch on charset sending. * akaihola for a patch regarding long queries. * bochecha for various patches. * stugots for an invalid character patch. * notanumber for a field boosting patch. * acdha for various patches. * zyegfryed for various patches. * girasquid for a patch related to server string. * David Cramer (dcramer) for various patches. * dourvais for a query time patch. * soypunk for a debug patch. * cordmata for a patch to handle how Solr 3.X returns suggestions. * pabluk for Tika integration improvements. * gthb for a patch to add grouping support. * timsavage for a patch making add() compatible with generators * Karol Sikora (@sicarrots) for Solr 4 softCommit support * Çağatay Çallı (@faraday) for Solr 4 field update support * Emmanuel Leblond (@touilleMan) for fixing error handling on Python 3 * Michał Jaworski (@swistakm) for improved Sentry-friendly logging * Upayavira (@upayavira) for SolrCloud support * Kwame Porter Robinson (@robinsonkwame) for adding overwrite support to Solr.add pysolr-3.6.0/CHANGELOG.rst000066400000000000000000000737271300613105600151040ustar00rootroot00000000000000Changelog ========= %%version%% (unreleased) ------------------------ New ~~~ - Support for nested documents (closes #170) [Chris Adams] This adds support for Solr's nested documents in `Solr.add` Thanks to @skirsdeda for the patch - ZooKeeper can receive an existing KazooClient instance. [Chris Adams] This simplifies advanced customization by allowing you to pass in an existing instance which is configured in whatever manner necessary. Changes ~~~~~~~ - Logging: pass full request body + headers as extra data. [Chris Adams] This doesn't affect the normal logging output but is helpful for aggregation systems such as Sentry where the full request information may be displayed for debugging purposes - Basic max_retries for ZooKeeper. [Chris Adams] Kazoo sets the default to hang forever which is frequently not the desired error-handling behavior. This makes it easier to set a limit on the number of retries and we use it in testing to avoid the suite hanging endlessly. - Better error message for Solr failures. [Chris Adams] Previously when ZooKeeper had no active shards pysolr would return an error when `random.shuffle` received an empty list. Now it will raise an exception which will hopefully indicate just how bad the situation is. - Remove __del__ methods. [Chris Adams] The __del__ methods were added in an attempt to avoid Kazoo-related failures as part of the SolrCloud support but can cause other problems on different versions of Python (see #193). Since the errors in question were observed during testing this commit removes the __del__ methods and we will have to find an alternative for making tests fail safely. Fix ~~~ - Set KazooClient timeout. [Chris Adams] `__init__` was not actually passing this to the ZooKeeper client Other ~~~~~ - Better docstring for SolrCoreAdmin. [Chris Adams] Thanks to Patricio Del Boca (@pdelboca) for the patch. Closes #185 - Require requests >= 2.9.1 (closes #177) [Chris Adams] This will avoid compatibility issues on Python 3 which can produce confusing errors. - Merge pull request #203 from bendemott/documentation. [Chris Adams] updated typo in documentation example - Updated typo in documentation example. [Ben DeMott] "Zookeeper" should be "ZooKeeper" on line 104 in README.rst - Docs: note that add has commit=True by default (see #46) [Chris Adams] Thanks to @mlissner - Adds note about commit=True being the default. [Mike Lissner] - Correctly handle time-zone aware dates (#201) [Andrew Kuchling] Thanks to Andrew Kuchling (@akuchling) for the patch. Closes #197, #198 - Oops.. Add a missing assert in tests. [Tadas Dailyda] - Refactor _build_doc to be recursive and allow deeper document nesting, fix tests accordingly. [Tadas Dailyda] - Add some block join queries to test_search. [Tadas Dailyda] - Add some nested docs to the tests. [Tadas Dailyda] - Implement nested documents functionality. [Tadas Dailyda] - ZooKeeper: by default use the same timeout for commands and connections. [Chris Adams] - Tox: run SolrCloud tests (parity with Travis CI) [Chris Adams] - Update project URL. [Chris Adams] v3.5.0 (2016-05-24) ------------------- New ~~~ - Expose the full Solr response in `Results` [Chris Adams] This makes life easier for anyone using custom extensions by removing the need to create a `Results` subclass just to get access to an extra dictionary key. - More flexible control of request handlers. [nuarhu] This allows configuring the default search handler and overriding it for every query method Thanks to @nuarhu for the patch - Start maintaining a changelog from gitchangelog. [Chris Adams] - Overwrite flag for Solr.add (closes #182) [Chris Adams] Thanks to @robinsonkwame for the patch - SolrCloud support (see #138) [Chris Adams] This optionally adds support for SolrCloud using the Kazoo client library. Thanks to @upayavira Other ~~~~~ - V3.5.0. [Chris Adams] - Merge pull request #192 from dhruvpathak/optimize_commit_flag. [Chris Adams] chg: `optimize()` also accepts `commit` flag - Included commit flag in optimize() to let optimize call run with or without commit. [dhruv.pathak] - Merge pull request #188 from TigorC/master. [Chris Adams] Removed py26 from tox.ini - Removed py26 from tox.ini. [Igor Tokarev] - Tests: avoid timeout-based CI failures. [Chris Adams] These caused sporadic CI build failures and weren’t otherwise testing actual functionality since we don’t have a test which does something like SIGSTOP the test Solr server long enough to confirm a timeout. We’ll confirm that the timeout is passed through but otherwise use the defaults. - Update Travis CI badge in the README. [Chris Adams] - Merge pull request #184 from atuljangra/master. [Chris Adams] Correct documentation for `_update` Thanks to @atuljangra for the patch! - Merge branch 'master' of https://github.com/atuljangra/pysolr. [atuljangra] - Misleading comments. [atuljangra] - Travis: use build matrix for regular and SolrCloud tests. [Chris Adams] - Test_cloud: remove dead code. [Chris Adams] The first instance of test_custom_results_class was broken because it used the wrong port but this wasn’t failing because the same method name was redefined further down in the file and that used the updated port config. - PEP-8. [Chris Adams] - ZooKeeper: log unexpected format changes to watched aliases. [Chris Adams] - ZooKeeper: restore JSON blob decoding. [Chris Adams] - PEP-8. [Chris Adams] - PEP-8 unused imports. [Chris Adams] - PEP-8. [Chris Adams] - PEP-8. [Chris Adams] - PEP-8. [Chris Adams] - Setup.cfg: add pep8 and isort config. [Chris Adams] - Tear down requests.Session instance at close. [Chris Adams] This avoids log-spew on modern unittest implementations which report unclosed file handles at the end of a run. - Remove Python 2.6 from Travis test matrix. [Chris Adams] - Add __future__ absolute_import. [Chris Adams] This is currently moot but avoids any chance of regression between Python 2.x and 3.x. - PEP-8. [Chris Adams] - Drop support for Python 2.6. [Chris Adams] We have some old import dances and other overhead for Python 2.6 support, which the CPython developers dropped support for in 2013: http://www.curiousefficiency.org/posts/2015/04/stop-supporting-python26.html - Allow queries to be directed to different search handlers. [Chris Adams] The `search` method now allows you override the default `select` handler when your Solr instance has multiple search handlers. Thanks to @k-patel for the patch. Closes #174 Closes #175 v3.4.0 (2016-02-02) ------------------- - Update version numbers for v3.4.0. [Chris Adams] - Logging: better message for HTTP status != 200. [Chris Adams] We already extract error message from Solr responses and that is great. Unfortunately it can contain the data that may change with every request (like document id). This creates an issue when user uses Sentry or other solution that captures logging or exceptions. Previous implementation causes many duplicated events in Sentry if message extracted using `self._extract_error(resp)` contained such variable data. This change uses 'non-mutable' message that is complemented with extracted data that using string formatting option supplied by Python logging. Thanks to this, Sentry and other solutions can perform better grouping of loging messages (by status code). This is approach that is already used in handling other errors. - Fix response error handling on Python 3 (closes #162) [Chris Adams] Previously the error handling did not work correctly on Python 3 because a byte-string response wasn't decoded before processing. Thanks to Emmanuel Leblond (@touilleMan) for the patch. - Merge pull request #167 from swistakm/master. [Chris Adams] Refactor common response processing to Results class - Move response manipulation responsibility to Results class and allow custom results classes. [mjaworski] - Add Python 3.5 to automated test matrix. [Chris Adams] v3.3.3 (2015-10-24) ------------------- - V3.3.3. [Chris Adams] - Fix response error handling on Python 3 (closes #162) [Chris Adams] Previously the error handling did not work correctly on Python 3 because a byte-string response wasn't decoded before processing. Thanks to Emmanuel Leblond (@touilleMan) for the patch. - Tests: upgrade Solr to 4.10.4. [Chris Adams] * Resync test Solar script with django-haystack These are still not quite the same; at some point it would be nice to look into a common tool which both projects could use * Update Solr configuration script to set correct libpath for solr-cell to avoid lazy-load failures during testing as was reported on e.g. #162 - Tests: update Solr download script for recent API change. [Chris Adams] - Merge pull request #142 from yspanchal/master. [Chris Adams] Add support for cursormark pagination - Added cursormark deep pagination support. [Yogesh Panchal] v3.3.2 (2015-05-26) ------------------- - Version 3.3.2. [Chris Adams] - Python 2.6 backwards compatibility. [Chris Adams] Python 2.6 shipped with ElementTree 1.2.x. Among other differences, it lacks support for the attribute selectors used to process valid XML error messages, which was added in ElementTree 1.3. - Merge pull request #155 from domenkozar/solr4/exceptions. [Chris Adams] Support Solr 4 XML error format parsing Thanks @domenkozar for the patch - Overhaul Travis config. [Chris Adams] * Sidestep use of Tox in favor of Travis-managed Python versions * Enable container-based builds * Enable caching for Solr server downloads - Use builtin unittest2 runner on Python 2.7 as well. [Chris Adams] - Simple error extraction. [Chris Adams] Previously pysolr depended on lxml and cssselect to extract text from Tomcat’s error messages, which was unreliable. This change uses regular expressions to deal with invalid XML rather than lxml’s salvaging parser and avoids having to maintain the code which attempted to find the main error message in tag soup Closes #149 - Update test Solr download script to work with default Python 3. [Chris Adams] v3.3.1 (2015-05-12) ------------------- - Version 3.3.1. [Chris Adams] - Prepare for 3.3.1 release. [Chris Adams] - Convert all HTTP client errors to SolrError. [Chris Adams] This commit ensures that an outside caller can handle all HTTP-related errors by catching SolrError without knowing whether the exception class is owned by requests, urllib3, or httplib. - Merge pull request #146 from gryphius/fix_doc_typo. [Chris Adams] Fix typo in ExtractingRequestHandler documentation Thanks @gryphius - Doc fix: a very simply model -> a very simple model. [Oli] - Merge pull request #139 from upayavira/feature/no-optimize. [Daniel Lindsley] Optimize is no longer recommended - Optimize is no longer recommended. [Upayavira] Since Solr 3.6, Solr has used the TieredMergePolicy which makes, in most scenarios, optimization a harmful rather than beneficial step. v3.3.0 (2015-02-03) ------------------- - Bumped to v3.3.0! [Daniel Lindsley] - Added @acdha to primaries for all his hard work. [Daniel Lindsley] - Support Solr 4+ individual field updates (closes #129) [Chris Adams] Now fields can be updated individually: conn.add(docs, fieldUpdates={'myfield1_ss': 'add', 'myfield2_s': 'set', 'myfield3_i': 'inc'}) Thanks to Çağatay Çallı (@faraday) for the patch. - Merge pull request #137 from LuRsT/patch-1. [Chris Adams] Fixed syntax error in README.rst example (thanks @LuRsT) - Fixed syntax error in README.rst example. [Gil Gonçalves] - Add softCommit support (closes #98) [Chris Adams] add() and commit() may now be called with softCommit=True Thanks to @sicarrots for the patch - Merge pull request #123 from ulivedit/master. [Chris Adams] Python 3 compatibility for error message extraction (thanks @ulivedit) - Fix python 3.4 error with forcing unicode strings. [Eric Hagman] - Merge pull request #135 from Grokzen/master. [Chris Adams] Use DEBUG_PYSOLR environmental variable to configure logging This offers an alternative to editing pysolr.py or reconfiguring logging elsewhere - Make it easier to debug pysolr via environment variable. [Johan Andersson] - Merge pull request #131 from andreif/highlighted-readme. [Chris Adams] Highlight Python code in README.rst (thanks @andreif) - Highlight Python code in README.rst. [Andrei Fokau] - Add support for error responses in JSON format (closes #113) [Chris Adams] Thanks to @andreif for the patch and tests - Merge pull request #125 from phill-tornroth/patch-1. [Chris Adams] Fix get-solr-download-url.py for Python 2.6 - Fixes 'zero field length' error from `format()` [Phill Tornroth] Unless I'm missing something... :) - Travis: download Solr before starting tests. [Chris Adams] This should avoid download errors being presented as test failures - Tests: increase Solr startup timeout. [Chris Adams] - Add test Solr tarball downloads to .gitignore. [Chris Adams] - Tests: add Python 3.4 targets. [Chris Adams] - Tests: use Solr 4.7.2 from nearest mirror (closes #115) [Chris Adams] - Tests: add a script to retrieve the closest Apache mirror. [Chris Adams] See #115 - Merge pull request #111 from redjack/py26-tests. [Chris Adams] Update 'run-tests.py' to invoke unittest2 correctly on Python 2.6 - Update 'run-tests.py' to invoke unittest2 correctly on Python 2.6. [Andy Freeland] - Expanded testing section of the README. [Chris Adams] - Merge pull request #36 from glenbot/master. [Chris Adams] Update to SolrCoreAdmin.create to use correct action - Updated create command in SolrCoreAdmin to use correct action. [glenbot] - Fix type in SolrAdmin.create default parameter. [Chris Adams] See #36 - Updated ignores. [Daniel Lindsley] v3.2.0 (2014-01-27) ------------------- - Bumped to v3.2.0! [Daniel Lindsley] - Merge pull request #104 from tongwang/master. [Chris Adams] Fix content extraction (thanks @tongwang) - Remove unnecessary comment. [Tong Wang] - Fixed both issues https://github.com/toastdriven/pysolr/issues/96 and https://github.com/toastdriven/pysolr/issues/90 and updated test solr sever from 4.1.0 to 4.6.0. All tests pass. [Tong Wang] - Tests: set Tox basepython versions for tomcat tests. [Chris Adams] - Tests: update test_full_url for multi-core config. [Chris Adams] - Tests: expect content extraction to fail. [Chris Adams] Once https://github.com/toastdriven/pysolr/issues/90 is fixed we can re-enable this test - Skip tomcat error tests when lxml is unavailable. [Chris Adams] Until _scrap_response has a Tomcat path which doesn't depend on lxml.html there's no point in running these tests on a different config - Enable Travis CI. [Chris Adams] - Use tox for testing multiple versions. [Chris Adams] * Add a simple test-runner which handles starting and stopping Solr * Added a basic tox.ini for Python 2.6, 2.7 and 3.3 with and without Tomcat to keep us honest about extra_requires… - Move test setup to script & update README. [Chris Adams] This avoids the README drifting out of sync - Bump requests dependency to 2.x for Unicode handling. [Chris Adams] - Update testing instructions in the README after the Solr mirror went away. [Chris Adams] This uses the canonical Apache archive which should be more stable than the mirror we were using - Merge remote-tracking branch 'anti-social/clean_xml' [Daniel Lindsley] - Fixed error when invalid xml chars present in document. [Alexander Koval] - Merge remote-tracking branch 'anti-social/absolute_import' [Daniel Lindsley] - Added absolute_import. [Alexander Koval] - Ignored env3. [Daniel Lindsley] v3.1.0 (2013-07-17) ------------------- - Bumped to v3.1.0! [Daniel Lindsley] - Better Unicode behavior under Python 3. [Daniel Lindsley] - Merge pull request #69 from zyegfryed/patch-1. [Daniel Lindsley] Added MoreLikeThis handler to solrconfig.xml test cores. - Added MoreLikeThis handler to solrconfig.xml test cores. [Sébastien Fievet] - README tweaks. Thanks to @msabramo for the original patch! [Daniel Lindsley] - Slightly better tomcat errors. [Daniel Lindsley] - Improved scraping of tomcat error. [Dougal Matthews] When scraping for the HTML error message include the description if found. - Merge pull request #86 from anti-social/fix_eval. [Chris Adams] Fixed eval in the _to_python method (thanks @anti-social) Ah, nice: since we no longer support Python 2.5 this is a great move. - Fixed eval in the _to_python method. [Alexander Koval] - Solr.add generator expression support (closes #81) [Chris Adams] The only compatibility issue before was a logging statement using len() on the input docs variable, which fails on generator expressions. Thanks to @timsavage for a patch changing this to measuring the message which is actually sent to Solr instead - Enable request's session pooling (closes #82) [Chris Adams] Performing requests using a session enables urllib3's connection pooling, reducing connection latency. Thanks @cody-young for the patch Closes #83 v3.0.6 (2013-04-13) ------------------- - Setup.py: require lxml 3.0+ for tomcat error messages. [Chris Adams] * Bumped version to 3.0.6 - Merge pull request #71 from mjumbewu/master. [Daniel Lindsley] Trailing slash in the base URL will break reqeusts - Make sure trailing and leading slashes do not collide. [Mjumbe Wawatu Ukweli] v3.0.5 (2013-02-16) ------------------- - Update error message string interpolation (closes #70) [Chris Adams] Python's string interpolation requires a tuple, not a list v3.0.4 (2013-02-11) ------------------- - Tag version 3.0.4 for PyPI. [Chris Adams] 3.x had a minor bug (see SHA:74b0a36) but it broke logging for Solr errors which seems worth an easily deployed fix - Correct log.error syntax on timeouts. [Chris Adams] v3.0.3 (2013-01-24) ------------------- - Update version to 3.0.3. [Chris Adams] Since python 2.6 compatibility was broken in 3.0+ this seems worth an update - Force_unicode: backwards compatibility with Python 2.6. [Chris Adams] v3.0.2 (2013-01-24) ------------------- - Update version to 3.0.2. [Chris Adams] - Fix rich content extraction method & tests. [Chris Adams] * Update test setup instructions with content extraction handler dependencies * Enable file upload support to _send_request * Added simple extract test - Fix field boosting, simplify _build_doc. [Chris Adams] * Ensure that numbers are converted to strings to avoid lxml choking when asked to serialize a number (in 2013!). * Refactor logic to have a single code-path for both single and multi-value fields * Refactor use **kwargs style so there's a single Element() create call - Force_unicode support for non-string types. [Chris Adams] Now force_unicode(1.0) will return u"1.0" for consistency and to avoid confusion with the Django function of the same name v3.0.1 (2013-01-23) ------------------- - Bumped to v3.0.1! [Daniel Lindsley] - Updated README to include testing info & made sure the README gets included n the package. [Daniel Lindsley] - Updated ignores. [Daniel Lindsley] v3.0.0 (2013-01-23) ------------------- - Bumped to v3.0.0, adding Python3 support! [Daniel Lindsley] Dependencies have changed & been slimmed down. - Bumped to v2.1.0! [Daniel Lindsley] - Catch socket errors for httplib fallback path. [Chris Adams] - Catch IOError in _send_request. [Chris Adams] httplib2 can raise a bare socket.error in _send_request, which handles only AttributeError. This change catches all IOError subclasses, tells logging to include exception information and moves logging code outside of the try/except block to avoid any possibility of an exception in a log handler being caught by mistake. - Fall back to HTML title when scraping error messages. [Chris Adams] Solr 3.6 + Jetty is not reliably detected by the existing approach but it does return a reasonably useful message in the title which is a lot more informative than "None" - Provide full headers & response to logging handlers. [Chris Adams] This allows handlers such as Raven / Sentry to do something smart with the full HTTP headers and/or response body. Among other things this should provide more insight in situations when pysolr currently logs "Response: None" - Full exception logging for basic connection failures. [Chris Adams] - Logging: use obvious exc_info= syntax. [Chris Adams] As per the documentation, logging exc_info just needs to evaluate to True. This change makes it obvious that the passed in value is not actually used in any other way - Added gthb to AUTHORS. [Daniel Lindsley] - PEP-8 nitpicks. [Chris Adams] - Don't bork on response with no "response" attr. [Gunnlaugur Þór Briem] (happens e.g. in grouped queries) - Support 'grouped' in Solr results. [Gunnlaugur Þór Briem] - Added ``extra_requires`` to cover the ``BeautifulSoup`` dependency. Thanks to kylemacfarlane for the report! [Daniel Lindsley] - Added pabluk to AUTHORS. [Daniel Lindsley] - Updated README file with optional requirement. [Pablo SEMINARIO] - Added kwargs to extract() method. [Pablo SEMINARIO] - Avoid forcing string interpolation when logging. [Chris Adams] This allows aggregators like Sentry and other consumers to see the raw, unformatted string and variables so they can e.g. group all instances of the same message even if the specific request values differ. - Added HTTPS support for httplib. [Richard Mitchell] - Added a long description for PyPI. [Daniel Lindsley] - Added support for Solr rich-content extraction. [Chris Adams] This exposes Solr's http://wiki.apache.org/solr/ExtractingRequestHandler which allows you to index text content from structured file formats like PDF, Microsoft Office, etc. - Bumped for the next round of beta. [Daniel Lindsley] - Added cordmata to AUTHORS. [Daniel Lindsley] - Updated suggest_terms so that it correctly handles response from Solr 3.x releases. [Matt Cordial] - Edited README via GitHub. [Daniel Lindsley] - Bumped to v2.0.15! [Daniel Lindsley] - Fixed a bug where ``server_string`` could come back as ``None``. Thanks to croddy for the report! [Daniel Lindsley] - Added dourvais & soypunk to AUTHORS. [Daniel Lindsley] - Unescape html entities in error messages. [David Cramer] - Added support for getting at the Solr querying debug data when using search(). [Shawn Medero] Passing ``debug=True`` as kwarg, the ``search()`` method will activate this property in the JSON results. - Fixed bug, qtime wasn't set when it was 0. [Daniel Dourvaris] - Added query time to results as attribute. [Daniel Dourvaris] - Bumped revision for dev on the next release. [Daniel Lindsley] v2.0.14 (2011-04-29) -------------------- - V2.0.14. [Daniel Lindsley] - Always send commit if its not-null. [David Cramer] - Add support for waitFlush and waitSearcher on update queries. Added support for expungeDeletes on commit(). Added support for maxSegments on optimize() [David Cramer] - Ensure port is coerced to an integer as (at least some version of) socket does not handle unicode ports nicely. [David Cramer] - Add support for commitWithin on Solr.add. [David Cramer] - Better compatibility with the latest revisions of lxml. Thanks to ghostmob for pointing this out! [Daniel Lindsley] - Fixed occasionally trying to call ``lower`` on ``None``. Thanks to girasquid for the report & original patch! [Daniel Lindsley] v2.0.13 (2010-09-15) -------------------- - Cleaned up how parameters are checked. Thanks to zyegfryed for the patch. v2.0.13. [Daniel Lindsley] - Fixed a bug in the weighting when given a string field that's weighted. Thanks to akaihola for the report. [Daniel Lindsley] - Fixed the case where the data being converted would be clean unicode. Thanks to acdha for submitting another version of this patch. [Daniel Lindsley] - Fixed the long URL support to correctly deal with sequences. [Daniel Lindsley] - Fixed a bug where additional parameters could cause the URL to be longer than 1024 even if the query is not. Thanks to zyegfryed for the report & patch! [Daniel Lindsley] - Boost values are now coerced into a string. Thanks to notanumber for the patch! [Daniel Lindsley] - All params are now safely encoded. Thanks to acdha for the patch! [Daniel Lindsley] - Added term suggestion. Requires Solr 1.4+. Thanks to acdha for the patch! [Daniel Lindsley] - If invalid characters are found, replace them. Thanks to stugots for the report and fix. [Daniel Lindsley] - Slicing ``None`` doesn't work. Make it a string... [Daniel Lindsley] - Added basic logging support. Thanks to sjaday for the suggestion. [Daniel Lindsley] v2.0.12 (2010-06-20) -------------------- - Releasing version v2.0.12. [Daniel Lindsley] - Added a more helpful message for the ever classic "'NoneType' object has no attribute 'makefile'" error when providing an incorrect URL. [Daniel Lindsley] - Added better error support when using Tomcat. Thanks to bochecha for the original patch. [Daniel Lindsley] - Fixed a long-standing TODO, allowing commits to happen without a second request. Thanks to lyblandin for finally chiding me into fixing it. [Daniel Lindsley] - Fixed a bug when sending long queries. Thanks to akaihola & gthb for the report and patch. [Daniel Lindsley] - Corrected a bug where Unicode character might not transmit correctly. Thanks to anti-social for the initial patch. [Daniel Lindsley] - Added field-based boost support. Thanks to notanumber for the patch. [David Sauve] - Better error messages are now provided when things go south. Thanks to bochecha for the patch. [Daniel Lindsley] - Added support for working with Solr cores. Thanks to james.colin.brady for the original patch. [Daniel Lindsley] - Fixed a bug where empty strings/``None`` would be erroneously sent. Thanks to Chak for the patch. [Daniel Lindsley] - Added support for the Stats component. Thanks to thomas.j.lee for the original patch. [Daniel Lindsley] - Fixed datetime/date handling to use ``isoformat`` instead of manually constructing the string. Thanks to joegermuska for the suggestion. [Daniel Lindsley] - Added document boost support. Thanks to Tomasz.Wegrzanowski for the patch. [Daniel Lindsley] - Fixed pysolr to add documents explicitly using UTF-8. Thanks to jarek & dekstop for the patch. [Daniel Lindsley] v2.0.11 (2010-04-28) -------------------- - Fixed initialization parameters on ``Results``. Thanks to jonathan.slenders for pointing this out. v2.0.11. [Daniel Lindsley] - Added a sane .gitignore. [Daniel Lindsley] v2.0.10 (2010-04-28) -------------------- - Fixed a bug in URL construction with httplib2. Thanks to maciekp.lists for the patch. v2.0.10. [Daniel Lindsley] - Added a way to handle queries longer than 1024. Adapted from cogtree's Python Solr fork. [Daniel Lindsley] - Fixed isinstance bug that can occur with the now potentially different datetime/date objects. [Daniel Lindsley] - Altered pysolr to use, if available, Django's implementation of datetime for dates before 1900. Falls back to the default implementation of datetime. [Daniel Lindsley] - If MLT was enabled but no reindexing was performed, Solr returns null instead of no docs. Handle this slightly more gracefully. [Daniel Lindsley] - Corrected a regression when errors occur while using httplib. [Daniel Lindsley] - Bumped version number for previous commit. [Daniel Lindsley] - Altered the '_extract_error' method to be a little more useful when things go south. [Daniel Lindsley] - Bumped version for previous commit. [polarcowz] - Added (optional but default) sanitizing for updates. This cleans the XML sent of control characters which cause Solr's XML parser to break. [polarcowz] - Fixed up a couple distribution bits. [polarcowz] - Added spellchecking support. [polarcowz] - Added timeouts (optional if httplib2 is installed). [polarcowz] - Fixed DATETIME_REGEX & _from_python to match Solr documentation. Thanks initcrash! [polarcowz] - Under some circumstances, Solr returns a regular data type instead of a string. Deal with it in _to_python as best as possible. [polarcowz] - Added '_to_python' method for converting data back to its native Python type. Backward compatible (requires manually calling). [polarcowz] - Updated pysolr to version 2.0. [polarcowz] New bits: * Now uses JSON instead of parsing XML. (jkocherhans) * Added support for passing many types of query parameters to Solr. (daniellindsley) * Added support for More Like This (requires Solr 1.3+). (daniellindsley) * Added support for highlighting. (daniellindsley) * Added support for faceting. (daniellindsley) Ought to be fairly backward-compatible (no known issues) but caution is advised when upgrading. Newly requires either the 'json' or 'simplejson' modules. - Added the stuff needed to easy_install pysolr. And a LICENSE, since I just made fun of another project for not having one. [jacob.kaplanmoss] - It would probably help if I imported the correct thing. [jkocherhans] - This is getting a bit hairy, but try to import ElementTree from lxml as well. [jkocherhans] - Use cElementTree if it's available. [jkocherhans] - Removed unused import. Thanks, jarek.zgoda. [jkocherhans] - Removed default values for start and rows from the search method. Thanks, jarek.zgoda. This will allow people to let solr determine what the default for those should be. [jkocherhans] - Added converters for float and decimal. This references Issue 1. Thanks, jarek.zgoda. [jkocherhans] - Fixed a bug for connections that don't specify a port number. [jkocherhans] - Fixed Python 2.5-ism. [jkocherhans] - Allowed for connections to solr instances that don't live at /solr. [jkocherhans] - Added multiValue field handling support. [jkocherhans] - Broke results out into a separate object with docs and hits attributes. [jkocherhans] - Fixed typo that caused breakage with python < 2.5. [jkocherhans] - Fixed a small typo. [jkocherhans] - Initial import of pysolr. [jkocherhans] - Initial directory structure. [(no author)] pysolr-3.6.0/LICENSE000066400000000000000000000030251300613105600140500ustar00rootroot00000000000000Copyright (c) Joseph Kocherhans, Jacob Kaplan-Moss, Daniel Lindsley. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of pysolr nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pysolr-3.6.0/MANIFEST.in000066400000000000000000000000711300613105600145770ustar00rootroot00000000000000include LICENSE include README.rst include CHANGELOG.rst pysolr-3.6.0/README.rst000066400000000000000000000113141300613105600145320ustar00rootroot00000000000000====== pysolr ====== ``pysolr`` is a lightweight Python wrapper for `Apache Solr`_. It provides an interface that queries the server and returns results based on the query. .. _`Apache Solr`: http://lucene.apache.org/solr/ Status ====== .. image:: https://secure.travis-ci.org/django-haystack/pysolr.png :target: https://secure.travis-ci.org/django-haystack/pysolr `Changelog `_ Features ======== * Basic operations such as selecting, updating & deleting. * Index optimization. * `"More Like This" `_ support (if set up in Solr). * `Spelling correction `_ (if set up in Solr). * Timeout support. * SolrCloud awareness Requirements ============ * Python 2.7 - 3.5 * Requests 2.0+ * **Optional** - ``simplejson`` * **Optional** - ``kazoo`` for SolrCloud mode Installation ============ ``sudo python setup.py install`` or drop the ``pysolr.py`` file anywhere on your PYTHONPATH. Usage ===== Basic usage looks like: .. code-block:: python # If on Python 2.X from __future__ import print_function import pysolr # Setup a Solr instance. The timeout is optional. solr = pysolr.Solr('http://localhost:8983/solr/', timeout=10) # How you'd index data. solr.add([ { "id": "doc_1", "title": "A test document", }, { "id": "doc_2", "title": "The Banana: Tasty or Dangerous?", }, ]) # Note that the add method has commit=True by default, so this is # immediately committed to your index. # Later, searching is easy. In the simple case, just a plain Lucene-style # query is fine. results = solr.search('bananas') # The ``Results`` object stores total results found, by default the top # ten most relevant results and any additional data like # facets/highlighting/spelling/etc. print("Saw {0} result(s).".format(len(results))) # Just loop over it to access the results. for result in results: print("The title is '{0}'.".format(result['title'])) # For a more advanced query, say involving highlighting, you can pass # additional options to Solr. results = solr.search('bananas', **{ 'hl': 'true', 'hl.fragsize': 10, }) # You can also perform More Like This searches, if your Solr is configured # correctly. similar = solr.more_like_this(q='id:doc_2', mltfl='text') # Finally, you can delete either individual documents... solr.delete(id='doc_1') # ...or all documents. solr.delete(q='*:*') .. code-block:: python # For SolrCloud mode, initialize your Solr like this: zookeeper = pysolr.ZooKeeper("zkhost1:2181,zkhost2:2181,zkhost3:2181") solr = pysolr.SolrCloud(zookeeper, "collection1") Multicore Index ~~~~~~~~~~~~~~~ Simply point the URL to the index core: .. code-block:: python # Setup a Solr instance. The timeout is optional. solr = pysolr.Solr('http://localhost:8983/solr/core_0/', timeout=10) Custom Request Handlers ~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python # Setup a Solr instance. The trailing slash is optional. solr = pysolr.Solr('http://localhost:8983/solr/core_0/', search_handler='/autocomplete', use_qt_param=False) If ``use_qt_param`` is ``True`` it is essential that the name of the handler is exactly what is configured in ``solrconfig.xml``, including the leading slash if any (though with the ``qt`` parameter a leading slash is not a requirement by SOLR). If ``use_qt_param`` is ``False`` (default), the leading and trailing slashes can be omitted. If ``search_handler`` is not specified, pysolr will default to ``/select``. The handlers for MoreLikeThis, Update, Terms etc. all default to the values set in the ``solrconfig.xml`` SOLR ships with: ``mlt``, ``update``, ``terms`` etc. The specific methods of pysolr's ``Solr`` class (like ``more_like_this``, ``suggest_terms`` etc.) allow for a kwarg ``handler`` to override that value. This includes the ``search`` method. Setting a handler in ``search`` explicitly overrides the ``search_handler`` setting (if any). LICENSE ======= ``pysolr`` is licensed under the New BSD license. Running Tests ============= The ``run-tests.py`` script will automatically perform the steps below and is recommended for testing by default unless you need more control. Running a test Solr instance ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Downloading, configuring and running Solr 4 looks like this:: ./start-solr-test-server.sh Running the tests ~~~~~~~~~~~~~~~~~ The test suite requires the unittest2 library: Python 2:: python -m unittest2 tests Python 3:: python3 -m unittest tests pysolr-3.6.0/get-solr-download-url.py000077500000000000000000000035401300613105600175630ustar00rootroot00000000000000#!/usr/bin/env python # encoding: utf-8 from __future__ import absolute_import, division, print_function, unicode_literals from itertools import chain import sys import requests # Try to import urllib from the Python 3 reorganized stdlib first: try: from urllib.parse import urljoin except ImportError: try: from urlparse.parse import urljoin except ImportError: from urlparse import urljoin if len(sys.argv) != 2: print('Usage: %s SOLR_VERSION' % sys.argv[0], file=sys.stderr) sys.exit(1) solr_version = sys.argv[1] tarball = 'solr-{0}.tgz'.format(solr_version) dist_path = 'lucene/solr/{0}/{1}'.format(solr_version, tarball) download_url = urljoin('https://archive.apache.org/dist/', dist_path) mirror_response = requests.get("https://www.apache.org/dyn/mirrors/mirrors.cgi/%s?asjson=1" % dist_path) if not mirror_response.ok: print('Apache mirror request returned HTTP %d' % mirror_response.status_code, file=sys.stderr) sys.exit(1) mirror_data = mirror_response.json() # Since the Apache mirrors are often unreliable and releases may disappear without notice we'll # try the preferred mirror, all of the alternates and backups, and fall back to the main Apache # archive server: for base_url in chain((mirror_data['preferred'], ), mirror_data['http'], mirror_data['backup'], ('https://archive.apache.org/dist/', )): test_url = urljoin(base_url, mirror_data['path_info']) # The Apache mirror script's response format has recently changed to exclude the actual file paths: if not test_url.endswith(tarball): test_url = urljoin(test_url, dist_path) if requests.head(test_url, allow_redirects=True).status_code == 200: download_url = test_url break else: print('None of the Apache mirrors have %s' % dist_path, file=sys.stderr) sys.exit(1) print(download_url) pysolr-3.6.0/pysolr.py000066400000000000000000001264361300613105600147610ustar00rootroot00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, unicode_literals import ast import datetime import logging import os import random import re import time from xml.etree import ElementTree import requests try: from kazoo.client import KazooClient, KazooState except ImportError: KazooClient = KazooState = None try: # Prefer simplejson, if installed. import simplejson as json except ImportError: import json try: # Python 3.X from urllib.parse import urlencode except ImportError: # Python 2.X from urllib import urlencode try: # Python 3.X import html.entities as htmlentities except ImportError: # Python 2.X import htmlentitydefs as htmlentities try: # Python 3.X from http.client import HTTPException except ImportError: from httplib import HTTPException try: # Python 2.X unicode_char = unichr except NameError: # Python 3.X unicode_char = chr # Ugh. long = int __author__ = 'Daniel Lindsley, Joseph Kocherhans, Jacob Kaplan-Moss' __all__ = ['Solr'] __version__ = (3, 5, 0) def get_version(): return "%s.%s.%s" % __version__[:3] DATETIME_REGEX = re.compile('^(?P\d{4})-(?P\d{2})-(?P\d{2})T(?P\d{2}):(?P\d{2}):(?P\d{2})(\.\d+)?Z$') # dict key used to add nested documents to a document NESTED_DOC_KEY = '_childDocuments_' class NullHandler(logging.Handler): def emit(self, record): pass # Add the ``NullHandler`` to avoid logging by default while still allowing # others to attach their own handlers. LOG = logging.getLogger('pysolr') h = NullHandler() LOG.addHandler(h) # For debugging... if os.environ.get("DEBUG_PYSOLR", "").lower() in ("true", "1"): LOG.setLevel(logging.DEBUG) stream = logging.StreamHandler() LOG.addHandler(stream) def is_py3(): try: basestring return False except NameError: return True IS_PY3 = is_py3() def force_unicode(value): """ Forces a bytestring to become a Unicode string. """ if IS_PY3: # Python 3.X if isinstance(value, bytes): value = value.decode('utf-8', errors='replace') elif not isinstance(value, str): value = str(value) else: # Python 2.X if isinstance(value, str): value = value.decode('utf-8', 'replace') elif not isinstance(value, basestring): value = unicode(value) return value def force_bytes(value): """ Forces a Unicode string to become a bytestring. """ if IS_PY3: if isinstance(value, str): value = value.encode('utf-8', 'backslashreplace') else: if isinstance(value, unicode): value = value.encode('utf-8') return value def unescape_html(text): """ Removes HTML or XML character references and entities from a text string. @param text The HTML (or XML) source text. @return The plain text, as a Unicode string, if necessary. Source: http://effbot.org/zone/re-sub.htm#unescape-html """ def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unicode_char(int(text[3:-1], 16)) else: return unicode_char(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unicode_char(htmlentities.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) def safe_urlencode(params, doseq=0): """ UTF-8-safe version of safe_urlencode The stdlib safe_urlencode prior to Python 3.x chokes on UTF-8 values which can't fail down to ascii. """ if IS_PY3: return urlencode(params, doseq) if hasattr(params, "items"): params = params.items() new_params = list() for k, v in params: k = k.encode("utf-8") if isinstance(v, (list, tuple)): new_params.append((k, [force_bytes(i) for i in v])) else: new_params.append((k, force_bytes(v))) return urlencode(new_params, doseq) def is_valid_xml_char_ordinal(i): """ Defines whether char is valid to use in xml document XML standard defines a valid char as:: Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] """ # conditions ordered by presumed frequency return ( 0x20 <= i <= 0xD7FF or i in (0x9, 0xA, 0xD) or 0xE000 <= i <= 0xFFFD or 0x10000 <= i <= 0x10FFFF ) def clean_xml_string(s): """ Cleans string from invalid xml chars Solution was found there:: http://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python """ return ''.join(c for c in s if is_valid_xml_char_ordinal(ord(c))) class SolrError(Exception): pass class Results(object): """ Default results class for wrapping decoded (from JSON) solr responses. Required ``decoded`` argument must be a Solr response dictionary. Individual documents can be retrieved either through ``docs`` attribute or by iterating over results instance. Example:: results = Results({ 'response': { 'docs': [{'id': 1}, {'id': 2}, {'id': 3}], 'numFound': 3, } }) # this: for doc in results: print doc # ... is equivalent to: for doc in results.docs: print doc # also: list(results) == results.docs Note that ``Results`` object does not support indexing and slicing. If you need to retrieve documents by index just use ``docs`` attribute. Other common response metadata (debug, highlighting, qtime, etc.) are available as attributes. The full response from Solr is provided as the `raw_response` dictionary for use with features which change the response format. """ def __init__(self, decoded): self.raw_response = decoded # main response part of decoded Solr response response_part = decoded.get('response') or {} self.docs = response_part.get('docs', ()) self.hits = response_part.get('numFound', 0) # other response metadata self.debug = decoded.get('debug', {}) self.highlighting = decoded.get('highlighting', {}) self.facets = decoded.get('facet_counts', {}) self.spellcheck = decoded.get('spellcheck', {}) self.stats = decoded.get('stats', {}) self.qtime = decoded.get('responseHeader', {}).get('QTime', None) self.grouped = decoded.get('grouped', {}) self.nextCursorMark = decoded.get('nextCursorMark', None) def __len__(self): return len(self.docs) def __iter__(self): return iter(self.docs) class Solr(object): """ The main object for working with Solr. Optionally accepts ``decoder`` for an alternate JSON decoder instance. Default is ``json.JSONDecoder()``. Optionally accepts ``timeout`` for wait seconds until giving up on a request. Default is ``60`` seconds. Optionally accepts ``results_cls`` that specifies class of results object returned by ``.search()`` and ``.more_like_this()`` methods. Default is ``pysolr.Results``. Usage:: solr = pysolr.Solr('http://localhost:8983/solr') # With a 10 second timeout. solr = pysolr.Solr('http://localhost:8983/solr', timeout=10) # with a dict as a default results class instead of pysolr.Results solr = pysolr.Solr('http://localhost:8983/solr', results_cls=dict) """ def __init__(self, url, decoder=None, timeout=60, results_cls=Results, search_handler='select', use_qt_param=False): self.decoder = decoder or json.JSONDecoder() self.url = url self.timeout = timeout self.log = self._get_log() self.session = None self.results_cls = results_cls self.search_handler = search_handler self.use_qt_param = use_qt_param def get_session(self): if self.session is None: self.session = requests.Session() self.session.stream = False return self.session def _get_log(self): return LOG def _create_full_url(self, path=''): if len(path): return '/'.join([self.url.rstrip('/'), path.lstrip('/')]) # No path? No problem. return self.url def _send_request(self, method, path='', body=None, headers=None, files=None): url = self._create_full_url(path) method = method.lower() log_body = body if headers is None: headers = {} if log_body is None: log_body = '' elif not isinstance(log_body, str): log_body = repr(body) self.log.debug("Starting request to '%s' (%s) with body '%s'...", url, method, log_body[:10]) start_time = time.time() session = self.get_session() try: requests_method = getattr(session, method) except AttributeError as err: raise SolrError("Unable to use unknown HTTP method '{0}.".format(method)) # Everything except the body can be Unicode. The body must be # encoded to bytes to work properly on Py3. bytes_body = body if bytes_body is not None: bytes_body = force_bytes(body) try: resp = requests_method(url, data=bytes_body, headers=headers, files=files, timeout=self.timeout) except requests.exceptions.Timeout as err: error_message = "Connection to server '%s' timed out: %s" self.log.error(error_message, url, err, exc_info=True) raise SolrError(error_message % (url, err)) except requests.exceptions.ConnectionError as err: error_message = "Failed to connect to server at '%s', are you sure that URL is correct? Checking it in a browser might help: %s" params = (url, err) self.log.error(error_message, *params, exc_info=True) raise SolrError(error_message % params) except HTTPException as err: error_message = "Unhandled error: %s %s: %s" self.log.error(error_message, method, url, err, exc_info=True) raise SolrError(error_message % (method, url, err)) end_time = time.time() self.log.info("Finished '%s' (%s) with body '%s' in %0.3f seconds, with status %s", url, method, log_body[:10], end_time - start_time, resp.status_code) if int(resp.status_code) != 200: error_message = "Solr responded with an error (HTTP %s): %s" solr_message = self._extract_error(resp) self.log.error(error_message, resp.status_code, solr_message, extra={'data': {'headers': resp.headers, 'response': resp.content, 'request_body': bytes_body, 'request_headers': headers}}) raise SolrError(error_message % (resp.status_code, solr_message)) return force_unicode(resp.content) def _select(self, params, handler=None): """ :param params: :param handler: defaults to self.search_handler (fallback to 'select') :return: """ # specify json encoding of results params['wt'] = 'json' custom_handler = handler or self.search_handler handler = 'select' if custom_handler: if self.use_qt_param: params['qt'] = custom_handler else: handler = custom_handler params_encoded = safe_urlencode(params, True) if len(params_encoded) < 1024: # Typical case. path = '%s/?%s' % (handler, params_encoded) return self._send_request('get', path) else: # Handles very long queries by submitting as a POST. path = '%s/' % handler headers = { 'Content-type': 'application/x-www-form-urlencoded; charset=utf-8', } return self._send_request('post', path, body=params_encoded, headers=headers) def _mlt(self, params, handler='mlt'): return self._select(params, handler) def _suggest_terms(self, params, handler='terms'): return self._select(params, handler) def _update(self, message, clean_ctrl_chars=True, commit=True, softCommit=False, waitFlush=None, waitSearcher=None, overwrite=None, handler='update'): """ Posts the given xml message to http:///update and returns the result. Passing `clean_ctrl_chars` as False will prevent the message from being cleaned of control characters (default True). This is done by default because these characters would cause Solr to fail to parse the XML. Only pass False if you're positive your data is clean. """ # Per http://wiki.apache.org/solr/UpdateXmlMessages, we can append a # ``commit=true`` to the URL and have the commit happen without a # second request. query_vars = [] path_handler = handler if self.use_qt_param: path_handler = 'select' query_vars.append('qt=%s' % safe_urlencode(handler, True)) path = '%s/' % path_handler if commit: query_vars.append('commit=%s' % str(bool(commit)).lower()) elif softCommit: query_vars.append('softCommit=%s' % str(bool(softCommit)).lower()) if waitFlush is not None: query_vars.append('waitFlush=%s' % str(bool(waitFlush)).lower()) if overwrite is not None: query_vars.append('overwrite=%s' % str(bool(overwrite)).lower()) if waitSearcher is not None: query_vars.append('waitSearcher=%s' % str(bool(waitSearcher)).lower()) if query_vars: path = '%s?%s' % (path, '&'.join(query_vars)) # Clean the message of ctrl characters. if clean_ctrl_chars: message = sanitize(message) return self._send_request('post', path, message, {'Content-type': 'text/xml; charset=utf-8'}) def _extract_error(self, resp): """ Extract the actual error message from a solr response. """ reason = resp.headers.get('reason', None) full_response = None if reason is None: try: # if response is in json format reason = resp.json()['error']['msg'] except KeyError: # if json response has unexpected structure full_response = resp.content except ValueError: # otherwise we assume it's html reason, full_html = self._scrape_response(resp.headers, resp.content) full_response = unescape_html(full_html) msg = "[Reason: %s]" % reason if reason is None: msg += "\n%s" % full_response return msg def _scrape_response(self, headers, response): """ Scrape the html response. """ # identify the responding server server_type = None server_string = headers.get('server', '') if server_string and 'jetty' in server_string.lower(): server_type = 'jetty' if server_string and 'coyote' in server_string.lower(): server_type = 'tomcat' reason = None full_html = '' dom_tree = None # In Python3, response can be made of bytes if IS_PY3 and hasattr(response, 'decode'): response = response.decode() if response.startswith(']*>\s*(.+?)\s*', response, re.IGNORECASE) if m: reason = m.group(2) else: full_html = "%s" % response else: # Let's assume others do produce a valid XML response try: dom_tree = ElementTree.fromstring(response) reason_node = None # html page might be different for every server if server_type == 'jetty': reason_node = dom_tree.find('body/pre') else: reason_node = dom_tree.find('head/title') if reason_node is not None: reason = reason_node.text if reason is None: full_html = ElementTree.tostring(dom_tree) except SyntaxError as err: LOG.warning('Unable to extract error message from invalid XML: %s', err, extra={'data': {'response': response}}) full_html = "%s" % response full_html = force_unicode(full_html) full_html = full_html.replace('\n', '') full_html = full_html.replace('\r', '') full_html = full_html.replace('
', '') full_html = full_html.replace('
', '') full_html = full_html.strip() return reason, full_html # Conversion ############################################################# def _from_python(self, value): """ Converts python values to a form suitable for insertion into the xml we send to solr. """ if hasattr(value, 'strftime'): if hasattr(value, 'hour'): offset = value.utcoffset() if offset: value = value - offset value = value.replace(tzinfo=None).isoformat() + 'Z' else: value = "%sT00:00:00Z" % value.isoformat() elif isinstance(value, bool): if value: value = 'true' else: value = 'false' else: if IS_PY3: # Python 3.X if isinstance(value, bytes): value = str(value, errors='replace') else: # Python 2.X if isinstance(value, str): value = unicode(value, errors='replace') value = "{0}".format(value) return clean_xml_string(value) def _to_python(self, value): """ Converts values from Solr to native Python values. """ if isinstance(value, (int, float, long, complex)): return value if isinstance(value, (list, tuple)): value = value[0] if value == 'true': return True elif value == 'false': return False is_string = False if IS_PY3: if isinstance(value, bytes): value = force_unicode(value) if isinstance(value, str): is_string = True else: if isinstance(value, str): value = force_unicode(value) if isinstance(value, basestring): is_string = True if is_string: possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime.datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # This is slightly gross but it's hard to tell otherwise what the # string's original type might have been. return ast.literal_eval(value) except (ValueError, SyntaxError): # If it fails, continue on. pass return value def _is_null_value(self, value): """ Check if a given value is ``null``. Criteria for this is based on values that shouldn't be included in the Solr ``add`` request at all. """ if value is None: return True if IS_PY3: # Python 3.X if isinstance(value, str) and len(value) == 0: return True else: # Python 2.X if isinstance(value, basestring) and len(value) == 0: return True # TODO: This should probably be removed when solved in core Solr level? return False # API Methods ############################################################ def search(self, q, search_handler=None, **kwargs): """ Performs a search and returns the results. Requires a ``q`` for a string version of the query to run. Optionally accepts ``**kwargs`` for additional options to be passed through the Solr URL. Returns ``self.results_cls`` class object (defaults to ``pysolr.Results``) Usage:: # All docs. results = solr.search('*:*') # Search with highlighting. results = solr.search('ponies', **{ 'hl': 'true', 'hl.fragsize': 10, }) """ params = {'q': q} params.update(kwargs) response = self._select(params, handler=search_handler) decoded = self.decoder.decode(response) self.log.debug( "Found '%s' search results.", # cover both cases: there is no response key or value is None (decoded.get('response', {}) or {}).get('numFound', 0) ) return self.results_cls(decoded) def more_like_this(self, q, mltfl, handler='mlt', **kwargs): """ Finds and returns results similar to the provided query. Returns ``self.results_cls`` class object (defaults to ``pysolr.Results``) Requires Solr 1.3+. Usage:: similar = solr.more_like_this('id:doc_234', 'text') """ params = { 'q': q, 'mlt.fl': mltfl, } params.update(kwargs) response = self._mlt(params, handler=handler) decoded = self.decoder.decode(response) self.log.debug( "Found '%s' MLT results.", # cover both cases: there is no response key or value is None (decoded.get('response', {}) or {}).get('numFound', 0) ) return self.results_cls(decoded) def suggest_terms(self, fields, prefix, handler='terms', **kwargs): """ Accepts a list of field names and a prefix Returns a dictionary keyed on field name containing a list of ``(term, count)`` pairs Requires Solr 1.4+. """ params = { 'terms.fl': fields, 'terms.prefix': prefix, } params.update(kwargs) response = self._suggest_terms(params, handler=handler) result = self.decoder.decode(response) terms = result.get("terms", {}) res = {} # in Solr 1.x the value of terms is a flat list: # ["field_name", ["dance",23,"dancers",10,"dancing",8,"dancer",6]] # # in Solr 3.x the value of terms is a dict: # {"field_name": ["dance",23,"dancers",10,"dancing",8,"dancer",6]} if isinstance(terms, (list, tuple)): terms = dict(zip(terms[0::2], terms[1::2])) for field, values in terms.items(): tmp = list() while values: tmp.append((values.pop(0), values.pop(0))) res[field] = tmp self.log.debug("Found '%d' Term suggestions results.", sum(len(j) for i, j in res.items())) return res def _build_doc(self, doc, boost=None, fieldUpdates=None): doc_elem = ElementTree.Element('doc') for key, value in doc.items(): if key == NESTED_DOC_KEY: for child in value: doc_elem.append(self._build_doc(child, boost, fieldUpdates)) continue if key == 'boost': doc_elem.set('boost', force_unicode(value)) continue # To avoid multiple code-paths we'd like to treat all of our values as iterables: if isinstance(value, (list, tuple)): values = value else: values = (value, ) for bit in values: if self._is_null_value(bit): continue attrs = {'name': key} if fieldUpdates and key in fieldUpdates: attrs['update'] = fieldUpdates[key] if boost and key in boost: attrs['boost'] = force_unicode(boost[key]) field = ElementTree.Element('field', **attrs) field.text = self._from_python(bit) doc_elem.append(field) return doc_elem def add(self, docs, boost=None, fieldUpdates=None, commit=True, softCommit=False, commitWithin=None, waitFlush=None, waitSearcher=None, overwrite=None, handler='update'): """ Adds or updates documents. Requires ``docs``, which is a list of dictionaries. Each key is the field name and each value is the value to index. Optionally accepts ``commit``. Default is ``True``. Optionally accepts ``softCommit``. Default is ``False``. Optionally accepts ``boost``. Default is ``None``. Optionally accepts ``fieldUpdates``. Default is ``None``. Optionally accepts ``commitWithin``. Default is ``None``. Optionally accepts ``waitFlush``. Default is ``None``. Optionally accepts ``waitSearcher``. Default is ``None``. Optionally accepts ``overwrite``. Default is ``None``. Usage:: solr.add([ { "id": "doc_1", "title": "A test document", }, { "id": "doc_2", "title": "The Banana: Tasty or Dangerous?", }, ]) """ start_time = time.time() self.log.debug("Starting to build add request...") message = ElementTree.Element('add') if commitWithin: message.set('commitWithin', commitWithin) for doc in docs: el = self._build_doc(doc, boost=boost, fieldUpdates=fieldUpdates) message.append(el) # This returns a bytestring. Ugh. m = ElementTree.tostring(message, encoding='utf-8') # Convert back to Unicode please. m = force_unicode(m) end_time = time.time() self.log.debug("Built add request of %s docs in %0.2f seconds.", len(message), end_time - start_time) return self._update(m, commit=commit, softCommit=softCommit, waitFlush=waitFlush, waitSearcher=waitSearcher, overwrite=overwrite, handler=handler) def delete(self, id=None, q=None, commit=True, softCommit=False, waitFlush=None, waitSearcher=None, handler='update'): """ Deletes documents. Requires *either* ``id`` or ``query``. ``id`` is if you know the specific document id to remove. ``query`` is a Lucene-style query indicating a collection of documents to delete. Optionally accepts ``commit``. Default is ``True``. Optionally accepts ``softCommit``. Default is ``False``. Optionally accepts ``waitFlush``. Default is ``None``. Optionally accepts ``waitSearcher``. Default is ``None``. Usage:: solr.delete(id='doc_12') solr.delete(q='*:*') """ if id is None and q is None: raise ValueError('You must specify "id" or "q".') elif id is not None and q is not None: raise ValueError('You many only specify "id" OR "q", not both.') elif id is not None: m = '%s' % id elif q is not None: m = '%s' % q return self._update(m, commit=commit, softCommit=softCommit, waitFlush=waitFlush, waitSearcher=waitSearcher, handler=handler) def commit(self, softCommit=False, waitFlush=None, waitSearcher=None, expungeDeletes=None, handler='update'): """ Forces Solr to write the index data to disk. Optionally accepts ``expungeDeletes``. Default is ``None``. Optionally accepts ``waitFlush``. Default is ``None``. Optionally accepts ``waitSearcher``. Default is ``None``. Optionally accepts ``softCommit``. Default is ``False``. Usage:: solr.commit() """ if expungeDeletes is not None: msg = '' % str(bool(expungeDeletes)).lower() else: msg = '' return self._update(msg, commit=not softCommit, softCommit=softCommit, waitFlush=waitFlush, waitSearcher=waitSearcher, handler=handler) def optimize(self, commit=True, waitFlush=None, waitSearcher=None, maxSegments=None, handler='update'): """ Tells Solr to streamline the number of segments used, essentially a defragmentation operation. Optionally accepts ``maxSegments``. Default is ``None``. Optionally accepts ``waitFlush``. Default is ``None``. Optionally accepts ``waitSearcher``. Default is ``None``. Usage:: solr.optimize() """ if maxSegments: msg = '' % maxSegments else: msg = '' return self._update(msg, commit=commit, waitFlush=waitFlush, waitSearcher=waitSearcher, handler=handler) def extract(self, file_obj, extractOnly=True, handler='update/extract', **kwargs): """ POSTs a file to the Solr ExtractingRequestHandler so rich content can be processed using Apache Tika. See the Solr wiki for details: http://wiki.apache.org/solr/ExtractingRequestHandler The ExtractingRequestHandler has a very simple model: it extracts contents and metadata from the uploaded file and inserts it directly into the index. This is rarely useful as it allows no way to store additional data or otherwise customize the record. Instead, by default we'll use the extract-only mode to extract the data without indexing it so the caller has the opportunity to process it as appropriate; call with ``extractOnly=False`` if you want to insert with no additional processing. Returns None if metadata cannot be extracted; otherwise returns a dictionary containing at least two keys: :contents: Extracted full-text content, if applicable :metadata: key:value pairs of text strings """ if not hasattr(file_obj, "name"): raise ValueError("extract() requires file-like objects which have a defined name property") params = { "extractOnly": "true" if extractOnly else "false", "lowernames": "true", "wt": "json", } params.update(kwargs) try: # We'll provide the file using its true name as Tika may use that # as a file type hint: resp = self._send_request('post', handler, body=params, files={'file': (file_obj.name, file_obj)}) except (IOError, SolrError) as err: self.log.error("Failed to extract document metadata: %s", err, exc_info=True) raise try: data = json.loads(resp) except ValueError as err: self.log.error("Failed to load JSON response: %s", err, exc_info=True) raise data['contents'] = data.pop(file_obj.name, None) data['metadata'] = metadata = {} raw_metadata = data.pop("%s_metadata" % file_obj.name, None) if raw_metadata: # The raw format is somewhat annoying: it's a flat list of # alternating keys and value lists while raw_metadata: metadata[raw_metadata.pop()] = raw_metadata.pop() return data class SolrCoreAdmin(object): """ Handles core admin operations: see http://wiki.apache.org/solr/CoreAdmin This must be initialized with the full admin cores URL:: solr_admin = SolrCoreAdmin('http://localhost:8983/solr/admin/cores') status = solr_admin.status() Operations offered by Solr are: 1. STATUS 2. CREATE 3. RELOAD 4. RENAME 5. ALIAS 6. SWAP 7. UNLOAD 8. LOAD (not currently implemented) """ def __init__(self, url, *args, **kwargs): super(SolrCoreAdmin, self).__init__(*args, **kwargs) self.url = url def _get_url(self, url, params={}, headers={}): resp = requests.get(url, data=safe_urlencode(params), headers=headers) return force_unicode(resp.content) def status(self, core=None): """http://wiki.apache.org/solr/CoreAdmin#head-9be76f5a459882c5c093a7a1456e98bea7723953""" params = { 'action': 'STATUS', } if core is not None: params.update(core=core) return self._get_url(self.url, params=params) def create(self, name, instance_dir=None, config='solrconfig.xml', schema='schema.xml'): """http://wiki.apache.org/solr/CoreAdmin#head-7ca1b98a9df8b8ca0dcfbfc49940ed5ac98c4a08""" params = { 'action': 'CREATE', 'name': name, 'config': config, 'schema': schema, } if instance_dir is None: params.update(instanceDir=name) else: params.update(instanceDir=instance_dir) return self._get_url(self.url, params=params) def reload(self, core): """http://wiki.apache.org/solr/CoreAdmin#head-3f125034c6a64611779442539812067b8b430930""" params = { 'action': 'RELOAD', 'core': core, } return self._get_url(self.url, params=params) def rename(self, core, other): """http://wiki.apache.org/solr/CoreAdmin#head-9473bee1abed39e8583ba45ef993bebb468e3afe""" params = { 'action': 'RENAME', 'core': core, 'other': other, } return self._get_url(self.url, params=params) def swap(self, core, other): """http://wiki.apache.org/solr/CoreAdmin#head-928b872300f1b66748c85cebb12a59bb574e501b""" params = { 'action': 'SWAP', 'core': core, 'other': other, } return self._get_url(self.url, params=params) def unload(self, core): """http://wiki.apache.org/solr/CoreAdmin#head-f5055a885932e2c25096a8856de840b06764d143""" params = { 'action': 'UNLOAD', 'core': core, } return self._get_url(self.url, params=params) def load(self, core): raise NotImplementedError('Solr 1.4 and below do not support this operation.') # Using two-tuples to preserve order. REPLACEMENTS = ( # Nuke nasty control characters. (b'\x00', b''), # Start of heading (b'\x01', b''), # Start of heading (b'\x02', b''), # Start of text (b'\x03', b''), # End of text (b'\x04', b''), # End of transmission (b'\x05', b''), # Enquiry (b'\x06', b''), # Acknowledge (b'\x07', b''), # Ring terminal bell (b'\x08', b''), # Backspace (b'\x0b', b''), # Vertical tab (b'\x0c', b''), # Form feed (b'\x0e', b''), # Shift out (b'\x0f', b''), # Shift in (b'\x10', b''), # Data link escape (b'\x11', b''), # Device control 1 (b'\x12', b''), # Device control 2 (b'\x13', b''), # Device control 3 (b'\x14', b''), # Device control 4 (b'\x15', b''), # Negative acknowledge (b'\x16', b''), # Synchronous idle (b'\x17', b''), # End of transmission block (b'\x18', b''), # Cancel (b'\x19', b''), # End of medium (b'\x1a', b''), # Substitute character (b'\x1b', b''), # Escape (b'\x1c', b''), # File separator (b'\x1d', b''), # Group separator (b'\x1e', b''), # Record separator (b'\x1f', b''), # Unit separator ) def sanitize(data): fixed_string = force_bytes(data) for bad, good in REPLACEMENTS: fixed_string = fixed_string.replace(bad, good) return force_unicode(fixed_string) class SolrCloud(Solr): def __init__(self, zookeeper, collection, decoder=None, timeout=60, retry_timeout=0.2, *args, **kwargs): url = zookeeper.getRandomURL(collection) super(SolrCloud, self).__init__(url, decoder=decoder, timeout=timeout, *args, **kwargs) self.zookeeper = zookeeper self.collection = collection self.retry_timeout = retry_timeout def _randomized_request(self, method, path, body, headers, files): self.url = self.zookeeper.getRandomURL(self.collection) LOG.debug('Using random URL: %s', self.url) return Solr._send_request(self, method, path, body, headers, files) def _send_request(self, method, path='', body=None, headers=None, files=None): # FIXME: this needs to have a maximum retry counter rather than waiting endlessly try: return self._randomized_request(method, path, body, headers, files) except requests.exceptions.RequestException: LOG.warning('RequestException, retrying after %fs', self.retry_timeout, exc_info=True) time.sleep(self.retry_timeout) # give zookeeper time to notice return self._randomized_request(method, path, body, headers, files) except SolrError: LOG.warning('SolrException, retrying after %fs', self.retry_timeout, exc_info=True) time.sleep(self.retry_timeout) # give zookeeper time to notice return self._randomized_request(method, path, body, headers, files) def _update(self, *args, **kwargs): self.url = self.zookeeper.getLeaderURL(self.collection) LOG.debug('Using random leader URL: %s', self.url) return Solr._update(self, *args, **kwargs) class ZooKeeper(object): # Constants used by the REST API: LIVE_NODES_ZKNODE = '/live_nodes' ALIASES = '/aliases.json' CLUSTER_STATE = '/clusterstate.json' SHARDS = 'shards' REPLICAS = 'replicas' STATE = 'state' ACTIVE = 'active' LEADER = 'leader' BASE_URL = 'base_url' TRUE = 'true' FALSE = 'false' COLLECTION = 'collection' def __init__(self, zkServerAddress, timeout=15, max_retries=-1, kazoo_client=None): if KazooClient is None: logging.error('ZooKeeper requires the `kazoo` library to be installed') raise RuntimeError self.collections = {} self.liveNodes = {} self.aliases = {} self.state = None if kazoo_client is None: self.zk = KazooClient(zkServerAddress, read_only=True, timeout=timeout, command_retry={'max_tries': max_retries}, connection_retry={'max_tries': max_retries}) else: self.zk = kazoo_client self.zk.start() def connectionListener(state): if state == KazooState.LOST: self.state = state elif state == KazooState.SUSPENDED: self.state = state self.zk.add_listener(connectionListener) @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE) def watchClusterState(data, *args, **kwargs): if not data: LOG.warning("No cluster state available: no collections defined?") else: self.collections = json.loads(data.decode('utf-8')) LOG.info('Updated collections: %s', self.collections) @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE) def watchLiveNodes(children): self.liveNodes = children LOG.info("Updated live nodes: %s", children) @self.zk.DataWatch(ZooKeeper.ALIASES) def watchAliases(data, stat): if data: json_data = json.loads(data.decode('utf-8')) if ZooKeeper.COLLECTION in json_data: self.aliases = json_data[ZooKeeper.COLLECTION] else: LOG.warning('Expected to find %s in alias update %s', ZooKeeper.COLLECTION, json_data.keys()) else: self.aliases = None LOG.info("Updated aliases: %s", self.aliases) def getHosts(self, collname, only_leader=False, seen_aliases=None): if self.aliases and collname in self.aliases: return self.getAliasHosts(collname, only_leader, seen_aliases) hosts = [] if collname not in self.collections: raise SolrError("Unknown collection: %s", collname) collection = self.collections[collname] shards = collection[ZooKeeper.SHARDS] for shardname in shards.keys(): shard = shards[shardname] if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE: replicas = shard[ZooKeeper.REPLICAS] for replicaname in replicas.keys(): replica = replicas[replicaname] if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE: if not only_leader or (replica.get(ZooKeeper.LEADER, None) == ZooKeeper.TRUE): base_url = replica[ZooKeeper.BASE_URL] if base_url not in hosts: hosts.append(base_url) return hosts def getAliasHosts(self, collname, only_leader, seen_aliases): if seen_aliases: if collname in seen_aliases: LOG.warn("%s in circular alias definition - ignored", collname) return [] else: seen_aliases = [] seen_aliases.append(collname) collections = self.aliases[collname].split(",") hosts = [] for collection in collections: for host in self.getHosts(collection, only_leader, seen_aliases): if host not in hosts: hosts.append(host) return hosts def getRandomURL(self, collname, only_leader=False): hosts = self.getHosts(collname, only_leader=only_leader) if not hosts: raise SolrError('ZooKeeper returned no active shards!') return '%s/%s' % (random.choice(hosts), collname) def getLeaderURL(self, collname): return self.getRandomURL(collname, only_leader=True) pysolr-3.6.0/run-tests.py000077500000000000000000000012731300613105600153670ustar00rootroot00000000000000#!/usr/bin/env python # encoding: utf-8 from __future__ import absolute_import, print_function, unicode_literals import signal import traceback import unittest from tests import utils as test_utils def main(): try: import faulthandler faulthandler.register(signal.SIGUSR1, all_threads=True) print('Installed SIGUSR1 handler to print stack traces: pkill -USR1 -f run-tests') except ImportError: pass test_utils.prepare() test_utils.start_solr() try: unittest.main(module='tests', verbosity=1) finally: print('Tests complete; halting Solr servers…') test_utils.stop_solr() if __name__ == "__main__": main() pysolr-3.6.0/setup.cfg000066400000000000000000000003211300613105600146600ustar00rootroot00000000000000[wheel] universal = 1 [pep8] ignore = W503 [pylama] ignore = D400,D401 [isort] combine_as_imports = true default_section = THIRDPARTY known_first_party = pysolr multi_line_output = 0 not_skip = __init__.py pysolr-3.6.0/setup.py000066400000000000000000000017521300613105600145620ustar00rootroot00000000000000try: from setuptools import setup except ImportError: from distutils.core import setup setup( name="pysolr", version="3.6.0", description="Lightweight python wrapper for Apache Solr.", author='Daniel Lindsley', author_email='daniel@toastdriven.com', long_description=open('README.rst', 'r').read(), py_modules=[ 'pysolr' ], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 3', ], url='https://github.com/django-haystack/pysolr/', license='BSD', install_requires=[ 'requests>=2.9.1' ], extras_require={ 'solrcloud': [ 'kazoo==2.2' ] } ) pysolr-3.6.0/start-solr-test-server.sh000077500000000000000000000135711300613105600200040ustar00rootroot00000000000000#!/bin/bash set -e # Redirect output to log files when stdin is not a TTY: if [ ! -t 0 ]; then exec 1>test-solr.stdout.log 2>test-solr.stderr.log fi SOLR_VERSION=4.10.4 ROOT=$(cd `dirname $0`; pwd) APP=$ROOT/solr-app PIDS=$ROOT/solr.pids export SOLR_ARCHIVE="solr-${SOLR_VERSION}.tgz" LOGS=$ROOT/logs cd $ROOT function download_solr() { if [ -d "${HOME}/download-cache/" ]; then export SOLR_ARCHIVE="${HOME}/download-cache/${SOLR_ARCHIVE}" fi if [ -f ${SOLR_ARCHIVE} ]; then # If the tarball doesn't extract cleanly, remove it so it'll download again: tar -tf ${SOLR_ARCHIVE} > /dev/null || rm ${SOLR_ARCHIVE} fi if [ ! -f ${SOLR_ARCHIVE} ]; then SOLR_DOWNLOAD_URL=$(python get-solr-download-url.py $SOLR_VERSION) curl -Lo $SOLR_ARCHIVE ${SOLR_DOWNLOAD_URL} || (echo "Unable to download ${SOLR_DOWNLOAD_URL}"; exit 2) fi } function extract_solr() { APP=solr-app echo "Extracting Solr ${SOLR_VERSION} to `pwd`/$APP" rm -rf $APP mkdir $APP tar -C $APP -xf ${SOLR_ARCHIVE} --strip-components 1 solr-${SOLR_VERSION} } function prepare_solr_home() { SOLR_HOME=$1 HOST=$2 echo "Preparing SOLR_HOME at $SOLR_HOME for host $HOST" APP=$(pwd)/solr-app mkdir -p ${SOLR_HOME} cp solr-app/example/solr/solr.xml ${SOLR_HOME}/ cp solr-app/example/solr/zoo.cfg ${SOLR_HOME}/ } function prepare_core() { SOLR_HOME=$1 CORE=$2 echo "Preparing core $CORE" CORE_DIR=${SOLR_HOME}/${CORE} mkdir -p ${CORE_DIR} cp -r solr-app/example/solr/collection1/conf ${CORE_DIR}/ perl -p -i -e 's|\n \n\n\n