From 48a63b751424fce6737dd9a9f8b7e7a409e5720d Mon Sep 17 00:00:00 2001 From: Michael Fladischer Date: Tue, 7 Jul 2015 22:12:20 +0200 Subject: [PATCH] django-haystack (2.4.0-2) unstable; urgency=low * Switch buildsystem to pybuild. * Add Python3 support through a separate package. * Add lintian override for missing upstream changelog. # imported from the archive --- AUTHORS | 106 ++ LICENSE | 31 + MANIFEST.in | 5 + PKG-INFO | 79 ++ README.rst | 59 + debian/changelog | 44 + debian/clean | 6 + debian/compat | 1 + debian/control | 71 ++ debian/copyright | 55 + debian/python-django-haystack-doc.doc-base | 8 + debian/python-django-haystack-doc.docs | 1 + ...thon-django-haystack-doc.lintian-overrides | 2 + debian/python-django-haystack.docs | 1 + .../python-django-haystack.lintian-overrides | 2 + debian/python3-django-haystack.docs | 1 + .../python3-django-haystack.lintian-overrides | 2 + debian/rules | 15 + debian/source/format | 1 + debian/source/lintian-overrides | 2 + debian/watch | 3 + django_haystack.egg-info/PKG-INFO | 79 ++ django_haystack.egg-info/SOURCES.txt | 99 ++ django_haystack.egg-info/dependency_links.txt | 1 + django_haystack.egg-info/not-zip-safe | 1 + django_haystack.egg-info/pbr.json | 1 + django_haystack.egg-info/requires.txt | 1 + django_haystack.egg-info/top_level.txt | 1 + docs/Makefile | 80 ++ docs/_build/.gitignore | 0 docs/_static/.gitignore | 0 docs/_templates/.gitignore | 0 docs/admin.rst | 47 + docs/architecture_overview.rst | 66 ++ docs/autocomplete.rst | 220 ++++ docs/backend_support.rst | 127 ++ docs/best_practices.rst | 263 +++++ docs/boost.rst | 123 ++ docs/conf.py | 207 ++++ docs/contributing.rst | 132 +++ docs/creating_new_backends.rst | 34 + docs/debugging.rst | 107 ++ docs/faceting.rst | 328 ++++++ docs/faq.rst | 117 ++ docs/glossary.rst | 76 ++ docs/haystack_theme/layout.html | 22 + docs/haystack_theme/static/documentation.css | 29 + docs/haystack_theme/theme.conf | 2 + docs/highlighting.rst | 77 ++ docs/index.rst | 117 ++ docs/inputtypes.rst | 177 +++ docs/installing_search_engines.rst | 222 ++++ docs/management_commands.rst | 201 ++++ docs/migration_from_1_to_2.rst | 285 +++++ docs/multiple_index.rst | 201 ++++ docs/other_apps.rst | 98 ++ docs/python3.rst | 47 + docs/rich_content_extraction.rst | 68 ++ docs/running_tests.rst | 70 ++ docs/searchbackend_api.rst | 124 ++ docs/searchfield_api.rst | 262 +++++ docs/searchindex_api.rst | 618 ++++++++++ docs/searchquery_api.rst | 336 ++++++ docs/searchqueryset_api.rst | 893 ++++++++++++++ docs/searchresult_api.rst | 62 + docs/settings.rst | 289 +++++ docs/signal_processors.rst | 117 ++ docs/spatial.rst | 412 +++++++ docs/templatetags.rst | 68 ++ docs/toc.rst | 53 + docs/tutorial.rst | 398 +++++++ docs/utils.rst | 18 + docs/views_and_forms.rst | 408 +++++++ docs/who_uses.rst | 357 ++++++ haystack/__init__.py | 71 ++ haystack/admin.py | 163 +++ haystack/backends/__init__.py | 1041 +++++++++++++++++ haystack/backends/elasticsearch_backend.py | 944 +++++++++++++++ haystack/backends/simple_backend.py | 135 +++ haystack/backends/solr_backend.py | 718 ++++++++++++ haystack/backends/whoosh_backend.py | 916 +++++++++++++++ haystack/constants.py | 33 + haystack/exceptions.py | 53 + haystack/fields.py | 441 +++++++ haystack/forms.py | 133 +++ haystack/generic_views.py | 126 ++ haystack/indexes.py | 497 ++++++++ haystack/inputs.py | 159 +++ haystack/management/__init__.py | 0 haystack/management/commands/__init__.py | 0 .../management/commands/build_solr_schema.py | 70 ++ haystack/management/commands/clear_index.py | 59 + haystack/management/commands/haystack_info.py | 21 + haystack/management/commands/rebuild_index.py | 26 + haystack/management/commands/update_index.py | 289 +++++ haystack/manager.py | 107 ++ haystack/models.py | 247 ++++ haystack/panels.py | 86 ++ haystack/query.py | 841 +++++++++++++ haystack/routers.py | 18 + haystack/signals.py | 90 ++ haystack/templates/panels/haystack.html | 33 + .../templates/search_configuration/solr.xml | 166 +++ haystack/templatetags/__init__.py | 0 haystack/templatetags/highlight.py | 119 ++ haystack/templatetags/more_like_this.py | 108 ++ haystack/urls.py | 16 + haystack/utils/__init__.py | 88 ++ haystack/utils/app_loading.py | 90 ++ haystack/utils/geo.py | 78 ++ haystack/utils/highlighting.py | 165 +++ haystack/utils/loading.py | 334 ++++++ haystack/utils/log.py | 25 + haystack/views.py | 235 ++++ setup.cfg | 22 + setup.py | 68 ++ 116 files changed, 17467 insertions(+) create mode 100644 AUTHORS create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 PKG-INFO create mode 100644 README.rst create mode 100644 debian/changelog create mode 100644 debian/clean create mode 100644 debian/compat create mode 100644 debian/control create mode 100644 debian/copyright create mode 100644 debian/python-django-haystack-doc.doc-base create mode 100644 debian/python-django-haystack-doc.docs create mode 100644 debian/python-django-haystack-doc.lintian-overrides create mode 100644 debian/python-django-haystack.docs create mode 100644 debian/python-django-haystack.lintian-overrides create mode 100644 debian/python3-django-haystack.docs create mode 100644 debian/python3-django-haystack.lintian-overrides create mode 100755 debian/rules create mode 100644 debian/source/format create mode 100644 debian/source/lintian-overrides create mode 100644 debian/watch create mode 100644 django_haystack.egg-info/PKG-INFO create mode 100644 django_haystack.egg-info/SOURCES.txt create mode 100644 django_haystack.egg-info/dependency_links.txt create mode 100644 django_haystack.egg-info/not-zip-safe create mode 100644 django_haystack.egg-info/pbr.json create mode 100644 django_haystack.egg-info/requires.txt create mode 100644 django_haystack.egg-info/top_level.txt create mode 100644 docs/Makefile create mode 100644 docs/_build/.gitignore create mode 100644 docs/_static/.gitignore create mode 100644 docs/_templates/.gitignore create mode 100644 docs/admin.rst create mode 100644 docs/architecture_overview.rst create mode 100644 docs/autocomplete.rst create mode 100644 docs/backend_support.rst create mode 100644 docs/best_practices.rst create mode 100644 docs/boost.rst create mode 100644 docs/conf.py create mode 100644 docs/contributing.rst create mode 100644 docs/creating_new_backends.rst create mode 100644 docs/debugging.rst create mode 100644 docs/faceting.rst create mode 100644 docs/faq.rst create mode 100644 docs/glossary.rst create mode 100644 docs/haystack_theme/layout.html create mode 100644 docs/haystack_theme/static/documentation.css create mode 100644 docs/haystack_theme/theme.conf create mode 100644 docs/highlighting.rst create mode 100644 docs/index.rst create mode 100644 docs/inputtypes.rst create mode 100644 docs/installing_search_engines.rst create mode 100644 docs/management_commands.rst create mode 100644 docs/migration_from_1_to_2.rst create mode 100644 docs/multiple_index.rst create mode 100644 docs/other_apps.rst create mode 100644 docs/python3.rst create mode 100644 docs/rich_content_extraction.rst create mode 100644 docs/running_tests.rst create mode 100644 docs/searchbackend_api.rst create mode 100644 docs/searchfield_api.rst create mode 100644 docs/searchindex_api.rst create mode 100644 docs/searchquery_api.rst create mode 100644 docs/searchqueryset_api.rst create mode 100644 docs/searchresult_api.rst create mode 100644 docs/settings.rst create mode 100644 docs/signal_processors.rst create mode 100644 docs/spatial.rst create mode 100644 docs/templatetags.rst create mode 100644 docs/toc.rst create mode 100644 docs/tutorial.rst create mode 100644 docs/utils.rst create mode 100644 docs/views_and_forms.rst create mode 100644 docs/who_uses.rst create mode 100644 haystack/__init__.py create mode 100644 haystack/admin.py create mode 100644 haystack/backends/__init__.py create mode 100644 haystack/backends/elasticsearch_backend.py create mode 100644 haystack/backends/simple_backend.py create mode 100644 haystack/backends/solr_backend.py create mode 100644 haystack/backends/whoosh_backend.py create mode 100644 haystack/constants.py create mode 100644 haystack/exceptions.py create mode 100644 haystack/fields.py create mode 100644 haystack/forms.py create mode 100644 haystack/generic_views.py create mode 100644 haystack/indexes.py create mode 100644 haystack/inputs.py create mode 100644 haystack/management/__init__.py create mode 100644 haystack/management/commands/__init__.py create mode 100644 haystack/management/commands/build_solr_schema.py create mode 100644 haystack/management/commands/clear_index.py create mode 100644 haystack/management/commands/haystack_info.py create mode 100644 haystack/management/commands/rebuild_index.py create mode 100755 haystack/management/commands/update_index.py create mode 100644 haystack/manager.py create mode 100644 haystack/models.py create mode 100644 haystack/panels.py create mode 100644 haystack/query.py create mode 100644 haystack/routers.py create mode 100644 haystack/signals.py create mode 100644 haystack/templates/panels/haystack.html create mode 100644 haystack/templates/search_configuration/solr.xml create mode 100644 haystack/templatetags/__init__.py create mode 100644 haystack/templatetags/highlight.py create mode 100644 haystack/templatetags/more_like_this.py create mode 100644 haystack/urls.py create mode 100644 haystack/utils/__init__.py create mode 100755 haystack/utils/app_loading.py create mode 100644 haystack/utils/geo.py create mode 100644 haystack/utils/highlighting.py create mode 100644 haystack/utils/loading.py create mode 100644 haystack/utils/log.py create mode 100644 haystack/views.py create mode 100644 setup.cfg create mode 100755 setup.py diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..a40e7a7 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,106 @@ +Primary Authors: + + * Daniel Lindsley + * Matt Croydon (some documentation, sanity checks and the sweet name) + * Travis Cline (the original SQ implementation, improvements to ModelSearchIndex) + * David Sauve (notanumber) for the Xapian backend, the simple backend and various patches. + * Jannis Leidel (jezdez) + * Chris Adams (acdha) + * Justin Caratzas (bigjust) + * Andrew Schoen (andrewschoen) + * Dan Watson (dcwatson) + * Matt Woodward (mpwoodward) + * Alex Vidal (avidal) + * Zach Smith (zmsmith) + * Stefan Wehrmeyer (stefanw) + * George Hickman (ghickman) + * Ben Spaulding (benspaulding) + + +Thanks to + * Jacob Kaplan-Moss & Joseph Kocherhans for the original implementation of + djangosearch, of which portions were used, as well as basic API feedback. + * Christian Metts for designing the logo and building a better site. + * Nathan Borror for testing and advanced form usage. + * Malcolm Tredinnick for API feedback. + * Mediaphormedia for funding the development on More Like This and faceting. + * Travis Cline for API feedback, Git help and improvements to the reindex command. + * Brian Rosner for various patches. + * Richard Boulton for feedback and suggestions. + * Cyberdelia for feedback and patches. + * Ask Solem for for patching the setup.py. + * Ben Spaulding for feedback and documentation patches. + * smulloni for various patches. + * JoeGermuska for various patches. + * SmileyChris for various patches. + * sk1p for various patches. + * Ryszard Szopa (ryszard) for various patches. + * Patryk Zawadzki (patrys) for various patches and feedback. + * Frank Wiles for documentation patches. + * Chris Adams (acdha) for various patches. + * Kyle MacFarlane for various patches. + * Alex Gaynor (alex) for help with handling deferred models with More Like This. + * RobertGawron for a patch to the Highlighter. + * Simon Willison (simonw) for various proposals and patches. + * Ben Firshman (bfirsh) for faceting improvements and suggestions. + * Peter Bengtsson for a patch regarding passing a customized site. + * Sam Bull (osirius) for a patch regarding initial data on SearchForms. + * slai for a patch regarding Whoosh and fetching all documents of a certain model type. + * alanwj for a patch regarding Whoosh and empty MultiValueFields. + * alanzoppa for a patch regarding highlighting. + * piquadrat for a patch regarding the more_like_this template tag. + * dedsm for a patch regarding the pickling of SearchResult objects. + * EmilStenstrom for a patch to the Highlighter. + * symroe for a patch regarding the more_like_this template tag. + * ghostrocket for a patch regarding the simple backend. + * Rob Hudson (robhudson) for improvements to the admin search. + * apollo13 for simplifying ``SearchForm.__init__``. + * Carl Meyer (carljm) for a patch regarding character primary keys. + * oyiptong for a patch regarding pickling. + * alfredo for a patch to generate epub docs. + * Luke Hatcher (lukeman) for documentation patches. + * Trey Hunner (treyhunner) for a Whoosh field boosting patch. + * Kent Gormat of Retail Catalyst for funding the development of multiple index support. + * Gidsy for funding the initial geospatial implementation + * CMGdigital for funding the development on: + * a multiprocessing-enabled version of ``update_index``. + * the addition of ``--start/--end`` options in ``update_index``. + * the ability to specify both apps & models to ``update_index``. + * A significant portion of the geospatial feature. + * A significant portion of the input types feature. + * Aram Dulyan (Aramgutang) for fixing the included admin class to be Django 1.4 compatible. + * Honza Kral (HonzaKral) for various Elasticsearch tweaks & testing. + * Alex Vidal (avidal) for a patch allowing developers to override the queryset used for update operations. + * Igor Támara (ikks) for a patch related to Unicode ``verbose_name_plural``. + * Dan Helfman (witten) for a patch related to highlighting. + * Matt DeBoard for refactor of ``SolrSearchBackend.search`` method to allow simpler extension of the class. + * Rodrigo Guzman (rz) for a fix to query handling in the ``simple`` backend. + * Martin J. Laubach (mjl) for fixing the logic used when combining querysets + * Eric Holscher (ericholscher) for a docs fix. + * Erik Rose (erikrose) for a quick pyelasticsearch-compatibility patch + * Stefan Wehrmeyer (stefanw) for a simple search filter fix + * Dan Watson (dcwatson) for various patches. + * Andrew Schoen (andrewschoen) for the addition of ``HAYSTACK_IDENTIFIER_METHOD`` + * Pablo SEMINARIO (pabluk) for a docs fix, and a fix in the ElasticSearch backend. + * Eric Thurgood (ethurgood) for a import fix in the Elasticssearch backend. + * Revolution Systems & The Python Software Foundation for funding a significant portion of the port to Python 3! + * Artem Kostiuk (postatum) for patch allowing to search for slash character in ElasticSearch since Lucene 4.0. + * Luis Barrueco (luisbarrueco) for a simple fix regarding updating indexes using multiple backends. + * Szymon Teżewski (jasisz) for an update to the bounding-box calculation for spatial queries + * Chris Wilson (qris) and Orlando Fiol (overflow) for an update allowing the use of multiple order_by() + fields with Whoosh as long as they share a consistent sort direction + * Steven Skoczen (@skoczen) for an ElasticSearch bug fix + * @Xaroth for updating the app loader to be compatible with Django 1.7 + * Jaroslav Gorjatsev (jarig) for a bugfix with index_fieldname + * Dirk Eschler (@deschler) for app loader Django 1.7 compatibility fixes + * Wictor (wicol) for a patch improving the error message given when model_attr references a non-existent + field + * Pierre Dulac (dulaccc) for a patch updating distance filters for ElasticSearch 1.x + * Andrei Fokau (andreif) for adding support for ``SQ`` in ``SearchQuerySet.narrow()`` + * Phill Tornroth (phill-tornroth) for several patches improving UnifiedIndex and ElasticSearch support + * Philippe Luickx (philippeluickx) for documenting how to provide backend-specific facet options + * Felipe Prenholato (@chronossc) for a patch making it easy to exclude documents from indexing using custom logic + * Alfredo Armanini (@phingage) for a patch fixing compatibility with database API changes in Django 1.8 + * Ben Spaulding (@benspaulding) for many updates for Django 1.8 support + * Troy Grosfield (@troygrosfield) for fixing the test runner for Django 1.8 + * Ilan Steemers (@Koed00) for fixing Django 1.9 deprecation warnings diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0bb702e --- /dev/null +++ b/LICENSE @@ -0,0 +1,31 @@ +Copyright (c) 2009-2013, Daniel Lindsley. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of Haystack nor the names of its contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +Prior to April 17, 2009, this software was released under the MIT license. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..ac95f3f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +recursive-include docs * +recursive-include haystack/templates *.xml *.html +include AUTHORS +include LICENSE +include README.rst diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..6c444ac --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,79 @@ +Metadata-Version: 1.1 +Name: django-haystack +Version: 2.4.0 +Summary: Pluggable search for Django. +Home-page: http://haystacksearch.org/ +Author: Daniel Lindsley +Author-email: daniel@toastdriven.com +License: UNKNOWN +Description: ======== + Haystack + ======== + + :author: Daniel Lindsley + :date: 2013/07/28 + + Haystack provides modular search for Django. It features a unified, familiar + API that allows you to plug in different search backends (such as Solr_, + Elasticsearch_, Whoosh_, Xapian_, etc.) without having to modify your code. + + .. _Solr: http://lucene.apache.org/solr/ + .. _Elasticsearch: http://elasticsearch.org/ + .. _Whoosh: https://bitbucket.org/mchaput/whoosh/ + .. _Xapian: http://xapian.org/ + + Haystack is BSD licensed, plays nicely with third-party app without needing to + modify the source and supports advanced features like faceting, More Like This, + highlighting, spatial search and spelling suggestions. + + You can find more information at http://haystacksearch.org/. + + + Getting Help + ============ + + There is a mailing list (http://groups.google.com/group/django-haystack/) + available for general discussion and an IRC channel (#haystack on + irc.freenode.net). + + + Documentation + ============= + + * Development version: http://docs.haystacksearch.org/ + * v2.3.X: http://django-haystack.readthedocs.org/en/v2.3.0/ + * v2.2.X: http://django-haystack.readthedocs.org/en/v2.2.0/ + * v2.1.X: http://django-haystack.readthedocs.org/en/v2.1.0/ + * v2.0.X: http://django-haystack.readthedocs.org/en/v2.0.0/ + * v1.2.X: http://django-haystack.readthedocs.org/en/v1.2.7/ + * v1.1.X: http://django-haystack.readthedocs.org/en/v1.1/ + + Build Status + ============ + + .. image:: https://travis-ci.org/django-haystack/django-haystack.svg?branch=master + :target: https://travis-ci.org/django-haystack/django-haystack + + Requirements + ============ + + Haystack has a relatively easily-met set of requirements. + + * Python 2.7+ or Python 3.3+ + * Django 1.6+ + + Additionally, each backend has its own requirements. You should refer to + http://django-haystack.readthedocs.org/en/latest/installing_search_engines.html for more + details. + +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Environment :: Web Environment +Classifier: Framework :: Django +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Utilities diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..2c0e549 --- /dev/null +++ b/README.rst @@ -0,0 +1,59 @@ +======== +Haystack +======== + +:author: Daniel Lindsley +:date: 2013/07/28 + +Haystack provides modular search for Django. It features a unified, familiar +API that allows you to plug in different search backends (such as Solr_, +Elasticsearch_, Whoosh_, Xapian_, etc.) without having to modify your code. + +.. _Solr: http://lucene.apache.org/solr/ +.. _Elasticsearch: http://elasticsearch.org/ +.. _Whoosh: https://bitbucket.org/mchaput/whoosh/ +.. _Xapian: http://xapian.org/ + +Haystack is BSD licensed, plays nicely with third-party app without needing to +modify the source and supports advanced features like faceting, More Like This, +highlighting, spatial search and spelling suggestions. + +You can find more information at http://haystacksearch.org/. + + +Getting Help +============ + +There is a mailing list (http://groups.google.com/group/django-haystack/) +available for general discussion and an IRC channel (#haystack on +irc.freenode.net). + + +Documentation +============= + +* Development version: http://docs.haystacksearch.org/ +* v2.3.X: http://django-haystack.readthedocs.org/en/v2.3.0/ +* v2.2.X: http://django-haystack.readthedocs.org/en/v2.2.0/ +* v2.1.X: http://django-haystack.readthedocs.org/en/v2.1.0/ +* v2.0.X: http://django-haystack.readthedocs.org/en/v2.0.0/ +* v1.2.X: http://django-haystack.readthedocs.org/en/v1.2.7/ +* v1.1.X: http://django-haystack.readthedocs.org/en/v1.1/ + +Build Status +============ + +.. image:: https://travis-ci.org/django-haystack/django-haystack.svg?branch=master + :target: https://travis-ci.org/django-haystack/django-haystack + +Requirements +============ + +Haystack has a relatively easily-met set of requirements. + +* Python 2.7+ or Python 3.3+ +* Django 1.6+ + +Additionally, each backend has its own requirements. You should refer to +http://django-haystack.readthedocs.org/en/latest/installing_search_engines.html for more +details. diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..5f4a725 --- /dev/null +++ b/debian/changelog @@ -0,0 +1,44 @@ +django-haystack (2.4.0-2) unstable; urgency=low + + * Switch buildsystem to pybuild. + * Add Python3 support through a separate package. + * Add lintian override for missing upstream changelog. + + -- Michael Fladischer Tue, 07 Jul 2015 22:12:20 +0200 + +django-haystack (2.4.0-1) unstable; urgency=low + + * New upstream release. + * Remove files from d/copyright which are no longer shipped by + upstream. + * Use pypi.debian.net service for uscan. + * Change my email address to fladi@debian.org. + + -- Michael Fladischer Tue, 07 Jul 2015 16:18:03 +0200 + +django-haystack (2.3.1-1) unstable; urgency=medium + + * New upstream release (Closes: #755599). + * Bump Standards-Version to 3.9.6. + * Disable tests as they require a live SOLR and elasticsearch server. + * Change file names for solr configuration files in d/copyright. + * Make pysolr require at least version 3.2.0. + * Add python-elasticsearch to Suggests. + * Drop packages required by tests from Build-Depends: + + python-django + + python-httplib2 + + python-mock + + python-pysolr + + python-whoosh + * Drop python-xapian from suggests as the xapian backend is not + included. + * Add django_haystack.egg-info/requires.txt to d/clean. + * Remove empty lines at EOF for d/clean and d/rules. + + -- Michael Fladischer Mon, 20 Oct 2014 14:18:24 +0200 + +django-haystack (2.1.0-1) unstable; urgency=low + + * Initial release (Closes: #563311). + + -- Michael Fladischer Thu, 13 Mar 2014 19:11:15 +0100 diff --git a/debian/clean b/debian/clean new file mode 100644 index 0000000..2573b6b --- /dev/null +++ b/debian/clean @@ -0,0 +1,6 @@ +django_haystack.egg-info/PKG-INFO +django_haystack.egg-info/SOURCES.txt +django_haystack.egg-info/dependency_links.txt +django_haystack.egg-info/not-zip-safe +django_haystack.egg-info/top_level.txt +django_haystack.egg-info/requires.txt diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..ec63514 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +9 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..0aa8c1d --- /dev/null +++ b/debian/control @@ -0,0 +1,71 @@ +Source: django-haystack +Section: python +Priority: optional +Maintainer: Debian Python Modules Team +Uploaders: Michael Fladischer +Build-Depends: debhelper (>= 9), + dh-python, + python-all, + python-setuptools, + python-sphinx (>= 1.0.7+dfsg), + python3-all, + python3-setuptools +Standards-Version: 3.9.6 +X-Python-Version: >= 2.6 +X-Python3-Version: >= 3.3 +Homepage: https://github.com/toastdriven/django-haystack +Vcs-Svn: svn://anonscm.debian.org/python-modules/packages/django-haystack/trunk/ +Vcs-Browser: http://anonscm.debian.org/viewvc/python-modules/packages/django-haystack/trunk/ + +Package: python-django-haystack +Architecture: all +Depends: python-django (>= 1.5), + ${misc:Depends}, + ${python:Depends} +Suggests: python-elasticsearch, + python-httplib2, + python-pysolr (>= 3.2.0), + python-whoosh +Description: modular search for Django + Haystack provides modular search for Django. It features a unified, familiar + API that allows you to plug in different search backends (such as Solr, + Elasticsearch, Whoosh, Xapian, etc.) without having to modify your code. + . + It plays nicely with third-party app without needing to modify the source and + supports advanced features like faceting, More Like This, highlighting, spatial + search and spelling suggestions. + +Package: python3-django-haystack +Architecture: all +Depends: python3-django, + ${misc:Depends}, + ${python3:Depends} +Suggests: python3-elasticsearch, + python3-httplib2, + python3-whoosh +Description: modular search for Django (Python3 version) + Haystack provides modular search for Django. It features a unified, familiar + API that allows you to plug in different search backends (such as Solr, + Elasticsearch, Whoosh, Xapian, etc.) without having to modify your code. + . + It plays nicely with third-party app without needing to modify the source and + supports advanced features like faceting, More Like This, highlighting, spatial + search and spelling suggestions. + . + This package contains the Python 3 version of the library. + +Package: python-django-haystack-doc +Section: doc +Architecture: all +Depends: ${misc:Depends}, + ${sphinxdoc:Depends} +Description: modular search for Django (Documentation) + Haystack provides modular search for Django. It features a unified, familiar + API that allows you to plug in different search backends (such as Solr, + Elasticsearch, Whoosh, Xapian, etc.) without having to modify your code. + . + It plays nicely with third-party app without needing to modify the source and + supports advanced features like faceting, More Like This, highlighting, spatial + search and spelling suggestions. + . + This package contains the documentation. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..8e706d2 --- /dev/null +++ b/debian/copyright @@ -0,0 +1,55 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: django-haystack +Upstream-Contact: Daniel Lindsley +Source: https://github.com/toastdriven/django-haystack + +Files: * +Copyright: 2009-2013, Daniel Lindsley +License: BSD-3-clause + +Files: haystack/templates/search_configuration/solr.xml +Copyright: Apache Software Foundation +License: Apache + +Files: debian/* +Copyright: 2013, Fladischer Michael +License: BSD-3-clause + +License: BSD-3-clause + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + . + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + . + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + . + 3. Neither the name of Haystack nor the names of its contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. + . + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +License: Apache + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + http://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/debian/python-django-haystack-doc.doc-base b/debian/python-django-haystack-doc.doc-base new file mode 100644 index 0000000..af562f5 --- /dev/null +++ b/debian/python-django-haystack-doc.doc-base @@ -0,0 +1,8 @@ +Document: python-django-haystack-doc +Title: Dango Haystack Documentation +Author: Daniel Lindsley +Section: Programming/Python + +Format: HTML +Index: /usr/share/doc/python-django-haystack-doc/html/index.html +Files: /usr/share/doc/python-django-haystack-doc/html/*.html diff --git a/debian/python-django-haystack-doc.docs b/debian/python-django-haystack-doc.docs new file mode 100644 index 0000000..4ecc793 --- /dev/null +++ b/debian/python-django-haystack-doc.docs @@ -0,0 +1 @@ +docs/_build/html diff --git a/debian/python-django-haystack-doc.lintian-overrides b/debian/python-django-haystack-doc.lintian-overrides new file mode 100644 index 0000000..c5cea0f --- /dev/null +++ b/debian/python-django-haystack-doc.lintian-overrides @@ -0,0 +1,2 @@ +# Upstream does not provide a changelog. +python-django-haystack-doc: no-upstream-changelog diff --git a/debian/python-django-haystack.docs b/debian/python-django-haystack.docs new file mode 100644 index 0000000..a1320b1 --- /dev/null +++ b/debian/python-django-haystack.docs @@ -0,0 +1 @@ +README.rst diff --git a/debian/python-django-haystack.lintian-overrides b/debian/python-django-haystack.lintian-overrides new file mode 100644 index 0000000..717776b --- /dev/null +++ b/debian/python-django-haystack.lintian-overrides @@ -0,0 +1,2 @@ +# Upstream does not provide a changelog. +python-django-haystack: no-upstream-changelog diff --git a/debian/python3-django-haystack.docs b/debian/python3-django-haystack.docs new file mode 100644 index 0000000..a1320b1 --- /dev/null +++ b/debian/python3-django-haystack.docs @@ -0,0 +1 @@ +README.rst diff --git a/debian/python3-django-haystack.lintian-overrides b/debian/python3-django-haystack.lintian-overrides new file mode 100644 index 0000000..bf957e8 --- /dev/null +++ b/debian/python3-django-haystack.lintian-overrides @@ -0,0 +1,2 @@ +# Upstream does not provide a changelog. +python3-django-haystack: no-upstream-changelog diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..b1bc88b --- /dev/null +++ b/debian/rules @@ -0,0 +1,15 @@ +#!/usr/bin/make -f + +export PYBUILD_NAME=django-haystack +export PYBUILD_DISABLE=test + +%: + dh $@ --with python2,python3,sphinxdoc --buildsystem=pybuild + +override_dh_auto_build: + PYTHONPATH=. sphinx-build -b html -d docs/_build/.doctrees -N docs docs/_build/html + dh_auto_build + +override_dh_clean: + rm -rf docs/_build + dh_clean diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/debian/source/lintian-overrides b/debian/source/lintian-overrides new file mode 100644 index 0000000..ded6f93 --- /dev/null +++ b/debian/source/lintian-overrides @@ -0,0 +1,2 @@ +# Upstream does not provide PGP signatures for their release tarballs. +django-haystack source: debian-watch-may-check-gpg-signature diff --git a/debian/watch b/debian/watch new file mode 100644 index 0000000..44dc910 --- /dev/null +++ b/debian/watch @@ -0,0 +1,3 @@ +version=3 +opts=uversionmangle=s/(rc|a|b|c)/~$1/ \ +http://pypi.debian.net/django-haystack/django-haystack-(.+)\.(?:zip|tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) \ No newline at end of file diff --git a/django_haystack.egg-info/PKG-INFO b/django_haystack.egg-info/PKG-INFO new file mode 100644 index 0000000..6c444ac --- /dev/null +++ b/django_haystack.egg-info/PKG-INFO @@ -0,0 +1,79 @@ +Metadata-Version: 1.1 +Name: django-haystack +Version: 2.4.0 +Summary: Pluggable search for Django. +Home-page: http://haystacksearch.org/ +Author: Daniel Lindsley +Author-email: daniel@toastdriven.com +License: UNKNOWN +Description: ======== + Haystack + ======== + + :author: Daniel Lindsley + :date: 2013/07/28 + + Haystack provides modular search for Django. It features a unified, familiar + API that allows you to plug in different search backends (such as Solr_, + Elasticsearch_, Whoosh_, Xapian_, etc.) without having to modify your code. + + .. _Solr: http://lucene.apache.org/solr/ + .. _Elasticsearch: http://elasticsearch.org/ + .. _Whoosh: https://bitbucket.org/mchaput/whoosh/ + .. _Xapian: http://xapian.org/ + + Haystack is BSD licensed, plays nicely with third-party app without needing to + modify the source and supports advanced features like faceting, More Like This, + highlighting, spatial search and spelling suggestions. + + You can find more information at http://haystacksearch.org/. + + + Getting Help + ============ + + There is a mailing list (http://groups.google.com/group/django-haystack/) + available for general discussion and an IRC channel (#haystack on + irc.freenode.net). + + + Documentation + ============= + + * Development version: http://docs.haystacksearch.org/ + * v2.3.X: http://django-haystack.readthedocs.org/en/v2.3.0/ + * v2.2.X: http://django-haystack.readthedocs.org/en/v2.2.0/ + * v2.1.X: http://django-haystack.readthedocs.org/en/v2.1.0/ + * v2.0.X: http://django-haystack.readthedocs.org/en/v2.0.0/ + * v1.2.X: http://django-haystack.readthedocs.org/en/v1.2.7/ + * v1.1.X: http://django-haystack.readthedocs.org/en/v1.1/ + + Build Status + ============ + + .. image:: https://travis-ci.org/django-haystack/django-haystack.svg?branch=master + :target: https://travis-ci.org/django-haystack/django-haystack + + Requirements + ============ + + Haystack has a relatively easily-met set of requirements. + + * Python 2.7+ or Python 3.3+ + * Django 1.6+ + + Additionally, each backend has its own requirements. You should refer to + http://django-haystack.readthedocs.org/en/latest/installing_search_engines.html for more + details. + +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Environment :: Web Environment +Classifier: Framework :: Django +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Utilities diff --git a/django_haystack.egg-info/SOURCES.txt b/django_haystack.egg-info/SOURCES.txt new file mode 100644 index 0000000..53c2d69 --- /dev/null +++ b/django_haystack.egg-info/SOURCES.txt @@ -0,0 +1,99 @@ +AUTHORS +LICENSE +MANIFEST.in +README.rst +setup.cfg +setup.py +django_haystack.egg-info/PKG-INFO +django_haystack.egg-info/SOURCES.txt +django_haystack.egg-info/dependency_links.txt +django_haystack.egg-info/not-zip-safe +django_haystack.egg-info/pbr.json +django_haystack.egg-info/requires.txt +django_haystack.egg-info/top_level.txt +docs/Makefile +docs/admin.rst +docs/architecture_overview.rst +docs/autocomplete.rst +docs/backend_support.rst +docs/best_practices.rst +docs/boost.rst +docs/conf.py +docs/contributing.rst +docs/creating_new_backends.rst +docs/debugging.rst +docs/faceting.rst +docs/faq.rst +docs/glossary.rst +docs/highlighting.rst +docs/index.rst +docs/inputtypes.rst +docs/installing_search_engines.rst +docs/management_commands.rst +docs/migration_from_1_to_2.rst +docs/multiple_index.rst +docs/other_apps.rst +docs/python3.rst +docs/rich_content_extraction.rst +docs/running_tests.rst +docs/searchbackend_api.rst +docs/searchfield_api.rst +docs/searchindex_api.rst +docs/searchquery_api.rst +docs/searchqueryset_api.rst +docs/searchresult_api.rst +docs/settings.rst +docs/signal_processors.rst +docs/spatial.rst +docs/templatetags.rst +docs/toc.rst +docs/tutorial.rst +docs/utils.rst +docs/views_and_forms.rst +docs/who_uses.rst +docs/_build/.gitignore +docs/_static/.gitignore +docs/_templates/.gitignore +docs/haystack_theme/layout.html +docs/haystack_theme/theme.conf +docs/haystack_theme/static/documentation.css +haystack/__init__.py +haystack/admin.py +haystack/constants.py +haystack/exceptions.py +haystack/fields.py +haystack/forms.py +haystack/generic_views.py +haystack/indexes.py +haystack/inputs.py +haystack/manager.py +haystack/models.py +haystack/panels.py +haystack/query.py +haystack/routers.py +haystack/signals.py +haystack/urls.py +haystack/views.py +haystack/backends/__init__.py +haystack/backends/elasticsearch_backend.py +haystack/backends/simple_backend.py +haystack/backends/solr_backend.py +haystack/backends/whoosh_backend.py +haystack/management/__init__.py +haystack/management/commands/__init__.py +haystack/management/commands/build_solr_schema.py +haystack/management/commands/clear_index.py +haystack/management/commands/haystack_info.py +haystack/management/commands/rebuild_index.py +haystack/management/commands/update_index.py +haystack/templates/panels/haystack.html +haystack/templates/search_configuration/solr.xml +haystack/templatetags/__init__.py +haystack/templatetags/highlight.py +haystack/templatetags/more_like_this.py +haystack/utils/__init__.py +haystack/utils/app_loading.py +haystack/utils/geo.py +haystack/utils/highlighting.py +haystack/utils/loading.py +haystack/utils/log.py \ No newline at end of file diff --git a/django_haystack.egg-info/dependency_links.txt b/django_haystack.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/django_haystack.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/django_haystack.egg-info/not-zip-safe b/django_haystack.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/django_haystack.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/django_haystack.egg-info/pbr.json b/django_haystack.egg-info/pbr.json new file mode 100644 index 0000000..694ff89 --- /dev/null +++ b/django_haystack.egg-info/pbr.json @@ -0,0 +1 @@ +{"is_release": false, "git_version": "ebf1a5c"} \ No newline at end of file diff --git a/django_haystack.egg-info/requires.txt b/django_haystack.egg-info/requires.txt new file mode 100644 index 0000000..eec1cf1 --- /dev/null +++ b/django_haystack.egg-info/requires.txt @@ -0,0 +1 @@ +Django \ No newline at end of file diff --git a/django_haystack.egg-info/top_level.txt b/django_haystack.egg-info/top_level.txt new file mode 100644 index 0000000..d755762 --- /dev/null +++ b/django_haystack.egg-info/top_level.txt @@ -0,0 +1 @@ +haystack diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..791d8d6 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,80 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d _build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html web pickle htmlhelp latex changes linkcheck + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " changes to make an overview over all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + +clean: + -rm -rf _build/* + +html: + mkdir -p _build/html _build/doctrees + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) _build/html + @echo + @echo "Build finished. The HTML pages are in _build/html." + +pickle: + mkdir -p _build/pickle _build/doctrees + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) _build/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +web: pickle + +json: + mkdir -p _build/json _build/doctrees + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) _build/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + mkdir -p _build/htmlhelp _build/doctrees + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) _build/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in _build/htmlhelp." + +latex: + mkdir -p _build/latex _build/doctrees + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) _build/latex + @echo + @echo "Build finished; the LaTeX files are in _build/latex." + @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ + "run these through (pdf)latex." + +changes: + mkdir -p _build/changes _build/doctrees + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) _build/changes + @echo + @echo "The overview file is in _build/changes." + +linkcheck: + mkdir -p _build/linkcheck _build/doctrees + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) _build/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in _build/linkcheck/output.txt." + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) _build/epub + @echo + @echo "Build finished. The epub file is in _build/epub." diff --git a/docs/_build/.gitignore b/docs/_build/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/docs/_static/.gitignore b/docs/_static/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/docs/_templates/.gitignore b/docs/_templates/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/docs/admin.rst b/docs/admin.rst new file mode 100644 index 0000000..c7a2d2a --- /dev/null +++ b/docs/admin.rst @@ -0,0 +1,47 @@ +.. _ref-admin: + +=================== +Django Admin Search +=================== + +Haystack comes with a base class to support searching via Haystack in the +Django admin. To use Haystack to search, inherit from ``haystack.admin.SearchModelAdmin`` +instead of ``django.contrib.admin.ModelAdmin``. + +For example:: + + from haystack.admin import SearchModelAdmin + from .models import MockModel + + + class MockModelAdmin(SearchModelAdmin): + haystack_connection = 'solr' + date_hierarchy = 'pub_date' + list_display = ('author', 'pub_date') + + + admin.site.register(MockModel, MockModelAdmin) + +You can also specify the Haystack connection used by the search with the +``haystack_connection`` property on the model admin class. If not specified, +the default connection will be used. + +If you already have a base model admin class you use, there is also a mixin +you can use instead:: + + from django.contrib import admin + from haystack.admin import SearchModelAdminMixin + from .models import MockModel + + + class MyCustomModelAdmin(admin.ModelAdmin): + pass + + + class MockModelAdmin(SearchModelAdminMixin, MyCustomModelAdmin): + haystack_connection = 'solr' + date_hierarchy = 'pub_date' + list_display = ('author', 'pub_date') + + + admin.site.register(MockModel, MockModelAdmin) diff --git a/docs/architecture_overview.rst b/docs/architecture_overview.rst new file mode 100644 index 0000000..f56a58b --- /dev/null +++ b/docs/architecture_overview.rst @@ -0,0 +1,66 @@ +.. _ref-architecture-overview: + +===================== +Architecture Overview +===================== + +``SearchQuerySet`` +------------------ + +One main implementation. + +* Standard API that loosely follows ``QuerySet`` +* Handles most queries +* Allows for custom "parsing"/building through API +* Dispatches to ``SearchQuery`` for actual query +* Handles automatically creating a query +* Allows for raw queries to be passed straight to backend. + + +``SearchQuery`` +--------------- + +Implemented per-backend. + +* Method for building the query out of the structured data. +* Method for cleaning a string of reserved characters used by the backend. + +Main class provides: + +* Methods to add filters/models/order-by/boost/limits to the search. +* Method to perform a raw search. +* Method to get the number of hits. +* Method to return the results provided by the backend (likely not a full list). + + +``SearchBackend`` +----------------- + +Implemented per-backend. + +* Connects to search engine +* Method for saving new docs to index +* Method for removing docs from index +* Method for performing the actual query + + +``SearchSite`` +-------------- + +One main implementation. + +* Standard API that loosely follows ``django.contrib.admin.sites.AdminSite`` +* Handles registering/unregistering models to search on a per-site basis. +* Provides a means of adding custom indexes to a model, like ``ModelAdmins``. + + +``SearchIndex`` +--------------- + +Implemented per-model you wish to index. + +* Handles generating the document to be indexed. +* Populates additional fields to accompany the document. +* Provides a way to limit what types of objects get indexed. +* Provides a way to index the document(s). +* Provides a way to remove the document(s). diff --git a/docs/autocomplete.rst b/docs/autocomplete.rst new file mode 100644 index 0000000..0ff7005 --- /dev/null +++ b/docs/autocomplete.rst @@ -0,0 +1,220 @@ +.. _ref-autocomplete: + +============ +Autocomplete +============ + +Autocomplete is becoming increasingly common as an add-on to search. Haystack +makes it relatively simple to implement. There are two steps in the process, +one to prepare the data and one to implement the actual search. + +Step 1. Setup The Data +====================== + +To do autocomplete effectively, the search backend uses n-grams (essentially +a small window passed over the string). Because this alters the way your +data needs to be stored, the best approach is to add a new field to your +``SearchIndex`` that contains the text you want to autocomplete on. + +You have two choices: ``NgramField`` and ``EdgeNgramField``. Though very similar, +the choice of field is somewhat important. + +* If you're working with standard text, ``EdgeNgramField`` tokenizes on + whitespace. This prevents incorrect matches when part of two different words + are mashed together as one n-gram. **This is what most users should use.** +* If you're working with Asian languages or want to be able to autocomplete + across word boundaries, ``NgramField`` should be what you use. + +Example (continuing from the tutorial):: + + import datetime + from haystack import indexes + from myapp.models import Note + + + class NoteIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + author = indexes.CharField(model_attr='user') + pub_date = indexes.DateTimeField(model_attr='pub_date') + # We add this for autocomplete. + content_auto = indexes.EdgeNgramField(model_attr='content') + + def get_model(self): + return Note + + def index_queryset(self, using=None): + """Used when the entire index for model is updated.""" + return Note.objects.filter(pub_date__lte=datetime.datetime.now()) + +As with all schema changes, you'll need to rebuild/update your index after +making this change. + + +Step 2. Performing The Query +============================ + +Haystack ships with a convenience method to perform most autocomplete searches. +You simply provide a field and the query you wish to search on to the +``SearchQuerySet.autocomplete`` method. Given the previous example, an example +search would look like:: + + from haystack.query import SearchQuerySet + + SearchQuerySet().autocomplete(content_auto='old') + # Result match things like 'goldfish', 'cuckold' and 'older'. + +The results from the ``SearchQuerySet.autocomplete`` method are full search +results, just like any regular filter. + +If you need more control over your results, you can use standard +``SearchQuerySet.filter`` calls. For instance:: + + from haystack.query import SearchQuerySet + + sqs = SearchQuerySet().filter(content_auto=request.GET.get('q', '')) + +This can also be extended to use ``SQ`` for more complex queries (and is what's +being done under the hood in the ``SearchQuerySet.autocomplete`` method). + + +Example Implementation +====================== + +The above is the low-level backend portion of how you implement autocomplete. +To make it work in browser, you need both a view to run the autocomplete +and some Javascript to fetch the results. + +Since it comes up often, here is an example implementation of those things. + +.. warning:: + + This code comes with no warranty. Don't ask for support on it. If you + copy-paste it and it burns down your server room, I'm not liable for any + of it. + + It worked this one time on my machine in a simulated environment. + + And yeah, semicolon-less + 2 space + comma-first. Deal with it. + +A stripped-down view might look like:: + + # views.py + import simplejson as json + from django.http import HttpResponse + from haystack.query import SearchQuerySet + + + def autocomplete(request): + sqs = SearchQuerySet().autocomplete(content_auto=request.GET.get('q', ''))[:5] + suggestions = [result.title for result in sqs] + # Make sure you return a JSON object, not a bare list. + # Otherwise, you could be vulnerable to an XSS attack. + the_data = json.dumps({ + 'results': suggestions + }) + return HttpResponse(the_data, content_type='application/json') + +The template might look like:: + + + + + + Autocomplete Example + + +

Autocomplete Example

+ +
+ + +
+ + + + + diff --git a/docs/backend_support.rst b/docs/backend_support.rst new file mode 100644 index 0000000..4ab3bc6 --- /dev/null +++ b/docs/backend_support.rst @@ -0,0 +1,127 @@ +.. _ref-backend-support: + +=============== +Backend Support +=============== + + +Supported Backends +================== + +* Solr_ +* Elasticsearch_ +* Whoosh_ +* Xapian_ + +.. _Solr: http://lucene.apache.org/solr/ +.. _Elasticsearch: http://elasticsearch.org/ +.. _Whoosh: https://bitbucket.org/mchaput/whoosh/ +.. _Xapian: http://xapian.org/ + + +Backend Capabilities +==================== + +Solr +---- + +**Complete & included with Haystack.** + +* Full SearchQuerySet support +* Automatic query building +* "More Like This" functionality +* Term Boosting +* Faceting +* Stored (non-indexed) fields +* Highlighting +* Spatial search +* Requires: pysolr (2.0.13+) & Solr 3.5+ + +Elasticsearch +------------- + +**Complete & included with Haystack.** + +* Full SearchQuerySet support +* Automatic query building +* "More Like This" functionality +* Term Boosting +* Faceting (up to 100 facets) +* Stored (non-indexed) fields +* Highlighting +* Spatial search +* Requires: elasticsearch-py > 1.0 & Elasticsearch 1.0+ + +Whoosh +------ + +**Complete & included with Haystack.** + +* Full SearchQuerySet support +* Automatic query building +* "More Like This" functionality +* Term Boosting +* Stored (non-indexed) fields +* Highlighting +* Requires: whoosh (2.0.0+) + +Xapian +------ + +**Complete & available as a third-party download.** + +* Full SearchQuerySet support +* Automatic query building +* "More Like This" functionality +* Term Boosting +* Faceting +* Stored (non-indexed) fields +* Highlighting +* Requires: Xapian 1.0.5+ & python-xapian 1.0.5+ +* Backend can be downloaded here: `xapian-haystack `_ + +Backend Support Matrix +====================== + ++----------------+------------------------+---------------------+----------------+------------+----------+---------------+--------------+---------+ +| Backend | SearchQuerySet Support | Auto Query Building | More Like This | Term Boost | Faceting | Stored Fields | Highlighting | Spatial | ++================+========================+=====================+================+============+==========+===============+==============+=========+ +| Solr | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | ++----------------+------------------------+---------------------+----------------+------------+----------+---------------+--------------+---------+ +| Elasticsearch | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | ++----------------+------------------------+---------------------+----------------+------------+----------+---------------+--------------+---------+ +| Whoosh | Yes | Yes | Yes | Yes | No | Yes | Yes | No | ++----------------+------------------------+---------------------+----------------+------------+----------+---------------+--------------+---------+ +| Xapian | Yes | Yes | Yes | Yes | Yes | Yes | Yes (plugin) | No | ++----------------+------------------------+---------------------+----------------+------------+----------+---------------+--------------+---------+ + + +Wishlist +======== + +The following are search backends that would be nice to have in Haystack but are +licensed in a way that prevents them from being officially bundled. If the +community expresses interest in any of these, there may be future development. + +* Riak_ +* Lupyne_ +* Sphinx_ + +.. _Riak: http://www.basho.com/ +.. _Lupyne: http://code.google.com/p/lupyne/ +.. _Sphinx: http://www.sphinxsearch.com/ + + +Sphinx +------ + +This backend is unlikely to be built. Sphinx is pretty gimpy & doesn't do +blended search results across all models the way the other engines can. +Very limited featureset as well. + +* Full SearchQuerySet support +* Automatic query building +* Term Boosting +* Stored (non-indexed) fields +* Highlighting +* Requires: sphinxapi.py (Comes with Sphinx) diff --git a/docs/best_practices.rst b/docs/best_practices.rst new file mode 100644 index 0000000..bf70eaf --- /dev/null +++ b/docs/best_practices.rst @@ -0,0 +1,263 @@ +.. _ref-best-practices: + +============== +Best Practices +============== + +What follows are some general recommendations on how to improve your search. +Some tips represent performance benefits, some provide a better search index. +You should evaluate these options for yourself and pick the ones that will +work best for you. Not all situations are created equal and many of these +options could be considered mandatory in some cases and unnecessary premature +optimizations in others. Your mileage may vary. + + +Good Search Needs Good Content +============================== + +Most search engines work best when they're given corpuses with predominantly +text (as opposed to other data like dates, numbers, etc.) in decent quantities +(more than a couple words). This is in stark contrast to the databases most +people are used to, which rely heavily on non-text data to create relationships +and for ease of querying. + +To this end, if search is important to you, you should take the time to +carefully craft your ``SearchIndex`` subclasses to give the search engine the +best information you can. This isn't necessarily hard but is worth the +investment of time and thought. Assuming you've only ever used the +``BasicSearchIndex``, in creating custom ``SearchIndex`` classes, there are +some easy improvements to make that will make your search better: + +* For your ``document=True`` field, use a well-constructed template. +* Add fields for data you might want to be able to filter by. +* If the model has related data, you can squash good content from those + related models into the parent model's ``SearchIndex``. +* Similarly, if you have heavily de-normalized models, it may be best + represented by a single indexed model rather than many indexed models. + +Well-Constructed Templates +-------------------------- + +A relatively unique concept in Haystack is the use of templates associated with +``SearchIndex`` fields. These are data templates, will never been seen by users +and ideally contain no HTML. They are used to collect various data from the +model and structure it as a document for the search engine to analyze and index. + +.. note:: + + If you read nothing else, this is the single most important thing you can + do to make search on your site better for your users. Good templates can + make or break your search and providing the search engine with good content + to index is critical. + +Good templates structure the data well and incorporate as much pertinent text +as possible. This may include additional fields such as titles, author +information, metadata, tags/categories. Without being artificial, you want to +construct as much context as you can. This doesn't mean you should necessarily +include every field, but you should include fields that provide good content +or include terms you think your users may frequently search on. + +Unless you have very unique numbers or dates, neither of these types of data +are a good fit within templates. They are usually better suited to other +fields for filtering within a ``SearchQuerySet``. + +Additional Fields For Filtering +------------------------------- + +Documents by themselves are good for generating indexes of content but are +generally poor for filtering content, for instance, by date. All search engines +supported by Haystack provide a means to associate extra data as +attributes/fields on a record. The database analogy would be adding extra +columns to the table for filtering. + +Good candidates here are date fields, number fields, de-normalized data from +related objects, etc. You can expose these things to users in the form of a +calendar range to specify, an author to look up or only data from a certain +series of numbers to return. + +You will need to plan ahead and anticipate what you might need to filter on, +though with each field you add, you increase storage space usage. It's generally +**NOT** recommended to include every field from a model, just ones you are +likely to use. + +Related Data +------------ + +Related data is somewhat problematic to deal with, as most search engines are +better with documents than they are with relationships. One way to approach this +is to de-normalize a related child object or objects into the parent's document +template. The inclusion of a foreign key's relevant data or a simple Django +``{% for %}`` templatetag to iterate over the related objects can increase the +salient data in your document. Be careful what you include and how you structure +it, as this can have consequences on how well a result might rank in your +search. + + +Avoid Hitting The Database +========================== + +A very easy but effective thing you can do to drastically reduce hits on the +database is to pre-render your search results using stored fields then disabling +the ``load_all`` aspect of your ``SearchView``. + +.. warning:: + + This technique may cause a substantial increase in the size of your index + as you are basically using it as a storage mechanism. + +To do this, you setup one or more stored fields (`indexed=False`) on your +``SearchIndex`` classes. You should specify a template for the field, filling it +with the data you'd want to display on your search results pages. When the model +attached to the ``SearchIndex`` is placed in the index, this template will get +rendered and stored in the index alongside the record. + +.. note:: + + The downside of this method is that the HTML for the result will be locked + in once it is indexed. To make changes to the structure, you'd have to + reindex all of your content. It also limits you to a single display of the + content (though you could use multiple fields if that suits your needs). + +The second aspect is customizing your ``SearchView`` and its templates. First, +pass the ``load_all=False`` to your ``SearchView``, ideally in your URLconf. +This prevents the ``SearchQuerySet`` from loading all models objects for results +ahead of time. Then, in your template, simply display the stored content from +your ``SearchIndex`` as the HTML result. + +.. warning:: + + To do this, you must absolutely avoid using ``{{ result.object }}`` or any + further accesses beyond that. That call will hit the database, not only + nullifying your work on lessening database hits, but actually making it + worse as there will now be at least query for each result, up from a single + query for each type of model with ``load_all=True``. + + +Content-Type Specific Templates +=============================== + +Frequently, when displaying results, you'll want to customize the HTML output +based on what model the result represents. + +In practice, the best way to handle this is through the use of ``include`` +along with the data on the ``SearchResult``. + +Your existing loop might look something like:: + + {% for result in page.object_list %} +

+ {{ result.object.title }} +

+ {% empty %} +

No results found.

+ {% endfor %} + +An improved version might look like:: + + {% for result in page.object_list %} + {% if result.content_type == "blog.post" %} + {% include "search/includes/blog/post.html" %} + {% endif %} + {% if result.content_type == "media.photo" %} + {% include "search/includes/media/photo.html" %} + {% endif %} + {% empty %} +

No results found.

+ {% endfor %} + +Those include files might look like:: + + # search/includes/blog/post.html +
+

{{ result.object.title }}

+ +

{{ result.object.tease }}

+
+ + # search/includes/media/photo.html +
+ + +

Taken By {{ result.object.taken_by }}

+
+ +You can make this even better by standardizing on an includes layout, then +writing a template tag or filter that generates the include filename. Usage +might looks something like:: + + {% for result in page.object_list %} + {% with result|search_include as fragment %} + {% include fragment %} + {% endwith %} + {% empty %} +

No results found.

+ {% endfor %} + + +Real-Time Search +================ + +If your site sees heavy search traffic and up-to-date information is very +important, Haystack provides a way to constantly keep your index up to date. + +You can enable the ``RealtimeSignalProcessor`` within your settings, which +will allow Haystack to automatically update the index whenever a model is +saved/deleted. + +You can find more information within the :doc:`signal_processors` documentation. + + +Use Of A Queue For A Better User Experience +=========================================== + +By default, you have to manually reindex content, Haystack immediately tries to merge +it into the search index. If you have a write-heavy site, this could mean your +search engine may spend most of its time churning on constant merges. If you can +afford a small delay between when a model is saved and when it appears in the +search results, queuing these merges is a good idea. + +You gain a snappier interface for users as updates go into a queue (a fast +operation) and then typical processing continues. You also get a lower churn +rate, as most search engines deal with batches of updates better than many +single updates. You can also use this to distribute load, as the queue consumer +could live on a completely separate server from your webservers, allowing you +to tune more efficiently. + +Implementing this is relatively simple. There are two parts, creating a new +``QueuedSignalProcessor`` class and creating a queue processing script to +handle the actual updates. + +For the ``QueuedSignalProcessor``, you should inherit from +``haystack.signals.BaseSignalProcessor``, then alter the ``setup/teardown`` +methods to call an enqueuing method instead of directly calling +``handle_save/handle_delete``. For example:: + + from haystack import signals + + + class QueuedSignalProcessor(signals.BaseSignalProcessor): + # Override the built-in. + def setup(self): + models.signals.post_save.connect(self.enqueue_save) + models.signals.post_delete.connect(self.enqueue_delete) + + # Override the built-in. + def teardown(self): + models.signals.post_save.disconnect(self.enqueue_save) + models.signals.post_delete.disconnect(self.enqueue_delete) + + # Add on a queuing method. + def enqueue_save(self, sender, instance, **kwargs): + # Push the save & information onto queue du jour here... + + # Add on a queuing method. + def enqueue_delete(self, sender, instance, **kwargs): + # Push the delete & information onto queue du jour here... + +For the consumer, this is much more specific to the queue used and your desired +setup. At a minimum, you will need to periodically consume the queue, fetch the +correct index from the ``SearchSite`` for your application, load the model from +the message and pass that model to the ``update_object`` or ``remove_object`` +methods on the ``SearchIndex``. Proper grouping, batching and intelligent +handling are all additional things that could be applied on top to further +improve performance. diff --git a/docs/boost.rst b/docs/boost.rst new file mode 100644 index 0000000..4a56931 --- /dev/null +++ b/docs/boost.rst @@ -0,0 +1,123 @@ +.. _ref-boost: + +===== +Boost +===== + + +Scoring is a critical component of good search. Normal full-text searches +automatically score a document based on how well it matches the query provided. +However, sometimes you want certain documents to score better than they +otherwise would. Boosting is a way to achieve this. There are three types of +boost: + +* Term Boost +* Document Boost +* Field Boost + +.. note:: + + Document & Field boost support was added in Haystack 1.1. + +Despite all being types of boost, they take place at different times and have +slightly different effects on scoring. + +Term boost happens at query time (when the search query is run) and is based +around increasing the score if a certain word/phrase is seen. + +On the other hand, document & field boosts take place at indexing time (when +the document is being added to the index). Document boost causes the relevance +of the entire result to go up, where field boost causes only searches within +that field to do better. + +.. warning:: + + Be warned that boost is very, very sensitive & can hurt overall search + quality if over-zealously applied. Even very small adjustments can affect + relevance in a big way. + +Term Boost +========== + +Term boosting is achieved by using ``SearchQuerySet.boost``. You provide it +the term you want to boost on & a floating point value (based around ``1.0`` +as 100% - no boost). + +Example:: + + # Slight increase in relevance for documents that include "banana". + sqs = SearchQuerySet().boost('banana', 1.1) + + # Big decrease in relevance for documents that include "blueberry". + sqs = SearchQuerySet().boost('blueberry', 0.8) + +See the :doc:`searchqueryset_api` docs for more details on using this method. + + +Document Boost +============== + +Document boosting is done by adding a ``boost`` field to the prepared data +``SearchIndex`` creates. The best way to do this is to override +``SearchIndex.prepare``:: + + from haystack import indexes + from notes.models import Note + + + class NoteSearchIndex(indexes.SearchIndex, indexes.Indexable): + # Your regular fields here then... + + def prepare(self, obj): + data = super(NoteSearchIndex, self).prepare(obj) + data['boost'] = 1.1 + return data + + +Another approach might be to add a new field called ``boost``. However, this +can skew your schema and is not encouraged. + + +Field Boost +=========== + +Field boosting is enabled by setting the ``boost`` kwarg on the desired field. +An example of this might be increasing the significance of a ``title``:: + + from haystack import indexes + from notes.models import Note + + + class NoteSearchIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + title = indexes.CharField(model_attr='title', boost=1.125) + + def get_model(self): + return Note + +.. note:: + + Field boosting only has an effect when the SearchQuerySet filters on the + field which has been boosted. If you are using a default search view or + form you will need override the search method or other include the field + in your search query. This example CustomSearchForm searches the automatic + ``content`` field and the ``title`` field which has been boosted:: + + from haystack.forms import SearchForm + + class CustomSearchForm(SearchForm): + + def search(self): + if not self.is_valid(): + return self.no_query_found() + + if not self.cleaned_data.get('q'): + return self.no_query_found() + + q = self.cleaned_data['q'] + sqs = self.searchqueryset.filter(SQ(content=AutoQuery(q)) | SQ(title=AutoQuery(q))) + + if self.load_all: + sqs = sqs.load_all() + + return sqs.highlight() diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..db3e990 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- +# +# Haystack documentation build configuration file, created by +# sphinx-quickstart on Wed Apr 15 08:50:46 2009. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import sys + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.append(os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'toc' + +# General information about the project. +project = u'Haystack' +copyright = u'2009-2013, Daniel Lindsley' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '2.1.1' +# The full version, including alpha/beta/rc tags. +release = '2.1.1-dev' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +# html_theme = 'haystack_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# html_theme_options = { +# "rightsidebar": "true", +# "bodyfont": "'Helvetica Neue', Arial, sans-serif", +# "sidebarbgcolor": "#303c0c", +# "sidebartextcolor": "#effbcb", +# "sidebarlinkcolor": "#eef7ab", +# "relbarbgcolor": "#caecff", +# "relbartextcolor": "#262511", +# "relbarlinkcolor": "#262511", +# "footerbgcolor": "#262511", +# } + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['.'] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Haystackdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'Haystack.tex', u'Haystack Documentation', + u'Daniel Lindsley', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 0000000..7806c19 --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1,132 @@ +============ +Contributing +============ + +Haystack is open-source and, as such, grows (or shrinks) & improves in part +due to the community. Below are some guidelines on how to help with the project. + + +Philosophy +========== + +* Haystack is BSD-licensed. All contributed code must be either + + * the original work of the author, contributed under the BSD, or... + * work taken from another project released under a BSD-compatible license. + +* GPL'd (or similar) works are not eligible for inclusion. +* Haystack's git master branch should always be stable, production-ready & + passing all tests. +* Major releases (1.x.x) are commitments to backward-compatibility of the public APIs. + Any documented API should ideally not change between major releases. + The exclusion to this rule is in the event of either a security issue + or to accommodate changes in Django itself. +* Minor releases (x.3.x) are for the addition of substantial features or major + bugfixes. +* Patch releases (x.x.4) are for minor features or bugfixes. + + +Guidelines For Reporting An Issue/Feature +========================================= + +So you've found a bug or have a great idea for a feature. Here's the steps you +should take to help get it added/fixed in Haystack: + +* First, check to see if there's an existing issue/pull request for the + bug/feature. All issues are at https://github.com/toastdriven/django-haystack/issues + and pull reqs are at https://github.com/toastdriven/django-haystack/pulls. +* If there isn't one there, please file an issue. The ideal report includes: + + * A description of the problem/suggestion. + * How to recreate the bug. + * If relevant, including the versions of your: + + * Python interpreter + * Django + * Haystack + * Search engine used (as well as bindings) + * Optionally of the other dependencies involved + + * Ideally, creating a pull request with a (failing) test case demonstrating + what's wrong. This makes it easy for us to reproduce & fix the problem. + Instructions for running the tests are at :doc:`index` + +You might also hop into the IRC channel (``#haystack`` on ``irc.freenode.net``) +& raise your question there, as there may be someone who can help you with a +work-around. + + +Guidelines For Contributing Code +================================ + +If you're ready to take the plunge & contribute back some code/docs, the +process should look like: + +* Fork the project on GitHub into your own account. +* Clone your copy of Haystack. +* Make a new branch in git & commit your changes there. +* Push your new branch up to GitHub. +* Again, ensure there isn't already an issue or pull request out there on it. + If there is & you feel you have a better fix, please take note of the issue + number & mention it in your pull request. +* Create a new pull request (based on your branch), including what the + problem/feature is, versions of your software & referencing any related + issues/pull requests. + +In order to be merged into Haystack, contributions must have the following: + +* A solid patch that: + + * is clear. + * works across all supported versions of Python/Django. + * follows the existing style of the code base (mostly PEP-8). + * comments included as needed. + +* A test case that demonstrates the previous flaw that now passes + with the included patch. +* If it adds/changes a public API, it must also include documentation + for those changes. +* Must be appropriately licensed (see "Philosophy"). +* Adds yourself to the AUTHORS file. + +If your contribution lacks any of these things, they will have to be added +by a core contributor before being merged into Haystack proper, which may take +substantial time for the all-volunteer team to get to. + + +Guidelines For Core Contributors +================================ + +If you've been granted the commit bit, here's how to shepherd the changes in: + +* Any time you go to work on Haystack, please use ``git pull --rebase`` to fetch + the latest changes. +* Any new features/bug fixes must meet the above guidelines for contributing + code (solid patch/tests passing/docs included). +* Commits are typically cherry-picked onto a branch off master. + + * This is done so as not to include extraneous commits, as some people submit + pull reqs based on their git master that has other things applied to it. + +* A set of commits should be squashed down to a single commit. + + * ``git merge --squash`` is a good tool for performing this, as is + ``git rebase -i HEAD~N``. + * This is done to prevent anyone using the git repo from accidently pulling + work-in-progress commits. + +* Commit messages should use past tense, describe what changed & thank anyone + involved. Examples:: + + """Added support for the latest version of Whoosh (v2.3.2).""" + """Fixed a bug in ``solr_backend.py``. Thanks to joeschmoe for the report!""" + """BACKWARD-INCOMPATIBLE: Altered the arguments passed to ``SearchBackend``. + + Further description appears here if the change warrants an explanation + as to why it was done.""" + +* For any patches applied from a contributor, please ensure their name appears + in the AUTHORS file. +* When closing issues or pull requests, please reference the SHA in the closing + message (i.e. ``Thanks! Fixed in SHA: 6b93f6``). GitHub will automatically + link to it. diff --git a/docs/creating_new_backends.rst b/docs/creating_new_backends.rst new file mode 100644 index 0000000..df5551c --- /dev/null +++ b/docs/creating_new_backends.rst @@ -0,0 +1,34 @@ +.. _ref-creating-new-backends: + +===================== +Creating New Backends +===================== + +The process should be fairly simple. + +#. Create new backend file. Name is important. +#. Two classes inside. + + #. SearchBackend (inherit from haystack.backends.BaseSearchBackend) + #. SearchQuery (inherit from haystack.backends.BaseSearchQuery) + + +SearchBackend +============= + +Responsible for the actual connection and low-level details of interacting with +the backend. + +* Connects to search engine +* Method for saving new docs to index +* Method for removing docs from index +* Method for performing the actual query + + +SearchQuery +=========== + +Responsible for taking structured data about the query and converting it into a +backend appropriate format. + +* Method for creating the backend specific query - ``build_query``. diff --git a/docs/debugging.rst b/docs/debugging.rst new file mode 100644 index 0000000..f2e2b4a --- /dev/null +++ b/docs/debugging.rst @@ -0,0 +1,107 @@ +.. ref-debugging: + +================== +Debugging Haystack +================== + +There are some common problems people run into when using Haystack for the first +time. Some of the common problems and things to try appear below. + +.. note:: + + As a general suggestion, your best friend when debugging an issue is to + use the ``pdb`` library included with Python. By dropping a + ``import pdb; pdb.set_trace()`` in your code before the issue occurs, you + can step through and examine variable/logic as you progress through. Make + sure you don't commit those ``pdb`` lines though. + + +"No module named haystack." +=========================== + +This problem usually occurs when first adding Haystack to your project. + +* Are you using the ``haystack`` directory within your ``django-haystack`` + checkout/install? +* Is the ``haystack`` directory on your ``PYTHONPATH``? Alternatively, is + ``haystack`` symlinked into your project? +* Start a Django shell (``./manage.py shell``) and try ``import haystack``. + You may receive a different, more descriptive error message. +* Double-check to ensure you have no circular imports. (i.e. module A tries + importing from module B which is trying to import from module A.) + + +"No results found." (On the web page) +===================================== + +Several issues can cause no results to be found. Most commonly it is either +not running a ``rebuild_index`` to populate your index or having a blank +``document=True`` field, resulting in no content for the engine to search on. + +* Do you have a ``search_indexes.py`` located within an installed app? +* Do you have data in your database? +* Have you run a ``./manage.py rebuild_index`` to index all of your content? +* Try running ``./manage.py rebuild_index -v2`` for more verbose output to + ensure data is being processed/inserted. +* Start a Django shell (``./manage.py shell``) and try:: + + >>> from haystack.query import SearchQuerySet + >>> sqs = SearchQuerySet().all() + >>> sqs.count() + +* You should get back an integer > 0. If not, check the above and reindex. + + >>> sqs[0] # Should get back a SearchResult object. + >>> sqs[0].id # Should get something back like 'myapp.mymodel.1'. + >>> sqs[0].text # ... or whatever your document=True field is. + +* If you get back either ``u''`` or ``None``, it means that your data isn't + making it into the main field that gets searched. You need to check that the + field either has a template that uses the model data, a ``model_attr`` that + pulls data directly from the model or a ``prepare/prepare_FOO`` method that + populates the data at index time. +* Check the template for your search page and ensure it is looping over the + results properly. Also ensure that it's either accessing valid fields coming + back from the search engine or that it's trying to access the associated + model via the ``{{ result.object.foo }}`` lookup. + + +"LockError: [Errno 17] File exists: '/path/to/whoosh_index/_MAIN_LOCK'" +======================================================================= + +This is a Whoosh-specific traceback. It occurs when the Whoosh engine in one +process/thread is locks the index files for writing while another process/thread +tries to access them. This is a common error when using ``RealtimeSignalProcessor`` +with Whoosh under any kind of load, which is why it's only recommended for +small sites or development. + +The only real solution is to set up a cron job that runs +``./manage.py rebuild_index`` (optionally with ``--age=24``) that runs nightly +(or however often you need) to refresh the search indexes. Then disable the +use of the ``RealtimeSignalProcessor`` within your settings. + +The downside to this is that you lose real-time search. For many people, this +isn't an issue and this will allow you to scale Whoosh up to a much higher +traffic. If this is not acceptable, you should investigate either the Solr or +Xapian backends. + + +"Failed to add documents to Solr: [Reason: None]" +================================================= + +This is a Solr-specific traceback. It generally occurs when there is an error +with your ``HAYSTACK_CONNECTIONS[]['URL']``. Since Solr acts as a webservice, you should +test the URL in your web browser. If you receive an error, you may need to +change your URL. + +This can also be caused when using old versions of pysolr (2.0.9 and before) with httplib2 and +including a trailing slash in your ``HAYSTACK_CONNECTIONS[]['URL']``. If this applies to +you, please upgrade to the current version of pysolr. + + +"Got an unexpected keyword argument 'boost'" +============================================ + +This is a Solr-specific traceback. This can also be caused when using old +versions of pysolr (2.0.12 and before). Please upgrade your version of +pysolr (2.0.13+). diff --git a/docs/faceting.rst b/docs/faceting.rst new file mode 100644 index 0000000..f2e64f4 --- /dev/null +++ b/docs/faceting.rst @@ -0,0 +1,328 @@ +.. _ref-faceting: + +======== +Faceting +======== + +What Is Faceting? +----------------- + +Faceting is a way to provide users with feedback about the number of documents +which match terms they may be interested in. At its simplest, it gives +document counts based on words in the corpus, date ranges, numeric ranges or +even advanced queries. + +Faceting is particularly useful when trying to provide users with drill-down +capabilities. The general workflow in this regard is: + + #. You can choose what you want to facet on. + #. The search engine will return the counts it sees for that match. + #. You display those counts to the user and provide them with a link. + #. When the user chooses a link, you narrow the search query to only include + those conditions and display the results, potentially with further facets. + +.. note:: + + Faceting can be difficult, especially in providing the user with the right + number of options and/or the right areas to be able to drill into. This + is unique to every situation and demands following what real users need. + + You may want to consider logging queries and looking at popular terms to + help you narrow down how you can help your users. + +Haystack provides functionality so that all of the above steps are possible. +From the ground up, let's build a faceted search setup. This assumes that you +have been to work through the :doc:`tutorial` and have a working Haystack +installation. The same setup from the :doc:`tutorial` applies here. + +1. Determine Facets And ``SearchQuerySet`` +------------------------------------------ + +Determining what you want to facet on isn't always easy. For our purposes, +we'll facet on the ``author`` field. + +In order to facet effectively, the search engine should store both a standard +representation of your data as well as exact version to facet on. This is +generally accomplished by duplicating the field and storing it via two +different types. Duplication is suggested so that those fields are still +searchable in the standard ways. + +To inform Haystack of this, you simply pass along a ``faceted=True`` parameter +on the field(s) you wish to facet on. So to modify our existing example:: + + class NoteIndex(SearchIndex, indexes.Indexable): + text = CharField(document=True, use_template=True) + author = CharField(model_attr='user', faceted=True) + pub_date = DateTimeField(model_attr='pub_date') + +Haystack quietly handles all of the backend details for you, creating a similar +field to the type you specified with ``_exact`` appended. Our example would now +have both a ``author`` and ``author_exact`` field, though this is largely an +implementation detail. + +To pull faceting information out of the index, we'll use the +``SearchQuerySet.facet`` method to setup the facet and the +``SearchQuerySet.facet_counts`` method to retrieve back the counts seen. + +Experimenting in a shell (``./manage.py shell``) is a good way to get a feel +for what various facets might look like:: + + >>> from haystack.query import SearchQuerySet + >>> sqs = SearchQuerySet().facet('author') + >>> sqs.facet_counts() + { + 'dates': {}, + 'fields': { + 'author': [ + ('john', 4), + ('daniel', 2), + ('sally', 1), + ('terry', 1), + ], + }, + 'queries': {} + } + +.. note:: + + Note that, despite the duplication of fields, you should provide the + regular name of the field when faceting. Haystack will intelligently + handle the underlying details and mapping. + +As you can see, we get back a dictionary which provides access to the three +types of facets available: ``fields``, ``dates`` and ``queries``. Since we only +faceted on the ``author`` field (which actually facets on the ``author_exact`` +field managed by Haystack), only the ``fields`` key has any data +associated with it. In this case, we have a corpus of eight documents with four +unique authors. + +.. note:: + Facets are chainable, like most ``SearchQuerySet`` methods. However, unlike + most ``SearchQuerySet`` methods, they are *NOT* affected by ``filter`` or + similar methods. The only method that has any effect on facets is the + ``narrow`` method (which is how you provide drill-down). + +Configuring facet behaviour +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can configure the behaviour of your facets by passing options +for each facet in your SearchQuerySet. These options can be backend specific. + +**limit** +*tested on Solr* + +The ``limit`` parameter limits the results for each query. On Solr, the default `facet.limit`_ is 100 and a +negative number removes the limit. + +.. _facet.limit: https://wiki.apache.org/solr/SimpleFacetParameters#facet.limit + +Example usage:: + + >>> from haystack.query import SearchQuerySet + >>> sqs = SearchQuerySet().facet('author', limit=-1) + >>> sqs.facet_counts() + { + 'dates': {}, + 'fields': { + 'author': [ + ('abraham', 1), + ('benny', 2), + ('cindy', 1), + ('diana', 5), + ], + }, + 'queries': {} + } + + >>> sqs = SearchQuerySet().facet('author', limit=2) + >>> sqs.facet_counts() + { + 'dates': {}, + 'fields': { + 'author': [ + ('abraham', 1), + ('benny', 2), + ], + }, + 'queries': {} + } + +**sort** +*tested on Solr* + +The ``sort`` parameter will sort the results for each query. Solr's default +`facet.sort`_ is ``index``, which will sort the facets alphabetically. Changing +the parameter to ``count`` will sort the facets by the number of results for +each facet value. + +.. _facet.sort: https://wiki.apache.org/solr/SimpleFacetParameters#facet.sort + + +Example usage:: + + >>> from haystack.query import SearchQuerySet + >>> sqs = SearchQuerySet().facet('author', sort='index', ) + >>> sqs.facet_counts() + { + 'dates': {}, + 'fields': { + 'author': [ + ('abraham', 1), + ('benny', 2), + ('cindy', 1), + ('diana', 5), + ], + }, + 'queries': {} + } + + >>> sqs = SearchQuerySet().facet('author', sort='count', ) + >>> sqs.facet_counts() + { + 'dates': {}, + 'fields': { + 'author': [ + ('diana', 5), + ('benny', 2), + ('abraham', 1), + ('cindy', 1), + ], + }, + 'queries': {} + } + + +Now that we have the facet we want, it's time to implement it. + +2. Switch to the ``FacetedSearchView`` and ``FacetedSearchForm`` +---------------------------------------------------------------- + +There are three things that we'll need to do to expose facets to our frontend. +The first is construct the ``SearchQuerySet`` we want to use. We should have +that from the previous step. The second is to switch to the +``FacetedSearchView``. This view is useful because it prepares the facet counts +and provides them in the context as ``facets``. + +Optionally, the third step is to switch to the ``FacetedSearchForm``. As it +currently stands, this is only useful if you want to provide drill-down, though +it may provide more functionality in the future. We'll do it for the sake of +having it in place but know that it's not required. + +In your URLconf, you'll need to switch to the ``FacetedSearchView``. Your +URLconf should resemble:: + + from django.conf.urls.defaults import * + from haystack.forms import FacetedSearchForm + from haystack.query import SearchQuerySet + from haystack.views import FacetedSearchView + + + sqs = SearchQuerySet().facet('author') + + + urlpatterns = patterns('haystack.views', + url(r'^$', FacetedSearchView(form_class=FacetedSearchForm, searchqueryset=sqs), name='haystack_search'), + ) + +The ``FacetedSearchView`` will now instantiate the ``FacetedSearchForm`` and use +the ``SearchQuerySet`` we provided. Now, a ``facets`` variable will be present +in the context. This is added in an overridden ``extra_context`` method. + + +3. Display The Facets In The Template +------------------------------------- + +Templating facets involves simply adding an extra bit of processing to display +the facets (and optionally to link to provide drill-down). An example template +might look like this:: + +
+ + + {{ form.as_table }} + + + + + +
 
+
+ + {% if query %} + +

By Author

+ +
+
+ {% if facets.fields.author %} +
Author
+ {# Provide only the top 5 authors #} + {% for author in facets.fields.author|slice:":5" %} +
{{ author.0 }} ({{ author.1 }})
+ {% endfor %} + {% else %} +

No author facets.

+ {% endif %} +
+
+ + + + {% for result in page.object_list %} +
+

{{ result.object.title }}

+ +

{{ result.object.body|truncatewords:80 }}

+
+ {% empty %} +

Sorry, no results found.

+ {% endfor %} + {% endif %} + +Displaying the facets is a matter of looping through the facets you want and +providing the UI to suit. The ``author.0`` is the facet text from the backend +and the ``author.1`` is the facet count. + +4. Narrowing The Search +----------------------- + +We've also set ourselves up for the last bit, the drill-down aspect. By +appending on the ``selected_facets`` to the URLs, we're informing the +``FacetedSearchForm`` that we want to narrow our results to only those +containing the author we provided. + +For a concrete example, if the facets on author come back as:: + + { + 'dates': {}, + 'fields': { + 'author': [ + ('john', 4), + ('daniel', 2), + ('sally', 1), + ('terry', 1), + ], + }, + 'queries': {} + } + +You should present a list similar to:: + + + +.. warning:: + + Haystack can automatically handle most details around faceting. However, + since ``selected_facets`` is passed directly to narrow, it must use the + duplicated field name. Improvements to this are planned but incomplete. + +This is simply the default behavior but it is possible to override or provide +your own form which does additional processing. You could also write your own +faceted ``SearchView``, which could provide additional/different facets based +on facets chosen. There is a wide range of possibilities available to help the +user navigate your content. diff --git a/docs/faq.rst b/docs/faq.rst new file mode 100644 index 0000000..94fd0d3 --- /dev/null +++ b/docs/faq.rst @@ -0,0 +1,117 @@ +.. _ref-frequently-asked-questions: + +============================== +(In)Frequently Asked Questions +============================== + + +What is Haystack? +================= + +Haystack is meant to be a portable interface to a search engine of your choice. +Some might call it a search framework, an abstraction layer or what have you. +The idea is that you write your search code once and should be able to freely +switch between backends as your situation necessitates. + + +Why should I consider using Haystack? +===================================== + +Haystack is targeted at the following use cases: + +* If you want to feature search on your site and search solutions like Google or + Yahoo search don't fit your needs. +* If you want to be able to customize your search and search on more than just + the main content. +* If you want to have features like drill-down (faceting) or "More Like This". +* If you want a interface that is non-search engine specific, allowing you to + change your mind later without much rewriting. + + +When should I not be using Haystack? +==================================== + +* Non-Model-based data. If you just want to index random data (flat files, + alternate sources, etc.), Haystack isn't a good solution. Haystack is very + ``Model``-based and doesn't work well outside of that use case. +* Ultra-high volume. Because of the very nature of Haystack (abstraction layer), + there's more overhead involved. This makes it portable, but as with all + abstraction layers, you lose a little performance. You also can't take full + advantage of the exact feature-set of your search engine. This is the price + of pluggable backends. + + +Why was Haystack created when there are so many other search options? +===================================================================== + +The proliferation of search options in Django is a relatively recent development +and is actually one of the reasons for Haystack's existence. There are too +many options that are only partial solutions or are too engine specific. + +Further, most use an unfamiliar API and documentation is lacking in most cases. + +Haystack is an attempt to unify these efforts into one solution. That's not to +say there should be no alternatives, but Haystack should provide a good +solution to 80%+ of the search use cases out there. + + +What's the history behind Haystack? +=================================== + +Haystack started because of my frustration with the lack of good search options +(before many other apps came out) and as the result of extensive use of +Djangosearch. Djangosearch was a decent solution but had a number of +shortcomings, such as: + +* Tied to the models.py, so you'd have to modify the source of third-party ( + or django.contrib) apps in order to effectively use it. +* All or nothing approach to indexes. So all indexes appear on all sites and + in all places. +* Lack of tests. +* Lack of documentation. +* Uneven backend implementations. + +The initial idea was to simply fork Djangosearch and improve on these (and +other issues). However, after stepping back, I decided to overhaul the entire +API (and most of the underlying code) to be more representative of what I would +want as an end-user. The result was starting afresh and reusing concepts (and +some code) from Djangosearch as needed. + +As a result of this heritage, you can actually still find some portions of +Djangosearch present in Haystack (especially in the ``SearchIndex`` and +``SearchBackend`` classes) where it made sense. The original authors of +Djangosearch are aware of this and thus far have seemed to be fine with this +reuse. + + +Why doesn't have a backend included in Haystack? +================================================================== + +Several possibilities on this. + +#. Licensing + + A common problem is that the Python bindings for a specific engine may + have been released under an incompatible license. The goal is for Haystack + to remain BSD licensed and importing bindings with an incompatible license + can technically convert the entire codebase to that license. This most + commonly occurs with GPL'ed bindings. + +#. Lack of time + + The search engine in question may be on the list of backends to add and we + simply haven't gotten to it yet. We welcome patches for additional backends. + +#. Incompatible API + + In order for an engine to work well with Haystack, a certain baseline set of + features is needed. This is often an issue when the engine doesn't support + ranged queries or additional attributes associated with a search record. + +#. We're not aware of the engine + + If you think we may not be aware of the engine you'd like, please tell us + about it (preferably via the group - + http://groups.google.com/group/django-haystack/). Be sure to check through + the backends (in case it wasn't documented) and search the history on the + group to minimize duplicates. diff --git a/docs/glossary.rst b/docs/glossary.rst new file mode 100644 index 0000000..f6a1e6e --- /dev/null +++ b/docs/glossary.rst @@ -0,0 +1,76 @@ +.. _ref-glossary: + +======== +Glossary +======== + +Search is a domain full of its own jargon and definitions. As this may be an +unfamiliar territory to many developers, what follows are some commonly used +terms and what they mean. + + +Engine + An engine, for the purposes of Haystack, is a third-party search solution. + It might be a full service (i.e. Solr_) or a library to build an + engine with (i.e. Whoosh_) + +.. _Solr: http://lucene.apache.org/solr/ +.. _Whoosh: https://bitbucket.org/mchaput/whoosh/ + +Index + The datastore used by the engine is called an index. Its structure can vary + wildly between engines but commonly they resemble a document store. This is + the source of all information in Haystack. + +Document + A document is essentially a record within the index. It usually contains at + least one blob of text that serves as the primary content the engine searches + and may have additional data hung off it. + +Corpus + A term for a collection of documents. When talking about the documents stored + by the engine (rather than the technical implementation of the storage), this + term is commonly used. + +Field + Within the index, each document may store extra data with the main content as + a field. Also sometimes called an attribute, this usually represents metadata + or extra content about the document. Haystack can use these fields for + filtering and display. + +Term + A term is generally a single word (or word-like) string of characters used + in a search query. + +Stemming + A means of determining if a word has any root words. This varies by language, + but in English, this generally consists of removing plurals, an action form of + the word, et cetera. For instance, in English, 'giraffes' would stem to + 'giraffe'. Similarly, 'exclamation' would stem to 'exclaim'. This is useful + for finding variants of the word that may appear in other documents. + +Boost + Boost provides a means to take a term or phrase from a search query and alter + the relevance of a result based on if that term is found in the result, a form + of weighting. For instance, if you wanted to more heavily weight results that + included the word 'zebra', you'd specify a boost for that term within the + query. + +More Like This + Incorporating techniques from information retrieval and artificial + intelligence, More Like This is a technique for finding other documents within + the index that closely resemble the document in question. This is useful for + programmatically generating a list of similar content for a user to browse + based on the current document they are viewing. + +Faceting + Faceting is a way to provide insight to the user into the contents of your + corpus. In its simplest form, it is a set of document counts returned with + results when performing a query. These counts can be used as feedback for + the user, allowing the user to choose interesting aspects of their search + results and "drill down" into those results. + + An example might be providing a facet on an ``author`` field, providing back a + list of authors and the number of documents in the index they wrote. This + could be presented to the user with a link, allowing the user to click and + narrow their original search to all results by that author. diff --git a/docs/haystack_theme/layout.html b/docs/haystack_theme/layout.html new file mode 100644 index 0000000..e1d4ab3 --- /dev/null +++ b/docs/haystack_theme/layout.html @@ -0,0 +1,22 @@ +{% extends "basic/layout.html" %} + +{%- block extrahead %} + + +{% endblock %} + +{%- block header %} + +{% endblock %} \ No newline at end of file diff --git a/docs/haystack_theme/static/documentation.css b/docs/haystack_theme/static/documentation.css new file mode 100644 index 0000000..3e9492c --- /dev/null +++ b/docs/haystack_theme/static/documentation.css @@ -0,0 +1,29 @@ +a, a:link, a:hover { background-color: transparent !important; color: #CAECFF; outline-color: transparent !important; text-decoration: underline; } +dl dt { text-decoration: underline; } +dl.class dt, dl.method dt { background-color: #444444; padding: 5px; text-decoration: none; } +tt.descname { font-weight: normal; } +dl.method dt span.optional { font-weight: normal; } +div#header { margin-bottom: 0px; } +div.document, div.related, div.footer { width: 900px; margin: 0 auto; } +div.document { margin-top: 10px; } +div.related { background-color: #262511; padding-left: 10px; padding-right: 10px; } +div.documentwrapper { width:640px; float:left;} +div.body h1, +div.body h2, +div.body h3, +div.body h4, +div.body h5, +div.body h6 { + background-color: #053211; + font-weight: normal; + border-bottom: 2px solid #262511; + margin: 20px -20px 10px -20px; + padding: 3px 0 3px 10px; +} +div.sphinxsidebar { width:220px; float:right;} +div.sphinxsidebar ul { padding-left: 10px; } +div.sphinxsidebar ul ul { padding-left: 10px; margin-left: 10px; } +div.bodywrapper { margin: 0px; } +div.highlight-python, div.highlight { background-color: #262511; margin-bottom: 10px; padding: 10px; } +div.footer { background-color:#262511; font-size: 90%; padding: 10px; } +table thead { background-color: #053211; border-bottom: 1px solid #262511; } \ No newline at end of file diff --git a/docs/haystack_theme/theme.conf b/docs/haystack_theme/theme.conf new file mode 100644 index 0000000..3161b4d --- /dev/null +++ b/docs/haystack_theme/theme.conf @@ -0,0 +1,2 @@ +[theme] +inherit = basic \ No newline at end of file diff --git a/docs/highlighting.rst b/docs/highlighting.rst new file mode 100644 index 0000000..5c95619 --- /dev/null +++ b/docs/highlighting.rst @@ -0,0 +1,77 @@ +.. _ref-highlighting: + +============ +Highlighting +============ + +Haystack supports two different methods of highlighting. You can either use +``SearchQuerySet.highlight`` or the built-in ``{% highlight %}`` template tag, +which uses the ``Highlighter`` class. Each approach has advantages and +disadvantages you need to weigh when deciding which to use. + +If you want portable, flexible, decently fast code, the +``{% highlight %}`` template tag (or manually using the underlying +``Highlighter`` class) is the way to go. On the other hand, if you care more +about speed and will only ever be using one backend, +``SearchQuerySet.highlight`` may suit your needs better. + +Use of ``SearchQuerySet.highlight`` is documented in the +:doc:`searchqueryset_api` documentation and the ``{% highlight %}`` tag is +covered in the :doc:`templatetags` documentation, so the rest of this material +will cover the ``Highlighter`` implementation. + + +``Highlighter`` +--------------- + +The ``Highlighter`` class is a pure-Python implementation included with Haystack +that's designed for flexibility. If you use the ``{% highlight %}`` template +tag, you'll be automatically using this class. You can also use it manually in +your code. For example:: + + >>> from haystack.utils import Highlighter + + >>> my_text = 'This is a sample block that would be more meaningful in real life.' + >>> my_query = 'block meaningful' + + >>> highlight = Highlighter(my_query) + >>> highlight.highlight(my_text) + u'...block that would be more meaningful in real life.' + +The default implementation takes three optional kwargs: ``html_tag``, +``css_class`` and ``max_length``. These allow for basic customizations to the +output, like so:: + + >>> from haystack.utils import Highlighter + + >>> my_text = 'This is a sample block that would be more meaningful in real life.' + >>> my_query = 'block meaningful' + + >>> highlight = Highlighter(my_query, html_tag='div', css_class='found', max_length=35) + >>> highlight.highlight(my_text) + u'...
block
that would be more
meaningful
...' + +Further, if this implementation doesn't suit your needs, you can define your own +custom highlighter class. As long as it implements the API you've just seen, it +can highlight however you choose. For example:: + + # In ``myapp/utils.py``... + from haystack.utils import Highlighter + + class BorkHighlighter(Highlighter): + def render_html(self, highlight_locations=None, start_offset=None, end_offset=None): + highlighted_chunk = self.text_block[start_offset:end_offset] + + for word in self.query_words: + highlighted_chunk = highlighted_chunk.replace(word, 'Bork!') + + return highlighted_chunk + +Then set the ``HAYSTACK_CUSTOM_HIGHLIGHTER`` setting to +``myapp.utils.BorkHighlighter``. Usage would then look like:: + + >>> highlight = BorkHighlighter(my_query) + >>> highlight.highlight(my_text) + u'Bork! that would be more Bork! in real life.' + +Now the ``{% highlight %}`` template tag will also use this highlighter. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..9ca34eb --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,117 @@ +Welcome to Haystack! +==================== + +Haystack provides modular search for Django. It features a unified, familiar +API that allows you to plug in different search backends (such as Solr_, +Elasticsearch_, Whoosh_, Xapian_, etc.) without having to modify your code. + +.. _Solr: http://lucene.apache.org/solr/ +.. _Elasticsearch: http://elasticsearch.org/ +.. _Whoosh: https://bitbucket.org/mchaput/whoosh/ +.. _Xapian: http://xapian.org/ + + +.. note:: + + This documentation represents Haystack 2.x. For old versions of the documentation: `1.2`_, `1.1`_. + +.. _`1.2`: http://django-haystack.readthedocs.org/en/v1.2.6/index.html +.. _`1.1`: http://django-haystack.readthedocs.org/en/v1.1/index.html + +Getting Started +--------------- + +If you're new to Haystack, you may want to start with these documents to get +you up and running: + +.. toctree:: + :maxdepth: 2 + + tutorial + +.. toctree:: + :maxdepth: 1 + + views_and_forms + templatetags + glossary + management_commands + faq + who_uses + other_apps + installing_search_engines + debugging + + migration_from_1_to_2 + python3 + contributing + + +Advanced Uses +------------- + +Once you've got Haystack working, here are some of the more complex features +you may want to include in your application. + +.. toctree:: + :maxdepth: 1 + + best_practices + highlighting + faceting + autocomplete + boost + signal_processors + multiple_index + rich_content_extraction + spatial + admin + + +Reference +--------- + +If you're an experienced user and are looking for a reference, you may be +looking for API documentation and advanced usage as detailed in: + +.. toctree:: + :maxdepth: 2 + + searchqueryset_api + searchindex_api + inputtypes + searchfield_api + searchresult_api + searchquery_api + searchbackend_api + + architecture_overview + backend_support + settings + utils + + +Developing +---------- + +Finally, if you're looking to help out with the development of Haystack, +the following links should help guide you on running tests and creating +additional backends: + +.. toctree:: + :maxdepth: 1 + + running_tests + creating_new_backends + + +Requirements +------------ + +Haystack has a relatively easily-met set of requirements. + +* Python 2.7+ or Python 3.3+ +* Django 1.6+ + +Additionally, each backend has its own requirements. You should refer to +:doc:`installing_search_engines` for more details. diff --git a/docs/inputtypes.rst b/docs/inputtypes.rst new file mode 100644 index 0000000..fe839e6 --- /dev/null +++ b/docs/inputtypes.rst @@ -0,0 +1,177 @@ +.. _ref-inputtypes: + +=========== +Input Types +=========== + +Input types allow you to specify more advanced query behavior. They serve as a +way to alter the query, often in backend-specific ways, without altering your +Python code; as well as enabling use of more advanced features. + +Input types currently are only useful with the ``filter/exclude`` methods on +``SearchQuerySet``. Expanding this support to other methods is on the roadmap. + + +Available Input Types +===================== + +Included with Haystack are the following input types: + +``Raw`` +------- + +.. class:: haystack.inputs.Raw + +Raw allows you to specify backend-specific query syntax. If Haystack doesn't +provide a way to access special query functionality, you can make use of this +input type to pass it along. + +Example:: + + # Fielded. + sqs = SearchQuerySet().filter(author=Raw('daniel OR jones')) + + # Non-fielded. + # See ``AltParser`` for a better way to construct this. + sqs = SearchQuerySet().filter(content=Raw('{!dismax qf=author mm=1}haystack')) + + +``Clean`` +--------- + +.. class:: haystack.inputs.Clean + +``Clean`` takes standard user (untrusted) input and sanitizes it. It ensures +that no unintended operators or special characters make it into the query. + +This is roughly analogous to Django's ``autoescape`` support. + +.. note:: + + By default, if you hand a ``SearchQuerySet`` a bare string, it will get + wrapped in this class. + +Example:: + + # This becomes "daniel or jones". + sqs = SearchQuerySet().filter(content=Clean('daniel OR jones')) + + # Things like ``:`` & ``/`` get escaped. + sqs = SearchQuerySet().filter(url=Clean('http://www.example.com')) + + # Equivalent (automatically wrapped in ``Clean``). + sqs = SearchQuerySet().filter(url='http://www.example.com') + + +``Exact`` +--------- + +.. class:: haystack.inputs.Exact + +``Exact`` allows for making sure a phrase is exactly matched, unlike the usual +``AND`` lookups, where words may be far apart. + +Example:: + + sqs = SearchQuerySet().filter(author=Exact('n-gram support')) + + # Equivalent. + sqs = SearchQuerySet().filter(author__exact='n-gram support') + + +``Not`` +------- + +.. class:: haystack.inputs.Not + +``Not`` allows negation of the query fragment it wraps. As ``Not`` is a subclass +of ``Clean``, it will also sanitize the query. + +This is generally only used internally. Most people prefer to use the +``SearchQuerySet.exclude`` method. + +Example:: + + sqs = SearchQuerySet().filter(author=Not('daniel')) + + +``AutoQuery`` +------------- + +.. class:: haystack.inputs.AutoQuery + +``AutoQuery`` takes a more complex user query (that includes simple, standard +query syntax bits) & forms a proper query out of them. It also handles +sanitizing that query using ``Clean`` to ensure the query doesn't break. + +``AutoQuery`` accommodates for handling regular words, NOT-ing words & +extracting exact phrases. + +Example:: + + # Against the main text field with an accidental ":" before "search". + # Generates a query like ``haystack (NOT whoosh) "fast search"`` + sqs = SearchQuerySet().filter(content=AutoQuery('haystack -whoosh "fast :search"')) + + # Equivalent. + sqs = SearchQuerySet().auto_query('haystack -whoosh "fast :search"') + + # Fielded. + sqs = SearchQuerySet().filter(author=AutoQuery('daniel -day -lewis')) + + +``AltParser`` +------------- + +.. class:: haystack.inputs.AltParser + +``AltParser`` lets you specify that a portion of the query should use a +separate parser in the search engine. This is search-engine-specific, so it may +decrease the portability of your app. + +Currently only supported under Solr. + +Example:: + + # DisMax. + sqs = SearchQuerySet().filter(content=AltParser('dismax', 'haystack', qf='text', mm=1)) + + # Prior to the spatial support, you could do... + sqs = SearchQuerySet().filter(content=AltParser('dismax', 'haystack', qf='author', mm=1)) + + +Creating Your Own Input Types +============================= + +Building your own input type is relatively simple. All input types are simple +classes that provide an ``__init__`` & a ``prepare`` method. + +The ``__init__`` may accept any ``args/kwargs``, though the typical use usually +just involves a query string. + +The ``prepare`` method lets you alter the query the user provided before it +becomes of the main query. It is lazy, called as late as possible, right before +the final query is built & shipped to the engine. + +A full, if somewhat silly, example looks like:: + + from haystack.inputs import Clean + + + class NoShoutCaps(Clean): + input_type_name = 'no_shout_caps' + # This is the default & doesn't need to be specified. + post_process = True + + def __init__(self, query_string, **kwargs): + # Stash the original, if you need it. + self.original = query_string + super(NoShoutCaps, self).__init__(query_string, **kwargs) + + def prepare(self, query_obj): + # We need a reference to the current ``SearchQuery`` object this + # will run against, in case we need backend-specific code. + query_string = super(NoShoutCaps, self).prepare(query_obj) + + # Take that, capital letters! + return query_string.lower() diff --git a/docs/installing_search_engines.rst b/docs/installing_search_engines.rst new file mode 100644 index 0000000..e9599a0 --- /dev/null +++ b/docs/installing_search_engines.rst @@ -0,0 +1,222 @@ +.. _ref-installing-search-engines: + +========================= +Installing Search Engines +========================= + +Solr +==== + +Official Download Location: http://www.apache.org/dyn/closer.cgi/lucene/solr/ + +Solr is Java but comes in a pre-packaged form that requires very little other +than the JRE and Jetty. It's very performant and has an advanced featureset. +Haystack suggests using Solr 3.5+, though it's possible to get it working on +Solr 1.4 with a little effort. Installation is relatively simple:: + + curl -LO https://archive.apache.org/dist/lucene/solr/4.10.2/solr-4.10.2.tgz + tar xvzf solr-4.10.2.tgz + cd solr-4.10.2 + cd example + java -jar start.jar + +You'll need to revise your schema. You can generate this from your application +(once Haystack is installed and setup) by running +``./manage.py build_solr_schema``. Take the output from that command and place +it in ``solr-4.10.2/example/solr/collection1/conf/schema.xml``. Then restart Solr. + +.. note:: + ``build_solr_schema`` uses a template to generate ``schema.xml``. Haystack + provides a default template using some sensible defaults. If you would like + to provide your own template, you will need to place it in + ``search_configuration/solr.xml``, inside a directory specified by your app's + ``TEMPLATE_DIRS`` setting. Examples:: + + /myproj/myapp/templates/search_configuration/solr.xml + # ...or... + /myproj/templates/search_configuration/solr.xml + +You'll also need a Solr binding, ``pysolr``. The official ``pysolr`` package, +distributed via PyPI, is the best version to use (2.1.0+). Place ``pysolr.py`` +somewhere on your ``PYTHONPATH``. + +.. note:: + + ``pysolr`` has its own dependencies that aren't covered by Haystack. See + https://pypi.python.org/pypi/pysolr for the latest documentation. + +More Like This +-------------- + +To enable the "More Like This" functionality in Haystack, you'll need +to enable the ``MoreLikeThisHandler``. Add the following line to your +``solrconfig.xml`` file within the ``config`` tag:: + + + +Spelling Suggestions +-------------------- + +To enable the spelling suggestion functionality in Haystack, you'll need to +enable the ``SpellCheckComponent``. + +The first thing to do is create a special field on your ``SearchIndex`` class +that mirrors the ``text`` field, but uses ``FacetCharField``. This disables +the post-processing that Solr does, which can mess up your suggestions. +Something like the following is suggested:: + + class MySearchIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + # ... normal fields then... + suggestions = indexes.FacetCharField() + + def prepare(self, obj): + prepared_data = super(MySearchIndex, self).prepare(obj) + prepared_data['suggestions'] = prepared_data['text'] + return prepared_data + +Then, you enable it in Solr by adding the following line to your +``solrconfig.xml`` file within the ``config`` tag:: + + + + textSpell + + + default + suggestions + ./spellchecker1 + true + + + +Then change your default handler from:: + + + +... to ...:: + + + + spellcheck + + + +Be warned that the ``suggestions`` portion will be specific to +your ``SearchIndex`` classes (in this case, assuming the main field is called +``text``). + + +Elasticsearch +============= + +Official Download Location: http://www.elasticsearch.org/download/ + +Elasticsearch is Java but comes in a pre-packaged form that requires very +little other than the JRE. It's also very performant, scales easily and has +an advanced featureset. Haystack requires at least version 0.90.0+. +Installation is best done using a package manager:: + + # On Mac OS X... + brew install elasticsearch + + # On Ubuntu... + apt-get install elasticsearch + + # Then start via: + elasticsearch -f -D es.config= + + # Example: + elasticsearch -f -D es.config=/usr/local/Cellar/elasticsearch/0.90.0/config/elasticsearch.yml + +You may have to alter the configuration to run on ``localhost`` when developing +locally. Modifications should be done in a YAML file, the stock one being +``config/elasticsearch.yml``:: + + # Unicast Discovery (disable multicast) + discovery.zen.ping.multicast.enabled: false + discovery.zen.ping.unicast.hosts: ["127.0.0.1"] + + # Name your cluster here to whatever. + # My machine is called "Venus", so... + cluster: + name: venus + + network: + host: 127.0.0.1 + + path: + logs: /usr/local/var/log + data: /usr/local/var/data + +You'll also need an Elasticsearch binding: elasticsearch-py_ (**NOT** +``pyes``). Place ``elasticsearch`` somewhere on your ``PYTHONPATH`` +(usually ``python setup.py install`` or ``pip install elasticsearch``). + +.. _elasticsearch-py: http://pypi.python.org/pypi/elasticsearch/ + +.. note:: + + Elasticsearch 1.0 is slightly backwards incompatible so you need to make sure + you have the proper version of `elasticsearch-py` installed - releases with + major version 1 (1.X.Y) are to be used with Elasticsearch 1.0 and later, 0.4 + releases are meant to work with Elasticsearch 0.90.X. + +.. note:: + + ``elasticsearch`` has its own dependencies that aren't covered by + Haystack. You'll also need ``urllib3``. + + +Whoosh +====== + +Official Download Location: http://bitbucket.org/mchaput/whoosh/ + +Whoosh is pure Python, so it's a great option for getting started quickly and +for development, though it does work for small scale live deployments. The +current recommended version is 1.3.1+. You can install via PyPI_ using +``sudo easy_install whoosh`` or ``sudo pip install whoosh``. + +Note that, while capable otherwise, the Whoosh backend does not currently +support "More Like This" or faceting. Support for these features has recently +been added to Whoosh itself & may be present in a future release. + +.. _PyPI: http://pypi.python.org/pypi/Whoosh/ + + +Xapian +====== + +Official Download Location: http://xapian.org/download + +Xapian is written in C++ so it requires compilation (unless your OS has a +package for it). Installation looks like:: + + curl -O http://oligarchy.co.uk/xapian/1.2.18/xapian-core-1.2.18.tar.xz + curl -O http://oligarchy.co.uk/xapian/1.2.18/xapian-bindings-1.2.18.tar.xz + + unxz xapian-core-1.2.18.tar.xz + unxz xapian-bindings-1.2.18.tar.xz + + tar xvf xapian-core-1.2.18.tar + tar xvf xapian-bindings-1.2.18.tar + + cd xapian-core-1.2.18 + ./configure + make + sudo make install + + cd .. + cd xapian-bindings-1.2.18 + ./configure + make + sudo make install + +Xapian is a third-party supported backend. It is not included in Haystack +proper due to licensing. To use it, you need both Haystack itself as well as +``xapian-haystack``. You can download the source from +http://github.com/notanumber/xapian-haystack/tree/master. Installation +instructions can be found on that page as well. The backend, written +by David Sauve (notanumber), fully implements the `SearchQuerySet` API and is +an excellent alternative to Solr. diff --git a/docs/management_commands.rst b/docs/management_commands.rst new file mode 100644 index 0000000..e167923 --- /dev/null +++ b/docs/management_commands.rst @@ -0,0 +1,201 @@ +.. _ref-management-commands: + +=================== +Management Commands +=================== + +Haystack comes with several management commands to make working with Haystack +easier. + + +``clear_index`` +=============== + +The ``clear_index`` command wipes out your entire search index. Use with +caution. In addition to the standard management command options, it accepts the +following arguments:: + + ``--noinput``: + If provided, the interactive prompts are skipped and the index is + uncerimoniously wiped out. + ``--verbosity``: + Accepted but ignored. + ``--using``: + If provided, determines which connection should be used. Default is + ``default``. + ``--nocommit``: + If provided, it will pass commit=False to the backend. This means that the + update will not become immediately visible and will depend on another explicit commit + or the backend's commit strategy to complete the update. + +By default, this is an **INTERACTIVE** command and assumes that you do **NOT** +wish to delete the entire index. + +.. note:: + + The ``--nocommit`` argument is only supported by the Solr backend. + +.. warning:: + + Depending on the backend you're using, this may simply delete the entire + directory, so be sure your ``HAYSTACK_CONNECTIONS[]['PATH']`` setting is correctly + pointed at just the index directory. + + +``update_index`` +================ + +.. note:: + + If you use the ``--start/--end`` flags on this command, you'll need to + install dateutil_ to handle the datetime parsing. + + .. _dateutil: http://pypi.python.org/pypi/python-dateutil/1.5 + +The ``update_index`` command will freshen all of the content in your index. It +iterates through all indexed models and updates the records in the index. In +addition to the standard management command options, it accepts the following +arguments:: + + ``--age``: + Number of hours back to consider objects new. Useful for nightly + reindexes (``--age=24``). Requires ``SearchIndexes`` to implement + the ``get_updated_field`` method. Default is ``None``. + ``--start``: + The start date for indexing within. Can be any dateutil-parsable string, + recommended to be YYYY-MM-DDTHH:MM:SS. Requires ``SearchIndexes`` to + implement the ``get_updated_field`` method. Default is ``None``. + ``--end``: + The end date for indexing within. Can be any dateutil-parsable string, + recommended to be YYYY-MM-DDTHH:MM:SS. Requires ``SearchIndexes`` to + implement the ``get_updated_field`` method. Default is ``None``. + ``--batch-size``: + Number of items to index at once. Default is 1000. + ``--remove``: + Remove objects from the index that are no longer present in the + database. + ``--workers``: + Allows for the use multiple workers to parallelize indexing. Requires + ``multiprocessing``. + ``--verbosity``: + If provided, dumps out more information about what's being done. + + * ``0`` = No output + * ``1`` = Minimal output describing what models were indexed + and how many records. + * ``2`` = Full output, including everything from ``1`` plus output + on each batch that is indexed, which is useful when debugging. + ``--using``: + If provided, determines which connection should be used. Default is + ``default``. + ``--nocommit``: + If provided, it will pass commit=False to the backend. This means that the + updates will not become immediately visible and will depend on another explicit commit + or the backend's commit strategy to complete the update. + +.. note:: + + The ``--nocommit`` argument is only supported by the Solr and Elasticsearch backends. + +Examples:: + + # Update everything. + ./manage.py update_index --settings=settings.prod + + # Update everything with lots of information about what's going on. + ./manage.py update_index --settings=settings.prod --verbosity=2 + + # Update everything, cleaning up after deleted models. + ./manage.py update_index --remove --settings=settings.prod + + # Update everything changed in the last 2 hours. + ./manage.py update_index --age=2 --settings=settings.prod + + # Update everything between Dec. 1, 2011 & Dec 31, 2011 + ./manage.py update_index --start='2011-12-01T00:00:00' --end='2011-12-31T23:59:59' --settings=settings.prod + + # Update just a couple apps. + ./manage.py update_index blog auth comments --settings=settings.prod + + # Update just a single model (in a complex app). + ./manage.py update_index auth.User --settings=settings.prod + + # Crazy Go-Nuts University + ./manage.py update_index events.Event media news.Story --start='2011-01-01T00:00:00 --remove --using=hotbackup --workers=12 --verbosity=2 --settings=settings.prod + +.. note:: + + This command *ONLY* updates records in the index. It does *NOT* handle + deletions unless the ``--remove`` flag is provided. You might consider + a queue consumer if the memory requirements for ``--remove`` don't + fit your needs. Alternatively, you can use the + ``RealtimeSignalProcessor``, which will automatically handle deletions. + + +``rebuild_index`` +================= + +A shortcut for ``clear_index`` followed by ``update_index``. It accepts any/all +of the arguments of the following arguments:: + + ``--age``: + Number of hours back to consider objects new. Useful for nightly + reindexes (``--age=24``). Requires ``SearchIndexes`` to implement + the ``get_updated_field`` method. + ``--batch-size``: + Number of items to index at once. Default is 1000. + ``--site``: + The site object to use when reindexing (like `search_sites.mysite`). + ``--noinput``: + If provided, the interactive prompts are skipped and the index is + uncerimoniously wiped out. + ``--remove``: + Remove objects from the index that are no longer present in the + database. + ``--verbosity``: + If provided, dumps out more information about what's being done. + + * ``0`` = No output + * ``1`` = Minimal output describing what models were indexed + and how many records. + * ``2`` = Full output, including everything from ``1`` plus output + on each batch that is indexed, which is useful when debugging. + ``--using``: + If provided, determines which connection should be used. Default is + ``default``. + ``--nocommit``: + If provided, it will pass commit=False to the backend. This means that the + update will not become immediately visible and will depend on another explicit commit + or the backend's commit strategy to complete the update. + +For when you really, really want a completely rebuilt index. + + +``build_solr_schema`` +===================== + +Once all of your ``SearchIndex`` classes are in place, this command can be used +to generate the XML schema Solr needs to handle the search data. It accepts the +following arguments:: + + ``--filename``: + If provided, directs output to a file instead of stdout. + ``--using``: + If provided, determines which connection should be used. Default is + ``default``. + +.. warning:: + + This command does NOT update the ``schema.xml`` file for you. You either + have to specify a ``filename`` flag or have to + copy-paste (or redirect) the output to the correct file. Haystack has no + way of knowing where your Solr is setup (or if it's even on the same + machine), hence the manual step. + + +``haystack_info`` +================= + +Provides some basic information about how Haystack is setup and what models it +is handling. It accepts no arguments. Useful when debugging or when using +Haystack-enabled third-party apps. diff --git a/docs/migration_from_1_to_2.rst b/docs/migration_from_1_to_2.rst new file mode 100644 index 0000000..6159e06 --- /dev/null +++ b/docs/migration_from_1_to_2.rst @@ -0,0 +1,285 @@ +.. _ref-migration_from_1_to_2: + +=========================================== +Migrating From Haystack 1.X to Haystack 2.X +=========================================== + +Haystack introduced several backward-incompatible changes in the process of +moving from the 1.X series to the 2.X series. These were done to clean up the +API, to support new features & to clean up problems in 1.X. At a high level, +they consisted of: + +* The removal of ``SearchSite`` & ``haystack.site``. +* The removal of ``handle_registrations`` & ``autodiscover``. +* The addition of multiple index support. +* The addition of ``SignalProcessors`` & the removal of ``RealTimeSearchIndex``. +* The removal/renaming of various settings. + +This guide will help you make the changes needed to be compatible with Haystack +2.X. + + +Settings +======== + +Most prominently, the old way of specifying a backend & its settings has changed +to support the multiple index feature. A complete Haystack 1.X example might +look like:: + + HAYSTACK_SEARCH_ENGINE = 'solr' + HAYSTACK_SOLR_URL = 'http://localhost:9001/solr/default' + HAYSTACK_SOLR_TIMEOUT = 60 * 5 + HAYSTACK_INCLUDE_SPELLING = True + HAYSTACK_BATCH_SIZE = 100 + + # Or... + HAYSTACK_SEARCH_ENGINE = 'whoosh' + HAYSTACK_WHOOSH_PATH = '/home/search/whoosh_index' + HAYSTACK_WHOOSH_STORAGE = 'file' + HAYSTACK_WHOOSH_POST_LIMIT = 128 * 1024 * 1024 + HAYSTACK_INCLUDE_SPELLING = True + HAYSTACK_BATCH_SIZE = 100 + + # Or... + HAYSTACK_SEARCH_ENGINE = 'xapian' + HAYSTACK_XAPIAN_PATH = '/home/search/xapian_index' + HAYSTACK_INCLUDE_SPELLING = True + HAYSTACK_BATCH_SIZE = 100 + +In Haystack 2.X, you can now supply as many backends as you like, so all of the +above settings can now be active at the same time. A translated set of settings +would look like:: + + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.solr_backend.SolrEngine', + 'URL': 'http://localhost:9001/solr/default', + 'TIMEOUT': 60 * 5, + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + }, + 'autocomplete': { + 'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine', + 'PATH': '/home/search/whoosh_index', + 'STORAGE': 'file', + 'POST_LIMIT': 128 * 1024 * 1024, + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + }, + 'slave': { + 'ENGINE': 'xapian_backend.XapianEngine', + 'PATH': '/home/search/xapian_index', + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + }, + } + +You are required to have at least one connection listed within +``HAYSTACK_CONNECTIONS``, it must be named ``default`` & it must have a valid +``ENGINE`` within it. Bare minimum looks like:: + + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.simple_backend.SimpleEngine' + } + } + +The key for each backend is an identifier you use to describe the backend within +your app. You should refer to the :ref:`ref-multiple_index` documentation for +more information on using the new multiple indexes & routing features. + +Also note that the ``ENGINE`` setting has changed from a lowercase "short name" +of the engine to a full path to a new ``Engine`` class within the backend. +Available options are: + +* ``haystack.backends.solr_backend.SolrEngine`` +* ``haystack.backends.whoosh_backend.WhooshEngine`` +* ``haystack.backends.simple_backend.SimpleEngine`` + +Additionally, the following settings were outright removed & will generate +an exception if found: + +* ``HAYSTACK_SITECONF`` - Remove this setting & the file it pointed to. +* ``HAYSTACK_ENABLE_REGISTRATIONS`` +* ``HAYSTACK_INCLUDE_SPELLING`` + + +Backends +======== + +The ``dummy`` backend was outright removed from Haystack, as it served very +little use after the ``simple`` (pure-ORM-powered) backend was introduced. + +If you wrote a custom backend, please refer to the "Custom Backends" section +below. + + +Indexes +======= + +The other major changes affect the ``SearchIndex`` class. As the concept of +``haystack.site`` & ``SearchSite`` are gone, you'll need to modify your indexes. + +A Haystack 1.X index might've looked like:: + + import datetime + from haystack.indexes import * + from haystack import site + from myapp.models import Note + + + class NoteIndex(SearchIndex): + text = CharField(document=True, use_template=True) + author = CharField(model_attr='user') + pub_date = DateTimeField(model_attr='pub_date') + + def get_queryset(self): + """Used when the entire index for model is updated.""" + return Note.objects.filter(pub_date__lte=datetime.datetime.now()) + + + site.register(Note, NoteIndex) + +A converted Haystack 2.X index should look like:: + + import datetime + from haystack import indexes + from myapp.models import Note + + + class NoteIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + author = indexes.CharField(model_attr='user') + pub_date = indexes.DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def index_queryset(self, using=None): + """Used when the entire index for model is updated.""" + return self.get_model().objects.filter(pub_date__lte=datetime.datetime.now()) + +Note the import on ``site`` & the registration statements are gone. Newly added +are is the ``NoteIndex.get_model`` method. This is a **required** method & +should simply return the ``Model`` class the index is for. + +There's also a new, additional class added to the ``class`` definition. The +``indexes.Indexable`` class is a simple mixin that serves to identify the +classes Haystack should automatically discover & use. If you have a custom +base class (say ``QueuedSearchIndex``) that other indexes inherit from, simply +leave the ``indexes.Indexable`` off that declaration & Haystack won't try to +use it. + +Additionally, the name of the ``document=True`` field is now enforced to be +``text`` across all indexes. If you need it named something else, you should +set the ``HAYSTACK_DOCUMENT_FIELD`` setting. For example:: + + HAYSTACK_DOCUMENT_FIELD = 'pink_polka_dot' + +Finally, the ``index_queryset`` method should supplant the ``get_queryset`` +method. This was present in the Haystack 1.2.X series (with a deprecation warning +in 1.2.4+) but has been removed in Haystack v2. + +Finally, if you were unregistering other indexes before, you should make use of +the new ``EXCLUDED_INDEXES`` setting available in each backend's settings. It +should be a list of strings that contain the Python import path to the indexes +that should not be loaded & used. For example:: + + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.solr_backend.SolrEngine', + 'URL': 'http://localhost:9001/solr/default', + 'EXCLUDED_INDEXES': [ + # Imagine that these indexes exist. They don't. + 'django.contrib.auth.search_indexes.UserIndex', + 'third_party_blog_app.search_indexes.EntryIndex', + ] + } + } + +This allows for reliable swapping of the index that handles a model without +relying on correct import order. + + +Removal of ``RealTimeSearchIndex`` +================================== + +Use of the ``haystack.indexes.RealTimeSearchIndex`` is no longer valid. It has +been removed in favor of ``RealtimeSignalProcessor``. To migrate, first change +the inheritance of all your ``RealTimeSearchIndex`` subclasses to use +``SearchIndex`` instead:: + + # Old. + class MySearchIndex(indexes.RealTimeSearchIndex, indexes.Indexable): + # ... + + + # New. + class MySearchIndex(indexes.SearchIndex, indexes.Indexable): + # ... + +Then update your settings to enable use of the ``RealtimeSignalProcessor``:: + + HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' + + +Done! +===== + +For most basic uses of Haystack, this is all that is necessary to work with +Haystack 2.X. You should rebuild your index if needed & test your new setup. + + +Advanced Uses +============= + +Swapping Backend +---------------- + +If you were manually swapping the ``SearchQuery`` or ``SearchBackend`` being +used by ``SearchQuerySet`` in the past, it's now preferable to simply setup +another connection & use the ``SearchQuerySet.using`` method to select that +connection instead. + +Also, if you were manually instantiating ``SearchBackend`` or ``SearchQuery``, +it's now preferable to rely on the connection's engine to return the right +thing. For example:: + + from haystack import connections + backend = connections['default'].get_backend() + query = connections['default'].get_query() + + +Custom Backends +--------------- + +If you had written a custom ``SearchBackend`` and/or custom ``SearchQuery``, +there's a little more work needed to be Haystack 2.X compatible. + +You should, but don't have to, rename your ``SearchBackend`` & ``SearchQuery`` +classes to be more descriptive/less collide-y. For example, +``solr_backend.SearchBackend`` became ``solr_backend.SolrSearchBackend``. This +prevents non-namespaced imports from stomping on each other. + +You need to add a new class to your backend, subclassing ``BaseEngine``. This +allows specifying what ``backend`` & ``query`` should be used on a connection +with less duplication/naming trickery. It goes at the bottom of the file (so +that the classes are defined above it) and should look like:: + + from haystack.backends import BaseEngine + from haystack.backends.solr_backend import SolrSearchQuery + + # Code then... + + class MyCustomSolrEngine(BaseEngine): + # Use our custom backend. + backend = MySolrBackend + # Use the built-in Solr query. + query = SolrSearchQuery + +Your ``HAYSTACK_CONNECTIONS['default']['ENGINE']`` should then point to the +full Python import path to your new ``BaseEngine`` subclass. + +Finally, you will likely have to adjust the ``SearchBackend.__init__`` & +``SearchQuery.__init__``, as they have changed significantly. Please refer to +the commits for those backends. diff --git a/docs/multiple_index.rst b/docs/multiple_index.rst new file mode 100644 index 0000000..c51b734 --- /dev/null +++ b/docs/multiple_index.rst @@ -0,0 +1,201 @@ +.. _ref-multiple_index: + +================ +Multiple Indexes +================ + +Much like Django's `multiple database support`_, Haystack has "multiple index" +support. This allows you to talk to several different engines at the same time. +It enables things like master-slave setups, multiple language indexing, +separate indexes for general search & autocomplete as well as other options. + +.. _`multiple database support`: http://docs.djangoproject.com/en/1.3/topics/db/multi-db/ + + +Specifying Available Connections +================================ + +You can supply as many backends as you like, each with a descriptive name. A +complete setup that accesses all backends might look like:: + + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.solr_backend.SolrEngine', + 'URL': 'http://localhost:9001/solr/default', + 'TIMEOUT': 60 * 5, + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + 'SILENTLY_FAIL': True, + }, + 'autocomplete': { + 'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine', + 'PATH': '/home/search/whoosh_index', + 'STORAGE': 'file', + 'POST_LIMIT': 128 * 1024 * 1024, + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + 'SILENTLY_FAIL': True, + }, + 'slave': { + 'ENGINE': 'xapian_backend.XapianEngine', + 'PATH': '/home/search/xapian_index', + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + 'SILENTLY_FAIL': True, + }, + 'db': { + 'ENGINE': 'haystack.backends.simple_backend.SimpleEngine', + 'SILENTLY_FAIL': True, + } + } + +You are required to have at least one connection listed within +``HAYSTACK_CONNECTIONS``, it must be named ``default`` & it must have a valid +``ENGINE`` within it. + + +Management Commands +=================== + +All management commands that manipulate data use **ONLY** one connection at a +time. By default, they use the ``default`` index but accept a ``--using`` flag +to specify a different connection. For example:: + + ./manage.py rebuild_index --noinput --using=whoosh + + +Automatic Routing +================= + +To make the selection of the correct index easier, Haystack (like Django) has +the concept of "routers". All provided routers are checked whenever a read or +write happens, stopping at the first router that knows how to handle it. + +Haystack ships with a ``DefaultRouter`` enabled. It looks like:: + + class DefaultRouter(BaseRouter): + def for_read(self, **hints): + return DEFAULT_ALIAS + + def for_write(self, **hints): + return DEFAULT_ALIAS + +On a read (when a search query is executed), the ``DefaultRouter.for_read`` +method is checked & returns the ``DEFAULT_ALIAS`` (which is ``default``), +telling whatever requested it that it should perform the query against the +``default`` connection. The same process is followed for writes. + +If the ``for_read`` or ``for_write`` method returns ``None``, that indicates +that the current router can't handle the data. The next router is then checked. + +The ``hints`` passed can be anything that helps the router make a decision. This +data should always be considered optional & be guarded against. At current, +``for_write`` receives an ``index`` option (pointing to the ``SearchIndex`` +calling it) while ``for_read`` may receive ``models`` (being a list of ``Model`` +classes the ``SearchQuerySet`` may be looking at). + +You may provide as many routers as you like by overriding the +``HAYSTACK_ROUTERS`` setting. For example:: + + HAYSTACK_ROUTERS = ['myapp.routers.MasterRouter', 'myapp.routers.SlaveRouter', 'haystack.routers.DefaultRouter'] + + +Master-Slave Example +-------------------- + +The ``MasterRouter`` & ``SlaveRouter`` might look like:: + + from haystack import routers + + + class MasterRouter(routers.BaseRouter): + def for_write(self, **hints): + return 'master' + + def for_read(self, **hints): + return None + + + class SlaveRouter(routers.BaseRouter): + def for_write(self, **hints): + return None + + def for_read(self, **hints): + return 'slave' + +The observant might notice that since the methods don't overlap, this could be +combined into one ``Router`` like so:: + + from haystack import routers + + + class MasterSlaveRouter(routers.BaseRouter): + def for_write(self, **hints): + return 'master' + + def for_read(self, **hints): + return 'slave' + + +Manually Selecting +================== + +There may be times when automatic selection of the correct index is undesirable, +such as when fixing erroneous data in an index or when you know exactly where +data should be located. + +For this, the ``SearchQuerySet`` class allows for manually selecting the index +via the ``SearchQuerySet.using`` method:: + + from haystack.query import SearchQuerySet + + # Uses the routers' opinion. + sqs = SearchQuerySet().auto_query('banana') + + # Forces the default. + sqs = SearchQuerySet().using('default').auto_query('banana') + + # Forces the slave connection (presuming it was setup). + sqs = SearchQuerySet().using('slave').auto_query('banana') + +.. warning:: + + Note that the models a ``SearchQuerySet`` is trying to pull from must all come + from the same index. Haystack is not able to combine search queries against + different indexes. + + +Custom Index Selection +====================== + +If a specific backend has been selected, the ``SearchIndex.index_queryset`` and +``SearchIndex.read_queryset`` will receive the backend name, giving indexes the +opportunity to customize the returned queryset. + +For example, a site which uses separate indexes for recent items and older +content might define ``index_queryset`` to filter the items based on date:: + + def index_queryset(self, using=None): + qs = Note.objects.all() + archive_limit = datetime.datetime.now() - datetime.timedelta(days=90) + + if using == "archive": + return qs.filter(pub_date__lte=archive_limit) + else: + return qs.filter(pub_date__gte=archive_limit) + + +Multi-lingual Content +--------------------- + +Most search engines require you to set the language at the index level. For +example, a multi-lingual site using Solr can use `multiple cores `_ and corresponding Haystack +backends using the language name. Under this scenario, queries are simple:: + + sqs = SearchQuerySet.using(lang).auto_query(…) + +During index updates, the Index's ``index_queryset`` method will need to filter +the items to avoid sending the wrong content to the search engine:: + + def index_queryset(self, using=None): + return Post.objects.filter(language=using) diff --git a/docs/other_apps.rst b/docs/other_apps.rst new file mode 100644 index 0000000..e9751ff --- /dev/null +++ b/docs/other_apps.rst @@ -0,0 +1,98 @@ +.. _ref-other_apps: + +============================= +Haystack-Related Applications +============================= + +Sub Apps +======== + +These are apps that build on top of the infrastructure provided by Haystack. +Useful for essentially extending what Haystack can do. + +queued_search +------------- + +http://github.com/toastdriven/queued_search (2.X compatible) + +Provides a queue-based setup as an alternative to ``RealtimeSignalProcessor`` or +constantly running the ``update_index`` command. Useful for high-load, short +update time situations. + +celery-haystack +--------------- + +https://github.com/jezdez/celery-haystack (1.X and 2.X compatible) + +Also provides a queue-based setup, this time centered around Celery. Useful +for keeping the index fresh per model instance or with the included task +to call the ``update_index`` management command instead. + +haystack-rqueue +--------------- + +https://github.com/mandx/haystack-rqueue (2.X compatible) + +Also provides a queue-based setup, this time centered around RQ. Useful +for keeping the index fresh using ``./manage.py rqworker``. + +django-celery-haystack +---------------------- + +https://github.com/mixcloud/django-celery-haystack-SearchIndex + +Another queue-based setup, also around Celery. Useful +for keeping the index fresh. + +saved_searches +-------------- + +http://github.com/toastdriven/saved_searches (2.X compatible) + +Adds personalization to search. Retains a history of queries run by the various +users on the site (including anonymous users). This can be used to present the +user with their search history and provide most popular/most recent queries +on the site. + +saved-search +------------ + +https://github.com/DirectEmployers/saved-search + +An alternate take on persisting user searches, this has a stronger focus +on locale-based searches as well as further integration. + +haystack-static-pages +--------------------- + +http://github.com/trapeze/haystack-static-pages + +Provides a simple way to index flat (non-model-based) content on your site. +By using the management command that comes with it, it can crawl all pertinent +pages on your site and add them to search. + +django-tumbleweed +----------------- + +http://github.com/mcroydon/django-tumbleweed + +Provides a tumblelog-like view to any/all Haystack-enabled models on your +site. Useful for presenting date-based views of search data. Attempts to avoid +the database completely where possible. + + +Haystack-Enabled Apps +===================== + +These are reusable apps that ship with ``SearchIndexes``, suitable for quick +integration with Haystack. + +* django-faq (freq. asked questions app) - http://github.com/benspaulding/django-faq +* django-essays (blog-like essay app) - http://github.com/bkeating/django-essays +* gtalug (variety of apps) - http://github.com/myles/gtalug +* sciencemuseum (science museum open data) - http://github.com/simonw/sciencemuseum +* vz-wiki (wiki) - http://github.com/jobscry/vz-wiki +* ffmff (events app) - http://github.com/stefreak/ffmff +* Dinette (forums app) - http://github.com/uswaretech/Dinette +* fiftystates_site (site) - http://github.com/sunlightlabs/fiftystates_site +* Open-Knesset (site) - http://github.com/ofri/Open-Knesset diff --git a/docs/python3.rst b/docs/python3.rst new file mode 100644 index 0000000..310ced2 --- /dev/null +++ b/docs/python3.rst @@ -0,0 +1,47 @@ +.. _ref-python3: + +================ +Python 3 Support +================ + +As of Haystack v2.1.0, it has been ported to support both Python 2 & Python 3 +within the same codebase. This builds on top of what `six`_ & `Django`_ provide. + +No changes are required for anyone running an existing Haystack +installation. The API is completely backward-compatible, so you should be able +to run your existing software without modification. + +Virtually all tests pass under both Python 2 & 3, with a small number of +expected failures under Python (typically related to ordering, see below). + +.. _`six`: http://pythonhosted.org/six/ +.. _`Django`: https://docs.djangoproject.com/en/1.5/topics/python3/#str-and-unicode-methods + + +Supported Backends +================== + +The following backends are fully supported under Python 3. However, you may +need to update these dependencies if you have a pre-existing setup. + +* Solr (pysolr>=3.1.0) +* Elasticsearch + + +Notes +===== + +Testing +------- + +If you were testing things such as the query generated by a given +``SearchQuerySet`` or how your forms would render, under Python 3.3.2+, +`hash randomization`_ is in effect, which means that the ordering of +dictionaries is no longer consistent, even on the same platform. + +Haystack took the approach of abandoning making assertions about the entire +structure. Instead, we either simply assert that the new object contains the +right things or make a call to ``sorted(...)`` around it to ensure order. It is +recommended you take a similar approach. + +.. _`hash randomization`: http://docs.python.org/3/whatsnew/3.3.html#builtin-functions-and-types diff --git a/docs/rich_content_extraction.rst b/docs/rich_content_extraction.rst new file mode 100644 index 0000000..a23c85d --- /dev/null +++ b/docs/rich_content_extraction.rst @@ -0,0 +1,68 @@ +.. _ref-rich_content_extraction: + +======================= +Rich Content Extraction +======================= + +For some projects it is desirable to index text content which is stored in +structured files such as PDFs, Microsoft Office documents, images, etc. +Currently only Solr's `ExtractingRequestHandler`_ is directly supported by +Haystack but the approach below could be used with any backend which supports +this feature. + +.. _`ExtractingRequestHandler`: http://wiki.apache.org/solr/ExtractingRequestHandler + +Extracting Content +================== + +:meth:`SearchBackend.extract_file_contents` accepts a file or file-like object +and returns a dictionary containing two keys: ``metadata`` and ``contents``. The +``contents`` value will be a string containing all of the text which the backend +managed to extract from the file contents. ``metadata`` will always be a +dictionary but the keys and values will vary based on the underlying extraction +engine and the type of file provided. + +Indexing Extracted Content +========================== + +Generally you will want to include the extracted text in your main document +field along with everything else specified in your search template. This example +shows how to override a hypothetical ``FileIndex``'s ``prepare`` method to +include the extract content along with information retrieved from the database:: + + def prepare(self, obj): + data = super(FileIndex, self).prepare(obj) + + # This could also be a regular Python open() call, a StringIO instance + # or the result of opening a URL. Note that due to a library limitation + # file_obj must have a .name attribute even if you need to set one + # manually before calling extract_file_contents: + file_obj = obj.the_file.open() + + extracted_data = self.backend.extract_file_contents(file_obj) + + # Now we'll finally perform the template processing to render the + # text field with *all* of our metadata visible for templating: + t = loader.select_template(('search/indexes/myapp/file_text.txt', )) + data['text'] = t.render(Context({'object': obj, + 'extracted': extracted_data})) + + return data + +This allows you to insert the extracted text at the appropriate place in your +template, modified or intermixed with database content as appropriate: + +.. code-block:: html+django + + {{ object.title }} + {{ object.owner.name }} + + … + + {% for k, v in extracted.metadata.items %} + {% for val in v %} + {{ k }}: {{ val|safe }} + {% endfor %} + {% endfor %} + + {{ extracted.contents|striptags|safe }} \ No newline at end of file diff --git a/docs/running_tests.rst b/docs/running_tests.rst new file mode 100644 index 0000000..6b928f7 --- /dev/null +++ b/docs/running_tests.rst @@ -0,0 +1,70 @@ +.. _ref-running-tests: + +============= +Running Tests +============= + +Everything +========== + +The simplest way to get up and running with Haystack's tests is to run:: + + python setup.py test + +This installs all of the backend libraries & all dependencies for getting the +tests going and runs the tests. You will still have to setup search servers +(for running Solr tests, the spatial Solr tests & the Elasticsearch tests). + + +Cherry-Picked +============= + +If you'd rather not run all the tests, run only the backends you need since +tests for backends that are not running will be skipped. + +``Haystack`` is maintained with all tests passing at all times, so if you +receive any errors during testing, please check your setup and file a report if +the errors persist. + +To run just a portion of the tests you can use the script ``run_tests.py`` and +just specify the files or directories you wish to run, for example:: + + cd test_haystack + ./run_tests.py whoosh_tests test_loading.py + +The ``run_tests.py`` script is just a tiny wrapper around the nose_ library and +any options you pass to it will be passed on; including ``--help`` to get a +list of possible options:: + + cd test_haystack + ./run_tests.py --help + +.. _nose: https://nose.readthedocs.org/en/latest/ + +Configuring Solr +================ + +Haystack assumes that you have a Solr server running on port ``9001`` which +uses the schema and configuration provided in the +``test_haystack/solr_tests/server/`` directory. For convenience, a script is +provided which will download, configure and start a test Solr server:: + + test_haystack/solr_tests/server/start-solr-test-server.sh + +If no server is found all solr-related tests will be skipped. + +Configuring Elasticsearch +========================= + +The test suite will try to connect to Elasticsearch on port ``9200``. If no +server is found all elasticsearch tests will be skipped. Note that the tests +are destructive - during the teardown phase they will wipe the cluster clean so +make sure you don't run them against an instance with data you wish to keep. + +If you want to run the geo-django tests you may need to review the +`GeoDjango GEOS and GDAL settings`_ before running these commands:: + + cd test_haystack + ./run_tests.py elasticsearch_tests + +.. _GeoDjango GEOS and GDAL settings: https://docs.djangoproject.com/en/1.7/ref/contrib/gis/install/geolibs/#geos-library-path diff --git a/docs/searchbackend_api.rst b/docs/searchbackend_api.rst new file mode 100644 index 0000000..d077fbf --- /dev/null +++ b/docs/searchbackend_api.rst @@ -0,0 +1,124 @@ +.. _ref-searchbackend-api: + +===================== +``SearchBackend`` API +===================== + +.. class:: SearchBackend(connection_alias, **connection_options) + +The ``SearchBackend`` class handles interaction directly with the backend. The +search query it performs is usually fed to it from a ``SearchQuery`` class that +has been built for that backend. + +This class must be at least partially implemented on a per-backend basis and +is usually accompanied by a ``SearchQuery`` class within the same module. + +Unless you are writing a new backend, it is unlikely you need to directly +access this class. + + +Method Reference +================ + +``update`` +---------- + +.. method:: SearchBackend.update(self, index, iterable) + +Updates the backend when given a ``SearchIndex`` and a collection of +documents. + +This method MUST be implemented by each backend, as it will be highly +specific to each one. + +``remove`` +---------- + +.. method:: SearchBackend.remove(self, obj_or_string) + +Removes a document/object from the backend. Can be either a model +instance or the identifier (i.e. ``app_name.model_name.id``) in the +event the object no longer exists. + +This method MUST be implemented by each backend, as it will be highly +specific to each one. + +``clear`` +--------- + +.. method:: SearchBackend.clear(self, models=[]) + +Clears the backend of all documents/objects for a collection of models. + +This method MUST be implemented by each backend, as it will be highly +specific to each one. + +``search`` +---------- + +.. method:: SearchBackend.search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, limit_to_registered_models=None, result_class=None, **kwargs) + +Takes a query to search on and returns a dictionary. + +The query should be a string that is appropriate syntax for the backend. + +The returned dictionary should contain the keys 'results' and 'hits'. +The 'results' value should be an iterable of populated ``SearchResult`` +objects. The 'hits' should be an integer count of the number of matched +results the search backend found. + +This method MUST be implemented by each backend, as it will be highly +specific to each one. + +``extract_file_contents`` +------------------------- + +.. method:: SearchBackend.extract_file_contents(self, file_obj) + +Perform text extraction on the provided file or file-like object. Returns either +None or a dictionary containing the keys ``contents`` and ``metadata``. The +``contents`` field will always contain the extracted text content returned by +the underlying search engine but ``metadata`` may vary considerably based on +the backend and the input file. + +``prep_value`` +-------------- + +.. method:: SearchBackend.prep_value(self, value) + +Hook to give the backend a chance to prep an attribute value before +sending it to the search engine. + +By default, just force it to unicode. + +``more_like_this`` +------------------ + +.. method:: SearchBackend.more_like_this(self, model_instance, additional_query_string=None, result_class=None) + +Takes a model object and returns results the backend thinks are similar. + +This method MUST be implemented by each backend, as it will be highly +specific to each one. + +``build_schema`` +---------------- + +.. method:: SearchBackend.build_schema(self, fields) + +Takes a dictionary of fields and returns schema information. + +This method MUST be implemented by each backend, as it will be highly +specific to each one. + +``build_models_list`` +--------------------- + +.. method:: SearchBackend.build_models_list(self) + +Builds a list of models for searching. + +The ``search`` method should use this and the ``django_ct`` field to +narrow the results (unless the user indicates not to). This helps ignore +any results that are not currently handled models and ensures +consistent caching. diff --git a/docs/searchfield_api.rst b/docs/searchfield_api.rst new file mode 100644 index 0000000..bf8466b --- /dev/null +++ b/docs/searchfield_api.rst @@ -0,0 +1,262 @@ +.. _ref-searchfield-api: + +=================== +``SearchField`` API +=================== + +.. class:: SearchField + +The ``SearchField`` and its subclasses provides a way to declare what data +you're interested in indexing. They are used with ``SearchIndexes``, much like +``forms.*Field`` are used within forms or ``models.*Field`` within models. + +They provide both the means for storing data in the index, as well as preparing +the data before it's placed in the index. Haystack uses all fields from all +``SearchIndex`` classes to determine what the engine's index schema ought to +look like. + +In practice, you'll likely never actually use the base ``SearchField``, as the +subclasses are much better at handling real data. + + +Subclasses +========== + +Included with Haystack are the following field types: + +* ``BooleanField`` +* ``CharField`` +* ``DateField`` +* ``DateTimeField`` +* ``DecimalField`` +* ``EdgeNgramField`` +* ``FloatField`` +* ``IntegerField`` +* ``LocationField`` +* ``MultiValueField`` +* ``NgramField`` + +And equivalent faceted versions: + +* ``FacetBooleanField`` +* ``FacetCharField`` +* ``FacetDateField`` +* ``FacetDateTimeField`` +* ``FacetDecimalField`` +* ``FacetFloatField`` +* ``FacetIntegerField`` +* ``FacetMultiValueField`` + +.. note:: + + There is no faceted variant of the n-gram fields. Because of how the engine + generates n-grams, faceting on these field types (``NgramField`` & + ``EdgeNgram``) would make very little sense. + + +Usage +===== + +While ``SearchField`` objects can be used on their own, they're generally used +within a ``SearchIndex``. You use them in a declarative manner, just like +fields in ``django.forms.Form`` or ``django.db.models.Model`` objects. For +example:: + + from haystack import indexes + from myapp.models import Note + + + class NoteIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + author = indexes.CharField(model_attr='user') + pub_date = indexes.DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + +This will hook up those fields with the index and, when updating a ``Model`` +object, pull the relevant data out and prepare it for storage in the index. + + +Field Options +============= + +``default`` +----------- + +.. attribute:: SearchField.default + +Provides a means for specifying a fallback value in the event that no data is +found for the field. Can be either a value or a callable. + +``document`` +------------ + +.. attribute:: SearchField.document + +A boolean flag that indicates which of the fields in the ``SearchIndex`` ought +to be the primary field for searching within. Default is ``False``. + +.. note:: + + Only one field can be marked as the ``document=True`` field, so you should + standardize this name and the format of the field between all of your + ``SearchIndex`` classes. + +``indexed`` +----------- + +.. attribute:: SearchField.indexed + +A boolean flag for indicating whether or not the data from this field will +be searchable within the index. Default is ``True``. + +The companion of this option is ``stored``. + +``index_fieldname`` +------------------- + +.. attribute:: SearchField.index_fieldname + +The ``index_fieldname`` option allows you to force the name of the field in the +index. This does not change how Haystack refers to the field. This is useful +when using Solr's dynamic attributes or when integrating with other external +software. + +Default is variable name of the field within the ``SearchIndex``. + +``model_attr`` +-------------- + +.. attribute:: SearchField.model_attr + +The ``model_attr`` option is a shortcut for preparing data. Rather than having +to manually fetch data out of a ``Model``, ``model_attr`` allows you to specify +a string that will automatically pull data out for you. For example:: + + # Automatically looks within the model and populates the field with + # the ``last_name`` attribute. + author = CharField(model_attr='last_name') + +It also handles callables:: + + # On a ``User`` object, pulls the full name as pieced together by the + # ``get_full_name`` method. + author = CharField(model_attr='get_full_name') + +And can look through relations:: + + # Pulls the ``bio`` field from a ``UserProfile`` object that has a + # ``OneToOneField`` relationship to a ``User`` object. + biography = CharField(model_attr='user__profile__bio') + +``null`` +-------- + +.. attribute:: SearchField.null + +A boolean flag for indicating whether or not it's permissible for the field +not to contain any data. Default is ``False``. + +.. note:: + + Unlike Django's database layer, which injects a ``NULL`` into the database + when a field is marked nullable, ``null=True`` will actually exclude that + field from being included with the document. This is more efficient for the + search engine to deal with. + +``stored`` +---------- + +.. attribute:: SearchField.stored + +A boolean flag for indicating whether or not the data from this field will +be stored within the index. Default is ``True``. + +This is useful for pulling data out of the index along with the search result +in order to save on hits to the database. + +The companion of this option is ``indexed``. + +``template_name`` +----------------- + +.. attribute:: SearchField.template_name + +Allows you to override the name of the template to use when preparing data. By +default, the data templates for fields are located within your ``TEMPLATE_DIRS`` +under a path like ``search/indexes/{app_label}/{model_name}_{field_name}.txt``. +This option lets you override that path (though still within ``TEMPLATE_DIRS``). + +Example:: + + bio = CharField(use_template=True, template_name='myapp/data/bio.txt') + +You can also provide a list of templates, as ``loader.select_template`` is used +under the hood. + +Example:: + + bio = CharField(use_template=True, template_name=['myapp/data/bio.txt', 'myapp/bio.txt', 'bio.txt']) + + +``use_template`` +---------------- + +.. attribute:: SearchField.use_template + +A boolean flag for indicating whether or not a field should prepare its data +via a data template or not. Default is False. + +Data templates are extremely useful, as they let you easily tie together +different parts of the ``Model`` (and potentially related models). This leads +to better search results with very little effort. + + + +Method Reference +================ + +``__init__`` +------------ + +.. method:: SearchField.__init__(self, model_attr=None, use_template=False, template_name=None, document=False, indexed=True, stored=True, faceted=False, default=NOT_PROVIDED, null=False, index_fieldname=None, facet_class=None, boost=1.0, weight=None) + +Instantiates a fresh ``SearchField`` instance. + +``has_default`` +--------------- + +.. method:: SearchField.has_default(self) + +Returns a boolean of whether this field has a default value. + +``prepare`` +----------- + +.. method:: SearchField.prepare(self, obj) + +Takes data from the provided object and prepares it for storage in the +index. + +``prepare_template`` +-------------------- + +.. method:: SearchField.prepare_template(self, obj) + +Flattens an object for indexing. + +This loads a template +(``search/indexes/{app_label}/{model_name}_{field_name}.txt``) and +returns the result of rendering that template. ``object`` will be in +its context. + +``convert`` +----------- + +.. method:: SearchField.convert(self, value) + +Handles conversion between the data found and the type of the field. + +Extending classes should override this method and provide correct +data coercion. diff --git a/docs/searchindex_api.rst b/docs/searchindex_api.rst new file mode 100644 index 0000000..8263f80 --- /dev/null +++ b/docs/searchindex_api.rst @@ -0,0 +1,618 @@ +.. _ref-searchindex-api: + +=================== +``SearchIndex`` API +=================== + +.. class:: SearchIndex() + +The ``SearchIndex`` class allows the application developer a way to provide data to +the backend in a structured format. Developers familiar with Django's ``Form`` +or ``Model`` classes should find the syntax for indexes familiar. + +This class is arguably the most important part of integrating Haystack into your +application, as it has a large impact on the quality of the search results and +how easy it is for users to find what they're looking for. Care and effort +should be put into making your indexes the best they can be. + + +Quick Start +=========== + +For the impatient:: + + import datetime + from haystack import indexes + from myapp.models import Note + + + class NoteIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + author = indexes.CharField(model_attr='user') + pub_date = indexes.DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def index_queryset(self, using=None): + "Used when the entire index for model is updated." + return self.get_model().objects.filter(pub_date__lte=datetime.datetime.now()) + + +Background +========== + +Unlike relational databases, most search engines supported by Haystack are +primarily document-based. They focus on a single text blob which they tokenize, +analyze and index. When searching, this field is usually the primary one that +is searched. + +Further, the schema used by most engines is the same for all types of data +added, unlike a relational database that has a table schema for each chunk of +data. + +It may be helpful to think of your search index as something closer to a +key-value store instead of imagining it in terms of a RDBMS. + + +Why Create Fields? +------------------ + +Despite being primarily document-driven, most search engines also support the +ability to associate other relevant data with the indexed document. These +attributes can be mapped through the use of fields within Haystack. + +Common uses include storing pertinent data information, categorizations of the +document, author information and related data. By adding fields for these pieces +of data, you provide a means to further narrow/filter search terms. This can +be useful from either a UI perspective (a better advanced search form) or from a +developer standpoint (section-dependent search, off-loading certain tasks to +search, et cetera). + +.. warning:: + + Haystack reserves the following field names for internal use: ``id``, + ``django_ct``, ``django_id`` & ``content``. The ``name`` & ``type`` names + used to be reserved but no longer are. + + You can override these field names using the ``HAYSTACK_ID_FIELD``, + ``HAYSTACK_DJANGO_CT_FIELD`` & ``HAYSTACK_DJANGO_ID_FIELD`` if needed. + + +Significance Of ``document=True`` +--------------------------------- + +Most search engines that were candidates for inclusion in Haystack all had a +central concept of a document that they indexed. These documents form a corpus +within which to primarily search. Because this ideal is so central and most of +Haystack is designed to have pluggable backends, it is important to ensure that +all engines have at least a bare minimum of the data they need to function. + +As a result, when creating a ``SearchIndex``, at least one field must be marked +with ``document=True``. This signifies to Haystack that whatever is placed in +this field while indexing is to be the primary text the search engine indexes. +The name of this field can be almost anything, but ``text`` is one of the +more common names used. + + +Stored/Indexed Fields +--------------------- + +One shortcoming of the use of search is that you rarely have all or the most +up-to-date information about an object in the index. As a result, when +retrieving search results, you will likely have to access the object in the +database to provide better information. + +However, this can also hit the database quite heavily (think +``.get(pk=result.id)`` per object). If your search is popular, this can lead +to a big performance hit. There are two ways to prevent this. The first way is +``SearchQuerySet.load_all``, which tries to group all similar objects and pull +them through one query instead of many. This still hits the DB and incurs a +performance penalty. + +The other option is to leverage stored fields. By default, all fields in +Haystack are both indexed (searchable by the engine) and stored (retained by +the engine and presented in the results). By using a stored field, you can +store commonly used data in such a way that you don't need to hit the database +when processing the search result to get more information. + +For example, one great way to leverage this is to pre-rendering an object's +search result template DURING indexing. You define an additional field, render +a template with it and it follows the main indexed record into the index. Then, +when that record is pulled when it matches a query, you can simply display the +contents of that field, which avoids the database hit.: + +Within ``myapp/search_indexes.py``:: + + class NoteIndex(SearchIndex, indexes.Indexable): + text = CharField(document=True, use_template=True) + author = CharField(model_attr='user') + pub_date = DateTimeField(model_attr='pub_date') + # Define the additional field. + rendered = CharField(use_template=True, indexed=False) + +Then, inside a template named ``search/indexes/myapp/note_rendered.txt``:: + +

{{ object.title }}

+ +

{{ object.content }}

+ +And finally, in ``search/search.html``:: + + ... + + {% for result in page.object_list %} +
+ {{ result.rendered|safe }} +
+ {% endfor %} + + +Keeping The Index Fresh +======================= + +There are several approaches to keeping the search index in sync with your +database. None are more correct than the others and depending the traffic you +see, the churn rate of your data and what concerns are important to you +(CPU load, how recent, et cetera). + +The conventional method is to use ``SearchIndex`` in combination with cron +jobs. Running a ``./manage.py update_index`` every couple hours will keep your +data in sync within that timeframe and will handle the updates in a very +efficient batch. Additionally, Whoosh (and to a lesser extent Xapian) behaves +better when using this approach. + +Another option is to use ``RealtimeSignalProcessor``, which uses Django's +signals to immediately update the index any time a model is saved/deleted. This +yields a much more current search index at the expense of being fairly +inefficient. Solr & Elasticsearch are the only backends that handles this well +under load, and even then, you should make sure you have the server capacity +to spare. + +A third option is to develop a custom ``QueuedSignalProcessor`` that, much like +``RealtimeSignalProcessor``, uses Django's signals to enqueue messages for +updates/deletes. Then writing a management command to consume these messages +in batches, yielding a nice compromise between the previous two options. + +For more information see :doc:`signal_processors`. + +.. note:: + + Haystack doesn't ship with a ``QueuedSignalProcessor`` largely because there is + such a diversity of lightweight queuing options and that they tend to + polarize developers. Queuing is outside of Haystack's goals (provide good, + powerful search) and, as such, is left to the developer. + + Additionally, the implementation is relatively trivial & there are already + good third-party add-ons for Haystack to enable this. + + +Advanced Data Preparation +========================= + +In most cases, using the `model_attr` parameter on your fields allows you to +easily get data from a Django model to the document in your index, as it handles +both direct attribute access as well as callable functions within your model. + +.. note:: + + The ``model_attr`` keyword argument also can look through relations in + models. So you can do something like ``model_attr='author__first_name'`` + to pull just the first name of the author, similar to some lookups used + by Django's ORM. + +However, sometimes, even more control over what gets placed in your index is +needed. To facilitate this, ``SearchIndex`` objects have a 'preparation' stage +that populates data just before it is indexed. You can hook into this phase in +several ways. + +This should be very familiar to developers who have used Django's ``forms`` +before as it loosely follows similar concepts, though the emphasis here is +less on cleansing data from user input and more on making the data friendly +to the search backend. + +1. ``prepare_FOO(self, object)`` +-------------------------------- + +The most common way to affect a single field's data is to create a +``prepare_FOO`` method (where FOO is the name of the field). As a parameter +to this method, you will receive the instance that is attempting to be indexed. + +.. note:: + + This method is analogous to Django's ``Form.clean_FOO`` methods. + +To keep with our existing example, one use case might be altering the name +inside the ``author`` field to be "firstname lastname ". In this case, +you might write the following code:: + + class NoteIndex(SearchIndex, indexes.Indexable): + text = CharField(document=True, use_template=True) + author = CharField(model_attr='user') + pub_date = DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def prepare_author(self, obj): + return "%s <%s>" % (obj.user.get_full_name(), obj.user.email) + +This method should return a single value (or list/tuple/dict) to populate that +field's data upon indexing. Note that this method takes priority over whatever +data may come from the field itself. + +Just like ``Form.clean_FOO``, the field's ``prepare`` runs before the +``prepare_FOO``, allowing you to access ``self.prepared_data``. For example:: + + class NoteIndex(SearchIndex, indexes.Indexable): + text = CharField(document=True, use_template=True) + author = CharField(model_attr='user') + pub_date = DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def prepare_author(self, obj): + # Say we want last name first, the hard way. + author = u'' + + if 'author' in self.prepared_data: + name_bits = self.prepared_data['author'].split() + author = "%s, %s" % (name_bits[-1], ' '.join(name_bits[:-1])) + + return author + +This method is fully function with ``model_attr``, so if there's no convenient +way to access the data you want, this is an excellent way to prepare it:: + + class NoteIndex(SearchIndex, indexes.Indexable): + text = CharField(document=True, use_template=True) + author = CharField(model_attr='user') + categories = MultiValueField() + pub_date = DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def prepare_categories(self, obj): + # Since we're using a M2M relationship with a complex lookup, + # we can prepare the list here. + return [category.id for category in obj.category_set.active().order_by('-created')] + + +2. ``prepare(self, object)`` +---------------------------- + +Each ``SearchIndex`` gets a ``prepare`` method, which handles collecting all +the data. This method should return a dictionary that will be the final data +used by the search backend. + +Overriding this method is useful if you need to collect more than one piece +of data or need to incorporate additional data that is not well represented +by a single ``SearchField``. An example might look like:: + + class NoteIndex(SearchIndex, indexes.Indexable): + text = CharField(document=True, use_template=True) + author = CharField(model_attr='user') + pub_date = DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def prepare(self, object): + self.prepared_data = super(NoteIndex, self).prepare(object) + + # Add in tags (assuming there's a M2M relationship to Tag on the model). + # Note that this would NOT get picked up by the automatic + # schema tools provided by Haystack. + self.prepared_data['tags'] = [tag.name for tag in object.tags.all()] + + return self.prepared_data + +If you choose to use this method, you should make a point to be careful to call +the ``super()`` method before altering the data. Without doing so, you may have +an incomplete set of data populating your indexes. + +This method has the final say in all data, overriding both what the fields +provide as well as any ``prepare_FOO`` methods on the class. + +.. note:: + + This method is roughly analogous to Django's ``Form.full_clean`` and + ``Form.clean`` methods. However, unlike these methods, it is not fired + as the result of trying to access ``self.prepared_data``. It requires + an explicit call. + + +3. Overriding ``prepare(self, object)`` On Individual ``SearchField`` Objects +----------------------------------------------------------------------------- + +The final way to manipulate your data is to implement a custom ``SearchField`` +object and write its ``prepare`` method to populate/alter the data any way you +choose. For instance, a (naive) user-created ``GeoPointField`` might look +something like:: + + from django.utils import six + from haystack import indexes + + class GeoPointField(indexes.CharField): + def __init__(self, **kwargs): + kwargs['default'] = '0.00-0.00' + super(GeoPointField, self).__init__(**kwargs) + + def prepare(self, obj): + return six.text_type("%s-%s" % (obj.latitude, obj.longitude)) + +The ``prepare`` method simply returns the value to be used for that field. It's +entirely possible to include data that's not directly referenced to the object +here, depending on your needs. + +Note that this is NOT a recommended approach to storing geographic data in a +search engine (there is no formal suggestion on this as support is usually +non-existent), merely an example of how to extend existing fields. + +.. note:: + + This method is analagous to Django's ``Field.clean`` methods. + + +Adding New Fields +================= + +If you have an existing ``SearchIndex`` and you add a new field to it, Haystack +will add this new data on any updates it sees after that point. However, this +will not populate the existing data you already have. + +In order for the data to be picked up, you will need to run ``./manage.py +rebuild_index``. This will cause all backends to rebuild the existing data +already present in the quickest and most efficient way. + +.. note:: + + With the Solr backend, you'll also have to add to the appropriate + ``schema.xml`` for your configuration before running the ``rebuild_index``. + + +``Search Index`` +================ + +``get_model`` +------------- + +.. method:: SearchIndex.get_model(self) + +Should return the ``Model`` class (not an instance) that the rest of the +``SearchIndex`` should use. + +This method is required & you must override it to return the correct class. + +``index_queryset`` +------------------ + +.. method:: SearchIndex.index_queryset(self, using=None) + +Get the default QuerySet to index when doing a full update. + +Subclasses can override this method to avoid indexing certain objects. + +``read_queryset`` +----------------- + +.. method:: SearchIndex.read_queryset(self, using=None) + +Get the default QuerySet for read actions. + +Subclasses can override this method to work with other managers. +Useful when working with default managers that filter some objects. + +``build_queryset`` +------------------- + +.. method:: SearchIndex.build_queryset(self, start_date=None, end_date=None) + +Get the default QuerySet to index when doing an index update. + +Subclasses can override this method to take into account related +model modification times. + +The default is to use ``SearchIndex.index_queryset`` and filter +based on ``SearchIndex.get_updated_field`` + +``prepare`` +----------- + +.. method:: SearchIndex.prepare(self, obj) + +Fetches and adds/alters data before indexing. + +``get_content_field`` +--------------------- + +.. method:: SearchIndex.get_content_field(self) + +Returns the field that supplies the primary document to be indexed. + +``update`` +---------- + +.. method:: SearchIndex.update(self, using=None) + +Updates the entire index. + +If ``using`` is provided, it specifies which connection should be +used. Default relies on the routers to decide which backend should +be used. + +``update_object`` +----------------- + +.. method:: SearchIndex.update_object(self, instance, using=None, **kwargs) + +Update the index for a single object. Attached to the class's +post-save hook. + +If ``using`` is provided, it specifies which connection should be +used. Default relies on the routers to decide which backend should +be used. + +``remove_object`` +----------------- + +.. method:: SearchIndex.remove_object(self, instance, using=None, **kwargs) + +Remove an object from the index. Attached to the class's +post-delete hook. + +If ``using`` is provided, it specifies which connection should be +used. Default relies on the routers to decide which backend should +be used. + +``clear`` +--------- + +.. method:: SearchIndex.clear(self, using=None) + +Clears the entire index. + +If ``using`` is provided, it specifies which connection should be +used. Default relies on the routers to decide which backend should +be used. + +``reindex`` +----------- + +.. method:: SearchIndex.reindex(self, using=None) + +Completely clears the index for this model and rebuilds it. + +If ``using`` is provided, it specifies which connection should be +used. Default relies on the routers to decide which backend should +be used. + +``get_updated_field`` +--------------------- + +.. method:: SearchIndex.get_updated_field(self) + +Get the field name that represents the updated date for the model. + +If specified, this is used by the reindex command to filter out results +from the ``QuerySet``, enabling you to reindex only recent records. This +method should either return None (reindex everything always) or a +string of the ``Model``'s ``DateField``/``DateTimeField`` name. + +``should_update`` +----------------- + +.. method:: SearchIndex.should_update(self, instance, **kwargs) + +Determine if an object should be updated in the index. + +It's useful to override this when an object may save frequently and +cause excessive reindexing. You should check conditions on the instance +and return False if it is not to be indexed. + +The ``kwargs`` passed along to this method can be the same as the ones passed +by Django when a Model is saved/delete, so it's possible to check if the object +has been created or not. See ``django.db.models.signals.post_save`` for details +on what is passed. + +By default, returns True (always reindex). + +``load_all_queryset`` +--------------------- + +.. method:: SearchIndex.load_all_queryset(self) + +Provides the ability to override how objects get loaded in conjunction +with ``RelatedSearchQuerySet.load_all``. This is useful for post-processing the +results from the query, enabling things like adding ``select_related`` or +filtering certain data. + +.. warning:: + + Utilizing this functionality can have negative performance implications. + Please see the section on ``RelatedSearchQuerySet`` within + :doc:`searchqueryset_api` for further information. + +By default, returns ``all()`` on the model's default manager. + +Example:: + + class NoteIndex(SearchIndex, indexes.Indexable): + text = CharField(document=True, use_template=True) + author = CharField(model_attr='user') + pub_date = DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def load_all_queryset(self): + # Pull all objects related to the Note in search results. + return Note.objects.all().select_related() + +When searching, the ``RelatedSearchQuerySet`` appends on a call to ``in_bulk``, so be +sure that the ``QuerySet`` you provide can accommodate this and that the ids +passed to ``in_bulk`` will map to the model in question. + +If you need a specific ``QuerySet`` in one place, you can specify this at the +``RelatedSearchQuerySet`` level using the ``load_all_queryset`` method. See +:doc:`searchqueryset_api` for usage. + + +``ModelSearchIndex`` +==================== + +The ``ModelSearchIndex`` class allows for automatic generation of a +``SearchIndex`` based on the fields of the model assigned to it. + +With the exception of the automated introspection, it is a ``SearchIndex`` +class, so all notes above pertaining to ``SearchIndexes`` apply. As with the +``ModelForm`` class in Django, it employs an inner class called ``Meta``, which +should contain a ``model`` attribute. By default all non-relational model +fields are included as search fields on the index, but fields can be restricted +by way of a ``fields`` whitelist, or excluded with an ``excludes`` list, to +prevent certain fields from appearing in the class. + +In addition, it adds a `text` field that is the ``document=True`` field and +has `use_template=True` option set, just like the ``BasicSearchIndex``. + +.. warning:: + + Usage of this class might result in inferior ``SearchIndex`` objects, which + can directly affect your search results. Use this to establish basic + functionality and move to custom `SearchIndex` objects for better control. + +At this time, it does not handle related fields. + +Quick Start +----------- + +For the impatient:: + + import datetime + from haystack import indexes + from myapp.models import Note + + # All Fields + class AllNoteIndex(indexes.ModelSearchIndex, indexes.Indexable): + class Meta: + model = Note + + # Blacklisted Fields + class LimitedNoteIndex(indexes.ModelSearchIndex, indexes.Indexable): + class Meta: + model = Note + excludes = ['user'] + + # Whitelisted Fields + class NoteIndex(indexes.ModelSearchIndex, indexes.Indexable): + class Meta: + model = Note + fields = ['user', 'pub_date'] + + # Note that regular ``SearchIndex`` methods apply. + def index_queryset(self, using=None): + "Used when the entire index for model is updated." + return Note.objects.filter(pub_date__lte=datetime.datetime.now()) + diff --git a/docs/searchquery_api.rst b/docs/searchquery_api.rst new file mode 100644 index 0000000..305557e --- /dev/null +++ b/docs/searchquery_api.rst @@ -0,0 +1,336 @@ +.. _ref-searchquery-api: + +=================== +``SearchQuery`` API +=================== + +.. class:: SearchQuery(using=DEFAULT_ALIAS) + +The ``SearchQuery`` class acts as an intermediary between ``SearchQuerySet``'s +abstraction and ``SearchBackend``'s actual search. Given the metadata provided +by ``SearchQuerySet``, ``SearchQuery`` builds the actual query and interacts +with the ``SearchBackend`` on ``SearchQuerySet``'s behalf. + +This class must be at least partially implemented on a per-backend basis, as portions +are highly specific to the backend. It usually is bundled with the accompanying +``SearchBackend``. + +Most people will **NOT** have to use this class directly. ``SearchQuerySet`` +handles all interactions with ``SearchQuery`` objects and provides a nicer +interface to work with. + +Should you need advanced/custom behavior, you can supply your version of +``SearchQuery`` that overrides/extends the class in the manner you see fit. +You can either hook it up in a ``BaseEngine`` subclass or ``SearchQuerySet`` +objects take a kwarg parameter ``query`` where you can pass in your class. + + +``SQ`` Objects +============== + +For expressing more complex queries, especially involving AND/OR/NOT in +different combinations, you should use ``SQ`` objects. Like +``django.db.models.Q`` objects, ``SQ`` objects can be passed to +``SearchQuerySet.filter`` and use the familiar unary operators (``&``, ``|`` and +``~``) to generate complex parts of the query. + +.. warning:: + + Any data you pass to ``SQ`` objects is passed along **unescaped**. If + you don't trust the data you're passing along, you should use + the ``clean`` method on your ``SearchQuery`` to sanitize the data. + +Example:: + + from haystack.query import SQ + + # We want "title: Foo AND (tags:bar OR tags:moof)" + sqs = SearchQuerySet().filter(title='Foo').filter(SQ(tags='bar') | SQ(tags='moof')) + + # To clean user-provided data: + sqs = SearchQuerySet() + clean_query = sqs.query.clean(user_query) + sqs = sqs.filter(SQ(title=clean_query) | SQ(tags=clean_query)) + +Internally, the ``SearchQuery`` object maintains a tree of ``SQ`` objects. Each +``SQ`` object supports what field it looks up against, what kind of lookup (i.e. +the ``__`` filters), what value it's looking for, if it's a AND/OR/NOT and +tracks any children it may have. The ``SearchQuery.build_query`` method starts +with the root of the tree, building part of the final query at each node until +the full final query is ready for the ``SearchBackend``. + + +Backend-Specific Methods +======================== + +When implementing a new backend, the following methods will need to be created: + +``build_query_fragment`` +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.build_query_fragment(self, field, filter_type, value) + +Generates a query fragment from a field, filter type and a value. + +Must be implemented in backends as this will be highly backend specific. + + +Inheritable Methods +=================== + +The following methods have a complete implementation in the base class and +can largely be used unchanged. + +``build_query`` +~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.build_query(self) + +Interprets the collected query metadata and builds the final query to +be sent to the backend. + +``build_params`` +~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.build_params(self, spelling_query=None) + +Generates a list of params to use when searching. + +``clean`` +~~~~~~~~~ + +.. method:: SearchQuery.clean(self, query_fragment) + +Provides a mechanism for sanitizing user input before presenting the +value to the backend. + +A basic (override-able) implementation is provided. + +``run`` +~~~~~~~ + +.. method:: SearchQuery.run(self, spelling_query=None, **kwargs) + +Builds and executes the query. Returns a list of search results. + +Optionally passes along an alternate query for spelling suggestions. + +Optionally passes along more kwargs for controlling the search query. + +``run_mlt`` +~~~~~~~~~~~ + +.. method:: SearchQuery.run_mlt(self, **kwargs) + +Executes the More Like This. Returns a list of search results similar +to the provided document (and optionally query). + +``run_raw`` +~~~~~~~~~~~ + +.. method:: SearchQuery.run_raw(self, **kwargs) + +Executes a raw query. Returns a list of search results. + +``get_count`` +~~~~~~~~~~~~~ + +.. method:: SearchQuery.get_count(self) + +Returns the number of results the backend found for the query. + +If the query has not been run, this will execute the query and store +the results. + +``get_results`` +~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.get_results(self, **kwargs) + +Returns the results received from the backend. + +If the query has not been run, this will execute the query and store +the results. + +``get_facet_counts`` +~~~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.get_facet_counts(self) + +Returns the results received from the backend. + +If the query has not been run, this will execute the query and store +the results. + +``boost_fragment`` +~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.boost_fragment(self, boost_word, boost_value) + +Generates query fragment for boosting a single word/value pair. + +``matching_all_fragment`` +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.matching_all_fragment(self) + +Generates the query that matches all documents. + +``add_filter`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_filter(self, expression, value, use_not=False, use_or=False) + +Narrows the search by requiring certain conditions. + +``add_order_by`` +~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_order_by(self, field) + +Orders the search result by a field. + +``clear_order_by`` +~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.clear_order_by(self) + +Clears out all ordering that has been already added, reverting the +query to relevancy. + +``add_model`` +~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_model(self, model) + +Restricts the query requiring matches in the given model. + +This builds upon previous additions, so you can limit to multiple models +by chaining this method several times. + +``set_limits`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuery.set_limits(self, low=None, high=None) + +Restricts the query by altering either the start, end or both offsets. + +``clear_limits`` +~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.clear_limits(self) + +Clears any existing limits. + +``add_boost`` +~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_boost(self, term, boost_value) + +Adds a boosted term and the amount to boost it to the query. + +``raw_search`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuery.raw_search(self, query_string, **kwargs) + +Runs a raw query (no parsing) against the backend. + +This method causes the ``SearchQuery`` to ignore the standard query-generating +facilities, running only what was provided instead. + +Note that any kwargs passed along will override anything provided +to the rest of the ``SearchQuerySet``. + +``more_like_this`` +~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.more_like_this(self, model_instance) + +Allows backends with support for "More Like This" to return results +similar to the provided instance. + +``add_stats_query`` +~~~~~~~~~~~~~~~~~~~ +.. method:: SearchQuery.add_stats_query(self,stats_field,stats_facets) + +Adds stats and stats_facets queries for the Solr backend. + +``add_highlight`` +~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_highlight(self) + +Adds highlighting to the search results. + +``add_within`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_within(self, field, point_1, point_2): + +Adds bounding box parameters to search query. + +``add_dwithin`` +~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_dwithin(self, field, point, distance): + +Adds radius-based parameters to search query. + +``add_distance`` +~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_distance(self, field, point): + +Denotes that results should include distance measurements from the +point passed in. + +``add_field_facet`` +~~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_field_facet(self, field, **options) + +Adds a regular facet on a field. + +``add_date_facet`` +~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_date_facet(self, field, start_date, end_date, gap_by, gap_amount) + +Adds a date-based facet on a field. + +``add_query_facet`` +~~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_query_facet(self, field, query) + +Adds a query facet on a field. + +``add_narrow_query`` +~~~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.add_narrow_query(self, query) + +Narrows a search to a subset of all documents per the query. + +Generally used in conjunction with faceting. + +``set_result_class`` +~~~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuery.set_result_class(self, klass) + +Sets the result class to use for results. + +Overrides any previous usages. If ``None`` is provided, Haystack will +revert back to the default ``SearchResult`` object. + +``using`` +~~~~~~~~~ + +.. method:: SearchQuery.using(self, using=None) + +Allows for overriding which connection should be used. This +disables the use of routers when performing the query. + +If ``None`` is provided, it has no effect on what backend is used. diff --git a/docs/searchqueryset_api.rst b/docs/searchqueryset_api.rst new file mode 100644 index 0000000..03bb34a --- /dev/null +++ b/docs/searchqueryset_api.rst @@ -0,0 +1,893 @@ +.. _ref-searchqueryset-api: + +====================== +``SearchQuerySet`` API +====================== + +.. class:: SearchQuerySet(using=None, query=None) + +The ``SearchQuerySet`` class is designed to make performing a search and +iterating over its results easy and consistent. For those familiar with Django's +ORM ``QuerySet``, much of the ``SearchQuerySet`` API should feel familiar. + + +Why Follow ``QuerySet``? +======================== + +A couple reasons to follow (at least in part) the ``QuerySet`` API: + +#. Consistency with Django +#. Most Django programmers have experience with the ORM and can use this + knowledge with ``SearchQuerySet``. + +And from a high-level perspective, ``QuerySet`` and ``SearchQuerySet`` do very similar +things: given certain criteria, provide a set of results. Both are powered by +multiple backends, both are abstractions on top of the way a query is performed. + + +Quick Start +=========== + +For the impatient:: + + from haystack.query import SearchQuerySet + all_results = SearchQuerySet().all() + hello_results = SearchQuerySet().filter(content='hello') + hello_world_results = SearchQuerySet().filter(content='hello world') + unfriendly_results = SearchQuerySet().exclude(content='hello').filter(content='world') + recent_results = SearchQuerySet().order_by('-pub_date')[:5] + + # Using the new input types... + from haystack.inputs import AutoQuery, Exact, Clean + sqs = SearchQuerySet().filter(content=AutoQuery(request.GET['q']), product_type=Exact('ancient book')) + + if request.GET['product_url']: + sqs = sqs.filter(product_url=Clean(request.GET['product_url'])) + +For more on the ``AutoQuery``, ``Exact``, ``Clean`` classes & friends, see the +:ref:`ref-inputtypes` documentation. + + +``SearchQuerySet`` +================== + +By default, ``SearchQuerySet`` provide the documented functionality. You can +extend with your own behavior by simply subclassing from ``SearchQuerySet`` and +adding what you need, then using your subclass in place of ``SearchQuerySet``. + +Most methods in ``SearchQuerySet`` "chain" in a similar fashion to ``QuerySet``. +Additionally, like ``QuerySet``, ``SearchQuerySet`` is lazy (meaning it evaluates the +query as late as possible). So the following is valid:: + + from haystack.query import SearchQuerySet + results = SearchQuerySet().exclude(content='hello').filter(content='world').order_by('-pub_date').boost('title', 0.5)[10:20] + + +The ``content`` Shortcut +======================== + +Searching your document fields is a very common activity. To help mitigate +possible differences in ``SearchField`` names (and to help the backends deal +with search queries that inspect the main corpus), there is a special field +called ``content``. You may use this in any place that other fields names would +work (e.g. ``filter``, ``exclude``, etc.) to indicate you simply want to +search the main documents. + +For example:: + + from haystack.query import SearchQuerySet + + # This searches whatever fields were marked ``document=True``. + results = SearchQuerySet().exclude(content='hello') + +This special pseudo-field works best with the ``exact`` lookup and may yield +strange or unexpected results with the other lookups. + + +``SearchQuerySet`` Methods +========================== + +The primary interface to search in Haystack is through the ``SearchQuerySet`` +object. It provides a clean, programmatic, portable API to the search backend. +Many aspects are also "chainable", meaning you can call methods one after another, each +applying their changes to the previous ``SearchQuerySet`` and further narrowing +the search. + +All ``SearchQuerySet`` objects implement a list-like interface, meaning you can +perform actions like getting the length of the results, accessing a result at an +offset or even slicing the result list. + + +Methods That Return A ``SearchQuerySet`` +---------------------------------------- + +``all`` +~~~~~~~ + +.. method:: SearchQuerySet.all(self): + +Returns all results for the query. This is largely a no-op (returns an identical +copy) but useful for denoting exactly what behavior is going on. + +``none`` +~~~~~~~~ + +.. method:: SearchQuerySet.none(self): + +Returns an ``EmptySearchQuerySet`` that behaves like a ``SearchQuerySet`` but +always yields no results. + +``filter`` +~~~~~~~~~~ + +.. method:: SearchQuerySet.filter(self, **kwargs) + +Filters the search by looking for (and including) certain attributes. + +The lookup parameters (``**kwargs``) should follow the `Field lookups`_ below. +If you specify more than one pair, they will be joined in the query according to +the ``HAYSTACK_DEFAULT_OPERATOR`` setting (defaults to ``AND``). + +You can pass it either strings or a variety of :ref:`ref-inputtypes` if you +need more advanced query behavior. + +.. warning:: + + Any data you pass to ``filter`` gets auto-escaped. If you need to send + non-escaped data, use the ``Raw`` input type (:ref:`ref-inputtypes`). + + Also, if a string with one or more spaces in it is specified as the value, the + string will get passed along **AS IS**. This will mean that it will **NOT** + be treated as a phrase (like Haystack 1.X's behavior). + + If you want to match a phrase, you should use either the ``__exact`` filter + type or the ``Exact`` input type (:ref:`ref-inputtypes`). + +Examples:: + + sqs = SearchQuerySet().filter(content='foo') + + sqs = SearchQuerySet().filter(content='foo', pub_date__lte=datetime.date(2008, 1, 1)) + + # Identical to the previous example. + sqs = SearchQuerySet().filter(content='foo').filter(pub_date__lte=datetime.date(2008, 1, 1)) + + # To send unescaped data: + from haystack.inputs import Raw + sqs = SearchQuerySet().filter(title=Raw(trusted_query)) + + # To use auto-query behavior on a non-``document=True`` field. + from haystack.inputs import AutoQuery + sqs = SearchQuerySet().filter(title=AutoQuery(user_query)) + + +``exclude`` +~~~~~~~~~~~ + +.. method:: SearchQuerySet.exclude(self, **kwargs) + +Narrows the search by ensuring certain attributes are not included. + +.. warning:: + + Any data you pass to ``exclude`` gets auto-escaped. If you need to send + non-escaped data, use the ``Raw`` input type (:ref:`ref-inputtypes`). + +Example:: + + sqs = SearchQuerySet().exclude(content='foo') + + +``filter_and`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.filter_and(self, **kwargs) + +Narrows the search by looking for (and including) certain attributes. Join +behavior in the query is forced to be ``AND``. Used primarily by the ``filter`` +method. + +``filter_or`` +~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.filter_or(self, **kwargs) + +Narrows the search by looking for (and including) certain attributes. Join +behavior in the query is forced to be ``OR``. Used primarily by the ``filter`` +method. + +``order_by`` +~~~~~~~~~~~~ + +.. method:: SearchQuerySet.order_by(self, *args) + +Alters the order in which the results should appear. Arguments should be strings +that map to the attributes/fields within the index. You may specify multiple +fields by comma separating them:: + + SearchQuerySet().filter(content='foo').order_by('author', 'pub_date') + +Default behavior is ascending order. To specify descending order, prepend the +string with a ``-``:: + + SearchQuerySet().filter(content='foo').order_by('-pub_date') + +.. note:: + + In general, ordering is locale-specific. Haystack makes no effort to try to + reconcile differences between characters from different languages. This + means that accented characters will sort closely with the same character + and **NOT** necessarily close to the unaccented form of the character. + + If you want this kind of behavior, you should override the ``prepare_FOO`` + methods on your ``SearchIndex`` objects to transliterate the characters + as you see fit. + +``highlight`` +~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.highlight(self) + +If supported by the backend, the ``SearchResult`` objects returned will include +a highlighted version of the result:: + + sqs = SearchQuerySet().filter(content='foo').highlight() + result = sqs[0] + result.highlighted['text'][0] # u'Two computer scientists walk into a bar. The bartender says "Foo!".' + +``models`` +~~~~~~~~~~ + +.. method:: SearchQuerySet.models(self, *models) + +Accepts an arbitrary number of Model classes to include in the search. This will +narrow the search results to only include results from the models specified. + +Example:: + + SearchQuerySet().filter(content='foo').models(BlogEntry, Comment) + +``result_class`` +~~~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.result_class(self, klass) + +Allows specifying a different class to use for results. + +Overrides any previous usages. If ``None`` is provided, Haystack will +revert back to the default ``SearchResult`` object. + +Example:: + + SearchQuerySet().result_class(CustomResult) + +``boost`` +~~~~~~~~~ + +.. method:: SearchQuerySet.boost(self, term, boost_value) + +Boosts a certain term of the query. You provide the term to be boosted and the +value is the amount to boost it by. Boost amounts may be either an integer or a +float. + +Example:: + + SearchQuerySet().filter(content='foo').boost('bar', 1.5) + +``facet`` +~~~~~~~~~ + +.. method:: SearchQuerySet.facet(self, field, **options) + +Adds faceting to a query for the provided field. You provide the field (from one +of the ``SearchIndex`` classes) you like to facet on. Any keyword options you +provide will be passed along to the backend for that facet. + +Example:: + + # For SOLR (setting f.author.facet.*; see http://wiki.apache.org/solr/SimpleFacetParameters#Parameters) + SearchQuerySet().facet('author', mincount=1, limit=10) + # For ElasticSearch (see http://www.elasticsearch.org/guide/reference/api/search/facets/terms-facet.html) + SearchQuerySet().facet('author', size=10, order='term') + +In the search results you get back, facet counts will be populated in the +``SearchResult`` object. You can access them via the ``facet_counts`` method. + +Example:: + + # Count document hits for each author within the index. + SearchQuerySet().filter(content='foo').facet('author') + +``date_facet`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.date_facet(self, field, start_date, end_date, gap_by, gap_amount=1) + +Adds faceting to a query for the provided field by date. You provide the field +(from one of the ``SearchIndex`` classes) you like to facet on, a ``start_date`` +(either ``datetime.datetime`` or ``datetime.date``), an ``end_date`` and the +amount of time between gaps as ``gap_by`` (one of ``'year'``, ``'month'``, +``'day'``, ``'hour'``, ``'minute'`` or ``'second'``). + +You can also optionally provide a ``gap_amount`` to specify a different +increment than ``1``. For example, specifying gaps by week (every seven days) +would be ``gap_by='day', gap_amount=7``). + +In the search results you get back, facet counts will be populated in the +``SearchResult`` object. You can access them via the ``facet_counts`` method. + +Example:: + + # Count document hits for each day between 2009-06-07 to 2009-07-07 within the index. + SearchQuerySet().filter(content='foo').date_facet('pub_date', start_date=datetime.date(2009, 6, 7), end_date=datetime.date(2009, 7, 7), gap_by='day') + +``query_facet`` +~~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.query_facet(self, field, query) + +Adds faceting to a query for the provided field with a custom query. You provide +the field (from one of the ``SearchIndex`` classes) you like to facet on and the +backend-specific query (as a string) you'd like to execute. + +Please note that this is **NOT** portable between backends. The syntax is entirely +dependent on the backend. No validation/cleansing is performed and it is up to +the developer to ensure the query's syntax is correct. + +In the search results you get back, facet counts will be populated in the +``SearchResult`` object. You can access them via the ``facet_counts`` method. + +Example:: + + # Count document hits for authors that start with 'jo' within the index. + SearchQuerySet().filter(content='foo').query_facet('author', 'jo*') + +``within`` +~~~~~~~~~~ + +.. method:: SearchQuerySet.within(self, field, point_1, point_2): + +Spatial: Adds a bounding box search to the query. + +See the :ref:`ref-spatial` docs for more information. + +``dwithin`` +~~~~~~~~~~~ + +.. method:: SearchQuerySet.dwithin(self, field, point, distance): + +Spatial: Adds a distance-based search to the query. + +See the :ref:`ref-spatial` docs for more information. + +``stats`` +~~~~~~~~~ + +.. method:: SearchQuerySet.stats(self, field): + +Adds stats to a query for the provided field. This is supported on +Solr only. You provide the field (from one of the ``SearchIndex`` +classes) you would like stats on. + +In the search results you get back, stats will be populated in the +``SearchResult`` object. You can access them via the `` stats_results`` method. + +Example:: + + # Get stats on the author field. + SearchQuerySet().filter(content='foo').stats('author') + +``stats_facet`` +~~~~~~~~~~~~~~~ +.. method:: SearchQuerySet.stats_facet(self, field, +.. facet_fields=None): + +Adds stats facet for the given field and facet_fields represents the +faceted fields. This is supported on Solr only. + +Example:: + + # Get stats on the author field, and stats on the author field + faceted by bookstore. + SearchQuerySet().filter(content='foo').stats_facet('author','bookstore') + + +``distance`` +~~~~~~~~~~~~ +.. method:: SearchQuerySet.distance(self, field, point): + +Spatial: Denotes results must have distance measurements from the +provided point. + +See the :ref:`ref-spatial` docs for more information. + +``narrow`` +~~~~~~~~~~ + +.. method:: SearchQuerySet.narrow(self, query) + +Pulls a subset of documents from the search engine to search within. This is +for advanced usage, especially useful when faceting. + +Example:: + + # Search, from recipes containing 'blend', for recipes containing 'banana'. + SearchQuerySet().narrow('blend').filter(content='banana') + + # Using a fielded search where the recipe's title contains 'smoothie', find all recipes published before 2009. + SearchQuerySet().narrow('title:smoothie').filter(pub_date__lte=datetime.datetime(2009, 1, 1)) + +By using ``narrow``, you can create drill-down interfaces for faceting by +applying ``narrow`` calls for each facet that gets selected. + +This method is different from ``SearchQuerySet.filter()`` in that it does not +affect the query sent to the engine. It pre-limits the document set being +searched. Generally speaking, if you're in doubt of whether to use +``filter`` or ``narrow``, use ``filter``. + +.. note:: + + This method is, generally speaking, not necessarily portable between + backends. The syntax is entirely dependent on the backend, though most + backends have a similar syntax for basic fielded queries. No + validation/cleansing is performed and it is up to the developer to ensure + the query's syntax is correct. + +``raw_search`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.raw_search(self, query_string, **kwargs) + +Passes a raw query directly to the backend. This is for advanced usage, where +the desired query can not be expressed via ``SearchQuerySet``. + +This method is still supported, however it now uses the much more flexible +``Raw`` input type (:ref:`ref-inputtypes`). + +.. warning:: + + Different from Haystack 1.X, this method no longer causes immediate + evaluation & now chains appropriately. + +Example:: + + # In the case of Solr... (this example could be expressed with SearchQuerySet) + SearchQuerySet().raw_search('django_ct:blog.blogentry "However, it is"') + + # Equivalent. + from haystack.inputs import Raw + sqs = SearchQuerySet().filter(content=Raw('django_ct:blog.blogentry "However, it is"')) + +Please note that this is **NOT** portable between backends. The syntax is entirely +dependent on the backend. No validation/cleansing is performed and it is up to +the developer to ensure the query's syntax is correct. + +Further, the use of ``**kwargs`` are completely undocumented intentionally. If +a third-party backend can implement special features beyond what's present, it +should use those ``**kwargs`` for passing that information. Developers should +be careful to make sure there are no conflicts with the backend's ``search`` +method, as that is called directly. + +``load_all`` +~~~~~~~~~~~~ + +.. method:: SearchQuerySet.load_all(self) + +Efficiently populates the objects in the search results. Without using this +method, DB lookups are done on a per-object basis, resulting in many individual +trips to the database. If ``load_all`` is used, the ``SearchQuerySet`` will +group similar objects into a single query, resulting in only as many queries as +there are different object types returned. + +Example:: + + SearchQuerySet().filter(content='foo').load_all() + +``auto_query`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.auto_query(self, query_string, fieldname=None) + +Performs a best guess constructing the search query. + +This method is intended for common use directly with a user's query. This +method is still supported, however it now uses the much more flexible +``AutoQuery`` input type (:ref:`ref-inputtypes`). + +It handles exact matches (specified with single or double quotes), negation ( +using a ``-`` immediately before the term) and joining remaining terms with the +operator specified in ``HAYSTACK_DEFAULT_OPERATOR``. + +Example:: + + sqs = SearchQuerySet().auto_query('goldfish "old one eye" -tank') + + # Equivalent. + from haystack.inputs import AutoQuery + sqs = SearchQuerySet().filter(content=AutoQuery('goldfish "old one eye" -tank')) + + # Against a different field. + sqs = SearchQuerySet().filter(title=AutoQuery('goldfish "old one eye" -tank')) + + +``autocomplete`` +~~~~~~~~~~~~~~~~ + +A shortcut method to perform an autocomplete search. + +Must be run against fields that are either ``NgramField`` or +``EdgeNgramField``. + +Example:: + + SearchQuerySet().autocomplete(title_autocomplete='gol') + +``more_like_this`` +~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.more_like_this(self, model_instance) + +Finds similar results to the object passed in. + +You should pass in an instance of a model (for example, one fetched via a +``get`` in Django's ORM). This will execute a query on the backend that searches +for similar results. The instance you pass in should be an indexed object. +Previously called methods will have an effect on the provided results. + +It will evaluate its own backend-specific query and populate the +``SearchQuerySet`` in the same manner as other methods. + +Example:: + + entry = Entry.objects.get(slug='haystack-one-oh-released') + mlt = SearchQuerySet().more_like_this(entry) + mlt.count() # 5 + mlt[0].object.title # "Haystack Beta 1 Released" + + # ...or... + mlt = SearchQuerySet().filter(public=True).exclude(pub_date__lte=datetime.date(2009, 7, 21)).more_like_this(entry) + mlt.count() # 2 + mlt[0].object.title # "Haystack Beta 1 Released" + +``using`` +~~~~~~~~~ + +.. method:: SearchQuerySet.using(self, connection_name) + +Allows switching which connection the ``SearchQuerySet`` uses to search in. + +Example:: + + # Let the routers decide which connection to use. + sqs = SearchQuerySet().all() + + # Specify the 'default'. + sqs = SearchQuerySet().all().using('default') + + +Methods That Do Not Return A ``SearchQuerySet`` +----------------------------------------------- + +``count`` +~~~~~~~~~ + +.. method:: SearchQuerySet.count(self) + +Returns the total number of matching results. + +This returns an integer count of the total number of results the search backend +found that matched. This method causes the query to evaluate and run the search. + +Example:: + + SearchQuerySet().filter(content='foo').count() + +``best_match`` +~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.best_match(self) + +Returns the best/top search result that matches the query. + +This method causes the query to evaluate and run the search. This method returns +a ``SearchResult`` object that is the best match the search backend found:: + + foo = SearchQuerySet().filter(content='foo').best_match() + foo.id # Something like 5. + + # Identical to: + foo = SearchQuerySet().filter(content='foo')[0] + +``latest`` +~~~~~~~~~~ + +.. method:: SearchQuerySet.latest(self, date_field) + +Returns the most recent search result that matches the query. + +This method causes the query to evaluate and run the search. This method returns +a ``SearchResult`` object that is the most recent match the search backend +found:: + + foo = SearchQuerySet().filter(content='foo').latest('pub_date') + foo.id # Something like 3. + + # Identical to: + foo = SearchQuerySet().filter(content='foo').order_by('-pub_date')[0] + +``facet_counts`` +~~~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.facet_counts(self) + +Returns the facet counts found by the query. This will cause the query to +execute and should generally be used when presenting the data (template-level). + +You receive back a dictionary with three keys: ``fields``, ``dates`` and +``queries``. Each contains the facet counts for whatever facets you specified +within your ``SearchQuerySet``. + +.. note:: + + The resulting dictionary may change before 1.0 release. It's fairly + backend-specific at the time of writing. Standardizing is waiting on + implementing other backends that support faceting and ensuring that the + results presented will meet their needs as well. + +Example:: + + # Count document hits for each author. + sqs = SearchQuerySet().filter(content='foo').facet('author') + + sqs.facet_counts() + # Gives the following response: + # { + # 'dates': {}, + # 'fields': { + # 'author': [ + # ('john', 4), + # ('daniel', 2), + # ('sally', 1), + # ('terry', 1), + # ], + # }, + # 'queries': {} + # } + +``stats_results`` +~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.stats_results(self): + +Returns the stats results found by the query. + +This will cause the query to execute and should generally be used when +presenting the data (template-level). + +You receive back a dictionary with three keys: ``fields``, ``dates`` and +``queries``. Each contains the facet counts for whatever facets you specified +within your ``SearchQuerySet``. + +.. note:: + + The resulting dictionary may change before 1.0 release. It's fairly + backend-specific at the time of writing. Standardizing is waiting on + implementing other backends that support faceting and ensuring that the + results presented will meet their needs as well. + +Example:: + + # Count document hits for each author. + sqs = SearchQuerySet().filter(content='foo').stats('price') + + sqs.stats_results() + + # Gives the following response + # { + # 'stats_fields':{ + # 'author:{ + # 'min': 0.0, + # 'max': 2199.0, + # 'sum': 5251.2699999999995, + # 'count': 15, + # 'missing': 11, + # 'sumOfSquares': 6038619.160300001, + # 'mean': 350.08466666666664, + # 'stddev': 547.737557906113 + # } + # } + # + # } + + +``spelling_suggestion`` +~~~~~~~~~~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.spelling_suggestion(self, preferred_query=None) + +Returns the spelling suggestion found by the query. + +To work, you must set ``INCLUDE_SPELLING`` within your connection's +settings dictionary to ``True``, and you must rebuild your index afterwards. +Otherwise, ``None`` will be returned. + +This method causes the query to evaluate and run the search if it hasn't already +run. Search results will be populated as normal but with an additional spelling +suggestion. Note that this does *NOT* run the revised query, only suggests +improvements. + +If provided, the optional argument to this method lets you specify an alternate +query for the spelling suggestion to be run on. This is useful for passing along +a raw user-provided query, especially when there are many methods chained on the +``SearchQuerySet``. + +Example:: + + sqs = SearchQuerySet().auto_query('mor exmples') + sqs.spelling_suggestion() # u'more examples' + + # ...or... + suggestion = SearchQuerySet().spelling_suggestion('moar exmples') + suggestion # u'more examples' + +``values`` +~~~~~~~~~~ + +.. method:: SearchQuerySet.values(self, *fields) + +Returns a list of dictionaries, each containing the key/value pairs for the +result, exactly like Django's ``ValuesQuerySet``. + +This method causes the query to evaluate and run the search if it hasn't already +run. + +You must provide a list of one or more fields as arguments. These fields will +be the ones included in the individual results. + +Example:: + + sqs = SearchQuerySet().auto_query('banana').values('title', 'description') + + +``values_list`` +~~~~~~~~~~~~~~~ + +.. method:: SearchQuerySet.values_list(self, *fields, **kwargs) + +Returns a list of field values as tuples, exactly like Django's +``ValuesListQuerySet``. + +This method causes the query to evaluate and run the search if it hasn't already +run. + +You must provide a list of one or more fields as arguments. These fields will +be the ones included in the individual results. + +You may optionally also provide a ``flat=True`` kwarg, which in the case of a +single field being provided, will return a flat list of that field rather than +a list of tuples. + +Example:: + + sqs = SearchQuerySet().auto_query('banana').values_list('title', 'description') + + # ...or just the titles as a flat list... + sqs = SearchQuerySet().auto_query('banana').values_list('title', flat=True) + + +.. _field-lookups: + +Field Lookups +------------- + +The following lookup types are supported: + +* contains +* exact +* gt +* gte +* lt +* lte +* in +* startswith +* range + +These options are similar in function to the way Django's lookup types work. +The actual behavior of these lookups is backend-specific. + +.. warning:: + + The ``startswith`` filter is strongly affected by the other ways the engine + parses data, especially in regards to stemming (see :doc:`glossary`). This + can mean that if the query ends in a vowel or a plural form, it may get + stemmed before being evaluated. + + This is both backend-specific and yet fairly consistent between engines, + and may be the cause of sometimes unexpected results. + +.. warning:: + + The ``contains`` filter became the new default filter as of Haystack v2.X + (the default in Haystack v1.X was ``exact``). This changed because ``exact`` + caused problems and was unintuitive for new people trying to use Haystack. + ``contains`` is a much more natural usage. + + If you had an app built on Haystack v1.X & are upgrading, you'll need to + sanity-check & possibly change any code that was relying on the default. + The solution is just to add ``__exact`` to any "bare" field in a + ``.filter(...)`` clause. + +Example:: + + SearchQuerySet().filter(content='foo') + + # Identical to: + SearchQuerySet().filter(content__contains='foo') + + # Phrase matching. + SearchQuerySet().filter(content__exact='hello world') + + # Other usages look like: + SearchQuerySet().filter(pub_date__gte=datetime.date(2008, 1, 1), pub_date__lt=datetime.date(2009, 1, 1)) + SearchQuerySet().filter(author__in=['daniel', 'john', 'jane']) + SearchQuerySet().filter(view_count__range=[3, 5]) + + +``EmptySearchQuerySet`` +======================= + +Also included in Haystack is an ``EmptySearchQuerySet`` class. It behaves just +like ``SearchQuerySet`` but will always return zero results. This is useful for +places where you want no query to occur or results to be returned. + + +``RelatedSearchQuerySet`` +========================= + +Sometimes you need to filter results based on relations in the database that are +not present in the search index or are difficult to express that way. To this +end, ``RelatedSearchQuerySet`` allows you to post-process the search results by +calling ``load_all_queryset``. + +.. warning:: + + ``RelatedSearchQuerySet`` can have negative performance implications. + Because results are excluded based on the database after the search query + has been run, you can't guarantee offsets within the cache. Therefore, the + entire cache that appears before the offset you request must be filled in + order to produce consistent results. On large result sets and at higher + slices, this can take time. + + This is the old behavior of ``SearchQuerySet``, so performance is no worse + than the early days of Haystack. + +It supports all other methods that the standard ``SearchQuerySet`` does, with +the addition of the ``load_all_queryset`` method and paying attention to the +``load_all_queryset`` method of ``SearchIndex`` objects when populating the +cache. + +``load_all_queryset`` +--------------------- + +.. method:: RelatedSearchQuerySet.load_all_queryset(self, model_class, queryset) + +Allows for specifying a custom ``QuerySet`` that changes how ``load_all`` will +fetch records for the provided model. This is useful for post-processing the +results from the query, enabling things like adding ``select_related`` or +filtering certain data. + +Example:: + + sqs = RelatedSearchQuerySet().filter(content='foo').load_all() + # For the Entry model, we want to include related models directly associated + # with the Entry to save on DB queries. + sqs = sqs.load_all_queryset(Entry, Entry.objects.all().select_related(depth=1)) + +This method chains indefinitely, so you can specify ``QuerySets`` for as many +models as you wish, one per model. The ``SearchQuerySet`` appends on a call to +``in_bulk``, so be sure that the ``QuerySet`` you provide can accommodate this +and that the ids passed to ``in_bulk`` will map to the model in question. + +If you need to do this frequently and have one ``QuerySet`` you'd like to apply +everywhere, you can specify this at the ``SearchIndex`` level using the +``load_all_queryset`` method. See :doc:`searchindex_api` for usage. diff --git a/docs/searchresult_api.rst b/docs/searchresult_api.rst new file mode 100644 index 0000000..ea506f2 --- /dev/null +++ b/docs/searchresult_api.rst @@ -0,0 +1,62 @@ +.. _ref-searchresult-api: + +==================== +``SearchResult`` API +==================== + +.. class:: SearchResult(app_label, model_name, pk, score, **kwargs) + +The ``SearchResult`` class provides structure to the results that come back from +the search index. These objects are what a ``SearchQuerySet`` will return when +evaluated. + + +Attribute Reference +=================== + +The class exposes the following useful attributes/properties: + +* ``app_label`` - The application the model is attached to. +* ``model_name`` - The model's name. +* ``pk`` - The primary key of the model. +* ``score`` - The score provided by the search engine. +* ``object`` - The actual model instance (lazy loaded). +* ``model`` - The model class. +* ``verbose_name`` - A prettier version of the model's class name for display. +* ``verbose_name_plural`` - A prettier version of the model's *plural* class name for display. +* ``searchindex`` - Returns the ``SearchIndex`` class associated with this + result. +* ``distance`` - On geo-spatial queries, this returns a ``Distance`` object + representing the distance the result was from the focused point. + + +Method Reference +================ + +``content_type`` +---------------- + +.. method:: SearchResult.content_type(self) + +Returns the content type for the result's model instance. + +``get_additional_fields`` +------------------------- + +.. method:: SearchResult.get_additional_fields(self) + +Returns a dictionary of all of the fields from the raw result. + +Useful for serializing results. Only returns what was seen from the +search engine, so it may have extra fields Haystack's indexes aren't +aware of. + +``get_stored_fields`` +--------------------- + +.. method:: SearchResult.get_stored_fields(self) + +Returns a dictionary of all of the stored fields from the SearchIndex. + +Useful for serializing results. Only returns the fields Haystack's +indexes are aware of as being 'stored'. diff --git a/docs/settings.rst b/docs/settings.rst new file mode 100644 index 0000000..c60752e --- /dev/null +++ b/docs/settings.rst @@ -0,0 +1,289 @@ +.. _ref-settings: + +================= +Haystack Settings +================= + +As a way to extend/change the default behavior within Haystack, there are +several settings you can alter within your ``settings.py``. This is a +comprehensive list of the settings Haystack recognizes. + + +``HAYSTACK_DEFAULT_OPERATOR`` +============================= + +**Optional** + +This setting controls what the default behavior for chaining ``SearchQuerySet`` +filters together is. + +Valid options are:: + + HAYSTACK_DEFAULT_OPERATOR = 'AND' + HAYSTACK_DEFAULT_OPERATOR = 'OR' + +Defaults to ``AND``. + + +``HAYSTACK_CONNECTIONS`` +======================== + +**Required** + +This setting controls which backends should be available. It should be a +dictionary of dictionaries resembling the following (complete) example:: + + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.solr_backend.SolrEngine', + 'URL': 'http://localhost:9001/solr/default', + 'TIMEOUT': 60 * 5, + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + 'EXCLUDED_INDEXES': ['thirdpartyapp.search_indexes.BarIndex'], + }, + 'autocomplete': { + 'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine', + 'PATH': '/home/search/whoosh_index', + 'STORAGE': 'file', + 'POST_LIMIT': 128 * 1024 * 1024, + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + 'EXCLUDED_INDEXES': ['thirdpartyapp.search_indexes.BarIndex'], + }, + 'slave': { + 'ENGINE': 'xapian_backend.XapianEngine', + 'PATH': '/home/search/xapian_index', + 'INCLUDE_SPELLING': True, + 'BATCH_SIZE': 100, + 'EXCLUDED_INDEXES': ['thirdpartyapp.search_indexes.BarIndex'], + }, + 'db': { + 'ENGINE': 'haystack.backends.simple_backend.SimpleEngine', + 'EXCLUDED_INDEXES': ['thirdpartyapp.search_indexes.BarIndex'], + } + } + +No default for this setting is provided. + +The main keys (``default`` & friends) are identifiers for your application. +You can use them any place the API exposes ``using`` as a method or kwarg. + +There must always be at least a ``default`` key within this setting. + +The ``ENGINE`` option is required for all backends & should point to the +``BaseEngine`` subclass for the backend. + +Additionally, each backend may have additional options it requires: + +* Solr + + * ``URL`` - The URL to the Solr core. + +* Whoosh + + * ``PATH`` - The filesystem path to where the index data is located. + +* Xapian + + * ``PATH`` - The filesystem path to where the index data is located. + +The following options are optional: + +* ``INCLUDE_SPELLING`` - Include spelling suggestions. Default is ``False`` +* ``BATCH_SIZE`` - How many records should be updated at once via the management + commands. Default is ``1000``. +* ``TIMEOUT`` - (Solr and ElasticSearch) How long to wait (in seconds) before + the connection times out. Default is ``10``. +* ``STORAGE`` - (Whoosh-only) Which storage engine to use. Accepts ``file`` or + ``ram``. Default is ``file``. +* ``POST_LIMIT`` - (Whoosh-only) How large the file sizes can be. Default is + ``128 * 1024 * 1024``. +* ``FLAGS`` - (Xapian-only) A list of flags to use when querying the index. +* ``EXCLUDED_INDEXES`` - A list of strings (as Python import paths) to indexes + you do **NOT** want included. Useful for omitting third-party things you + don't want indexed or for when you want to replace an index. +* ``KWARGS`` - (Solr and ElasticSearch) Any additional keyword arguments that + should be passed on to the underlying client library. + + +``HAYSTACK_ROUTERS`` +==================== + +**Optional** + +This setting controls how routing is performed to allow different backends to +handle updates/deletes/reads. + +An example:: + + HAYSTACK_ROUTERS = ['search_routers.MasterSlaveRouter', 'haystack.routers.DefaultRouter'] + +Defaults to ``['haystack.routers.DefaultRouter']``. + + +``HAYSTACK_SIGNAL_PROCESSOR`` +============================= + +**Optional** + +This setting controls what ``SignalProcessor`` class is used to handle Django's +signals & keep the search index up-to-date. + +An example:: + + HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' + +Defaults to ``'haystack.signals.BaseSignalProcessor'``. + + +``HAYSTACK_DOCUMENT_FIELD`` +=========================== + +**Optional** + +This setting controls what fieldname Haystack relies on as the default field +for searching within. + +An example:: + + HAYSTACK_DOCUMENT_FIELD = 'wall_o_text' + +Defaults to ``text``. + + +``HAYSTACK_SEARCH_RESULTS_PER_PAGE`` +==================================== + +**Optional** + +This setting controls how many results are shown per page when using the +included ``SearchView`` and its subclasses. + +An example:: + + HAYSTACK_SEARCH_RESULTS_PER_PAGE = 50 + +Defaults to ``20``. + + +``HAYSTACK_CUSTOM_HIGHLIGHTER`` +=============================== + +**Optional** + +This setting allows you to specify your own custom ``Highlighter`` +implementation for use with the ``{% highlight %}`` template tag. It should be +the full path to the class. + +An example:: + + HAYSTACK_CUSTOM_HIGHLIGHTER = 'myapp.utils.BorkHighlighter' + +No default is provided. Haystack automatically falls back to the default +implementation. + + +``HAYSTACK_ITERATOR_LOAD_PER_QUERY`` +==================================== + +**Optional** + +This setting controls the number of results that are pulled at once when +iterating through a ``SearchQuerySet``. If you generally consume large portions +at a time, you can bump this up for better performance. + +.. note:: + + This is not used in the case of a slice on a ``SearchQuerySet``, which + already overrides the number of results pulled at once. + +An example:: + + HAYSTACK_ITERATOR_LOAD_PER_QUERY = 100 + +The default is 10 results at a time. + + +``HAYSTACK_LIMIT_TO_REGISTERED_MODELS`` +======================================= + +**Optional** + +This setting allows you to control whether or not Haystack will limit the +search results seen to just the models registered. It should be a boolean. + +If your search index is never used for anything other than the models +registered with Haystack, you can turn this off and get a small to moderate +performance boost. + +An example:: + + HAYSTACK_LIMIT_TO_REGISTERED_MODELS = False + +Default is ``True``. + + +``HAYSTACK_ID_FIELD`` +===================== + +**Optional** + +This setting allows you to control what the unique field name used internally +by Haystack is called. Rarely needed unless your field names collide with +Haystack's defaults. + +An example:: + + HAYSTACK_ID_FIELD = 'my_id' + +Default is ``id``. + + +``HAYSTACK_DJANGO_CT_FIELD`` +============================ + +**Optional** + +This setting allows you to control what the content type field name used +internally by Haystack is called. Rarely needed unless your field names +collide with Haystack's defaults. + +An example:: + + HAYSTACK_DJANGO_CT_FIELD = 'my_django_ct' + +Default is ``django_ct``. + + +``HAYSTACK_DJANGO_ID_FIELD`` +============================ + +**Optional** + +This setting allows you to control what the primary key field name used +internally by Haystack is called. Rarely needed unless your field names +collide with Haystack's defaults. + +An example:: + + HAYSTACK_DJANGO_ID_FIELD = 'my_django_id' + +Default is ``django_id``. + + +``HAYSTACK_IDENTIFIER_METHOD`` +============================== + +**Optional** + +This setting allows you to provide a custom method for +``haystack.utils.get_identifier``. Useful when the default identifier +pattern of .. isn't suited to your +needs. + +An example:: + + HAYSTACK_IDENTIFIER_METHOD = 'my_app.module.get_identifier' + +Default is ``haystack.utils.default_get_identifier``. diff --git a/docs/signal_processors.rst b/docs/signal_processors.rst new file mode 100644 index 0000000..3865b72 --- /dev/null +++ b/docs/signal_processors.rst @@ -0,0 +1,117 @@ +.. _ref-signal_processors: + +================= +Signal Processors +================= + +Keeping data in sync between the (authoritative) database & the +(non-authoritative) search index is one of the more difficult problems when +using Haystack. Even frequently running the ``update_index`` management command +still introduces lag between when the data is stored & when it's available +for searching. + +A solution to this is to incorporate Django's signals (specifically +``models.db.signals.post_save`` & ``models.db.signals.post_delete``), which then +trigger *individual* updates to the search index, keeping them in near-perfect +sync. + +Older versions of Haystack (pre-v2.0) tied the ``SearchIndex`` directly to the +signals, which caused occasional conflicts of interest with third-party +applications. + +To solve this, starting with Haystack v2.0, the concept of a ``SignalProcessor`` +has been introduced. In it's simplest form, the ``SignalProcessor`` listens +to whatever signals are setup & can be configured to then trigger the updates +without having to change any ``SearchIndex`` code. + +.. warning:: + + Incorporating Haystack's ``SignalProcessor`` into your setup **will** + increase the overall load (CPU & perhaps I/O depending on configuration). + You will need to capacity plan for this & ensure you can make the tradeoff + of more real-time results for increased load. + + +Default - ``BaseSignalProcessor`` +================================= + +The default setup is configured to use the +``haystack.signals.BaseSignalProcessor`` class, which includes all the +underlying code necessary to handle individual updates/deletes, **BUT DOES NOT +HOOK UP THE SIGNALS**. + +This means that, by default, **NO ACTION IS TAKEN BY HAYSTACK** when a model is +saved or deleted. The ``BaseSignalProcessor.setup`` & +``BaseSignalProcessor.teardown`` methods are both empty to prevent anything +from being setup at initialization time. + +This usage is configured very simply (again, by default) with the +``HAYSTACK_SIGNAL_PROCESSOR`` setting. An example of manually setting this +would look like:: + + HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.BaseSignalProcessor' + +This class forms an excellent base if you'd like to override/extend for more +advanced behavior. Which leads us to... + + +Realtime - ``RealtimeSignalProcessor`` +====================================== + +The other included ``SignalProcessor`` is the +``haystack.signals.RealtimeSignalProcessor`` class. It is an extremely thin +extension of the ``BaseSignalProcessor`` class, differing only in that +in implements the ``setup/teardown`` methods, tying **ANY** Model +``save/delete`` to the signal processor. + +If the model has an associated ``SearchIndex``, the ``RealtimeSignalProcessor`` +will then trigger an update/delete of that model instance within the search +index proper. + +Configuration looks like:: + + HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor' + +This causes **all** ``SearchIndex`` classes to work in a realtime fashion. + +.. note:: + + These updates happen in-process, which if a request-response cycle is + involved, may cause the user with the browser to sit & wait for indexing to + be completed. Since this wait can be undesirable, especially under load, + you may wish to look into queued search options. See the + :ref:`ref-other_apps` documentation for existing options. + + +Custom ``SignalProcessors`` +=========================== + +The ``BaseSignalProcessor`` & ``RealtimeSignalProcessor`` classes are fairly +simple/straightforward to customize or extend. Rather than forking Haystack to +implement your modifications, you should create your own subclass within your +codebase (anywhere that's importable is usually fine, though you should avoid +``models.py`` files). + +For instance, if you only wanted ``User`` saves to be realtime, deferring all +other updates to the management commands, you'd implement the following code:: + + from django.contrib.auth.models import User + from django.db import models + from haystack import signals + + + class UserOnlySignalProcessor(signals.BaseSignalProcessor): + def setup(self): + # Listen only to the ``User`` model. + models.signals.post_save.connect(self.handle_save, sender=User) + models.signals.post_delete.connect(self.handle_delete, sender=User) + + def teardown(self): + # Disconnect only for the ``User`` model. + models.signals.post_save.disconnect(self.handle_save, sender=User) + models.signals.post_delete.disconnect(self.handle_delete, sender=User) + +For other customizations (modifying how saves/deletes should work), you'll need +to override/extend the ``handle_save/handle_delete`` methods. The source code +is your best option for referring to how things currently work on your version +of Haystack. diff --git a/docs/spatial.rst b/docs/spatial.rst new file mode 100644 index 0000000..07dda7e --- /dev/null +++ b/docs/spatial.rst @@ -0,0 +1,412 @@ +.. _ref-spatial: + +============== +Spatial Search +============== + +Spatial search (also called geospatial search) allows you to take data that +has a geographic location & enhance the search results by limiting them to a +physical area. Haystack, combined with the latest versions of a couple engines, +can provide this type of search. + +In addition, Haystack tries to implement these features in a way that is as +close to GeoDjango_ as possible. There are some differences, which we'll +highlight throughout this guide. Additionally, while the support isn't as +comprehensive as PostGIS (for example), it is still quite useful. + +.. _GeoDjango: http://geodjango.org/ + + +Additional Requirements +======================= + +The spatial functionality has only one non-included, non-available-in-Django +dependency: + +* ``geopy`` - ``pip install geopy`` + +If you do not ever need distance information, you may be able to skip +installing ``geopy``. + + +Support +======= + +You need the latest & greatest of either Solr or Elasticsearch. None of the +other backends (specifially the engines) support this kind of search. + +For Solr_, you'll need at least **v3.5+**. In addition, if you have an existing +install of Haystack & Solr, you'll need to upgrade the schema & reindex your +data. If you're adding geospatial data, you would have to reindex anyhow. + +For Elasticsearch, you'll need at least v0.17.7, preferably v0.18.6 or better. +If you're adding geospatial data, you'll have to reindex as well. + +.. _Solr: http://lucene.apache.org/solr/ + +====================== ====== =============== ======== ======== ====== +Lookup Type Solr Elasticsearch Whoosh Xapian Simple +====================== ====== =============== ======== ======== ====== +`within` X X +`dwithin` X X +`distance` X X +`order_by('distance')` X X +`polygon` X +====================== ====== =============== ======== ======== ====== + +For more details, you can inspect http://wiki.apache.org/solr/SpatialSearch +or http://www.elasticsearch.org/guide/reference/query-dsl/geo-bounding-box-filter.html. + + +Geospatial Assumptions +====================== + +``Points`` +---------- + +Haystack prefers to work with ``Point`` objects, which are located in +``django.contrib.gis.geos.Point`` but conviently importable out of +``haystack.utils.geo.Point``. + +``Point`` objects use **LONGITUDE, LATITUDE** for their construction, regardless +if you use the parameters to instantiate them or WKT_/``GEOSGeometry``. + +.. _WKT: http://en.wikipedia.org/wiki/Well-known_text + +Examples:: + + # Using positional arguments. + from haystack.utils.geo import Point + pnt = Point(-95.23592948913574, 38.97127105172941) + + # Using WKT. + from django.contrib.gis.geos import GEOSGeometry + pnt = GEOSGeometry('POINT(-95.23592948913574 38.97127105172941)') + +They are preferred over just providing ``latitude, longitude`` because they are +more intelligent, have a spatial reference system attached & are more consistent +with GeoDjango's use. + + +``Distance`` +------------ + +Haystack also uses the ``D`` (or ``Distance``) objects from GeoDjango, +implemented in ``django.contrib.gis.measure.Distance`` but conveniently +importable out of ``haystack.utils.geo.D`` (or ``haystack.utils.geo.Distance``). + +``Distance`` objects accept a very flexible set of measurements during +instantiaton and can convert amongst them freely. This is important, because +the engines rely on measurements being in kilometers but you're free to use +whatever units you want. + +Examples:: + + from haystack.utils.geo import D + + # Start at 5 miles. + imperial_d = D(mi=5) + + # Convert to fathoms... + fathom_d = imperial_d.fathom + + # Now to kilometers... + km_d = imperial_d.km + + # And back to miles. + mi = imperial_d.mi + +They are preferred over just providing a raw distance because they are +more intelligent, have a well-defined unit system attached & are consistent +with GeoDjango's use. + + +``WGS-84`` +---------- + +All engines assume WGS-84 (SRID 4326). At the time of writing, there does **not** +appear to be a way to switch this. Haystack will transform all points into this +coordinate system for you. + + +Indexing +======== + +Indexing is relatively simple. Simply add a ``LocationField`` (or several) +onto your ``SearchIndex`` class(es) & provide them a ``Point`` object. For +example:: + + from haystack import indexes + from shops.models import Shop + + + class ShopIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + # ... the usual, then... + location = indexes.LocationField(model_attr='coordinates') + + def get_model(self): + return Shop + +If you must manually prepare the data, you have to do something slightly less +convenient, returning a string-ified version of the coordinates in WGS-84 as +``lat,long``:: + + from haystack import indexes + from shops.models import Shop + + + class ShopIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + # ... the usual, then... + location = indexes.LocationField() + + def get_model(self): + return Shop + + def prepare_location(self, obj): + # If you're just storing the floats... + return "%s,%s" % (obj.latitude, obj.longitude) + +Alternatively, you could build a method/property onto the ``Shop`` model that +returns a ``Point`` based on those coordinates:: + + # shops/models.py + from django.contrib.gis.geos import Point + from django.db import models + + + class Shop(models.Model): + # ... the usual, then... + latitude = models.FloatField() + longitude = models.FloatField() + + # Usual methods, then... + def get_location(self): + # Remember, longitude FIRST! + return Point(self.longitude, self.latitude) + + + # shops/search_indexes.py + from haystack import indexes + from shops.models import Shop + + + class ShopIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + location = indexes.LocationField(model_attr='get_location') + + def get_model(self): + return Shop + + +Querying +======== + +There are two types of geospatial queries you can run, ``within`` & ``dwithin``. +Like their GeoDjango counterparts (within_ & dwithin_), these methods focus on +finding results within an area. + +.. _within: https://docs.djangoproject.com/en/dev/ref/contrib/gis/geoquerysets/#within +.. _dwithin: https://docs.djangoproject.com/en/dev/ref/contrib/gis/geoquerysets/#dwithin + + +``within`` +---------- + +.. method:: SearchQuerySet.within(self, field, point_1, point_2) + +``within`` is a bounding box comparison. A bounding box is a rectangular area +within which to search. It's composed of a bottom-left point & a top-right +point. It is faster but slighty sloppier than its counterpart. + +Examples:: + + from haystack.query import SearchQuerySet + from haystack.utils.geo import Point + + downtown_bottom_left = Point(-95.23947, 38.9637903) + downtown_top_right = Point(-95.23362278938293, 38.973081081164715) + + # 'location' is the fieldname from our ``SearchIndex``... + + # Do the bounding box query. + sqs = SearchQuerySet().within('location', downtown_bottom_left, downtown_top_right) + + # Can be chained with other Haystack calls. + sqs = SearchQuerySet().auto_query('coffee').within('location', downtown_bottom_left, downtown_top_right).order_by('-popularity') + +.. note:: + + In GeoDjango, assuming the ``Shop`` model had been properly geo-ified, this + would have been implemented as:: + + from shops.models import Shop + Shop.objects.filter(location__within=(downtown_bottom_left, downtown_top_right)) + + Haystack's form differs because it yielded a cleaner implementation, was + no more typing than the GeoDjango version & tried to maintain the same + terminology/similar signature. + + +``dwithin`` +----------- + +.. method:: SearchQuerySet.dwithin(self, field, point, distance) + +``dwithin`` is a radius-based search. A radius-based search is a circular area +within which to search. It's composed of a center point & a radius (in +kilometers, though Haystack will use the ``D`` object's conversion utilities to +get it there). It is slower than``within`` but very exact & can involve fewer +calculations on your part. + +Examples:: + + from haystack.query import SearchQuerySet + from haystack.utils.geo import Point, D + + ninth_and_mass = Point(-95.23592948913574, 38.96753407043678) + # Within a two miles. + max_dist = D(mi=2) + + # 'location' is the fieldname from our ``SearchIndex``... + + # Do the radius query. + sqs = SearchQuerySet().dwithin('location', ninth_and_mass, max_dist) + + # Can be chained with other Haystack calls. + sqs = SearchQuerySet().auto_query('coffee').dwithin('location', ninth_and_mass, max_dist).order_by('-popularity') + +.. note:: + + In GeoDjango, assuming the ``Shop`` model had been properly geo-ified, this + would have been implemented as:: + + from shops.models import Shop + Shop.objects.filter(location__dwithin=(ninth_and_mass, D(mi=2))) + + Haystack's form differs because it yielded a cleaner implementation, was + no more typing than the GeoDjango version & tried to maintain the same + terminology/similar signature. + + +``distance`` +------------ + +.. method:: SearchQuerySet.distance(self, field, point) + +By default, search results will come back without distance information attached +to them. In the concept of a bounding box, it would be ambiguous what the +distances would be calculated against. And it is more calculation that may not +be necessary. + +So like GeoDjango, Haystack exposes a method to signify that you want to +include these calculated distances on results. + +Examples:: + + from haystack.query import SearchQuerySet + from haystack.utils.geo import Point, D + + ninth_and_mass = Point(-95.23592948913574, 38.96753407043678) + + # On a bounding box... + downtown_bottom_left = Point(-95.23947, 38.9637903) + downtown_top_right = Point(-95.23362278938293, 38.973081081164715) + + sqs = SearchQuerySet().within('location', downtown_bottom_left, downtown_top_right).distance('location', ninth_and_mass) + + # ...Or on a radius query. + sqs = SearchQuerySet().dwithin('location', ninth_and_mass, D(mi=2)).distance('location', ninth_and_mass) + +You can even apply a different field, for instance if you calculate results of +key, well-cached hotspots in town but want distances from the user's current +position:: + + from haystack.query import SearchQuerySet + from haystack.utils.geo import Point, D + + ninth_and_mass = Point(-95.23592948913574, 38.96753407043678) + user_loc = Point(-95.23455619812012, 38.97240128290697) + + sqs = SearchQuerySet().dwithin('location', ninth_and_mass, D(mi=2)).distance('location', user_loc) + +.. note:: + + The astute will notice this is Haystack's biggest departure from GeoDjango. + In GeoDjango, this would have been implemented as:: + + from shops.models import Shop + Shop.objects.filter(location__dwithin=(ninth_and_mass, D(mi=2))).distance(user_loc) + + Note that, by default, the GeoDjango form leaves *out* the field to be + calculating against (though it's possible to override it & specify the + field). + + Haystack's form differs because the same assumptions are difficult to make. + GeoDjango deals with a single model at a time, where Haystack deals with + a broad mix of models. Additionally, accessing ``Model`` information is a + couple hops away, so Haystack favors the explicit (if slightly more typing) + approach. + + +Ordering +======== + +Because you're dealing with search, even with geospatial queries, results still +come back in **RELEVANCE** order. If you want to offer the user ordering +results by distance, there's a simple way to enable this ordering. + +Using the standard Haystack ``order_by`` method, if you specify ``distance`` or +``-distance`` **ONLY**, you'll get geographic ordering. Additionally, you must +have a call to ``.distance()`` somewhere in the chain, otherwise there is no +distance information on the results & nothing to sort by. + +Examples:: + + from haystack.query import SearchQuerySet + from haystack.utils.geo import Point, D + + ninth_and_mass = Point(-95.23592948913574, 38.96753407043678) + downtown_bottom_left = Point(-95.23947, 38.9637903) + downtown_top_right = Point(-95.23362278938293, 38.973081081164715) + + # Non-geo ordering. + sqs = SearchQuerySet().within('location', downtown_bottom_left, downtown_top_right).order_by('title') + sqs = SearchQuerySet().within('location', downtown_bottom_left, downtown_top_right).distance('location', ninth_and_mass).order_by('-created') + + # Geo ordering, closest to farthest. + sqs = SearchQuerySet().within('location', downtown_bottom_left, downtown_top_right).distance('location', ninth_and_mass).order_by('distance') + # Geo ordering, farthest to closest. + sqs = SearchQuerySet().dwithin('location', ninth_and_mass, D(mi=2)).distance('location', ninth_and_mass).order_by('-distance') + +.. note:: + + This call is identical to the GeoDjango usage. + +.. warning:: + + You can not specify both a distance & lexicographic ordering. If you specify + more than just ``distance`` or ``-distance``, Haystack assumes ``distance`` + is a field in the index & tries to sort on it. Example:: + + # May blow up! + sqs = SearchQuerySet().dwithin('location', ninth_and_mass, D(mi=2)).distance('location', ninth_and_mass).order_by('distance', 'title') + + This is a limitation in the engine's implementation. + + If you actually **have** a field called ``distance`` (& aren't using + calculated distance information), Haystack will do the right thing in + these circumstances. + + +Caveats +======= + +In all cases, you may call the ``within/dwithin/distance`` methods as many times +as you like. However, the **LAST** call is the information that will be used. +No combination logic is available, as this is largely a backend limitation. + +Combining calls to both ``within`` & ``dwithin`` may yield unexpected or broken +results. They don't overlap when performing queries, so it may be possible to +construct queries that work. Your Mileage May Vary. diff --git a/docs/templatetags.rst b/docs/templatetags.rst new file mode 100644 index 0000000..71d6e08 --- /dev/null +++ b/docs/templatetags.rst @@ -0,0 +1,68 @@ +.. _ref-templatetags: + +============= +Template Tags +============= + +Haystack comes with a couple common template tags to make using some of its +special features available to templates. + + +``highlight`` +============= + +Takes a block of text and highlights words from a provided query within that +block of text. Optionally accepts arguments to provide the HTML tag to wrap +highlighted word in, a CSS class to use with the tag and a maximum length of +the blurb in characters. + +The defaults are ``span`` for the HTML tag, ``highlighted`` for the CSS class +and 200 characters for the excerpt. + +Syntax:: + + {% highlight with [css_class "class_name"] [html_tag "span"] [max_length 200] %} + +Example:: + + # Highlight summary with default behavior. + {% highlight result.summary with query %} + + # Highlight summary but wrap highlighted words with a div and the + # following CSS class. + {% highlight result.summary with query html_tag "div" css_class "highlight_me_please" %} + + # Highlight summary but only show 40 words. + {% highlight result.summary with query max_length 40 %} + +The highlighter used by this tag can be overridden as needed. See the +:doc:`highlighting` documentation for more information. + + +``more_like_this`` +================== + +Fetches similar items from the search index to find content that is similar +to the provided model's content. + +.. note:: + + This requires a backend that has More Like This built-in. + +Syntax:: + + {% more_like_this model_instance as varname [for app_label.model_name,app_label.model_name,...] [limit n] %} + +Example:: + + # Pull a full SearchQuerySet (lazy loaded) of similar content. + {% more_like_this entry as related_content %} + + # Pull just the top 5 similar pieces of content. + {% more_like_this entry as related_content limit 5 %} + + # Pull just the top 5 similar entries or comments. + {% more_like_this entry as related_content for "blog.entry,comments.comment" limit 5 %} + +This tag behaves exactly like ``SearchQuerySet.more_like_this``, so all notes in +that regard apply here as well. diff --git a/docs/toc.rst b/docs/toc.rst new file mode 100644 index 0000000..46ed9bb --- /dev/null +++ b/docs/toc.rst @@ -0,0 +1,53 @@ +Table Of Contents +================= + +.. toctree:: + :maxdepth: 2 + + index + tutorial + glossary + views_and_forms + templatetags + management_commands + architecture_overview + backend_support + installing_search_engines + settings + faq + who_uses + other_apps + debugging + + migration_from_1_to_2 + python3 + contributing + + best_practices + highlighting + faceting + autocomplete + boost + signal_processors + multiple_index + rich_content_extraction + spatial + + searchqueryset_api + searchindex_api + inputtypes + searchfield_api + searchresult_api + searchquery_api + searchbackend_api + + running_tests + creating_new_backends + utils + + +Indices and tables +================== + +* :ref:`search` + diff --git a/docs/tutorial.rst b/docs/tutorial.rst new file mode 100644 index 0000000..76b0388 --- /dev/null +++ b/docs/tutorial.rst @@ -0,0 +1,398 @@ +.. _ref-tutorial: + +============================= +Getting Started with Haystack +============================= + +Search is a topic of ever increasing importance. Users increasing rely on search +to separate signal from noise and find what they're looking for quickly. In +addition, search can provide insight into what things are popular (many +searches), what things are difficult to find on the site and ways you can +improve the site better. + +To this end, Haystack tries to make integrating custom search as easy as +possible while being flexible/powerful enough to handle more advanced use cases. + +Haystack is a reusable app (that is, it relies only on its own code and focuses +on providing just search) that plays nicely with both apps you control as well as +third-party apps (such as ``django.contrib.*``) without having to modify the +sources. + +Haystack also does pluggable backends (much like Django's database +layer), so virtually all of the code you write ought to be portable between +whichever search engine you choose. + +.. note:: + + If you hit a stumbling block, there is both a `mailing list`_ and + `#haystack on irc.freenode.net`_ to get help. + +.. note:: + + You can participate in and/or track the development of Haystack by + subscribing to the `development mailing list`_. + +.. _mailing list: http://groups.google.com/group/django-haystack +.. _#haystack on irc.freenode.net: irc://irc.freenode.net/haystack +.. _development mailing list: http://groups.google.com/group/django-haystack-dev + +This tutorial assumes that you have a basic familiarity with the various major +parts of Django (models/forms/views/settings/URLconfs) and tailored to the +typical use case. There are shortcuts available as well as hooks for much +more advanced setups, but those will not be covered here. + +For example purposes, we'll be adding search functionality to a simple +note-taking application. Here is ``myapp/models.py``:: + + from django.db import models + from django.contrib.auth.models import User + + + class Note(models.Model): + user = models.ForeignKey(User) + pub_date = models.DateTimeField() + title = models.CharField(max_length=200) + body = models.TextField() + + def __unicode__(self): + return self.title + +Finally, before starting with Haystack, you will want to choose a search +backend to get started. There is a quick-start guide to +:doc:`installing_search_engines`, though you may want to defer to each engine's +official instructions. + + +Installation +============= + +Use your favorite Python package manager to install the app from PyPI, e.g. + +Example:: + + pip install django-haystack + + +Configuration +============= + +Add Haystack To ``INSTALLED_APPS`` +---------------------------------- + +As with most Django applications, you should add Haystack to the +``INSTALLED_APPS`` within your settings file (usually ``settings.py``). + +Example:: + + INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.sites', + + # Added. + 'haystack', + + # Then your usual apps... + 'blog', + ] + + +Modify Your ``settings.py`` +--------------------------- + +Within your ``settings.py``, you'll need to add a setting to indicate where your +site configuration file will live and which backend to use, as well as other +settings for that backend. + +``HAYSTACK_CONNECTIONS`` is a required setting and should be at least one of +the following: + +Solr +~~~~ + +Example:: + + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.solr_backend.SolrEngine', + 'URL': 'http://127.0.0.1:8983/solr' + # ...or for multicore... + # 'URL': 'http://127.0.0.1:8983/solr/mysite', + }, + } + + +Elasticsearch +~~~~~~~~~~~~~ + +Example:: + + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.elasticsearch_backend.ElasticsearchSearchEngine', + 'URL': 'http://127.0.0.1:9200/', + 'INDEX_NAME': 'haystack', + }, + } + + +Whoosh +~~~~~~ + +Requires setting ``PATH`` to the place on your filesystem where the +Whoosh index should be located. Standard warnings about permissions and keeping +it out of a place your webserver may serve documents out of apply. + +Example:: + + import os + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine', + 'PATH': os.path.join(os.path.dirname(__file__), 'whoosh_index'), + }, + } + + +Xapian +~~~~~~ + +First, install the Xapian backend (via +http://github.com/notanumber/xapian-haystack/tree/master) per the instructions +included with the backend. + +Requires setting ``PATH`` to the place on your filesystem where the +Xapian index should be located. Standard warnings about permissions and keeping +it out of a place your webserver may serve documents out of apply. + +Example:: + + import os + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'xapian_backend.XapianEngine', + 'PATH': os.path.join(os.path.dirname(__file__), 'xapian_index'), + }, + } + + +Simple +~~~~~~ + +The ``simple`` backend using very basic matching via the database itself. It's +not recommended for production use but it will return results. + +.. warning:: + + This backend does *NOT* work like the other backends do. Data preparation + does nothing & advanced filtering calls do not work. You really probably + don't want this unless you're in an environment where you just want to + silence Haystack. + +Example:: + + HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.simple_backend.SimpleEngine', + }, + } + + +Handling Data +============= + +Creating ``SearchIndexes`` +-------------------------- + +``SearchIndex`` objects are the way Haystack determines what data should be +placed in the search index and handles the flow of data in. You can think of +them as being similar to Django ``Models`` or ``Forms`` in that they are +field-based and manipulate/store data. + +You generally create a unique ``SearchIndex`` for each type of ``Model`` you +wish to index, though you can reuse the same ``SearchIndex`` between different +models if you take care in doing so and your field names are very standardized. + +To build a ``SearchIndex``, all that's necessary is to subclass both +``indexes.SearchIndex`` & ``indexes.Indexable``, +define the fields you want to store data with and define a ``get_model`` method. + +We'll create the following ``NoteIndex`` to correspond to our ``Note`` +model. This code generally goes in a ``search_indexes.py`` file within the app +it applies to, though that is not required. This allows +Haystack to automatically pick it up. The ``NoteIndex`` should look like:: + + import datetime + from haystack import indexes + from myapp.models import Note + + + class NoteIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + author = indexes.CharField(model_attr='user') + pub_date = indexes.DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def index_queryset(self, using=None): + """Used when the entire index for model is updated.""" + return self.get_model().objects.filter(pub_date__lte=datetime.datetime.now()) + +Every ``SearchIndex`` requires there be one (and only one) field with +``document=True``. This indicates to both Haystack and the search engine about +which field is the primary field for searching within. + +.. warning:: + + When you choose a ``document=True`` field, it should be consistently named + across all of your ``SearchIndex`` classes to avoid confusing the backend. + The convention is to name this field ``text``. + + There is nothing special about the ``text`` field name used in all of the + examples. It could be anything; you could call it ``pink_polka_dot`` and + it won't matter. It's simply a convention to call it ``text``. + +Additionally, we're providing ``use_template=True`` on the ``text`` field. This +allows us to use a data template (rather than error-prone concatenation) to +build the document the search engine will index. You’ll need to +create a new template inside your template directory called +``search/indexes/myapp/note_text.txt`` and place the following inside:: + + {{ object.title }} + {{ object.user.get_full_name }} + {{ object.body }} + +In addition, we added several other fields (``author`` and ``pub_date``). These +are useful when you want to provide additional filtering options. Haystack comes +with a variety of ``SearchField`` classes to handle most types of data. + +A common theme is to allow admin users to add future content but have it not +display on the site until that future date is reached. We specify a custom +``index_queryset`` method to prevent those future items from being indexed. + +.. _Django admin site: http://docs.djangoproject.com/en/dev/ref/contrib/admin/ + + +Setting Up The Views +==================== + +Add The ``SearchView`` To Your URLconf +-------------------------------------- + +Within your URLconf, add the following line:: + + (r'^search/', include('haystack.urls')), + +This will pull in the default URLconf for Haystack. It consists of a single +URLconf that points to a ``SearchView`` instance. You can change this class's +behavior by passing it any of several keyword arguments or override it entirely +with your own view. + + +Search Template +--------------- + +Your search template (``search/search.html`` for the default case) will likely +be very simple. The following is enough to get going (your template/block names +will likely differ):: + + {% extends 'base.html' %} + + {% block content %} +

Search

+ +
+ + {{ form.as_table }} + + + + +
  + +
+ + {% if query %} +

Results

+ + {% for result in page.object_list %} +

+ {{ result.object.title }} +

+ {% empty %} +

No results found.

+ {% endfor %} + + {% if page.has_previous or page.has_next %} +
+ {% if page.has_previous %}{% endif %}« Previous{% if page.has_previous %}{% endif %} + | + {% if page.has_next %}{% endif %}Next »{% if page.has_next %}{% endif %} +
+ {% endif %} + {% else %} + {# Show some example queries to run, maybe query syntax, something else? #} + {% endif %} +
+ {% endblock %} + +Note that the ``page.object_list`` is actually a list of ``SearchResult`` +objects instead of individual models. These objects have all the data returned +from that record within the search index as well as score. They can also +directly access the model for the result via ``{{ result.object }}``. So the +``{{ result.object.title }}`` uses the actual ``Note`` object in the database +and accesses its ``title`` field. + + +Reindex +------- + +The final step, now that you have everything setup, is to put your data in +from your database into the search index. Haystack ships with a management +command to make this process easy. + +.. note:: + + If you're using the Solr backend, you have an extra step. Solr's + configuration is XML-based, so you'll need to manually regenerate the + schema. You should run + ``./manage.py build_solr_schema`` first, drop the XML output in your + Solr's ``schema.xml`` file and restart your Solr server. + +Simply run ``./manage.py rebuild_index``. You'll get some totals of how many +models were processed and placed in the index. + +.. note:: + + Using the standard ``SearchIndex``, your search index content is only + updated whenever you run either ``./manage.py update_index`` or start + afresh with ``./manage.py rebuild_index``. + + You should cron up a ``./manage.py update_index`` job at whatever interval + works best for your site (using ``--age=`` reduces the number of + things to update). + + Alternatively, if you have low traffic and/or your search engine can handle + it, the ``RealtimeSignalProcessor`` automatically handles updates/deletes + for you. + + +Complete! +========= + +You can now visit the search section of your site, enter a search query and +receive search results back for the query! Congratulations! + + +What's Next? +============ + +This tutorial just scratches the surface of what Haystack provides. The +``SearchQuerySet`` is the underpinning of all search in Haystack and provides +a powerful, ``QuerySet``-like API (see :ref:`ref-searchqueryset-api`). You can +use much more complicated ``SearchForms``/``SearchViews`` to give users a better +UI (see :ref:`ref-views-and_forms`). And the :ref:`ref-best-practices` provides +insight into non-obvious or advanced usages of Haystack. diff --git a/docs/utils.rst b/docs/utils.rst new file mode 100644 index 0000000..7d42fc5 --- /dev/null +++ b/docs/utils.rst @@ -0,0 +1,18 @@ +.. _ref-utils: + +========= +Utilities +========= + +Included here are some of the general use bits included with Haystack. + + +``get_identifier`` +------------------ + +.. function:: get_identifier(obj_or_string) + +Gets an unique identifier for the object or a string representing the +object. + +If not overridden, uses ``..``. diff --git a/docs/views_and_forms.rst b/docs/views_and_forms.rst new file mode 100644 index 0000000..b4c7697 --- /dev/null +++ b/docs/views_and_forms.rst @@ -0,0 +1,408 @@ +.. _ref-views-and_forms: + +============= +Views & Forms +============= + +.. note:: + + As of version 2.4 the views in ``haystack.views.SearchView`` are deprecated in + favor of the new generic views in ``haystack.generic_views.SearchView`` + which use the standard Django `class-based views`_ which are available in + every version of Django which is supported by Haystack. + +.. _class-based views: https://docs.djangoproject.com/en/1.7/topics/class-based-views/ + +Haystack comes with some default, simple views & forms as well as some +django-style views to help you get started and to cover the common cases. +Included is a way to provide: + + * Basic, query-only search. + * Search by models. + * Search with basic highlighted results. + * Faceted search. + * Search by models with basic highlighted results. + +Most processing is done by the forms provided by Haystack via the ``search`` +method. As a result, all but the faceted types (see :doc:`faceting`) use the +standard ``SearchView``. + +There is very little coupling between the forms & the views (other than relying +on the existence of a ``search`` method on the form), so you may interchangeably +use forms and/or views anywhere within your own code. + +Forms +===== + +.. currentmodule:: haystack.forms + +``SearchForm`` +-------------- + +The most basic of the form types, this form consists of a single field, the +``q`` field (for query). Upon searching, the form will take the cleaned contents +of the ``q`` field and perform an ``auto_query`` on either the custom +``SearchQuerySet`` you provide or off a default ``SearchQuerySet``. + +To customize the ``SearchQuerySet`` the form will use, pass it a +``searchqueryset`` parameter to the constructor with the ``SearchQuerySet`` +you'd like to use. If using this form in conjunction with a ``SearchView``, +the form will receive whatever ``SearchQuerySet`` you provide to the view with +no additional work needed. + +The ``SearchForm`` also accepts a ``load_all`` parameter (``True`` or +``False``), which determines how the database is queried when iterating through +the results. This also is received automatically from the ``SearchView``. + +All other forms in Haystack inherit (either directly or indirectly) from this +form. + +``HighlightedSearchForm`` +------------------------- + +Identical to the ``SearchForm`` except that it tags the ``highlight`` method on +to the end of the ``SearchQuerySet`` to enable highlighted results. + +``ModelSearchForm`` +------------------- + +This form adds new fields to form. It iterates through all registered models for +the current ``SearchSite`` and provides a checkbox for each one. If no models +are selected, all types will show up in the results. + +``HighlightedModelSearchForm`` +------------------------------ + +Identical to the ``ModelSearchForm`` except that it tags the ``highlight`` +method on to the end of the ``SearchQuerySet`` to enable highlighted results on +the selected models. + +``FacetedSearchForm`` +--------------------- + +Identical to the ``SearchForm`` except that it adds a hidden ``selected_facets`` +field onto the form, allowing the form to narrow the results based on the facets +chosen by the user. + +Creating Your Own Form +---------------------- + +The simplest way to go about creating your own form is to inherit from +``SearchForm`` (or the desired parent) and extend the ``search`` method. By +doing this, you save yourself most of the work of handling data correctly and +stay API compatible with the ``SearchView``. + +For example, let's say you're providing search with a user-selectable date range +associated with it. You might create a form that looked as follows:: + + from django import forms + from haystack.forms import SearchForm + + + class DateRangeSearchForm(SearchForm): + start_date = forms.DateField(required=False) + end_date = forms.DateField(required=False) + + def search(self): + # First, store the SearchQuerySet received from other processing. + sqs = super(DateRangeSearchForm, self).search() + + if not self.is_valid(): + return self.no_query_found() + + # Check to see if a start_date was chosen. + if self.cleaned_data['start_date']: + sqs = sqs.filter(pub_date__gte=self.cleaned_data['start_date']) + + # Check to see if an end_date was chosen. + if self.cleaned_data['end_date']: + sqs = sqs.filter(pub_date__lte=self.cleaned_data['end_date']) + + return sqs + +This form adds two new fields for (optionally) choosing the start and end dates. +Within the ``search`` method, we grab the results from the parent form's +processing. Then, if a user has selected a start and/or end date, we apply that +filtering. Finally, we simply return the ``SearchQuerySet``. + +Views +===== + +.. currentmodule:: haystack.views + +.. note:: + + As of version 2.4 the views in ``haystack.views.SearchView`` are deprecated in + favor of the new generic views in ``haystack.generic_views.SearchView`` + which use the standard Django `class-based views`_ which are available in + every version of Django which is supported by Haystack. + +.. _class-based views: https://docs.djangoproject.com/en/1.7/topics/class-based-views/ + +New Django Class Based Views +---------------------------- + + .. versionadded:: 2.4.0 + +The views in ``haystack.generic_views.SearchView`` inherit from Django’s standard +`FormView `_. +The example views can be customized like any other Django class-based view as +demonstrated in this example which filters the search results in ``get_queryset``:: + + # views.py + from datetime import date + + from haystack.generic_views import SearchView + + class MySearchView(SearchView): + """My custom search view.""" + + def get_queryset(self): + queryset = super(MySearchView, self).get_queryset() + # further filter queryset based on some set of criteria + return queryset.filter(pub_date__gte=date(2015, 1, 1)) + + def get_context_data(self, *args, **kwargs): + context = super(MySearchView, self).get_context_data(*args, **kwargs) + # do something + return context + + # urls.py + + urlpatterns = patterns('', + url(r'^/search/?$', MySearchView.as_view(), name='search_view'), + ) + + +Upgrading +~~~~~~~~~ + +Upgrading from basic usage of the old-style views to new-style views is usually as simple as: + +#. Create new views under ``views.py`` subclassing ``haystack.generic_views.SearchView`` + or ``haystack.generic_views.FacetedSearchView`` +#. Move all parameters of your old-style views from your ``urls.py`` to attributes on + your new views. This will require renaming ``searchqueryset`` to ``queryset`` and + ``template`` to ``template_name`` +#. Review your templates and replace the ``page`` variable with ``page_object`` + +Here's an example:: + + ### old-style views... + # urls.py + + sqs = SearchQuerySet().filter(author='john') + + urlpatterns = patterns('haystack.views', + url(r'^$', SearchView( + template='my/special/path/john_search.html', + searchqueryset=sqs, + form_class=SearchForm + ), name='haystack_search'), + ) + + ### new-style views... + # views.py + + class JohnSearchView(SearchView): + template_name = 'my/special/path/john_search.html' + queryset = SearchQuerySet().filter(author='john') + form_class = SearchForm + + # urls.py + from myapp.views import JohnSearchView + + urlpatterns = patterns('', + url(r'^$', JohnSearchView.as_view(), name='haystack_search'), + ) + + +If your views overrode methods on the old-style SearchView, you will need to +refactor those methods to the equivalents on Django's generic views. For example, +if you previously used ``extra_context()`` to add additional template variables or +preprocess the values returned by Haystack, that code would move to ``get_context_data`` + ++-----------------------+-------------------------------------------+ +| Old Method | New Method | ++=======================+===========================================+ +| ``extra_context()`` | `get_context_data()`_ | ++-----------------------+-------------------------------------------+ +| ``create_response()`` | `dispatch()`_ or ``get()`` and ``post()`` | ++-----------------------+-------------------------------------------+ +| ``get_query()`` | `get_queryset()`_ | ++-----------------------+-------------------------------------------+ + +.. _get_context_data(): https://docs.djangoproject.com/en/1.7/ref/class-based-views/mixins-simple/#django.views.generic.base.ContextMixin.get_context_data +.. _dispatch(): https://docs.djangoproject.com/en/1.7/ref/class-based-views/base/#django.views.generic.base.View.dispatch +.. _get_queryset(): https://docs.djangoproject.com/en/1.7/ref/class-based-views/mixins-multiple-object/#django.views.generic.list.MultipleObjectMixin.get_queryset + + +Old-Style Views +--------------- + + .. deprecated:: 2.4.0 + +Haystack comes bundled with three views, the class-based views (``SearchView`` & +``FacetedSearchView``) and a traditional functional view (``basic_search``). + +The class-based views provide for easy extension should you need to alter the +way a view works. Except in the case of faceting (again, see :doc:`faceting`), +the ``SearchView`` works interchangeably with all other forms provided by +Haystack. + +The functional view provides an example of how Haystack can be used in more +traditional settings or as an example of how to write a more complex custom +view. It is also thread-safe. + +``SearchView(template=None, load_all=True, form_class=None, searchqueryset=None, context_class=RequestContext, results_per_page=None)`` +--------------------------------------------------------------------------------------------------------------------------------------- + +The ``SearchView`` is designed to be easy/flexible enough to override common +changes as well as being internally abstracted so that only altering a specific +portion of the code should be easy to do. + +Without touching any of the internals of the ``SearchView``, you can modify +which template is used, which form class should be instantiated to search with, +what ``SearchQuerySet`` to use in the event you wish to pre-filter the results. +what ``Context``-style object to use in the response and the ``load_all`` +performance optimization to reduce hits on the database. These options can (and +generally should) be overridden at the URLconf level. For example, to have a +custom search limited to the 'John' author, displaying all models to search by +and specifying a custom template (``my/special/path/john_search.html``), your +URLconf should look something like:: + + from django.conf.urls.defaults import * + from haystack.forms import ModelSearchForm + from haystack.query import SearchQuerySet + from haystack.views import SearchView + + sqs = SearchQuerySet().filter(author='john') + + # Without threading... + urlpatterns = patterns('haystack.views', + url(r'^$', SearchView( + template='my/special/path/john_search.html', + searchqueryset=sqs, + form_class=SearchForm + ), name='haystack_search'), + ) + + # With threading... + from haystack.views import SearchView, search_view_factory + + urlpatterns = patterns('haystack.views', + url(r'^$', search_view_factory( + view_class=SearchView, + template='my/special/path/john_search.html', + searchqueryset=sqs, + form_class=ModelSearchForm + ), name='haystack_search'), + ) + +.. warning:: + + The standard ``SearchView`` is not thread-safe. Use the + ``search_view_factory`` function, which returns thread-safe instances of + ``SearchView``. + +By default, if you don't specify a ``form_class``, the view will use the +``haystack.forms.ModelSearchForm`` form. + +Beyond this customizations, you can create your own ``SearchView`` and +extend/override the following methods to change the functionality. + +``__call__(self, request)`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Generates the actual response to the search. + +Relies on internal, overridable methods to construct the response. You generally +should avoid altering this method unless you need to change the flow of the +methods or to add a new method into the processing. + +``build_form(self, form_kwargs=None)`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instantiates the form the class should use to process the search query. + +Optionally accepts a dictionary of parameters that are passed on to the +form's ``__init__``. You can use this to lightly customize the form. + +You should override this if you write a custom form that needs special +parameters for instantiation. + +``get_query(self)`` +~~~~~~~~~~~~~~~~~~~ + +Returns the query provided by the user. + +Returns an empty string if the query is invalid. This pulls the cleaned query +from the form, via the ``q`` field, for use elsewhere within the ``SearchView``. +This is used to populate the ``query`` context variable. + +``get_results(self)`` +~~~~~~~~~~~~~~~~~~~~~ + +Fetches the results via the form. + +Returns an empty list if there's no query to search with. This method relies on +the form to do the heavy lifting as much as possible. + +``build_page(self)`` +~~~~~~~~~~~~~~~~~~~~ + +Paginates the results appropriately. + +In case someone does not want to use Django's built-in pagination, it +should be a simple matter to override this method to do what they would +like. + +``extra_context(self)`` +~~~~~~~~~~~~~~~~~~~~~~~ + +Allows the addition of more context variables as needed. Must return a +dictionary whose contents will add to or overwrite the other variables in the +context. + +``create_response(self)`` +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Generates the actual HttpResponse to send back to the user. It builds the page, +creates the context and renders the response for all the aforementioned +processing. + + +``basic_search(request, template='search/search.html', load_all=True, form_class=ModelSearchForm, searchqueryset=None, context_class=RequestContext, extra_context=None, results_per_page=None)`` +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +The ``basic_search`` tries to provide most of the same functionality as the +class-based views but resembles a more traditional generic view. It's both a +working view if you prefer not to use the class-based views as well as a good +starting point for writing highly custom views. + +Since it is all one function, the only means of extension are passing in +kwargs, similar to the way generic views work. + + +Creating Your Own View +---------------------- + +As with the forms, inheritance is likely your best bet. In this case, the +``FacetedSearchView`` is a perfect example of how to extend the existing +``SearchView``. The complete code for the ``FacetedSearchView`` looks like:: + + class FacetedSearchView(SearchView): + def extra_context(self): + extra = super(FacetedSearchView, self).extra_context() + + if self.results == []: + extra['facets'] = self.form.search().facet_counts() + else: + extra['facets'] = self.results.facet_counts() + + return extra + +It updates the name of the class (generally for documentation purposes) and +adds the facets from the ``SearchQuerySet`` to the context as the ``facets`` +variable. As with the custom form example above, it relies on the parent class +to handle most of the processing and extends that only where needed. diff --git a/docs/who_uses.rst b/docs/who_uses.rst new file mode 100644 index 0000000..9419213 --- /dev/null +++ b/docs/who_uses.rst @@ -0,0 +1,357 @@ +.. _ref-who-uses: + +Sites Using Haystack +==================== + +The following sites are a partial list of people using Haystack. I'm always +interested in adding more sites, so please find me (``daniellindsley``) via +IRC or the mailing list thread. + + +LJWorld/Lawrence.com/KUSports +----------------------------- + +For all things search-related. + +Using: Solr + +* http://www2.ljworld.com/search/ +* http://www2.ljworld.com/search/vertical/news.story/ +* http://www2.ljworld.com/marketplace/ +* http://www.lawrence.com/search/ +* http://www.kusports.com/search/ + + +AltWeeklies +----------- + +Providing an API to story aggregation. + +Using: Whoosh + +* http://www.northcoastjournal.com/altweeklies/documentation/ + + +Trapeze +------- + +Various projects. + +Using: Xapian + +* http://www.trapeze.com/ +* http://www.windmobile.ca/ +* http://www.bonefishgrill.com/ +* http://www.canadiantire.ca/ (Portions of) + + +Vickerey.com +------------ + +For (really well done) search & faceting. + +Using: Solr + +* http://store.vickerey.com/products/search/ + + +Eldarion +-------- + +Various projects. + +Using: Solr + +* http://eldarion.com/ + + +Sunlight Labs +------------- + +For general search. + +Using: Whoosh & Solr + +* http://sunlightlabs.com/ +* http://subsidyscope.com/ + + +NASA +---- + +For general search. + +Using: Solr + +* An internal site called SMD Spacebook 1.1. +* http://science.nasa.gov/ + + +AllForLocal +----------- + +For general search. + +* http://www.allforlocal.com/ + + +HUGE +---- + +Various projects. + +Using: Solr + +* http://hugeinc.com/ +* http://houselogic.com/ + + +Brick Design +------------ + +For search on Explore. + +Using: Solr + +* http://bricksf.com/ +* http://explore.org/ + + +Winding Road +------------ + +For general search. + +Using: Solr + +* http://www.windingroad.com/ + + +Reddit +------ + +For Reddit Gifts. + +Using: Whoosh + +* http://redditgifts.com/ + + +Pegasus News +------------ + +For general search. + +Using: Xapian + +* http://www.pegasusnews.com/ + + +Rampframe +--------- + +For general search. + +Using: Xapian + +* http://www.rampframe.com/ + + +Forkinit +-------- + +For general search, model-specific search and suggestions via MLT. + +Using: Solr + +* http://forkinit.com/ + + +Structured Abstraction +---------------------- + +For general search. + +Using: Xapian + +* http://www.structuredabstraction.com/ +* http://www.delivergood.org/ + + +CustomMade +---------- + +For general search. + +Using: Solr + +* http://www.custommade.com/ + + +University of the Andes, Dept. of Political Science +--------------------------------------------------- + +For general search & section-specific search. Developed by Monoku. + +Using: Solr + +* http://www.congresovisible.org/ +* http://www.monoku.com/ + + +Christchurch Art Gallery +------------------------ + +For general search & section-specific search. + +Using: Solr + +* http://christchurchartgallery.org.nz/search/ +* http://christchurchartgallery.org.nz/collection/browse/ + + +DevCheatSheet.com +----------------- + +For general search. + +Using: Xapian + +* http://devcheatsheet.com/ + + +TodasLasRecetas +--------------- + +For search, faceting & More Like This. + +Using: Solr + +* http://www.todaslasrecetas.es/receta/s/?q=langostinos +* http://www.todaslasrecetas.es/receta/9526/brochetas-de-langostinos + + +AstroBin +-------- + +For general search. + +Using: Solr + +* http://www.astrobin.com/ + + +European Paper Company +---------------------- + +For general search. + +Using: ??? + +* http://europeanpaper.com/ + + +mtn-op +------ + +For general search. + +Using: ??? + +* http://mountain-op.com/ + + +Crate +----- + +Crate is a PyPI mirror/replacement. It's using Haystack to power all search & +faceted navigation on the site. + +Using: Elasticsearch + +* https://crate.io/ + + +Pix Populi +---------- + +Pix Populi is a popular French photo sharing site. + +Using: Solr + +* http://www.pix-populi.fr/ + + +LocalWiki +---------- + +LocalWiki is a tool for collaborating in local, geographic communities. +It's using Haystack to power search on every LocalWiki instance. + +Using: Solr + +* http://localwiki.org/ + + +Pitchup +------- + +For faceting, geo and autocomplete. + +Using: ??? + +* http://www.pitchup.com/search/ + + +Gidsy +----- + +Gidsy makes it easy for anyone to organize and find exciting things +to do everywhere in the world. + +For activity search, area pages, forums and private messages. + +Using: Elasticsearch + +* https://gidsy.com/ +* https://gidsy.com/search/ +* https://gidsy.com/forum/ + + +GroundCity +---------- + +Groundcity is a Romanian dynamic real estate site. + +For real estate, forums and comments. + +Using: Whoosh + +* http://groundcity.ro/cautare/ + + +Docket Alarm +------------ + +Docket Alarm allows people to search court dockets across +the country. With it, you can search court dockets in the International Trade +Commission (ITC), the Patent Trial and Appeal Board (PTAB) and All Federal +Courts. + +Using: Elasticsearch + +* https://www.docketalarm.com/search/ITC +* https://www.docketalarm.com/search/PTAB +* https://www.docketalarm.com/search/dockets + + +Educreations +------------- + +Educreations makes it easy for anyone to teach what they know and learn +what they don't with a recordable whiteboard. Haystack is used to +provide search across users and lessons. + +Using: Solr + +* http://www.educreations.com/browse/ diff --git a/haystack/__init__.py b/haystack/__init__.py new file mode 100644 index 0000000..a02c845 --- /dev/null +++ b/haystack/__init__.py @@ -0,0 +1,71 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured + +from haystack.constants import DEFAULT_ALIAS +from haystack import signals +from haystack.utils import loading + + +__author__ = 'Daniel Lindsley' +__version__ = (2, 4, 0) + + +# Setup default logging. +log = logging.getLogger('haystack') +stream = logging.StreamHandler() +stream.setLevel(logging.INFO) +log.addHandler(stream) + + +# Help people clean up from 1.X. +if hasattr(settings, 'HAYSTACK_SITECONF'): + raise ImproperlyConfigured('The HAYSTACK_SITECONF setting is no longer used & can be removed.') +if hasattr(settings, 'HAYSTACK_SEARCH_ENGINE'): + raise ImproperlyConfigured('The HAYSTACK_SEARCH_ENGINE setting has been replaced with HAYSTACK_CONNECTIONS.') +if hasattr(settings, 'HAYSTACK_ENABLE_REGISTRATIONS'): + raise ImproperlyConfigured('The HAYSTACK_ENABLE_REGISTRATIONS setting is no longer used & can be removed.') +if hasattr(settings, 'HAYSTACK_INCLUDE_SPELLING'): + raise ImproperlyConfigured('The HAYSTACK_INCLUDE_SPELLING setting is now a per-backend setting & belongs in HAYSTACK_CONNECTIONS.') + + +# Check the 2.X+ bits. +if not hasattr(settings, 'HAYSTACK_CONNECTIONS'): + raise ImproperlyConfigured('The HAYSTACK_CONNECTIONS setting is required.') +if DEFAULT_ALIAS not in settings.HAYSTACK_CONNECTIONS: + raise ImproperlyConfigured("The default alias '%s' must be included in the HAYSTACK_CONNECTIONS setting." % DEFAULT_ALIAS) + +# Load the connections. +connections = loading.ConnectionHandler(settings.HAYSTACK_CONNECTIONS) + +# Load the router(s). +connection_router = loading.ConnectionRouter() + +if hasattr(settings, 'HAYSTACK_ROUTERS'): + if not isinstance(settings.HAYSTACK_ROUTERS, (list, tuple)): + raise ImproperlyConfigured("The HAYSTACK_ROUTERS setting must be either a list or tuple.") + + connection_router = loading.ConnectionRouter(settings.HAYSTACK_ROUTERS) + +# Setup the signal processor. +signal_processor_path = getattr(settings, 'HAYSTACK_SIGNAL_PROCESSOR', 'haystack.signals.BaseSignalProcessor') +signal_processor_class = loading.import_class(signal_processor_path) +signal_processor = signal_processor_class(connections, connection_router) + + +# Per-request, reset the ghetto query log. +# Probably not extraordinarily thread-safe but should only matter when +# DEBUG = True. +def reset_search_queries(**kwargs): + for conn in connections.all(): + conn.reset_queries() + + +if settings.DEBUG: + from django.core import signals as django_signals + django_signals.request_started.connect(reset_search_queries) diff --git a/haystack/admin.py b/haystack/admin.py new file mode 100644 index 0000000..806991a --- /dev/null +++ b/haystack/admin.py @@ -0,0 +1,163 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django import template +from django.contrib.admin.options import csrf_protect_m, ModelAdmin +from django.contrib.admin.views.main import ChangeList, SEARCH_VAR +from django.core.exceptions import PermissionDenied +from django.core.paginator import InvalidPage, Paginator +from django.shortcuts import render_to_response +from django.utils.translation import ungettext + +from haystack import connections +from haystack.query import SearchQuerySet +from haystack.utils import get_model_ct_tuple + +try: + from django.utils.encoding import force_text +except ImportError: + from django.utils.encoding import force_unicode as force_text + + +def list_max_show_all(changelist): + """ + Returns the maximum amount of results a changelist can have for the + "Show all" link to be displayed in a manner compatible with both Django + 1.4 and 1.3. See Django ticket #15997 for details. + """ + try: + # This import is available in Django 1.3 and below + from django.contrib.admin.views.main import MAX_SHOW_ALL_ALLOWED + return MAX_SHOW_ALL_ALLOWED + except ImportError: + return changelist.list_max_show_all + + +class SearchChangeList(ChangeList): + def __init__(self, **kwargs): + self.haystack_connection = kwargs.pop('haystack_connection', 'default') + super(SearchChangeList, self).__init__(**kwargs) + + def get_results(self, request): + if not SEARCH_VAR in request.GET: + return super(SearchChangeList, self).get_results(request) + + # Note that pagination is 0-based, not 1-based. + sqs = SearchQuerySet(self.haystack_connection).models(self.model).auto_query(request.GET[SEARCH_VAR]).load_all() + + paginator = Paginator(sqs, self.list_per_page) + # Get the number of objects, with admin filters applied. + result_count = paginator.count + full_result_count = SearchQuerySet(self.haystack_connection).models(self.model).all().count() + + can_show_all = result_count <= list_max_show_all(self) + multi_page = result_count > self.list_per_page + + # Get the list of objects to display on this page. + try: + result_list = paginator.page(self.page_num + 1).object_list + # Grab just the Django models, since that's what everything else is + # expecting. + result_list = [result.object for result in result_list] + except InvalidPage: + result_list = () + + self.result_count = result_count + self.full_result_count = full_result_count + self.result_list = result_list + self.can_show_all = can_show_all + self.multi_page = multi_page + self.paginator = paginator + + +class SearchModelAdminMixin(object): + # haystack connection to use for searching + haystack_connection = 'default' + + @csrf_protect_m + def changelist_view(self, request, extra_context=None): + if not self.has_change_permission(request, None): + raise PermissionDenied + + if not SEARCH_VAR in request.GET: + # Do the usual song and dance. + return super(SearchModelAdminMixin, self).changelist_view(request, extra_context) + + # Do a search of just this model and populate a Changelist with the + # returned bits. + if not self.model in connections[self.haystack_connection].get_unified_index().get_indexed_models(): + # Oops. That model isn't being indexed. Return the usual + # behavior instead. + return super(SearchModelAdminMixin, self).changelist_view(request, extra_context) + + # So. Much. Boilerplate. + # Why copy-paste a few lines when you can copy-paste TONS of lines? + list_display = list(self.list_display) + + kwargs = { + 'haystack_connection': self.haystack_connection, + 'request': request, + 'model': self.model, + 'list_display': list_display, + 'list_display_links': self.list_display_links, + 'list_filter': self.list_filter, + 'date_hierarchy': self.date_hierarchy, + 'search_fields': self.search_fields, + 'list_select_related': self.list_select_related, + 'list_per_page': self.list_per_page, + 'list_editable': self.list_editable, + 'model_admin': self + } + + # Django 1.4 compatibility. + if hasattr(self, 'list_max_show_all'): + kwargs['list_max_show_all'] = self.list_max_show_all + + changelist = SearchChangeList(**kwargs) + formset = changelist.formset = None + media = self.media + + # Build the action form and populate it with available actions. + # Check actions to see if any are available on this changelist + actions = self.get_actions(request) + if actions: + action_form = self.action_form(auto_id=None) + action_form.fields['action'].choices = self.get_action_choices(request) + else: + action_form = None + + selection_note = ungettext('0 of %(count)d selected', + 'of %(count)d selected', len(changelist.result_list)) + selection_note_all = ungettext('%(total_count)s selected', + 'All %(total_count)s selected', changelist.result_count) + + context = { + 'module_name': force_text(self.model._meta.verbose_name_plural), + 'selection_note': selection_note % {'count': len(changelist.result_list)}, + 'selection_note_all': selection_note_all % {'total_count': changelist.result_count}, + 'title': changelist.title, + 'is_popup': changelist.is_popup, + 'cl': changelist, + 'media': media, + 'has_add_permission': self.has_add_permission(request), + # More Django 1.4 compatibility + 'root_path': getattr(self.admin_site, 'root_path', None), + 'app_label': self.model._meta.app_label, + 'action_form': action_form, + 'actions_on_top': self.actions_on_top, + 'actions_on_bottom': self.actions_on_bottom, + 'actions_selection_counter': getattr(self, 'actions_selection_counter', 0), + } + context.update(extra_context or {}) + context_instance = template.RequestContext(request, current_app=self.admin_site.name) + app_name, model_name = get_model_ct_tuple(self.model) + return render_to_response(self.change_list_template or [ + 'admin/%s/%s/change_list.html' % (app_name, model_name), + 'admin/%s/change_list.html' % app_name, + 'admin/change_list.html' + ], context, context_instance=context_instance) + + +class SearchModelAdmin(SearchModelAdminMixin, ModelAdmin): + pass diff --git a/haystack/backends/__init__.py b/haystack/backends/__init__.py new file mode 100644 index 0000000..bcf8554 --- /dev/null +++ b/haystack/backends/__init__.py @@ -0,0 +1,1041 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +import copy +from copy import deepcopy +from time import time +from django.conf import settings +from django.db.models import Q +from django.db.models.base import ModelBase +from django.utils import six +from django.utils import tree +from haystack.constants import VALID_FILTERS, FILTER_SEPARATOR, DEFAULT_ALIAS +from haystack.exceptions import MoreLikeThisError, FacetingError +from haystack.models import SearchResult +from haystack.utils.loading import UnifiedIndex +from haystack.utils import get_model_ct + +try: + from django.utils.encoding import force_text +except ImportError: + from django.utils.encoding import force_unicode as force_text + + +VALID_GAPS = ['year', 'month', 'day', 'hour', 'minute', 'second'] + + +def log_query(func): + """ + A decorator for pseudo-logging search queries. Used in the ``SearchBackend`` + to wrap the ``search`` method. + """ + def wrapper(obj, query_string, *args, **kwargs): + start = time() + + try: + return func(obj, query_string, *args, **kwargs) + finally: + stop = time() + + if settings.DEBUG: + from haystack import connections + connections[obj.connection_alias].queries.append({ + 'query_string': query_string, + 'additional_args': args, + 'additional_kwargs': kwargs, + 'time': "%.3f" % (stop - start), + 'start': start, + 'stop': stop, + }) + + return wrapper + + +class EmptyResults(object): + hits = 0 + docs = [] + + def __len__(self): + return 0 + + def __getitem__(self, k): + if isinstance(k, slice): + return [] + else: + raise IndexError("It's not here.") + + +class BaseSearchBackend(object): + """ + Abstract search engine base class. + """ + # Backends should include their own reserved words/characters. + RESERVED_WORDS = [] + RESERVED_CHARACTERS = [] + + def __init__(self, connection_alias, **connection_options): + self.connection_alias = connection_alias + self.timeout = connection_options.get('TIMEOUT', 10) + self.include_spelling = connection_options.get('INCLUDE_SPELLING', False) + self.batch_size = connection_options.get('BATCH_SIZE', 1000) + self.silently_fail = connection_options.get('SILENTLY_FAIL', True) + self.distance_available = connection_options.get('DISTANCE_AVAILABLE', False) + + def update(self, index, iterable): + """ + Updates the backend when given a SearchIndex and a collection of + documents. + + This method MUST be implemented by each backend, as it will be highly + specific to each one. + """ + raise NotImplementedError + + def remove(self, obj_or_string): + """ + Removes a document/object from the backend. Can be either a model + instance or the identifier (i.e. ``app_name.model_name.id``) in the + event the object no longer exists. + + This method MUST be implemented by each backend, as it will be highly + specific to each one. + """ + raise NotImplementedError + + def clear(self, models=[], commit=True): + """ + Clears the backend of all documents/objects for a collection of models. + + This method MUST be implemented by each backend, as it will be highly + specific to each one. + """ + raise NotImplementedError + + @log_query + def search(self, query_string, **kwargs): + """ + Takes a query to search on and returns dictionary. + + The query should be a string that is appropriate syntax for the backend. + + The returned dictionary should contain the keys 'results' and 'hits'. + The 'results' value should be an iterable of populated SearchResult + objects. The 'hits' should be an integer count of the number of matched + results the search backend found. + + This method MUST be implemented by each backend, as it will be highly + specific to each one. + """ + raise NotImplementedError + + def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, + fields='', highlight=False, facets=None, + date_facets=None, query_facets=None, + narrow_queries=None, spelling_query=None, + within=None, dwithin=None, distance_point=None, + models=None, limit_to_registered_models=None, + result_class=None): + # A convenience method most backends should include in order to make + # extension easier. + raise NotImplementedError + + def prep_value(self, value): + """ + Hook to give the backend a chance to prep an attribute value before + sending it to the search engine. By default, just force it to unicode. + """ + return force_text(value) + + def more_like_this(self, model_instance, additional_query_string=None, result_class=None): + """ + Takes a model object and returns results the backend thinks are similar. + + This method MUST be implemented by each backend, as it will be highly + specific to each one. + """ + raise NotImplementedError("Subclasses must provide a way to fetch similar record via the 'more_like_this' method if supported by the backend.") + + def extract_file_contents(self, file_obj): + """ + Hook to allow backends which support rich-content types such as PDF, + Word, etc. extraction to process the provided file object and return + the contents for indexing + + Returns None if metadata cannot be extracted; otherwise returns a + dictionary containing at least two keys: + + :contents: + Extracted full-text content, if applicable + :metadata: + key:value pairs of text strings + """ + + raise NotImplementedError("Subclasses must provide a way to extract metadata via the 'extract' method if supported by the backend.") + + def build_schema(self, fields): + """ + Takes a dictionary of fields and returns schema information. + + This method MUST be implemented by each backend, as it will be highly + specific to each one. + """ + raise NotImplementedError("Subclasses must provide a way to build their schema.") + + def build_models_list(self): + """ + Builds a list of models for searching. + + The ``search`` method should use this and the ``django_ct`` field to + narrow the results (unless the user indicates not to). This helps ignore + any results that are not currently handled models and ensures + consistent caching. + """ + from haystack import connections + models = [] + + for model in connections[self.connection_alias].get_unified_index().get_indexed_models(): + models.append(get_model_ct(model)) + + return models + + +# Alias for easy loading within SearchQuery objects. +SearchBackend = BaseSearchBackend + + +class SearchNode(tree.Node): + """ + Manages an individual condition within a query. + + Most often, this will be a lookup to ensure that a certain word or phrase + appears in the documents being indexed. However, it also supports filtering + types (such as 'lt', 'gt', 'in' and others) for more complex lookups. + + This object creates a tree, with children being a list of either more + ``SQ`` objects or the expressions/values themselves. + """ + AND = 'AND' + OR = 'OR' + default = AND + + # Start compat. Django 1.6 changed how ``tree.Node`` works, so we're going + # to patch back in the original implementation until time to rewrite this + # presents itself. + # See https://github.com/django/django/commit/d3f00bd. + + def __init__(self, children=None, connector=None, negated=False): + """ + Constructs a new Node. If no connector is given, the default will be + used. + + Warning: You probably don't want to pass in the 'negated' parameter. It + is NOT the same as constructing a node and calling negate() on the + result. + """ + self.children = children and children[:] or [] + self.connector = connector or self.default + self.subtree_parents = [] + self.negated = negated + + # We need this because of django.db.models.query_utils.Q. Q. __init__() is + # problematic, but it is a natural Node subclass in all other respects. + def _new_instance(cls, children=None, connector=None, negated=False): + """ + This is called to create a new instance of this class when we need new + Nodes (or subclasses) in the internal code in this class. Normally, it + just shadows __init__(). However, subclasses with an __init__ signature + that is not an extension of Node.__init__ might need to implement this + method to allow a Node to create a new instance of them (if they have + any extra setting up to do). + """ + obj = SearchNode(children, connector, negated) + obj.__class__ = cls + return obj + _new_instance = classmethod(_new_instance) + + def __str__(self): + if self.negated: + return '(NOT (%s: %s))' % (self.connector, ', '.join([str(c) for c + in self.children])) + return '(%s: %s)' % (self.connector, ', '.join([str(c) for c in + self.children])) + + def __deepcopy__(self, memodict): + """ + Utility method used by copy.deepcopy(). + """ + obj = SearchNode(connector=self.connector, negated=self.negated) + obj.__class__ = self.__class__ + obj.children = copy.deepcopy(self.children, memodict) + obj.subtree_parents = copy.deepcopy(self.subtree_parents, memodict) + return obj + + def __len__(self): + """ + The size of a node if the number of children it has. + """ + return len(self.children) + + def __bool__(self): + """ + For truth value testing. + """ + return bool(self.children) + + def __nonzero__(self): # Python 2 compatibility + return type(self).__bool__(self) + + def __contains__(self, other): + """ + Returns True is 'other' is a direct child of this instance. + """ + return other in self.children + + def add(self, node, conn_type): + """ + Adds a new node to the tree. If the conn_type is the same as the root's + current connector type, the node is added to the first level. + Otherwise, the whole tree is pushed down one level and a new root + connector is created, connecting the existing tree and the new node. + """ + if node in self.children and conn_type == self.connector: + return + if len(self.children) < 2: + self.connector = conn_type + if self.connector == conn_type: + if isinstance(node, SearchNode) and (node.connector == conn_type or + len(node) == 1): + self.children.extend(node.children) + else: + self.children.append(node) + else: + obj = self._new_instance(self.children, self.connector, + self.negated) + self.connector = conn_type + self.children = [obj, node] + + def negate(self): + """ + Negate the sense of the root connector. This reorganises the children + so that the current node has a single child: a negated node containing + all the previous children. This slightly odd construction makes adding + new children behave more intuitively. + + Interpreting the meaning of this negate is up to client code. This + method is useful for implementing "not" arrangements. + """ + self.children = [self._new_instance(self.children, self.connector, + not self.negated)] + self.connector = self.default + + def start_subtree(self, conn_type): + """ + Sets up internal state so that new nodes are added to a subtree of the + current node. The conn_type specifies how the sub-tree is joined to the + existing children. + """ + if len(self.children) == 1: + self.connector = conn_type + elif self.connector != conn_type: + self.children = [self._new_instance(self.children, self.connector, + self.negated)] + self.connector = conn_type + self.negated = False + + self.subtree_parents.append(self.__class__(self.children, + self.connector, self.negated)) + self.connector = self.default + self.negated = False + self.children = [] + + def end_subtree(self): + """ + Closes off the most recently unmatched start_subtree() call. + + This puts the current state into a node of the parent tree and returns + the current instances state to be the parent. + """ + obj = self.subtree_parents.pop() + node = self.__class__(self.children, self.connector) + self.connector = obj.connector + self.negated = obj.negated + self.children = obj.children + self.children.append(node) + + # End compat. + + def __repr__(self): + return '' % (self.connector, self.as_query_string(self._repr_query_fragment_callback)) + + def _repr_query_fragment_callback(self, field, filter_type, value): + if six.PY3: + value = force_text(value) + else: + value = force_text(value).encode('utf8') + + return "%s%s%s=%s" % (field, FILTER_SEPARATOR, filter_type, value) + + def as_query_string(self, query_fragment_callback): + """ + Produces a portion of the search query from the current SQ and its + children. + """ + result = [] + + for child in self.children: + if hasattr(child, 'as_query_string'): + result.append(child.as_query_string(query_fragment_callback)) + else: + expression, value = child + field, filter_type = self.split_expression(expression) + result.append(query_fragment_callback(field, filter_type, value)) + + conn = ' %s ' % self.connector + query_string = conn.join(result) + + if query_string: + if self.negated: + query_string = 'NOT (%s)' % query_string + elif len(self.children) != 1: + query_string = '(%s)' % query_string + + return query_string + + def split_expression(self, expression): + """Parses an expression and determines the field and filter type.""" + parts = expression.split(FILTER_SEPARATOR) + field = parts[0] + + if len(parts) == 1 or parts[-1] not in VALID_FILTERS: + filter_type = 'contains' + else: + filter_type = parts.pop() + + return (field, filter_type) + + +class SQ(Q, SearchNode): + """ + Manages an individual condition within a query. + + Most often, this will be a lookup to ensure that a certain word or phrase + appears in the documents being indexed. However, it also supports filtering + types (such as 'lt', 'gt', 'in' and others) for more complex lookups. + """ + pass + + +class BaseSearchQuery(object): + """ + A base class for handling the query itself. + + This class acts as an intermediary between the ``SearchQuerySet`` and the + ``SearchBackend`` itself. + + The ``SearchQuery`` object maintains a tree of ``SQ`` objects. Each ``SQ`` + object supports what field it looks up against, what kind of lookup (i.e. + the __'s), what value it's looking for, if it's a AND/OR/NOT and tracks + any children it may have. The ``SearchQuery.build_query`` method starts with + the root of the tree, building part of the final query at each node until + the full final query is ready for the ``SearchBackend``. + + Backends should extend this class and provide implementations for + ``build_query_fragment``, ``clean`` and ``run``. See the ``solr`` backend for an example + implementation. + """ + + def __init__(self, using=DEFAULT_ALIAS): + self.query_filter = SearchNode() + self.order_by = [] + self.models = set() + self.boost = {} + self.start_offset = 0 + self.end_offset = None + self.highlight = False + self.facets = {} + self.date_facets = {} + self.query_facets = [] + self.narrow_queries = set() + #: If defined, fields should be a list of field names - no other values + #: will be retrieved so the caller must be careful to include django_ct + #: and django_id when using code which expects those to be included in + #: the results + self.fields = [] + # Geospatial-related information + self.within = {} + self.dwithin = {} + self.distance_point = {} + # Internal. + self._raw_query = None + self._raw_query_params = {} + self._more_like_this = False + self._mlt_instance = None + self._results = None + self._hit_count = None + self._facet_counts = None + self._stats = None + self._spelling_suggestion = None + self.result_class = SearchResult + self.stats = {} + from haystack import connections + self._using = using + self.backend = connections[self._using].get_backend() + + def __str__(self): + return self.build_query() + + def __getstate__(self): + """For pickling.""" + obj_dict = self.__dict__.copy() + del(obj_dict['backend']) + return obj_dict + + def __setstate__(self, obj_dict): + """For unpickling.""" + from haystack import connections + self.__dict__.update(obj_dict) + self.backend = connections[self._using].get_backend() + + def has_run(self): + """Indicates if any query has been been run.""" + return None not in (self._results, self._hit_count) + + def build_params(self, spelling_query=None): + """Generates a list of params to use when searching.""" + kwargs = { + 'start_offset': self.start_offset, + } + + if self.order_by: + kwargs['sort_by'] = self.order_by + + if self.end_offset is not None: + kwargs['end_offset'] = self.end_offset + + if self.highlight: + kwargs['highlight'] = self.highlight + + if self.facets: + kwargs['facets'] = self.facets + + if self.date_facets: + kwargs['date_facets'] = self.date_facets + + if self.query_facets: + kwargs['query_facets'] = self.query_facets + + if self.narrow_queries: + kwargs['narrow_queries'] = self.narrow_queries + + if spelling_query: + kwargs['spelling_query'] = spelling_query + + if self.boost: + kwargs['boost'] = self.boost + + if self.within: + kwargs['within'] = self.within + + if self.dwithin: + kwargs['dwithin'] = self.dwithin + + if self.distance_point: + kwargs['distance_point'] = self.distance_point + + if self.result_class: + kwargs['result_class'] = self.result_class + + if self.fields: + kwargs['fields'] = self.fields + + if self.models: + kwargs['models'] = self.models + + return kwargs + + def run(self, spelling_query=None, **kwargs): + """Builds and executes the query. Returns a list of search results.""" + final_query = self.build_query() + search_kwargs = self.build_params(spelling_query=spelling_query) + + if kwargs: + search_kwargs.update(kwargs) + + results = self.backend.search(final_query, **search_kwargs) + self._results = results.get('results', []) + self._hit_count = results.get('hits', 0) + self._facet_counts = self.post_process_facets(results) + self._spelling_suggestion = results.get('spelling_suggestion', None) + + def run_mlt(self, **kwargs): + """ + Executes the More Like This. Returns a list of search results similar + to the provided document (and optionally query). + """ + if self._more_like_this is False or self._mlt_instance is None: + raise MoreLikeThisError("No instance was provided to determine 'More Like This' results.") + + search_kwargs = { + 'result_class': self.result_class, + } + + if self.models: + search_kwargs['models'] = self.models + + if kwargs: + search_kwargs.update(kwargs) + + additional_query_string = self.build_query() + results = self.backend.more_like_this(self._mlt_instance, additional_query_string, **search_kwargs) + self._results = results.get('results', []) + self._hit_count = results.get('hits', 0) + + def run_raw(self, **kwargs): + """Executes a raw query. Returns a list of search results.""" + search_kwargs = self.build_params() + search_kwargs.update(self._raw_query_params) + + if kwargs: + search_kwargs.update(kwargs) + + results = self.backend.search(self._raw_query, **search_kwargs) + self._results = results.get('results', []) + self._hit_count = results.get('hits', 0) + self._facet_counts = results.get('facets', {}) + self._spelling_suggestion = results.get('spelling_suggestion', None) + + def get_count(self): + """ + Returns the number of results the backend found for the query. + + If the query has not been run, this will execute the query and store + the results. + """ + if self._hit_count is None: + # Limit the slice to 1 so we get a count without consuming + # everything. + if not self.end_offset: + self.end_offset = 1 + + if self._more_like_this: + # Special case for MLT. + self.run_mlt() + elif self._raw_query: + # Special case for raw queries. + self.run_raw() + else: + self.run() + + return self._hit_count + + def get_results(self, **kwargs): + """ + Returns the results received from the backend. + + If the query has not been run, this will execute the query and store + the results. + """ + if self._results is None: + if self._more_like_this: + # Special case for MLT. + self.run_mlt(**kwargs) + elif self._raw_query: + # Special case for raw queries. + self.run_raw(**kwargs) + else: + self.run(**kwargs) + + return self._results + + def get_facet_counts(self): + """ + Returns the facet counts received from the backend. + + If the query has not been run, this will execute the query and store + the results. + """ + if self._facet_counts is None: + self.run() + + return self._facet_counts + + def get_stats(self): + """ + Returns the stats received from the backend. + + If the query has not been run, this will execute the query and store + the results + """ + if self._stats is None: + self.run() + return self._stats + + def get_spelling_suggestion(self, preferred_query=None): + """ + Returns the spelling suggestion received from the backend. + + If the query has not been run, this will execute the query and store + the results. + """ + if self._spelling_suggestion is None: + self.run(spelling_query=preferred_query) + + return self._spelling_suggestion + + def boost_fragment(self, boost_word, boost_value): + """Generates query fragment for boosting a single word/value pair.""" + return "%s^%s" % (boost_word, boost_value) + + def matching_all_fragment(self): + """Generates the query that matches all documents.""" + return '*' + + def build_query(self): + """ + Interprets the collected query metadata and builds the final query to + be sent to the backend. + """ + final_query = self.query_filter.as_query_string(self.build_query_fragment) + + if not final_query: + # Match all. + final_query = self.matching_all_fragment() + + if self.boost: + boost_list = [] + + for boost_word, boost_value in self.boost.items(): + boost_list.append(self.boost_fragment(boost_word, boost_value)) + + final_query = "%s %s" % (final_query, " ".join(boost_list)) + + return final_query + + def combine(self, rhs, connector=SQ.AND): + if connector == SQ.AND: + self.add_filter(rhs.query_filter) + elif connector == SQ.OR: + self.add_filter(rhs.query_filter, use_or=True) + + # Methods for backends to implement. + + def build_query_fragment(self, field, filter_type, value): + """ + Generates a query fragment from a field, filter type and a value. + + Must be implemented in backends as this will be highly backend specific. + """ + raise NotImplementedError("Subclasses must provide a way to generate query fragments via the 'build_query_fragment' method.") + + + # Standard methods to alter the query. + + def clean(self, query_fragment): + """ + Provides a mechanism for sanitizing user input before presenting the + value to the backend. + + A basic (override-able) implementation is provided. + """ + if not isinstance(query_fragment, six.string_types): + return query_fragment + + words = query_fragment.split() + cleaned_words = [] + + for word in words: + if word in self.backend.RESERVED_WORDS: + word = word.replace(word, word.lower()) + + for char in self.backend.RESERVED_CHARACTERS: + word = word.replace(char, '\\%s' % char) + + cleaned_words.append(word) + + return ' '.join(cleaned_words) + + def build_not_query(self, query_string): + if ' ' in query_string: + query_string = "(%s)" % query_string + + return u"NOT %s" % query_string + + def build_exact_query(self, query_string): + return u'"%s"' % query_string + + def add_filter(self, query_filter, use_or=False): + """ + Adds a SQ to the current query. + """ + if use_or: + connector = SQ.OR + else: + connector = SQ.AND + + if self.query_filter and query_filter.connector != connector and len(query_filter) > 1: + self.query_filter.start_subtree(connector) + subtree = True + else: + subtree = False + + for child in query_filter.children: + if isinstance(child, tree.Node): + self.query_filter.start_subtree(connector) + self.add_filter(child) + self.query_filter.end_subtree() + else: + expression, value = child + self.query_filter.add((expression, value), connector) + + connector = query_filter.connector + + if query_filter.negated: + self.query_filter.negate() + + if subtree: + self.query_filter.end_subtree() + + def add_order_by(self, field): + """Orders the search result by a field.""" + self.order_by.append(field) + + def clear_order_by(self): + """ + Clears out all ordering that has been already added, reverting the + query to relevancy. + """ + self.order_by = [] + + def add_model(self, model): + """ + Restricts the query requiring matches in the given model. + + This builds upon previous additions, so you can limit to multiple models + by chaining this method several times. + """ + if not isinstance(model, ModelBase): + raise AttributeError('The model being added to the query must derive from Model.') + + self.models.add(model) + + def set_limits(self, low=None, high=None): + """Restricts the query by altering either the start, end or both offsets.""" + if low is not None: + self.start_offset = int(low) + + if high is not None: + self.end_offset = int(high) + + def clear_limits(self): + """Clears any existing limits.""" + self.start_offset, self.end_offset = 0, None + + def add_boost(self, term, boost_value): + """Adds a boosted term and the amount to boost it to the query.""" + self.boost[term] = boost_value + + def raw_search(self, query_string, **kwargs): + """ + Runs a raw query (no parsing) against the backend. + + This method causes the SearchQuery to ignore the standard query + generating facilities, running only what was provided instead. + + Note that any kwargs passed along will override anything provided + to the rest of the ``SearchQuerySet``. + """ + self._raw_query = query_string + self._raw_query_params = kwargs + + def more_like_this(self, model_instance): + """ + Allows backends with support for "More Like This" to return results + similar to the provided instance. + """ + self._more_like_this = True + self._mlt_instance = model_instance + + def add_stats_query(self,stats_field,stats_facets): + """Adds stats and stats_facets queries for the Solr backend.""" + self.stats[stats_field] = stats_facets + + def add_highlight(self): + """Adds highlighting to the search results.""" + self.highlight = True + + def add_within(self, field, point_1, point_2): + """Adds bounding box parameters to search query.""" + from haystack.utils.geo import ensure_point + self.within = { + 'field': field, + 'point_1': ensure_point(point_1), + 'point_2': ensure_point(point_2), + } + + def add_dwithin(self, field, point, distance): + """Adds radius-based parameters to search query.""" + from haystack.utils.geo import ensure_point, ensure_distance + self.dwithin = { + 'field': field, + 'point': ensure_point(point), + 'distance': ensure_distance(distance), + } + + def add_distance(self, field, point): + """ + Denotes that results should include distance measurements from the + point passed in. + """ + from haystack.utils.geo import ensure_point + self.distance_point = { + 'field': field, + 'point': ensure_point(point), + } + + def add_field_facet(self, field, **options): + """Adds a regular facet on a field.""" + from haystack import connections + field_name = connections[self._using].get_unified_index().get_facet_fieldname(field) + self.facets[field_name] = options.copy() + + def add_date_facet(self, field, start_date, end_date, gap_by, gap_amount=1): + """Adds a date-based facet on a field.""" + from haystack import connections + if not gap_by in VALID_GAPS: + raise FacetingError("The gap_by ('%s') must be one of the following: %s." % (gap_by, ', '.join(VALID_GAPS))) + + details = { + 'start_date': start_date, + 'end_date': end_date, + 'gap_by': gap_by, + 'gap_amount': gap_amount, + } + self.date_facets[connections[self._using].get_unified_index().get_facet_fieldname(field)] = details + + def add_query_facet(self, field, query): + """Adds a query facet on a field.""" + from haystack import connections + self.query_facets.append((connections[self._using].get_unified_index().get_facet_fieldname(field), query)) + + def add_narrow_query(self, query): + """ + Narrows a search to a subset of all documents per the query. + + Generally used in conjunction with faceting. + """ + self.narrow_queries.add(query) + + def set_result_class(self, klass): + """ + Sets the result class to use for results. + + Overrides any previous usages. If ``None`` is provided, Haystack will + revert back to the default ``SearchResult`` object. + """ + if klass is None: + klass = SearchResult + + self.result_class = klass + + def post_process_facets(self, results): + # Handle renaming the facet fields. Undecorate and all that. + from haystack import connections + revised_facets = {} + field_data = connections[self._using].get_unified_index().all_searchfields() + + for facet_type, field_details in results.get('facets', {}).items(): + temp_facets = {} + + for field, field_facets in field_details.items(): + fieldname = field + if field in field_data and hasattr(field_data[field], 'get_facet_for_name'): + fieldname = field_data[field].get_facet_for_name() + + temp_facets[fieldname] = field_facets + + revised_facets[facet_type] = temp_facets + + return revised_facets + + def using(self, using=None): + """ + Allows for overriding which connection should be used. This + disables the use of routers when performing the query. + + If ``None`` is provided, it has no effect on what backend is used. + """ + return self._clone(using=using) + + def _reset(self): + """ + Resets the instance's internal state to appear as though no query has + been run before. Only need to tweak a few variables we check. + """ + self._results = None + self._hit_count = None + self._facet_counts = None + self._spelling_suggestion = None + + def _clone(self, klass=None, using=None): + if using is None: + using = self._using + else: + from haystack import connections + klass = connections[using].query + + if klass is None: + klass = self.__class__ + + clone = klass(using=using) + clone.query_filter = deepcopy(self.query_filter) + clone.order_by = self.order_by[:] + clone.models = self.models.copy() + clone.boost = self.boost.copy() + clone.highlight = self.highlight + clone.stats = self.stats.copy() + clone.facets = self.facets.copy() + clone.date_facets = self.date_facets.copy() + clone.query_facets = self.query_facets[:] + clone.narrow_queries = self.narrow_queries.copy() + clone.start_offset = self.start_offset + clone.end_offset = self.end_offset + clone.result_class = self.result_class + clone.within = self.within.copy() + clone.dwithin = self.dwithin.copy() + clone.distance_point = self.distance_point.copy() + clone._raw_query = self._raw_query + clone._raw_query_params = self._raw_query_params + + return clone + + +class BaseEngine(object): + backend = BaseSearchBackend + query = BaseSearchQuery + unified_index = UnifiedIndex + + def __init__(self, using=None): + if using is None: + using = DEFAULT_ALIAS + + self.using = using + self.options = settings.HAYSTACK_CONNECTIONS.get(self.using, {}) + self.queries = [] + self._index = None + self._backend = None + + def get_backend(self): + if self._backend is None: + self._backend = self.backend(self.using, **self.options) + return self._backend + + def get_query(self): + return self.query(using=self.using) + + def reset_queries(self): + self.queries = [] + + def get_unified_index(self): + if self._index is None: + self._index = self.unified_index(self.options.get('EXCLUDED_INDEXES', [])) + + return self._index diff --git a/haystack/backends/elasticsearch_backend.py b/haystack/backends/elasticsearch_backend.py new file mode 100644 index 0000000..a57bb12 --- /dev/null +++ b/haystack/backends/elasticsearch_backend.py @@ -0,0 +1,944 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import datetime +import re +import warnings + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.utils import six + +import haystack +from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, log_query +from haystack.constants import DEFAULT_OPERATOR, DJANGO_CT, DJANGO_ID, ID +from haystack.exceptions import MissingDependency, MoreLikeThisError, SkipDocument +from haystack.inputs import Clean, Exact, PythonData, Raw +from haystack.models import SearchResult +from haystack.utils import log as logging +from haystack.utils import get_identifier, get_model_ct +from haystack.utils.app_loading import haystack_get_model + +try: + import elasticsearch + from elasticsearch.helpers import bulk_index + from elasticsearch.exceptions import NotFoundError +except ImportError: + raise MissingDependency("The 'elasticsearch' backend requires the installation of 'elasticsearch'. Please refer to the documentation.") + + +DATETIME_REGEX = re.compile( + r'^(?P\d{4})-(?P\d{2})-(?P\d{2})T' + r'(?P\d{2}):(?P\d{2}):(?P\d{2})(\.\d+)?$') + + +class ElasticsearchSearchBackend(BaseSearchBackend): + # Word reserved by Elasticsearch for special use. + RESERVED_WORDS = ( + 'AND', + 'NOT', + 'OR', + 'TO', + ) + + # Characters reserved by Elasticsearch for special use. + # The '\\' must come first, so as not to overwrite the other slash replacements. + RESERVED_CHARACTERS = ( + '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', + '[', ']', '^', '"', '~', '*', '?', ':', '/', + ) + + # Settings to add an n-gram & edge n-gram analyzer. + DEFAULT_SETTINGS = { + 'settings': { + "analysis": { + "analyzer": { + "ngram_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["haystack_ngram", "lowercase"] + }, + "edgengram_analyzer": { + "type": "custom", + "tokenizer": "standard", + "filter": ["haystack_edgengram", "lowercase"] + } + }, + "tokenizer": { + "haystack_ngram_tokenizer": { + "type": "nGram", + "min_gram": 3, + "max_gram": 15, + }, + "haystack_edgengram_tokenizer": { + "type": "edgeNGram", + "min_gram": 2, + "max_gram": 15, + "side": "front" + } + }, + "filter": { + "haystack_ngram": { + "type": "nGram", + "min_gram": 3, + "max_gram": 15 + }, + "haystack_edgengram": { + "type": "edgeNGram", + "min_gram": 2, + "max_gram": 15 + } + } + } + } + } + + def __init__(self, connection_alias, **connection_options): + super(ElasticsearchSearchBackend, self).__init__(connection_alias, **connection_options) + + if not 'URL' in connection_options: + raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias) + + if not 'INDEX_NAME' in connection_options: + raise ImproperlyConfigured("You must specify a 'INDEX_NAME' in your settings for connection '%s'." % connection_alias) + + self.conn = elasticsearch.Elasticsearch(connection_options['URL'], timeout=self.timeout, **connection_options.get('KWARGS', {})) + self.index_name = connection_options['INDEX_NAME'] + self.log = logging.getLogger('haystack') + self.setup_complete = False + self.existing_mapping = {} + + def setup(self): + """ + Defers loading until needed. + """ + # Get the existing mapping & cache it. We'll compare it + # during the ``update`` & if it doesn't match, we'll put the new + # mapping. + try: + self.existing_mapping = self.conn.indices.get_mapping(index=self.index_name) + except NotFoundError: + pass + except Exception: + if not self.silently_fail: + raise + + unified_index = haystack.connections[self.connection_alias].get_unified_index() + self.content_field_name, field_mapping = self.build_schema(unified_index.all_searchfields()) + current_mapping = { + 'modelresult': { + 'properties': field_mapping, + '_boost': { + 'name': 'boost', + 'null_value': 1.0 + } + } + } + + if current_mapping != self.existing_mapping: + try: + # Make sure the index is there first. + self.conn.indices.create(index=self.index_name, body=self.DEFAULT_SETTINGS, ignore=400) + self.conn.indices.put_mapping(index=self.index_name, doc_type='modelresult', body=current_mapping) + self.existing_mapping = current_mapping + except Exception: + if not self.silently_fail: + raise + + self.setup_complete = True + + def update(self, index, iterable, commit=True): + if not self.setup_complete: + try: + self.setup() + except elasticsearch.TransportError as e: + if not self.silently_fail: + raise + + self.log.error("Failed to add documents to Elasticsearch: %s", e) + return + + prepped_docs = [] + + for obj in iterable: + try: + prepped_data = index.full_prepare(obj) + final_data = {} + + # Convert the data to make sure it's happy. + for key, value in prepped_data.items(): + final_data[key] = self._from_python(value) + final_data['_id'] = final_data[ID] + + prepped_docs.append(final_data) + except SkipDocument: + self.log.debug(u"Indexing for object `%s` skipped", obj) + except elasticsearch.TransportError as e: + if not self.silently_fail: + raise + + # We'll log the object identifier but won't include the actual object + # to avoid the possibility of that generating encoding errors while + # processing the log message: + self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ + "data": { + "index": index, + "object": get_identifier(obj) + } + }) + + bulk_index(self.conn, prepped_docs, index=self.index_name, doc_type='modelresult') + + if commit: + self.conn.indices.refresh(index=self.index_name) + + def remove(self, obj_or_string, commit=True): + doc_id = get_identifier(obj_or_string) + + if not self.setup_complete: + try: + self.setup() + except elasticsearch.TransportError as e: + if not self.silently_fail: + raise + + self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e) + return + + try: + self.conn.delete(index=self.index_name, doc_type='modelresult', id=doc_id, ignore=404) + + if commit: + self.conn.indices.refresh(index=self.index_name) + except elasticsearch.TransportError as e: + if not self.silently_fail: + raise + + self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e) + + def clear(self, models=[], commit=True): + # We actually don't want to do this here, as mappings could be + # very different. + # if not self.setup_complete: + # self.setup() + + try: + if not models: + self.conn.indices.delete(index=self.index_name, ignore=404) + self.setup_complete = False + self.existing_mapping = {} + else: + models_to_delete = [] + + for model in models: + models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model))) + + # Delete by query in Elasticsearch asssumes you're dealing with + # a ``query`` root object. :/ + query = {'query': {'query_string': {'query': " OR ".join(models_to_delete)}}} + self.conn.delete_by_query(index=self.index_name, doc_type='modelresult', body=query) + except elasticsearch.TransportError as e: + if not self.silently_fail: + raise + + if len(models): + self.log.error("Failed to clear Elasticsearch index of models '%s': %s", ','.join(models_to_delete), e) + else: + self.log.error("Failed to clear Elasticsearch index: %s", e) + + def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, + fields='', highlight=False, facets=None, + date_facets=None, query_facets=None, + narrow_queries=None, spelling_query=None, + within=None, dwithin=None, distance_point=None, + models=None, limit_to_registered_models=None, + result_class=None): + index = haystack.connections[self.connection_alias].get_unified_index() + content_field = index.document_field + + if query_string == '*:*': + kwargs = { + 'query': { + "match_all": {} + }, + } + else: + kwargs = { + 'query': { + 'query_string': { + 'default_field': content_field, + 'default_operator': DEFAULT_OPERATOR, + 'query': query_string, + 'analyze_wildcard': True, + 'auto_generate_phrase_queries': True, + }, + }, + } + + # so far, no filters + filters = [] + + if fields: + if isinstance(fields, (list, set)): + fields = " ".join(fields) + + kwargs['fields'] = fields + + if sort_by is not None: + order_list = [] + for field, direction in sort_by: + if field == 'distance' and distance_point: + # Do the geo-enabled sort. + lng, lat = distance_point['point'].get_coords() + sort_kwargs = { + "_geo_distance": { + distance_point['field']: [lng, lat], + "order": direction, + "unit": "km" + } + } + else: + if field == 'distance': + warnings.warn("In order to sort by distance, you must call the '.distance(...)' method.") + + # Regular sorting. + sort_kwargs = {field: {'order': direction}} + + order_list.append(sort_kwargs) + + kwargs['sort'] = order_list + + # From/size offsets don't seem to work right in Elasticsearch's DSL. :/ + # if start_offset is not None: + # kwargs['from'] = start_offset + + # if end_offset is not None: + # kwargs['size'] = end_offset - start_offset + + if highlight is True: + kwargs['highlight'] = { + 'fields': { + content_field: {'store': 'yes'}, + } + } + + if self.include_spelling: + kwargs['suggest'] = { + 'suggest': { + 'text': spelling_query or query_string, + 'term': { + # Using content_field here will result in suggestions of stemmed words. + 'field': '_all', + }, + }, + } + + if narrow_queries is None: + narrow_queries = set() + + if facets is not None: + kwargs.setdefault('facets', {}) + + for facet_fieldname, extra_options in facets.items(): + facet_options = { + 'terms': { + 'field': facet_fieldname, + 'size': 100, + }, + } + # Special cases for options applied at the facet level (not the terms level). + if extra_options.pop('global_scope', False): + # Renamed "global_scope" since "global" is a python keyword. + facet_options['global'] = True + if 'facet_filter' in extra_options: + facet_options['facet_filter'] = extra_options.pop('facet_filter') + facet_options['terms'].update(extra_options) + kwargs['facets'][facet_fieldname] = facet_options + + if date_facets is not None: + kwargs.setdefault('facets', {}) + + for facet_fieldname, value in date_facets.items(): + # Need to detect on gap_by & only add amount if it's more than one. + interval = value.get('gap_by').lower() + + # Need to detect on amount (can't be applied on months or years). + if value.get('gap_amount', 1) != 1 and interval not in ('month', 'year'): + # Just the first character is valid for use. + interval = "%s%s" % (value['gap_amount'], interval[:1]) + + kwargs['facets'][facet_fieldname] = { + 'date_histogram': { + 'field': facet_fieldname, + 'interval': interval, + }, + 'facet_filter': { + "range": { + facet_fieldname: { + 'from': self._from_python(value.get('start_date')), + 'to': self._from_python(value.get('end_date')), + } + } + } + } + + if query_facets is not None: + kwargs.setdefault('facets', {}) + + for facet_fieldname, value in query_facets: + kwargs['facets'][facet_fieldname] = { + 'query': { + 'query_string': { + 'query': value, + } + }, + } + + if limit_to_registered_models is None: + limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # Using narrow queries, limit the results to only models handled + # with the current routers. + model_choices = self.build_models_list() + else: + model_choices = [] + + if len(model_choices) > 0: + filters.append({"terms": {DJANGO_CT: model_choices}}) + + for q in narrow_queries: + filters.append({ + 'fquery': { + 'query': { + 'query_string': { + 'query': q + }, + }, + '_cache': True, + } + }) + + if within is not None: + from haystack.utils.geo import generate_bounding_box + + ((south, west), (north, east)) = generate_bounding_box(within['point_1'], within['point_2']) + within_filter = { + "geo_bounding_box": { + within['field']: { + "top_left": { + "lat": north, + "lon": west + }, + "bottom_right": { + "lat": south, + "lon": east + } + } + }, + } + filters.append(within_filter) + + if dwithin is not None: + lng, lat = dwithin['point'].get_coords() + + # NB: the 1.0.0 release of elasticsearch introduce an + # incompatible change on the distance filter formating + if elasticsearch.VERSION >= (1, 0, 0): + distance = "%(dist).6f%(unit)s" % { + 'dist': dwithin['distance'].km, + 'unit': "km" + } + else: + distance = dwithin['distance'].km + + dwithin_filter = { + "geo_distance": { + "distance": distance, + dwithin['field']: { + "lat": lat, + "lon": lng + } + } + } + filters.append(dwithin_filter) + + # if we want to filter, change the query type to filteres + if filters: + kwargs["query"] = {"filtered": {"query": kwargs.pop("query")}} + if len(filters) == 1: + kwargs['query']['filtered']["filter"] = filters[0] + else: + kwargs['query']['filtered']["filter"] = {"bool": {"must": filters}} + + return kwargs + + @log_query + def search(self, query_string, **kwargs): + if len(query_string) == 0: + return { + 'results': [], + 'hits': 0, + } + + if not self.setup_complete: + self.setup() + + search_kwargs = self.build_search_kwargs(query_string, **kwargs) + search_kwargs['from'] = kwargs.get('start_offset', 0) + + order_fields = set() + for order in search_kwargs.get('sort', []): + for key in order.keys(): + order_fields.add(key) + + geo_sort = '_geo_distance' in order_fields + + end_offset = kwargs.get('end_offset') + start_offset = kwargs.get('start_offset', 0) + if end_offset is not None and end_offset > start_offset: + search_kwargs['size'] = end_offset - start_offset + + try: + raw_results = self.conn.search(body=search_kwargs, + index=self.index_name, + doc_type='modelresult', + _source=True) + except elasticsearch.TransportError as e: + if not self.silently_fail: + raise + + self.log.error("Failed to query Elasticsearch using '%s': %s", query_string, e) + raw_results = {} + + return self._process_results(raw_results, + highlight=kwargs.get('highlight'), + result_class=kwargs.get('result_class', SearchResult), + distance_point=kwargs.get('distance_point'), + geo_sort=geo_sort) + + def more_like_this(self, model_instance, additional_query_string=None, + start_offset=0, end_offset=None, models=None, + limit_to_registered_models=None, result_class=None, **kwargs): + from haystack import connections + + if not self.setup_complete: + self.setup() + + # Deferred models will have a different class ("RealClass_Deferred_fieldname") + # which won't be in our registry: + model_klass = model_instance._meta.concrete_model + + index = connections[self.connection_alias].get_unified_index().get_index(model_klass) + field_name = index.get_content_field() + params = {} + + if start_offset is not None: + params['search_from'] = start_offset + + if end_offset is not None: + params['search_size'] = end_offset - start_offset + + doc_id = get_identifier(model_instance) + + try: + raw_results = self.conn.mlt(index=self.index_name, doc_type='modelresult', id=doc_id, mlt_fields=[field_name], **params) + except elasticsearch.TransportError as e: + if not self.silently_fail: + raise + + self.log.error("Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e) + raw_results = {} + + return self._process_results(raw_results, result_class=result_class) + + def _process_results(self, raw_results, highlight=False, + result_class=None, distance_point=None, + geo_sort=False): + from haystack import connections + results = [] + hits = raw_results.get('hits', {}).get('total', 0) + facets = {} + spelling_suggestion = None + + if result_class is None: + result_class = SearchResult + + if self.include_spelling and 'suggest' in raw_results: + raw_suggest = raw_results['suggest'].get('suggest') + if raw_suggest: + spelling_suggestion = ' '.join([word['text'] if len(word['options']) == 0 else word['options'][0]['text'] for word in raw_suggest]) + + if 'facets' in raw_results: + facets = { + 'fields': {}, + 'dates': {}, + 'queries': {}, + } + + for facet_fieldname, facet_info in raw_results['facets'].items(): + if facet_info.get('_type', 'terms') == 'terms': + facets['fields'][facet_fieldname] = [(individual['term'], individual['count']) for individual in facet_info['terms']] + elif facet_info.get('_type', 'terms') == 'date_histogram': + # Elasticsearch provides UTC timestamps with an extra three + # decimals of precision, which datetime barfs on. + facets['dates'][facet_fieldname] = [(datetime.datetime.utcfromtimestamp(individual['time'] / 1000), individual['count']) for individual in facet_info['entries']] + elif facet_info.get('_type', 'terms') == 'query': + facets['queries'][facet_fieldname] = facet_info['count'] + + unified_index = connections[self.connection_alias].get_unified_index() + indexed_models = unified_index.get_indexed_models() + content_field = unified_index.document_field + + for raw_result in raw_results.get('hits', {}).get('hits', []): + source = raw_result['_source'] + app_label, model_name = source[DJANGO_CT].split('.') + additional_fields = {} + model = haystack_get_model(app_label, model_name) + + if model and model in indexed_models: + for key, value in source.items(): + index = unified_index.get_index(model) + string_key = str(key) + + if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): + additional_fields[string_key] = index.fields[string_key].convert(value) + else: + additional_fields[string_key] = self._to_python(value) + + del(additional_fields[DJANGO_CT]) + del(additional_fields[DJANGO_ID]) + + if 'highlight' in raw_result: + additional_fields['highlighted'] = raw_result['highlight'].get(content_field, '') + + if distance_point: + additional_fields['_point_of_origin'] = distance_point + + if geo_sort and raw_result.get('sort'): + from haystack.utils.geo import Distance + additional_fields['_distance'] = Distance(km=float(raw_result['sort'][0])) + else: + additional_fields['_distance'] = None + + result = result_class(app_label, model_name, source[DJANGO_ID], raw_result['_score'], **additional_fields) + results.append(result) + else: + hits -= 1 + + return { + 'results': results, + 'hits': hits, + 'facets': facets, + 'spelling_suggestion': spelling_suggestion, + } + + def build_schema(self, fields): + content_field_name = '' + mapping = { + DJANGO_CT: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False}, + DJANGO_ID: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False}, + } + + for field_name, field_class in fields.items(): + field_mapping = FIELD_MAPPINGS.get(field_class.field_type, DEFAULT_FIELD_MAPPING).copy() + if field_class.boost != 1.0: + field_mapping['boost'] = field_class.boost + + if field_class.document is True: + content_field_name = field_class.index_fieldname + + # Do this last to override `text` fields. + if field_mapping['type'] == 'string': + if field_class.indexed is False or hasattr(field_class, 'facet_for'): + field_mapping['index'] = 'not_analyzed' + del field_mapping['analyzer'] + + mapping[field_class.index_fieldname] = field_mapping + + return (content_field_name, mapping) + + def _iso_datetime(self, value): + """ + If value appears to be something datetime-like, return it in ISO format. + + Otherwise, return None. + """ + if hasattr(value, 'strftime'): + if hasattr(value, 'hour'): + return value.isoformat() + else: + return '%sT00:00:00' % value.isoformat() + + def _from_python(self, value): + """Convert more Python data types to ES-understandable JSON.""" + iso = self._iso_datetime(value) + if iso: + return iso + elif isinstance(value, six.binary_type): + # TODO: Be stricter. + return six.text_type(value, errors='replace') + elif isinstance(value, set): + return list(value) + return value + + def _to_python(self, value): + """Convert values from ElasticSearch to native Python values.""" + if isinstance(value, (int, float, complex, list, tuple, bool)): + return value + + if isinstance(value, six.string_types): + possible_datetime = DATETIME_REGEX.search(value) + + if possible_datetime: + date_values = possible_datetime.groupdict() + + for dk, dv in date_values.items(): + date_values[dk] = int(dv) + + return datetime.datetime( + date_values['year'], date_values['month'], + date_values['day'], date_values['hour'], + date_values['minute'], date_values['second']) + + try: + # This is slightly gross but it's hard to tell otherwise what the + # string's original type might have been. Be careful who you trust. + converted_value = eval(value) + + # Try to handle most built-in types. + if isinstance( + converted_value, + (int, list, tuple, set, dict, float, complex)): + return converted_value + except Exception: + # If it fails (SyntaxError or its ilk) or we don't trust it, + # continue on. + pass + + return value + +# DRL_FIXME: Perhaps move to something where, if none of these +# match, call a custom method on the form that returns, per-backend, +# the right type of storage? +DEFAULT_FIELD_MAPPING = {'type': 'string', 'analyzer': 'snowball'} +FIELD_MAPPINGS = { + 'edge_ngram': {'type': 'string', 'analyzer': 'edgengram_analyzer'}, + 'ngram': {'type': 'string', 'analyzer': 'ngram_analyzer'}, + 'date': {'type': 'date'}, + 'datetime': {'type': 'date'}, + + 'location': {'type': 'geo_point'}, + 'boolean': {'type': 'boolean'}, + 'float': {'type': 'float'}, + 'long': {'type': 'long'}, + 'integer': {'type': 'long'}, +} + + +# Sucks that this is almost an exact copy of what's in the Solr backend, +# but we can't import due to dependencies. +class ElasticsearchSearchQuery(BaseSearchQuery): + def matching_all_fragment(self): + return '*:*' + + def build_query_fragment(self, field, filter_type, value): + from haystack import connections + query_frag = '' + + if not hasattr(value, 'input_type_name'): + # Handle when we've got a ``ValuesListQuerySet``... + if hasattr(value, 'values_list'): + value = list(value) + + if isinstance(value, six.string_types): + # It's not an ``InputType``. Assume ``Clean``. + value = Clean(value) + else: + value = PythonData(value) + + # Prepare the query using the InputType. + prepared_value = value.prepare(self) + + if not isinstance(prepared_value, (set, list, tuple)): + # Then convert whatever we get back to what pysolr wants if needed. + prepared_value = self.backend._from_python(prepared_value) + + # 'content' is a special reserved word, much like 'pk' in + # Django's ORM layer. It indicates 'no special field'. + if field == 'content': + index_fieldname = '' + else: + index_fieldname = u'%s:' % connections[self._using].get_unified_index().get_index_fieldname(field) + + filter_types = { + 'contains': u'%s', + 'startswith': u'%s*', + 'exact': u'%s', + 'gt': u'{%s TO *}', + 'gte': u'[%s TO *]', + 'lt': u'{* TO %s}', + 'lte': u'[* TO %s]', + } + + if value.post_process is False: + query_frag = prepared_value + else: + if filter_type in ['contains', 'startswith']: + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + # Iterate over terms & incorportate the converted form of each into the query. + terms = [] + + if isinstance(prepared_value, six.string_types): + for possible_value in prepared_value.split(' '): + terms.append(filter_types[filter_type] % self.backend._from_python(possible_value)) + else: + terms.append(filter_types[filter_type] % self.backend._from_python(prepared_value)) + + if len(terms) == 1: + query_frag = terms[0] + else: + query_frag = u"(%s)" % " AND ".join(terms) + elif filter_type == 'in': + in_options = [] + + for possible_value in prepared_value: + in_options.append(u'"%s"' % self.backend._from_python(possible_value)) + + query_frag = u"(%s)" % " OR ".join(in_options) + elif filter_type == 'range': + start = self.backend._from_python(prepared_value[0]) + end = self.backend._from_python(prepared_value[1]) + query_frag = u'["%s" TO "%s"]' % (start, end) + elif filter_type == 'exact': + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + prepared_value = Exact(prepared_value).prepare(self) + query_frag = filter_types[filter_type] % prepared_value + else: + if value.input_type_name != 'exact': + prepared_value = Exact(prepared_value).prepare(self) + + query_frag = filter_types[filter_type] % prepared_value + + if len(query_frag) and not isinstance(value, Raw): + if not query_frag.startswith('(') and not query_frag.endswith(')'): + query_frag = "(%s)" % query_frag + + return u"%s%s" % (index_fieldname, query_frag) + + def build_alt_parser_query(self, parser_name, query_string='', **kwargs): + if query_string: + kwargs['v'] = query_string + + kwarg_bits = [] + + for key in sorted(kwargs.keys()): + if isinstance(kwargs[key], six.string_types) and ' ' in kwargs[key]: + kwarg_bits.append(u"%s='%s'" % (key, kwargs[key])) + else: + kwarg_bits.append(u"%s=%s" % (key, kwargs[key])) + + return u"{!%s %s}" % (parser_name, ' '.join(kwarg_bits)) + + def build_params(self, spelling_query=None, **kwargs): + search_kwargs = { + 'start_offset': self.start_offset, + 'result_class': self.result_class + } + order_by_list = None + + if self.order_by: + if order_by_list is None: + order_by_list = [] + + for field in self.order_by: + direction = 'asc' + if field.startswith('-'): + direction = 'desc' + field = field[1:] + order_by_list.append((field, direction)) + + search_kwargs['sort_by'] = order_by_list + + if self.date_facets: + search_kwargs['date_facets'] = self.date_facets + + if self.distance_point: + search_kwargs['distance_point'] = self.distance_point + + if self.dwithin: + search_kwargs['dwithin'] = self.dwithin + + if self.end_offset is not None: + search_kwargs['end_offset'] = self.end_offset + + if self.facets: + search_kwargs['facets'] = self.facets + + if self.fields: + search_kwargs['fields'] = self.fields + + if self.highlight: + search_kwargs['highlight'] = self.highlight + + if self.models: + search_kwargs['models'] = self.models + + if self.narrow_queries: + search_kwargs['narrow_queries'] = self.narrow_queries + + if self.query_facets: + search_kwargs['query_facets'] = self.query_facets + + if self.within: + search_kwargs['within'] = self.within + + if spelling_query: + search_kwargs['spelling_query'] = spelling_query + + return search_kwargs + + def run(self, spelling_query=None, **kwargs): + """Builds and executes the query. Returns a list of search results.""" + final_query = self.build_query() + search_kwargs = self.build_params(spelling_query, **kwargs) + + if kwargs: + search_kwargs.update(kwargs) + + results = self.backend.search(final_query, **search_kwargs) + self._results = results.get('results', []) + self._hit_count = results.get('hits', 0) + self._facet_counts = self.post_process_facets(results) + self._spelling_suggestion = results.get('spelling_suggestion', None) + + def run_mlt(self, **kwargs): + """Builds and executes the query. Returns a list of search results.""" + if self._more_like_this is False or self._mlt_instance is None: + raise MoreLikeThisError("No instance was provided to determine 'More Like This' results.") + + additional_query_string = self.build_query() + search_kwargs = { + 'start_offset': self.start_offset, + 'result_class': self.result_class, + 'models': self.models + } + + if self.end_offset is not None: + search_kwargs['end_offset'] = self.end_offset - self.start_offset + + results = self.backend.more_like_this(self._mlt_instance, additional_query_string, **search_kwargs) + self._results = results.get('results', []) + self._hit_count = results.get('hits', 0) + + +class ElasticsearchSearchEngine(BaseEngine): + backend = ElasticsearchSearchBackend + query = ElasticsearchSearchQuery diff --git a/haystack/backends/simple_backend.py b/haystack/backends/simple_backend.py new file mode 100644 index 0000000..ff7402e --- /dev/null +++ b/haystack/backends/simple_backend.py @@ -0,0 +1,135 @@ +# encoding: utf-8 +""" +A very basic, ORM-based backend for simple search during tests. +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +from warnings import warn + +from django.conf import settings +from django.db.models import Q +from django.utils import six + +from haystack import connections +from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, log_query, SearchNode +from haystack.inputs import PythonData +from haystack.models import SearchResult +from haystack.utils import get_model_ct_tuple + +if settings.DEBUG: + import logging + + class NullHandler(logging.Handler): + def emit(self, record): + pass + + ch = logging.StreamHandler() + ch.setLevel(logging.WARNING) + ch.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + + logger = logging.getLogger('haystack.simple_backend') + logger.setLevel(logging.WARNING) + logger.addHandler(NullHandler()) + logger.addHandler(ch) +else: + logger = None + + +class SimpleSearchBackend(BaseSearchBackend): + def update(self, indexer, iterable, commit=True): + warn('update is not implemented in this backend') + + def remove(self, obj, commit=True): + warn('remove is not implemented in this backend') + + def clear(self, models=[], commit=True): + warn('clear is not implemented in this backend') + + @log_query + def search(self, query_string, **kwargs): + hits = 0 + results = [] + result_class = SearchResult + models = connections[self.connection_alias].get_unified_index().get_indexed_models() + + if kwargs.get('result_class'): + result_class = kwargs['result_class'] + + if kwargs.get('models'): + models = kwargs['models'] + + if query_string: + for model in models: + if query_string == '*': + qs = model.objects.all() + else: + for term in query_string.split(): + queries = [] + + for field in model._meta.fields: + if hasattr(field, 'related'): + continue + + if not field.get_internal_type() in ('TextField', 'CharField', 'SlugField'): + continue + + queries.append(Q(**{'%s__icontains' % field.name: term})) + + qs = model.objects.filter(six.moves.reduce(lambda x, y: x | y, queries)) + + hits += len(qs) + + for match in qs: + match.__dict__.pop('score', None) + app_label, model_name = get_model_ct_tuple(match) + result = result_class(app_label, model_name, match.pk, 0, **match.__dict__) + # For efficiency. + result._model = match.__class__ + result._object = match + results.append(result) + + return { + 'results': results, + 'hits': hits, + } + + def prep_value(self, db_field, value): + return value + + def more_like_this(self, model_instance, additional_query_string=None, + start_offset=0, end_offset=None, + limit_to_registered_models=None, result_class=None, **kwargs): + return { + 'results': [], + 'hits': 0 + } + + +class SimpleSearchQuery(BaseSearchQuery): + def build_query(self): + if not self.query_filter: + return '*' + + return self._build_sub_query(self.query_filter) + + def _build_sub_query(self, search_node): + term_list = [] + + for child in search_node.children: + if isinstance(child, SearchNode): + term_list.append(self._build_sub_query(child)) + else: + value = child[1] + + if not hasattr(value, 'input_type_name'): + value = PythonData(value) + + term_list.append(value.prepare(self)) + + return (' ').join(map(six.text_type, term_list)) + + +class SimpleEngine(BaseEngine): + backend = SimpleSearchBackend + query = SimpleSearchQuery diff --git a/haystack/backends/solr_backend.py b/haystack/backends/solr_backend.py new file mode 100644 index 0000000..4f301c9 --- /dev/null +++ b/haystack/backends/solr_backend.py @@ -0,0 +1,718 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import warnings + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.utils import six + +from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query +from haystack.constants import DJANGO_CT, DJANGO_ID, ID +from haystack.exceptions import MissingDependency, MoreLikeThisError, SkipDocument +from haystack.inputs import Clean, Exact, PythonData, Raw +from haystack.models import SearchResult +from haystack.utils import log as logging +from haystack.utils import get_identifier, get_model_ct +from haystack.utils.app_loading import haystack_get_model + +try: + from pysolr import Solr, SolrError +except ImportError: + raise MissingDependency("The 'solr' backend requires the installation of 'pysolr'. Please refer to the documentation.") + + +class SolrSearchBackend(BaseSearchBackend): + # Word reserved by Solr for special use. + RESERVED_WORDS = ( + 'AND', + 'NOT', + 'OR', + 'TO', + ) + + # Characters reserved by Solr for special use. + # The '\\' must come first, so as not to overwrite the other slash replacements. + RESERVED_CHARACTERS = ( + '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', + '[', ']', '^', '"', '~', '*', '?', ':', '/', + ) + + def __init__(self, connection_alias, **connection_options): + super(SolrSearchBackend, self).__init__(connection_alias, **connection_options) + + if not 'URL' in connection_options: + raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias) + + self.conn = Solr(connection_options['URL'], timeout=self.timeout, **connection_options.get('KWARGS', {})) + self.log = logging.getLogger('haystack') + + def update(self, index, iterable, commit=True): + docs = [] + + for obj in iterable: + try: + docs.append(index.full_prepare(obj)) + except SkipDocument: + self.log.debug(u"Indexing for object `%s` skipped", obj) + except UnicodeDecodeError: + if not self.silently_fail: + raise + + # We'll log the object identifier but won't include the actual object + # to avoid the possibility of that generating encoding errors while + # processing the log message: + self.log.error(u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={ + "data": { + "index": index, + "object": get_identifier(obj) + } + }) + + if len(docs) > 0: + try: + self.conn.add(docs, commit=commit, boost=index.get_field_weights()) + except (IOError, SolrError) as e: + if not self.silently_fail: + raise + + self.log.error("Failed to add documents to Solr: %s", e) + + def remove(self, obj_or_string, commit=True): + solr_id = get_identifier(obj_or_string) + + try: + kwargs = { + 'commit': commit, + 'id': solr_id + } + self.conn.delete(**kwargs) + except (IOError, SolrError) as e: + if not self.silently_fail: + raise + + self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e) + + def clear(self, models=[], commit=True): + try: + if not models: + # *:* matches all docs in Solr + self.conn.delete(q='*:*', commit=commit) + else: + models_to_delete = [] + + for model in models: + models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model))) + + self.conn.delete(q=" OR ".join(models_to_delete), commit=commit) + + if commit: + # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99 + self.conn.optimize() + except (IOError, SolrError) as e: + if not self.silently_fail: + raise + + if len(models): + self.log.error("Failed to clear Solr index of models '%s': %s", ','.join(models_to_delete), e) + else: + self.log.error("Failed to clear Solr index: %s", e) + + @log_query + def search(self, query_string, **kwargs): + if len(query_string) == 0: + return { + 'results': [], + 'hits': 0, + } + + search_kwargs = self.build_search_kwargs(query_string, **kwargs) + + try: + raw_results = self.conn.search(query_string, **search_kwargs) + except (IOError, SolrError) as e: + if not self.silently_fail: + raise + + self.log.error("Failed to query Solr using '%s': %s", query_string, e) + raw_results = EmptyResults() + + return self._process_results(raw_results, highlight=kwargs.get('highlight'), result_class=kwargs.get('result_class', SearchResult), distance_point=kwargs.get('distance_point')) + + def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, + fields='', highlight=False, facets=None, + date_facets=None, query_facets=None, + narrow_queries=None, spelling_query=None, + within=None, dwithin=None, distance_point=None, + models=None, limit_to_registered_models=None, + result_class=None, stats=None): + kwargs = {'fl': '* score'} + + if fields: + if isinstance(fields, (list, set)): + fields = " ".join(fields) + + kwargs['fl'] = fields + + if sort_by is not None: + if sort_by in ['distance asc', 'distance desc'] and distance_point: + # Do the geo-enabled sort. + lng, lat = distance_point['point'].get_coords() + kwargs['sfield'] = distance_point['field'] + kwargs['pt'] = '%s,%s' % (lat, lng) + + if sort_by == 'distance asc': + kwargs['sort'] = 'geodist() asc' + else: + kwargs['sort'] = 'geodist() desc' + else: + if sort_by.startswith('distance '): + warnings.warn("In order to sort by distance, you must call the '.distance(...)' method.") + + # Regular sorting. + kwargs['sort'] = sort_by + + if start_offset is not None: + kwargs['start'] = start_offset + + if end_offset is not None: + kwargs['rows'] = end_offset - start_offset + + if highlight is True: + kwargs['hl'] = 'true' + kwargs['hl.fragsize'] = '200' + + if self.include_spelling is True: + kwargs['spellcheck'] = 'true' + kwargs['spellcheck.collate'] = 'true' + kwargs['spellcheck.count'] = 1 + + if spelling_query: + kwargs['spellcheck.q'] = spelling_query + + if facets is not None: + kwargs['facet'] = 'on' + kwargs['facet.field'] = facets.keys() + + for facet_field, options in facets.items(): + for key, value in options.items(): + kwargs['f.%s.facet.%s' % (facet_field, key)] = self.conn._from_python(value) + + if date_facets is not None: + kwargs['facet'] = 'on' + kwargs['facet.date'] = date_facets.keys() + kwargs['facet.date.other'] = 'none' + + for key, value in date_facets.items(): + kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date')) + kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date')) + gap_by_string = value.get('gap_by').upper() + gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string) + + if value.get('gap_amount') != 1: + gap_string += "S" + + kwargs["f.%s.facet.date.gap" % key] = '+%s/%s' % (gap_string, gap_by_string) + + if query_facets is not None: + kwargs['facet'] = 'on' + kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets] + + if limit_to_registered_models is None: + limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # Using narrow queries, limit the results to only models handled + # with the current routers. + model_choices = self.build_models_list() + else: + model_choices = [] + + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + + narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) + + if narrow_queries is not None: + kwargs['fq'] = list(narrow_queries) + + if stats: + kwargs['stats'] = "true" + + for k in stats.keys(): + kwargs['stats.field'] = k + + for facet in stats[k]: + kwargs['f.%s.stats.facet' % k] = facet + + if within is not None: + from haystack.utils.geo import generate_bounding_box + + kwargs.setdefault('fq', []) + ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(within['point_1'], within['point_2']) + # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT* + # very clear on this. + bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng, max_lat, max_lng) + kwargs['fq'].append(bbox) + + if dwithin is not None: + kwargs.setdefault('fq', []) + lng, lat = dwithin['point'].get_coords() + geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (lat, lng, dwithin['field'], dwithin['distance'].km) + kwargs['fq'].append(geofilt) + + # Check to see if the backend should try to include distances + # (Solr 4.X+) in the results. + if self.distance_available and distance_point: + # In early testing, you can't just hand Solr 4.X a proper bounding box + # & request distances. To enable native distance would take calculating + # a center point & a radius off the user-provided box, which kinda + # sucks. We'll avoid it for now, since Solr 4.x's release will be some + # time yet. + # kwargs['fl'] += ' _dist_:geodist()' + pass + + return kwargs + + def more_like_this(self, model_instance, additional_query_string=None, + start_offset=0, end_offset=None, models=None, + limit_to_registered_models=None, result_class=None, **kwargs): + from haystack import connections + + # Deferred models will have a different class ("RealClass_Deferred_fieldname") + # which won't be in our registry: + model_klass = model_instance._meta.concrete_model + + index = connections[self.connection_alias].get_unified_index().get_index(model_klass) + field_name = index.get_content_field() + params = { + 'fl': '*,score', + } + + if start_offset is not None: + params['start'] = start_offset + + if end_offset is not None: + params['rows'] = end_offset + + narrow_queries = set() + + if limit_to_registered_models is None: + limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # Using narrow queries, limit the results to only models handled + # with the current routers. + model_choices = self.build_models_list() + else: + model_choices = [] + + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + + narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices))) + + if additional_query_string: + narrow_queries.add(additional_query_string) + + if narrow_queries: + params['fq'] = list(narrow_queries) + + query = "%s:%s" % (ID, get_identifier(model_instance)) + + try: + raw_results = self.conn.more_like_this(query, field_name, **params) + except (IOError, SolrError) as e: + if not self.silently_fail: + raise + + self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e) + raw_results = EmptyResults() + + return self._process_results(raw_results, result_class=result_class) + + def _process_results(self, raw_results, highlight=False, result_class=None, distance_point=None): + from haystack import connections + results = [] + hits = raw_results.hits + facets = {} + stats = {} + spelling_suggestion = None + + if result_class is None: + result_class = SearchResult + + if hasattr(raw_results,'stats'): + stats = raw_results.stats.get('stats_fields',{}) + + if hasattr(raw_results, 'facets'): + facets = { + 'fields': raw_results.facets.get('facet_fields', {}), + 'dates': raw_results.facets.get('facet_dates', {}), + 'queries': raw_results.facets.get('facet_queries', {}), + } + + for key in ['fields']: + for facet_field in facets[key]: + # Convert to a two-tuple, as Solr's json format returns a list of + # pairs. + facets[key][facet_field] = list(zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])) + + if self.include_spelling is True: + if hasattr(raw_results, 'spellcheck'): + if len(raw_results.spellcheck.get('suggestions', [])): + # For some reason, it's an array of pairs. Pull off the + # collated result from the end. + spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1] + + unified_index = connections[self.connection_alias].get_unified_index() + indexed_models = unified_index.get_indexed_models() + + for raw_result in raw_results.docs: + app_label, model_name = raw_result[DJANGO_CT].split('.') + additional_fields = {} + model = haystack_get_model(app_label, model_name) + + if model and model in indexed_models: + index = unified_index.get_index(model) + index_field_map = index.field_map + for key, value in raw_result.items(): + string_key = str(key) + # re-map key if alternate name used + if string_key in index_field_map: + string_key = index_field_map[key] + + if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): + additional_fields[string_key] = index.fields[string_key].convert(value) + else: + additional_fields[string_key] = self.conn._to_python(value) + + del(additional_fields[DJANGO_CT]) + del(additional_fields[DJANGO_ID]) + del(additional_fields['score']) + + if raw_result[ID] in getattr(raw_results, 'highlighting', {}): + additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]] + + if distance_point: + additional_fields['_point_of_origin'] = distance_point + + if raw_result.get('__dist__'): + from haystack.utils.geo import Distance + additional_fields['_distance'] = Distance(km=float(raw_result['__dist__'])) + else: + additional_fields['_distance'] = None + + result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], **additional_fields) + results.append(result) + else: + hits -= 1 + + return { + 'results': results, + 'hits': hits, + 'stats': stats, + 'facets': facets, + 'spelling_suggestion': spelling_suggestion, + } + + def build_schema(self, fields): + content_field_name = '' + schema_fields = [] + + for field_name, field_class in fields.items(): + field_data = { + 'field_name': field_class.index_fieldname, + 'type': 'text_en', + 'indexed': 'true', + 'stored': 'true', + 'multi_valued': 'false', + } + + if field_class.document is True: + content_field_name = field_class.index_fieldname + + # DRL_FIXME: Perhaps move to something where, if none of these + # checks succeed, call a custom method on the form that + # returns, per-backend, the right type of storage? + if field_class.field_type in ['date', 'datetime']: + field_data['type'] = 'date' + elif field_class.field_type == 'integer': + field_data['type'] = 'long' + elif field_class.field_type == 'float': + field_data['type'] = 'float' + elif field_class.field_type == 'boolean': + field_data['type'] = 'boolean' + elif field_class.field_type == 'ngram': + field_data['type'] = 'ngram' + elif field_class.field_type == 'edge_ngram': + field_data['type'] = 'edge_ngram' + elif field_class.field_type == 'location': + field_data['type'] = 'location' + + if field_class.is_multivalued: + field_data['multi_valued'] = 'true' + + if field_class.stored is False: + field_data['stored'] = 'false' + + # Do this last to override `text` fields. + if field_class.indexed is False: + field_data['indexed'] = 'false' + + # If it's text and not being indexed, we probably don't want + # to do the normal lowercase/tokenize/stemming/etc. dance. + if field_data['type'] == 'text_en': + field_data['type'] = 'string' + + # If it's a ``FacetField``, make sure we don't postprocess it. + if hasattr(field_class, 'facet_for'): + # If it's text, it ought to be a string. + if field_data['type'] == 'text_en': + field_data['type'] = 'string' + + schema_fields.append(field_data) + + return (content_field_name, schema_fields) + + def extract_file_contents(self, file_obj): + """Extract text and metadata from a structured file (PDF, MS Word, etc.) + + Uses the Solr ExtractingRequestHandler, which is based on Apache Tika. + See the Solr wiki for details: + + http://wiki.apache.org/solr/ExtractingRequestHandler + + Due to the way the ExtractingRequestHandler is implemented it completely + replaces the normal Haystack indexing process with several unfortunate + restrictions: only one file per request, the extracted data is added to + the index with no ability to modify it, etc. To simplify the process and + allow for more advanced use we'll run using the extract-only mode to + return the extracted data without adding it to the index so we can then + use it within Haystack's normal templating process. + + Returns None if metadata cannot be extracted; otherwise returns a + dictionary containing at least two keys: + + :contents: + Extracted full-text content, if applicable + :metadata: + key:value pairs of text strings + """ + + try: + return self.conn.extract(file_obj) + except Exception as e: + self.log.warning(u"Unable to extract file contents: %s", e, + exc_info=True, extra={"data": {"file": file_obj}}) + return None + + +class SolrSearchQuery(BaseSearchQuery): + def matching_all_fragment(self): + return '*:*' + + def build_query_fragment(self, field, filter_type, value): + from haystack import connections + query_frag = '' + + if not hasattr(value, 'input_type_name'): + # Handle when we've got a ``ValuesListQuerySet``... + if hasattr(value, 'values_list'): + value = list(value) + + if isinstance(value, six.string_types): + # It's not an ``InputType``. Assume ``Clean``. + value = Clean(value) + else: + value = PythonData(value) + + # Prepare the query using the InputType. + prepared_value = value.prepare(self) + + if not isinstance(prepared_value, (set, list, tuple)): + # Then convert whatever we get back to what pysolr wants if needed. + prepared_value = self.backend.conn._from_python(prepared_value) + + # 'content' is a special reserved word, much like 'pk' in + # Django's ORM layer. It indicates 'no special field'. + if field == 'content': + index_fieldname = '' + else: + index_fieldname = u'%s:' % connections[self._using].get_unified_index().get_index_fieldname(field) + + filter_types = { + 'contains': u'%s', + 'startswith': u'%s*', + 'exact': u'%s', + 'gt': u'{%s TO *}', + 'gte': u'[%s TO *]', + 'lt': u'{* TO %s}', + 'lte': u'[* TO %s]', + } + + if value.post_process is False: + query_frag = prepared_value + else: + if filter_type in ['contains', 'startswith']: + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + # Iterate over terms & incorportate the converted form of each into the query. + terms = [] + + for possible_value in prepared_value.split(' '): + terms.append(filter_types[filter_type] % self.backend.conn._from_python(possible_value)) + + if len(terms) == 1: + query_frag = terms[0] + else: + query_frag = u"(%s)" % " AND ".join(terms) + elif filter_type == 'in': + in_options = [] + + for possible_value in prepared_value: + in_options.append(u'"%s"' % self.backend.conn._from_python(possible_value)) + + query_frag = u"(%s)" % " OR ".join(in_options) + elif filter_type == 'range': + start = self.backend.conn._from_python(prepared_value[0]) + end = self.backend.conn._from_python(prepared_value[1]) + query_frag = u'["%s" TO "%s"]' % (start, end) + elif filter_type == 'exact': + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + prepared_value = Exact(prepared_value).prepare(self) + query_frag = filter_types[filter_type] % prepared_value + else: + if value.input_type_name != 'exact': + prepared_value = Exact(prepared_value).prepare(self) + + query_frag = filter_types[filter_type] % prepared_value + + if len(query_frag) and not isinstance(value, Raw): + if not query_frag.startswith('(') and not query_frag.endswith(')'): + query_frag = "(%s)" % query_frag + + return u"%s%s" % (index_fieldname, query_frag) + + def build_alt_parser_query(self, parser_name, query_string='', **kwargs): + if query_string: + query_string = Clean(query_string).prepare(self) + + kwarg_bits = [] + + for key in sorted(kwargs.keys()): + if isinstance(kwargs[key], six.string_types) and ' ' in kwargs[key]: + kwarg_bits.append(u"%s='%s'" % (key, kwargs[key])) + else: + kwarg_bits.append(u"%s=%s" % (key, kwargs[key])) + + return u'_query_:"{!%s %s}%s"' % (parser_name, Clean(' '.join(kwarg_bits)), query_string) + + def build_params(self, spelling_query=None, **kwargs): + search_kwargs = { + 'start_offset': self.start_offset, + 'result_class': self.result_class + } + order_by_list = None + + if self.order_by: + if order_by_list is None: + order_by_list = [] + + for order_by in self.order_by: + if order_by.startswith('-'): + order_by_list.append('%s desc' % order_by[1:]) + else: + order_by_list.append('%s asc' % order_by) + + search_kwargs['sort_by'] = ", ".join(order_by_list) + + if self.date_facets: + search_kwargs['date_facets'] = self.date_facets + + if self.distance_point: + search_kwargs['distance_point'] = self.distance_point + + if self.dwithin: + search_kwargs['dwithin'] = self.dwithin + + if self.end_offset is not None: + search_kwargs['end_offset'] = self.end_offset + + if self.facets: + search_kwargs['facets'] = self.facets + + if self.fields: + search_kwargs['fields'] = self.fields + + if self.highlight: + search_kwargs['highlight'] = self.highlight + + if self.models: + search_kwargs['models'] = self.models + + if self.narrow_queries: + search_kwargs['narrow_queries'] = self.narrow_queries + + if self.query_facets: + search_kwargs['query_facets'] = self.query_facets + + if self.within: + search_kwargs['within'] = self.within + + if spelling_query: + search_kwargs['spelling_query'] = spelling_query + + if self.stats: + search_kwargs['stats'] = self.stats + + return search_kwargs + + def run(self, spelling_query=None, **kwargs): + """Builds and executes the query. Returns a list of search results.""" + final_query = self.build_query() + search_kwargs = self.build_params(spelling_query, **kwargs) + + if kwargs: + search_kwargs.update(kwargs) + + results = self.backend.search(final_query, **search_kwargs) + self._results = results.get('results', []) + self._hit_count = results.get('hits', 0) + self._facet_counts = self.post_process_facets(results) + self._stats = results.get('stats',{}) + self._spelling_suggestion = results.get('spelling_suggestion', None) + + def run_mlt(self, **kwargs): + """Builds and executes the query. Returns a list of search results.""" + if self._more_like_this is False or self._mlt_instance is None: + raise MoreLikeThisError("No instance was provided to determine 'More Like This' results.") + + additional_query_string = self.build_query() + search_kwargs = { + 'start_offset': self.start_offset, + 'result_class': self.result_class, + 'models': self.models + } + + if self.end_offset is not None: + search_kwargs['end_offset'] = self.end_offset - self.start_offset + + results = self.backend.more_like_this(self._mlt_instance, additional_query_string, **search_kwargs) + self._results = results.get('results', []) + self._hit_count = results.get('hits', 0) + + +class SolrEngine(BaseEngine): + backend = SolrSearchBackend + query = SolrSearchQuery diff --git a/haystack/backends/whoosh_backend.py b/haystack/backends/whoosh_backend.py new file mode 100644 index 0000000..bf26adc --- /dev/null +++ b/haystack/backends/whoosh_backend.py @@ -0,0 +1,916 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import re +import shutil +import threading +import warnings + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.utils import six +from django.utils.datetime_safe import datetime + +from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query +from haystack.constants import DJANGO_CT, DJANGO_ID, ID +from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument +from haystack.inputs import Clean, Exact, PythonData, Raw +from haystack.models import SearchResult +from haystack.utils import log as logging +from haystack.utils import get_identifier, get_model_ct +from haystack.utils.app_loading import haystack_get_model + +try: + import json +except ImportError: + try: + import simplejson as json + except ImportError: + from django.utils import simplejson as json + +try: + from django.utils.encoding import force_text +except ImportError: + from django.utils.encoding import force_unicode as force_text + +try: + import whoosh +except ImportError: + raise MissingDependency("The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.") + +# Handle minimum requirement. +if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0): + raise MissingDependency("The 'whoosh' backend requires version 2.5.0 or greater.") + +# Bubble up the correct error. +from whoosh import index +from whoosh.analysis import StemmingAnalyzer +from whoosh.fields import ID as WHOOSH_ID +from whoosh.fields import BOOLEAN, DATETIME, IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, Schema, TEXT +from whoosh.filedb.filestore import FileStorage, RamStorage +from whoosh.highlight import highlight as whoosh_highlight +from whoosh.highlight import ContextFragmenter, HtmlFormatter +from whoosh.qparser import QueryParser +from whoosh.searching import ResultsPage +from whoosh.writing import AsyncWriter + + +DATETIME_REGEX = re.compile('^(?P\d{4})-(?P\d{2})-(?P\d{2})T(?P\d{2}):(?P\d{2}):(?P\d{2})(\.\d{3,6}Z?)?$') +LOCALS = threading.local() +LOCALS.RAM_STORE = None + + +class WhooshHtmlFormatter(HtmlFormatter): + """ + This is a HtmlFormatter simpler than the whoosh.HtmlFormatter. + We use it to have consistent results across backends. Specifically, + Solr, Xapian and Elasticsearch are using this formatting. + """ + template = '<%(tag)s>%(t)s' + + +class WhooshSearchBackend(BaseSearchBackend): + # Word reserved by Whoosh for special use. + RESERVED_WORDS = ( + 'AND', + 'NOT', + 'OR', + 'TO', + ) + + # Characters reserved by Whoosh for special use. + # The '\\' must come first, so as not to overwrite the other slash replacements. + RESERVED_CHARACTERS = ( + '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', + '[', ']', '^', '"', '~', '*', '?', ':', '.', + ) + + def __init__(self, connection_alias, **connection_options): + super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) + self.setup_complete = False + self.use_file_storage = True + self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) + self.path = connection_options.get('PATH') + + if connection_options.get('STORAGE', 'file') != 'file': + self.use_file_storage = False + + if self.use_file_storage and not self.path: + raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) + + self.log = logging.getLogger('haystack') + + def setup(self): + """ + Defers loading until needed. + """ + from haystack import connections + new_index = False + + # Make sure the index is there. + if self.use_file_storage and not os.path.exists(self.path): + os.makedirs(self.path) + new_index = True + + if self.use_file_storage and not os.access(self.path, os.W_OK): + raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) + + if self.use_file_storage: + self.storage = FileStorage(self.path) + else: + global LOCALS + + if LOCALS.RAM_STORE is None: + LOCALS.RAM_STORE = RamStorage() + + self.storage = LOCALS.RAM_STORE + + self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields()) + self.parser = QueryParser(self.content_field_name, schema=self.schema) + + if new_index is True: + self.index = self.storage.create_index(self.schema) + else: + try: + self.index = self.storage.open_index(schema=self.schema) + except index.EmptyIndexError: + self.index = self.storage.create_index(self.schema) + + self.setup_complete = True + + def build_schema(self, fields): + schema_fields = { + ID: WHOOSH_ID(stored=True, unique=True), + DJANGO_CT: WHOOSH_ID(stored=True), + DJANGO_ID: WHOOSH_ID(stored=True), + } + # Grab the number of keys that are hard-coded into Haystack. + # We'll use this to (possibly) fail slightly more gracefully later. + initial_key_count = len(schema_fields) + content_field_name = '' + + for field_name, field_class in fields.items(): + if field_class.is_multivalued: + if field_class.indexed is False: + schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) + else: + schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) + elif field_class.field_type in ['date', 'datetime']: + schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True) + elif field_class.field_type == 'integer': + schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost) + elif field_class.field_type == 'float': + schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost) + elif field_class.field_type == 'boolean': + # Field boost isn't supported on BOOLEAN as of 1.8.2. + schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) + elif field_class.field_type == 'ngram': + schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) + elif field_class.field_type == 'edge_ngram': + schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) + else: + schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) + + if field_class.document is True: + content_field_name = field_class.index_fieldname + schema_fields[field_class.index_fieldname].spelling = True + + # Fail more gracefully than relying on the backend to die if no fields + # are found. + if len(schema_fields) <= initial_key_count: + raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") + + return (content_field_name, Schema(**schema_fields)) + + def update(self, index, iterable, commit=True): + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + writer = AsyncWriter(self.index) + + for obj in iterable: + try: + doc = index.full_prepare(obj) + except SkipDocument: + self.log.debug(u"Indexing for object `%s` skipped", obj) + else: + # Really make sure it's unicode, because Whoosh won't have it any + # other way. + for key in doc: + doc[key] = self._from_python(doc[key]) + + # Document boosts aren't supported in Whoosh 2.5.0+. + if 'boost' in doc: + del doc['boost'] + + try: + writer.update_document(**doc) + except Exception as e: + if not self.silently_fail: + raise + + # We'll log the object identifier but won't include the actual object + # to avoid the possibility of that generating encoding errors while + # processing the log message: + self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ + "data": { + "index": index, + "object": get_identifier(obj) + } + }) + + if len(iterable) > 0: + # For now, commit no matter what, as we run into locking issues otherwise. + writer.commit() + + def remove(self, obj_or_string, commit=True): + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + whoosh_id = get_identifier(obj_or_string) + + try: + self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))) + except Exception as e: + if not self.silently_fail: + raise + + self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e) + + def clear(self, models=[], commit=True): + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + + try: + if not models: + self.delete_index() + else: + models_to_delete = [] + + for model in models: + models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model))) + + self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete))) + except Exception as e: + if not self.silently_fail: + raise + + self.log.error("Failed to clear documents from Whoosh: %s", e) + + def delete_index(self): + # Per the Whoosh mailing list, if wiping out everything from the index, + # it's much more efficient to simply delete the index files. + if self.use_file_storage and os.path.exists(self.path): + shutil.rmtree(self.path) + elif not self.use_file_storage: + self.storage.clean() + + # Recreate everything. + self.setup() + + def optimize(self): + if not self.setup_complete: + self.setup() + + self.index = self.index.refresh() + self.index.optimize() + + def calculate_page(self, start_offset=0, end_offset=None): + # Prevent against Whoosh throwing an error. Requires an end_offset + # greater than 0. + if not end_offset is None and end_offset <= 0: + end_offset = 1 + + # Determine the page. + page_num = 0 + + if end_offset is None: + end_offset = 1000000 + + if start_offset is None: + start_offset = 0 + + page_length = end_offset - start_offset + + if page_length and page_length > 0: + page_num = int(start_offset / page_length) + + # Increment because Whoosh uses 1-based page numbers. + page_num += 1 + return page_num, page_length + + @log_query + def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, + fields='', highlight=False, facets=None, date_facets=None, query_facets=None, + narrow_queries=None, spelling_query=None, within=None, + dwithin=None, distance_point=None, models=None, + limit_to_registered_models=None, result_class=None, **kwargs): + if not self.setup_complete: + self.setup() + + # A zero length query should return no results. + if len(query_string) == 0: + return { + 'results': [], + 'hits': 0, + } + + query_string = force_text(query_string) + + # A one-character query (non-wildcard) gets nabbed by a stopwords + # filter and should yield zero results. + if len(query_string) <= 1 and query_string != u'*': + return { + 'results': [], + 'hits': 0, + } + + reverse = False + + if sort_by is not None: + # Determine if we need to reverse the results and if Whoosh can + # handle what it's being asked to sort by. Reversing is an + # all-or-nothing action, unfortunately. + sort_by_list = [] + reverse_counter = 0 + + for order_by in sort_by: + if order_by.startswith('-'): + reverse_counter += 1 + + if reverse_counter and reverse_counter != len(sort_by): + raise SearchBackendError("Whoosh requires all order_by fields" + " to use the same sort direction") + + for order_by in sort_by: + if order_by.startswith('-'): + sort_by_list.append(order_by[1:]) + + if len(sort_by_list) == 1: + reverse = True + else: + sort_by_list.append(order_by) + + if len(sort_by_list) == 1: + reverse = False + + sort_by = sort_by_list[0] + + if facets is not None: + warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) + + if date_facets is not None: + warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) + + if query_facets is not None: + warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) + + narrowed_results = None + self.index = self.index.refresh() + + if limit_to_registered_models is None: + limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # Using narrow queries, limit the results to only models handled + # with the current routers. + model_choices = self.build_models_list() + else: + model_choices = [] + + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + + narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) + + narrow_searcher = None + + if narrow_queries is not None: + # Potentially expensive? I don't see another way to do it in Whoosh... + narrow_searcher = self.index.searcher() + + for nq in narrow_queries: + recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)), + limit=None) + + if len(recent_narrowed_results) <= 0: + return { + 'results': [], + 'hits': 0, + } + + if narrowed_results: + narrowed_results.filter(recent_narrowed_results) + else: + narrowed_results = recent_narrowed_results + + self.index = self.index.refresh() + + if self.index.doc_count(): + searcher = self.index.searcher() + parsed_query = self.parser.parse(query_string) + + # In the event of an invalid/stopworded query, recover gracefully. + if parsed_query is None: + return { + 'results': [], + 'hits': 0, + } + + page_num, page_length = self.calculate_page(start_offset, end_offset) + + search_kwargs = { + 'pagelen': page_length, + 'sortedby': sort_by, + 'reverse': reverse, + } + + # Handle the case where the results have been narrowed. + if narrowed_results is not None: + search_kwargs['filter'] = narrowed_results + + try: + raw_page = searcher.search_page( + parsed_query, + page_num, + **search_kwargs + ) + except ValueError: + if not self.silently_fail: + raise + + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': None, + } + + # Because as of Whoosh 2.5.1, it will return the wrong page of + # results if you request something too high. :( + if raw_page.pagenum < page_num: + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': None, + } + + results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class) + searcher.close() + + if hasattr(narrow_searcher, 'close'): + narrow_searcher.close() + + return results + else: + if self.include_spelling: + if spelling_query: + spelling_suggestion = self.create_spelling_suggestion(spelling_query) + else: + spelling_suggestion = self.create_spelling_suggestion(query_string) + else: + spelling_suggestion = None + + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': spelling_suggestion, + } + + def more_like_this(self, model_instance, additional_query_string=None, + start_offset=0, end_offset=None, models=None, + limit_to_registered_models=None, result_class=None, **kwargs): + if not self.setup_complete: + self.setup() + + # Deferred models will have a different class ("RealClass_Deferred_fieldname") + # which won't be in our registry: + model_klass = model_instance._meta.concrete_model + + field_name = self.content_field_name + narrow_queries = set() + narrowed_results = None + self.index = self.index.refresh() + + if limit_to_registered_models is None: + limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) + + if models and len(models): + model_choices = sorted(get_model_ct(model) for model in models) + elif limit_to_registered_models: + # Using narrow queries, limit the results to only models handled + # with the current routers. + model_choices = self.build_models_list() + else: + model_choices = [] + + if len(model_choices) > 0: + if narrow_queries is None: + narrow_queries = set() + + narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) + + if additional_query_string and additional_query_string != '*': + narrow_queries.add(additional_query_string) + + narrow_searcher = None + + if narrow_queries is not None: + # Potentially expensive? I don't see another way to do it in Whoosh... + narrow_searcher = self.index.searcher() + + for nq in narrow_queries: + recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)), + limit=None) + + if len(recent_narrowed_results) <= 0: + return { + 'results': [], + 'hits': 0, + } + + if narrowed_results: + narrowed_results.filter(recent_narrowed_results) + else: + narrowed_results = recent_narrowed_results + + page_num, page_length = self.calculate_page(start_offset, end_offset) + + self.index = self.index.refresh() + raw_results = EmptyResults() + + if self.index.doc_count(): + query = "%s:%s" % (ID, get_identifier(model_instance)) + searcher = self.index.searcher() + parsed_query = self.parser.parse(query) + results = searcher.search(parsed_query) + + if len(results): + raw_results = results[0].more_like_this(field_name, top=end_offset) + + # Handle the case where the results have been narrowed. + if narrowed_results is not None and hasattr(raw_results, 'filter'): + raw_results.filter(narrowed_results) + + try: + raw_page = ResultsPage(raw_results, page_num, page_length) + except ValueError: + if not self.silently_fail: + raise + + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': None, + } + + # Because as of Whoosh 2.5.1, it will return the wrong page of + # results if you request something too high. :( + if raw_page.pagenum < page_num: + return { + 'results': [], + 'hits': 0, + 'spelling_suggestion': None, + } + + results = self._process_results(raw_page, result_class=result_class) + searcher.close() + + if hasattr(narrow_searcher, 'close'): + narrow_searcher.close() + + return results + + def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): + from haystack import connections + results = [] + + # It's important to grab the hits first before slicing. Otherwise, this + # can cause pagination failures. + hits = len(raw_page) + + if result_class is None: + result_class = SearchResult + + facets = {} + spelling_suggestion = None + unified_index = connections[self.connection_alias].get_unified_index() + indexed_models = unified_index.get_indexed_models() + + for doc_offset, raw_result in enumerate(raw_page): + score = raw_page.score(doc_offset) or 0 + app_label, model_name = raw_result[DJANGO_CT].split('.') + additional_fields = {} + model = haystack_get_model(app_label, model_name) + + if model and model in indexed_models: + for key, value in raw_result.items(): + index = unified_index.get_index(model) + string_key = str(key) + + if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): + # Special-cased due to the nature of KEYWORD fields. + if index.fields[string_key].is_multivalued: + if value is None or len(value) is 0: + additional_fields[string_key] = [] + else: + additional_fields[string_key] = value.split(',') + else: + additional_fields[string_key] = index.fields[string_key].convert(value) + else: + additional_fields[string_key] = self._to_python(value) + + del(additional_fields[DJANGO_CT]) + del(additional_fields[DJANGO_ID]) + + if highlight: + sa = StemmingAnalyzer() + formatter = WhooshHtmlFormatter('em') + terms = [token.text for token in sa(query_string)] + + whoosh_result = whoosh_highlight( + additional_fields.get(self.content_field_name), + terms, + sa, + ContextFragmenter(), + formatter + ) + additional_fields['highlighted'] = { + self.content_field_name: [whoosh_result], + } + + result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) + results.append(result) + else: + hits -= 1 + + if self.include_spelling: + if spelling_query: + spelling_suggestion = self.create_spelling_suggestion(spelling_query) + else: + spelling_suggestion = self.create_spelling_suggestion(query_string) + + return { + 'results': results, + 'hits': hits, + 'facets': facets, + 'spelling_suggestion': spelling_suggestion, + } + + def create_spelling_suggestion(self, query_string): + spelling_suggestion = None + reader = self.index.reader() + corrector = reader.corrector(self.content_field_name) + cleaned_query = force_text(query_string) + + if not query_string: + return spelling_suggestion + + # Clean the string. + for rev_word in self.RESERVED_WORDS: + cleaned_query = cleaned_query.replace(rev_word, '') + + for rev_char in self.RESERVED_CHARACTERS: + cleaned_query = cleaned_query.replace(rev_char, '') + + # Break it down. + query_words = cleaned_query.split() + suggested_words = [] + + for word in query_words: + suggestions = corrector.suggest(word, limit=1) + + if len(suggestions) > 0: + suggested_words.append(suggestions[0]) + + spelling_suggestion = ' '.join(suggested_words) + return spelling_suggestion + + def _from_python(self, value): + """ + Converts Python values to a string for Whoosh. + + Code courtesy of pysolr. + """ + if hasattr(value, 'strftime'): + if not hasattr(value, 'hour'): + value = datetime(value.year, value.month, value.day, 0, 0, 0) + elif isinstance(value, bool): + if value: + value = 'true' + else: + value = 'false' + elif isinstance(value, (list, tuple)): + value = u','.join([force_text(v) for v in value]) + elif isinstance(value, (six.integer_types, float)): + # Leave it alone. + pass + else: + value = force_text(value) + return value + + def _to_python(self, value): + """ + Converts values from Whoosh to native Python values. + + A port of the same method in pysolr, as they deal with data the same way. + """ + if value == 'true': + return True + elif value == 'false': + return False + + if value and isinstance(value, six.string_types): + possible_datetime = DATETIME_REGEX.search(value) + + if possible_datetime: + date_values = possible_datetime.groupdict() + + for dk, dv in date_values.items(): + date_values[dk] = int(dv) + + return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) + + try: + # Attempt to use json to load the values. + converted_value = json.loads(value) + + # Try to handle most built-in types. + if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): + return converted_value + except: + # If it fails (SyntaxError or its ilk) or we don't trust it, + # continue on. + pass + + return value + + +class WhooshSearchQuery(BaseSearchQuery): + def _convert_datetime(self, date): + if hasattr(date, 'hour'): + return force_text(date.strftime('%Y%m%d%H%M%S')) + else: + return force_text(date.strftime('%Y%m%d000000')) + + def clean(self, query_fragment): + """ + Provides a mechanism for sanitizing user input before presenting the + value to the backend. + + Whoosh 1.X differs here in that you can no longer use a backslash + to escape reserved characters. Instead, the whole word should be + quoted. + """ + words = query_fragment.split() + cleaned_words = [] + + for word in words: + if word in self.backend.RESERVED_WORDS: + word = word.replace(word, word.lower()) + + for char in self.backend.RESERVED_CHARACTERS: + if char in word: + word = "'%s'" % word + break + + cleaned_words.append(word) + + return ' '.join(cleaned_words) + + def build_query_fragment(self, field, filter_type, value): + from haystack import connections + query_frag = '' + is_datetime = False + + if not hasattr(value, 'input_type_name'): + # Handle when we've got a ``ValuesListQuerySet``... + if hasattr(value, 'values_list'): + value = list(value) + + if hasattr(value, 'strftime'): + is_datetime = True + + if isinstance(value, six.string_types) and value != ' ': + # It's not an ``InputType``. Assume ``Clean``. + value = Clean(value) + else: + value = PythonData(value) + + # Prepare the query using the InputType. + prepared_value = value.prepare(self) + + if not isinstance(prepared_value, (set, list, tuple)): + # Then convert whatever we get back to what pysolr wants if needed. + prepared_value = self.backend._from_python(prepared_value) + + # 'content' is a special reserved word, much like 'pk' in + # Django's ORM layer. It indicates 'no special field'. + if field == 'content': + index_fieldname = '' + else: + index_fieldname = u'%s:' % connections[self._using].get_unified_index().get_index_fieldname(field) + + filter_types = { + 'contains': '%s', + 'startswith': "%s*", + 'exact': '%s', + 'gt': "{%s to}", + 'gte': "[%s to]", + 'lt': "{to %s}", + 'lte': "[to %s]", + } + + if value.post_process is False: + query_frag = prepared_value + else: + if filter_type in ['contains', 'startswith']: + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + # Iterate over terms & incorportate the converted form of each into the query. + terms = [] + + if isinstance(prepared_value, six.string_types): + possible_values = prepared_value.split(' ') + else: + if is_datetime is True: + prepared_value = self._convert_datetime(prepared_value) + + possible_values = [prepared_value] + + for possible_value in possible_values: + terms.append(filter_types[filter_type] % self.backend._from_python(possible_value)) + + if len(terms) == 1: + query_frag = terms[0] + else: + query_frag = u"(%s)" % " AND ".join(terms) + elif filter_type == 'in': + in_options = [] + + for possible_value in prepared_value: + is_datetime = False + + if hasattr(possible_value, 'strftime'): + is_datetime = True + + pv = self.backend._from_python(possible_value) + + if is_datetime is True: + pv = self._convert_datetime(pv) + + if isinstance(pv, six.string_types) and not is_datetime: + in_options.append('"%s"' % pv) + else: + in_options.append('%s' % pv) + + query_frag = "(%s)" % " OR ".join(in_options) + elif filter_type == 'range': + start = self.backend._from_python(prepared_value[0]) + end = self.backend._from_python(prepared_value[1]) + + if hasattr(prepared_value[0], 'strftime'): + start = self._convert_datetime(start) + + if hasattr(prepared_value[1], 'strftime'): + end = self._convert_datetime(end) + + query_frag = u"[%s to %s]" % (start, end) + elif filter_type == 'exact': + if value.input_type_name == 'exact': + query_frag = prepared_value + else: + prepared_value = Exact(prepared_value).prepare(self) + query_frag = filter_types[filter_type] % prepared_value + else: + if is_datetime is True: + prepared_value = self._convert_datetime(prepared_value) + + query_frag = filter_types[filter_type] % prepared_value + + if len(query_frag) and not isinstance(value, Raw): + if not query_frag.startswith('(') and not query_frag.endswith(')'): + query_frag = "(%s)" % query_frag + + return u"%s%s" % (index_fieldname, query_frag) + + + # if not filter_type in ('in', 'range'): + # # 'in' is a bit of a special case, as we don't want to + # # convert a valid list/tuple to string. Defer handling it + # # until later... + # value = self.backend._from_python(value) + + +class WhooshEngine(BaseEngine): + backend = WhooshSearchBackend + query = WhooshSearchQuery diff --git a/haystack/constants.py b/haystack/constants.py new file mode 100644 index 0000000..202280c --- /dev/null +++ b/haystack/constants.py @@ -0,0 +1,33 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.conf import settings + +DEFAULT_ALIAS = 'default' + +# Reserved field names +ID = getattr(settings, 'HAYSTACK_ID_FIELD', 'id') +DJANGO_CT = getattr(settings, 'HAYSTACK_DJANGO_CT_FIELD', 'django_ct') +DJANGO_ID = getattr(settings, 'HAYSTACK_DJANGO_ID_FIELD', 'django_id') + +# Default operator. Valid options are AND/OR. +DEFAULT_OPERATOR = getattr(settings, 'HAYSTACK_DEFAULT_OPERATOR', 'AND') + +# Valid expression extensions. +VALID_FILTERS = set(['contains', 'exact', 'gt', 'gte', 'lt', 'lte', 'in', 'startswith', 'range']) +FILTER_SEPARATOR = '__' + +# The maximum number of items to display in a SearchQuerySet.__repr__ +REPR_OUTPUT_SIZE = 20 + +# Number of SearchResults to load at a time. +ITERATOR_LOAD_PER_QUERY = getattr(settings, 'HAYSTACK_ITERATOR_LOAD_PER_QUERY', 10) + +# A marker class in the hierarchy to indicate that it handles search data. +class Indexable(object): + haystack_use_for_indexing = True + +# For the geo bits, since that's what Solr & Elasticsearch seem to silently +# assume... +WGS_84_SRID = 4326 diff --git a/haystack/exceptions.py b/haystack/exceptions.py new file mode 100644 index 0000000..305bd11 --- /dev/null +++ b/haystack/exceptions.py @@ -0,0 +1,53 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + + +class HaystackError(Exception): + """A generic exception for all others to extend.""" + pass + + +class SearchBackendError(HaystackError): + """Raised when a backend can not be found.""" + pass + + +class SearchFieldError(HaystackError): + """Raised when a field encounters an error.""" + pass + + +class MissingDependency(HaystackError): + """Raised when a library a backend depends on can not be found.""" + pass + + +class NotHandled(HaystackError): + """Raised when a model is not handled by the router setup.""" + pass + + +class MoreLikeThisError(HaystackError): + """Raised when a model instance has not been provided for More Like This.""" + pass + + +class FacetingError(HaystackError): + """Raised when incorrect arguments have been provided for faceting.""" + pass + + +class SpatialError(HaystackError): + """Raised when incorrect arguments have been provided for spatial.""" + pass + + +class StatsError(HaystackError): + "Raised when incorrect arguments have been provided for stats" + pass + + +class SkipDocument(HaystackError): + """Raised when a document should be skipped while updating""" + pass diff --git a/haystack/fields.py b/haystack/fields.py new file mode 100644 index 0000000..5fd769d --- /dev/null +++ b/haystack/fields.py @@ -0,0 +1,441 @@ +# encoding: utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +import re + +from django.template import Context, loader +from django.utils import datetime_safe, six + +from haystack.exceptions import SearchFieldError +from haystack.utils import get_model_ct_tuple + + +class NOT_PROVIDED: + pass + + +DATETIME_REGEX = re.compile('^(?P\d{4})-(?P\d{2})-(?P\d{2})(T|\s+)(?P\d{2}):(?P\d{2}):(?P\d{2}).*?$') + + +# All the SearchFields variants. + +class SearchField(object): + """The base implementation of a search field.""" + field_type = None + + def __init__(self, model_attr=None, use_template=False, template_name=None, + document=False, indexed=True, stored=True, faceted=False, + default=NOT_PROVIDED, null=False, index_fieldname=None, + facet_class=None, boost=1.0, weight=None): + # Track what the index thinks this field is called. + self.instance_name = None + self.model_attr = model_attr + self.use_template = use_template + self.template_name = template_name + self.document = document + self.indexed = indexed + self.stored = stored + self.faceted = faceted + self._default = default + self.null = null + self.index_fieldname = index_fieldname + self.boost = weight or boost + self.is_multivalued = False + + # We supply the facet_class for making it easy to create a faceted + # field based off of this field. + self.facet_class = facet_class + + if self.facet_class is None: + self.facet_class = FacetCharField + + self.set_instance_name(None) + + def set_instance_name(self, instance_name): + self.instance_name = instance_name + + if self.index_fieldname is None: + self.index_fieldname = self.instance_name + + def has_default(self): + """Returns a boolean of whether this field has a default value.""" + return self._default is not NOT_PROVIDED + + @property + def default(self): + """Returns the default value for the field.""" + if callable(self._default): + return self._default() + + return self._default + + def prepare(self, obj): + """ + Takes data from the provided object and prepares it for storage in the + index. + """ + # Give priority to a template. + if self.use_template: + return self.prepare_template(obj) + elif self.model_attr is not None: + # Check for `__` in the field for looking through the relation. + attrs = self.model_attr.split('__') + current_object = obj + + for attr in attrs: + if not hasattr(current_object, attr): + raise SearchFieldError("The model '%s' does not have a model_attr '%s'." % (repr(current_object), attr)) + + current_object = getattr(current_object, attr, None) + + if current_object is None: + if self.has_default(): + current_object = self._default + # Fall out of the loop, given any further attempts at + # accesses will fail miserably. + break + elif self.null: + current_object = None + # Fall out of the loop, given any further attempts at + # accesses will fail miserably. + break + else: + raise SearchFieldError("The model '%s' combined with model_attr '%s' returned None, but doesn't allow a default or null value." % (repr(obj), self.model_attr)) + + if callable(current_object): + return current_object() + + return current_object + + if self.has_default(): + return self.default + else: + return None + + def prepare_template(self, obj): + """ + Flattens an object for indexing. + + This loads a template + (``search/indexes/{app_label}/{model_name}_{field_name}.txt``) and + returns the result of rendering that template. ``object`` will be in + its context. + """ + if self.instance_name is None and self.template_name is None: + raise SearchFieldError("This field requires either its instance_name variable to be populated or an explicit template_name in order to load the correct template.") + + if self.template_name is not None: + template_names = self.template_name + + if not isinstance(template_names, (list, tuple)): + template_names = [template_names] + else: + app_label, model_name = get_model_ct_tuple(obj) + template_names = ['search/indexes/%s/%s_%s.txt' % (app_label, model_name, self.instance_name)] + + t = loader.select_template(template_names) + return t.render(Context({'object': obj})) + + def convert(self, value): + """ + Handles conversion between the data found and the type of the field. + + Extending classes should override this method and provide correct + data coercion. + """ + return value + + +class CharField(SearchField): + field_type = 'string' + + def __init__(self, **kwargs): + if kwargs.get('facet_class') is None: + kwargs['facet_class'] = FacetCharField + + super(CharField, self).__init__(**kwargs) + + def prepare(self, obj): + return self.convert(super(CharField, self).prepare(obj)) + + def convert(self, value): + if value is None: + return None + + return six.text_type(value) + + +class LocationField(SearchField): + field_type = 'location' + + def prepare(self, obj): + from haystack.utils.geo import ensure_point + + value = super(LocationField, self).prepare(obj) + + if value is None: + return None + + pnt = ensure_point(value) + pnt_lng, pnt_lat = pnt.get_coords() + return "%s,%s" % (pnt_lat, pnt_lng) + + def convert(self, value): + from haystack.utils.geo import ensure_point, Point + + if value is None: + return None + + if hasattr(value, 'geom_type'): + value = ensure_point(value) + return value + + if isinstance(value, six.string_types): + lat, lng = value.split(',') + elif isinstance(value, (list, tuple)): + # GeoJSON-alike + lat, lng = value[1], value[0] + elif isinstance(value, dict): + lat = value.get('lat', 0) + lng = value.get('lon', 0) + + value = Point(float(lng), float(lat)) + return value + + +class NgramField(CharField): + field_type = 'ngram' + + def __init__(self, **kwargs): + if kwargs.get('faceted') is True: + raise SearchFieldError("%s can not be faceted." % self.__class__.__name__) + + super(NgramField, self).__init__(**kwargs) + + +class EdgeNgramField(NgramField): + field_type = 'edge_ngram' + + +class IntegerField(SearchField): + field_type = 'integer' + + def __init__(self, **kwargs): + if kwargs.get('facet_class') is None: + kwargs['facet_class'] = FacetIntegerField + + super(IntegerField, self).__init__(**kwargs) + + def prepare(self, obj): + return self.convert(super(IntegerField, self).prepare(obj)) + + def convert(self, value): + if value is None: + return None + + return int(value) + + +class FloatField(SearchField): + field_type = 'float' + + def __init__(self, **kwargs): + if kwargs.get('facet_class') is None: + kwargs['facet_class'] = FacetFloatField + + super(FloatField, self).__init__(**kwargs) + + def prepare(self, obj): + return self.convert(super(FloatField, self).prepare(obj)) + + def convert(self, value): + if value is None: + return None + + return float(value) + + +class DecimalField(SearchField): + field_type = 'string' + + def __init__(self, **kwargs): + if kwargs.get('facet_class') is None: + kwargs['facet_class'] = FacetDecimalField + + super(DecimalField, self).__init__(**kwargs) + + def prepare(self, obj): + return self.convert(super(DecimalField, self).prepare(obj)) + + def convert(self, value): + if value is None: + return None + + return six.text_type(value) + + +class BooleanField(SearchField): + field_type = 'boolean' + + def __init__(self, **kwargs): + if kwargs.get('facet_class') is None: + kwargs['facet_class'] = FacetBooleanField + + super(BooleanField, self).__init__(**kwargs) + + def prepare(self, obj): + return self.convert(super(BooleanField, self).prepare(obj)) + + def convert(self, value): + if value is None: + return None + + return bool(value) + + +class DateField(SearchField): + field_type = 'date' + + def __init__(self, **kwargs): + if kwargs.get('facet_class') is None: + kwargs['facet_class'] = FacetDateField + + super(DateField, self).__init__(**kwargs) + + def convert(self, value): + if value is None: + return None + + if isinstance(value, six.string_types): + match = DATETIME_REGEX.search(value) + + if match: + data = match.groupdict() + return datetime_safe.date(int(data['year']), int(data['month']), int(data['day'])) + else: + raise SearchFieldError("Date provided to '%s' field doesn't appear to be a valid date string: '%s'" % (self.instance_name, value)) + + return value + + +class DateTimeField(SearchField): + field_type = 'datetime' + + def __init__(self, **kwargs): + if kwargs.get('facet_class') is None: + kwargs['facet_class'] = FacetDateTimeField + + super(DateTimeField, self).__init__(**kwargs) + + def convert(self, value): + if value is None: + return None + + if isinstance(value, six.string_types): + match = DATETIME_REGEX.search(value) + + if match: + data = match.groupdict() + return datetime_safe.datetime(int(data['year']), int(data['month']), int(data['day']), int(data['hour']), int(data['minute']), int(data['second'])) + else: + raise SearchFieldError("Datetime provided to '%s' field doesn't appear to be a valid datetime string: '%s'" % (self.instance_name, value)) + + return value + + +class MultiValueField(SearchField): + field_type = 'string' + + def __init__(self, **kwargs): + if kwargs.get('facet_class') is None: + kwargs['facet_class'] = FacetMultiValueField + + if kwargs.get('use_template') is True: + raise SearchFieldError("'%s' fields can not use templates to prepare their data." % self.__class__.__name__) + + super(MultiValueField, self).__init__(**kwargs) + self.is_multivalued = True + + def prepare(self, obj): + return self.convert(super(MultiValueField, self).prepare(obj)) + + def convert(self, value): + if value is None: + return None + + return list(value) + + +class FacetField(SearchField): + """ + ``FacetField`` is slightly different than the other fields because it can + work in conjunction with other fields as its data source. + + Accepts an optional ``facet_for`` kwarg, which should be the field name + (not ``index_fieldname``) of the field it should pull data from. + """ + instance_name = None + + def __init__(self, **kwargs): + handled_kwargs = self.handle_facet_parameters(kwargs) + super(FacetField, self).__init__(**handled_kwargs) + + def handle_facet_parameters(self, kwargs): + if kwargs.get('faceted', False): + raise SearchFieldError("FacetField (%s) does not accept the 'faceted' argument." % self.instance_name) + + if not kwargs.get('null', True): + raise SearchFieldError("FacetField (%s) does not accept False for the 'null' argument." % self.instance_name) + + if not kwargs.get('indexed', True): + raise SearchFieldError("FacetField (%s) does not accept False for the 'indexed' argument." % self.instance_name) + + if kwargs.get('facet_class'): + raise SearchFieldError("FacetField (%s) does not accept the 'facet_class' argument." % self.instance_name) + + self.facet_for = None + self.facet_class = None + + # Make sure the field is nullable. + kwargs['null'] = True + + if 'facet_for' in kwargs: + self.facet_for = kwargs['facet_for'] + del(kwargs['facet_for']) + + return kwargs + + def get_facet_for_name(self): + return self.facet_for or self.instance_name + + +class FacetCharField(FacetField, CharField): + pass + + +class FacetIntegerField(FacetField, IntegerField): + pass + + +class FacetFloatField(FacetField, FloatField): + pass + + +class FacetDecimalField(FacetField, DecimalField): + pass + + +class FacetBooleanField(FacetField, BooleanField): + pass + + +class FacetDateField(FacetField, DateField): + pass + + +class FacetDateTimeField(FacetField, DateTimeField): + pass + + +class FacetMultiValueField(FacetField, MultiValueField): + pass diff --git a/haystack/forms.py b/haystack/forms.py new file mode 100644 index 0000000..c68f91b --- /dev/null +++ b/haystack/forms.py @@ -0,0 +1,133 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django import forms +from django.db import models +from django.utils.text import capfirst +from django.utils.translation import ugettext_lazy as _ + +from haystack import connections +from haystack.constants import DEFAULT_ALIAS +from haystack.query import EmptySearchQuerySet, SearchQuerySet +from haystack.utils import get_model_ct + +try: + from django.utils.encoding import smart_text +except ImportError: + from django.utils.encoding import smart_unicode as smart_text + + +def model_choices(using=DEFAULT_ALIAS): + choices = [(get_model_ct(m), capfirst(smart_text(m._meta.verbose_name_plural))) + for m in connections[using].get_unified_index().get_indexed_models()] + return sorted(choices, key=lambda x: x[1]) + + +class SearchForm(forms.Form): + q = forms.CharField(required=False, label=_('Search'), + widget=forms.TextInput(attrs={'type': 'search'})) + + def __init__(self, *args, **kwargs): + self.searchqueryset = kwargs.pop('searchqueryset', None) + self.load_all = kwargs.pop('load_all', False) + + if self.searchqueryset is None: + self.searchqueryset = SearchQuerySet() + + super(SearchForm, self).__init__(*args, **kwargs) + + def no_query_found(self): + """ + Determines the behavior when no query was found. + + By default, no results are returned (``EmptySearchQuerySet``). + + Should you want to show all results, override this method in your + own ``SearchForm`` subclass and do ``return self.searchqueryset.all()``. + """ + return EmptySearchQuerySet() + + def search(self): + if not self.is_valid(): + return self.no_query_found() + + if not self.cleaned_data.get('q'): + return self.no_query_found() + + sqs = self.searchqueryset.auto_query(self.cleaned_data['q']) + + if self.load_all: + sqs = sqs.load_all() + + return sqs + + def get_suggestion(self): + if not self.is_valid(): + return None + + return self.searchqueryset.spelling_suggestion(self.cleaned_data['q']) + + +class HighlightedSearchForm(SearchForm): + def search(self): + return super(HighlightedSearchForm, self).search().highlight() + + +class FacetedSearchForm(SearchForm): + def __init__(self, *args, **kwargs): + self.selected_facets = kwargs.pop("selected_facets", []) + super(FacetedSearchForm, self).__init__(*args, **kwargs) + + def search(self): + sqs = super(FacetedSearchForm, self).search() + + # We need to process each facet to ensure that the field name and the + # value are quoted correctly and separately: + for facet in self.selected_facets: + if ":" not in facet: + continue + + field, value = facet.split(":", 1) + + if value: + sqs = sqs.narrow(u'%s:"%s"' % (field, sqs.query.clean(value))) + + return sqs + + +class ModelSearchForm(SearchForm): + def __init__(self, *args, **kwargs): + super(ModelSearchForm, self).__init__(*args, **kwargs) + self.fields['models'] = forms.MultipleChoiceField(choices=model_choices(), required=False, label=_('Search In'), widget=forms.CheckboxSelectMultiple) + + def get_models(self): + """Return an alphabetical list of model classes in the index.""" + search_models = [] + + if self.is_valid(): + for model in self.cleaned_data['models']: + search_models.append(models.get_model(*model.split('.'))) + + return search_models + + def search(self): + sqs = super(ModelSearchForm, self).search() + return sqs.models(*self.get_models()) + + +class HighlightedModelSearchForm(ModelSearchForm): + def search(self): + return super(HighlightedModelSearchForm, self).search().highlight() + + +class FacetedModelSearchForm(ModelSearchForm): + selected_facets = forms.CharField(required=False, widget=forms.HiddenInput) + + def search(self): + sqs = super(FacetedModelSearchForm, self).search() + + if hasattr(self, 'cleaned_data') and self.cleaned_data['selected_facets']: + sqs = sqs.narrow(self.cleaned_data['selected_facets']) + + return sqs.models(*self.get_models()) diff --git a/haystack/generic_views.py b/haystack/generic_views.py new file mode 100644 index 0000000..2dad515 --- /dev/null +++ b/haystack/generic_views.py @@ -0,0 +1,126 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.conf import settings +from django.core.paginator import Paginator +from django.views.generic import FormView +from django.views.generic.edit import FormMixin +from django.views.generic.list import MultipleObjectMixin + +from .forms import FacetedSearchForm, ModelSearchForm +from .query import SearchQuerySet + +RESULTS_PER_PAGE = getattr(settings, 'HAYSTACK_SEARCH_RESULTS_PER_PAGE', 20) + + +class SearchMixin(MultipleObjectMixin, FormMixin): + """ + A mixin that allows adding in Haystacks search functionality into + another view class. + + This mixin exhibits similar end functionality as the base Haystack search + view, but with some important distinctions oriented around greater + compatibility with Django's built-in class based views and mixins. + + Normal flow: + + self.request = request + + self.form = self.build_form() + self.query = self.get_query() + self.results = self.get_results() + + return self.create_response() + + This mixin should: + + 1. Make the form + 2. Get the queryset + 3. Return the paginated queryset + + """ + template_name = 'search/search.html' + load_all = True + form_class = ModelSearchForm + queryset = SearchQuerySet() + context_object_name = None + paginate_by = RESULTS_PER_PAGE + paginate_orphans = 0 + paginator_class = Paginator + page_kwarg = 'page' + form_name = 'form' + search_field = 'q' + object_list = None + + def get_form_kwargs(self): + """ + Returns the keyword arguments for instantiating the form. + """ + kwargs = {'initial': self.get_initial()} + if self.request.method == 'GET': + kwargs.update({ + 'data': self.request.GET, + }) + kwargs.update({'searchqueryset': self.get_queryset()}) + return kwargs + + def form_invalid(self, form): + context = self.get_context_data(**{ + self.form_name: form, + 'object_list': self.get_queryset() + }) + return self.render_to_response(context) + + def form_valid(self, form): + self.queryset = form.search() + context = self.get_context_data(**{ + self.form_name: form, + 'query': form.cleaned_data.get(self.search_field), + 'object_list': self.queryset + }) + return self.render_to_response(context) + + +class FacetedSearchMixin(SearchMixin): + """ + A mixin that allows adding in a Haystack search functionality with search + faceting. + """ + form_class = FacetedSearchForm + + def get_form_kwargs(self): + kwargs = super(SearchMixin, self).get_form_kwargs() + kwargs.update({ + 'selected_facets': self.request.GET.getlist("selected_facets") + }) + return kwargs + + def get_context_data(self, **kwargs): + context = super(FacetedSearchMixin, self).get_context_data(**kwargs) + context.update({'facets': self.results.facet_counts()}) + return context + + +class SearchView(SearchMixin, FormView): + """A view class for searching a Haystack managed search index""" + + def get(self, request, *args, **kwargs): + """ + Handles GET requests and instantiates a blank version of the form. + """ + form_class = self.get_form_class() + form = self.get_form(form_class) + + if form.is_valid(): + return self.form_valid(form) + else: + return self.form_invalid(form) + + +class FacetedSearchView(FacetedSearchMixin, SearchView): + """ + A view class for searching a Haystack managed search index with + facets + """ + pass diff --git a/haystack/indexes.py b/haystack/indexes.py new file mode 100644 index 0000000..d3d53ec --- /dev/null +++ b/haystack/indexes.py @@ -0,0 +1,497 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import copy +import threading +import warnings + +from django.core.exceptions import ImproperlyConfigured +from django.utils.six import with_metaclass + +from haystack import connection_router, connections +from haystack.constants import DEFAULT_ALIAS, DJANGO_CT, DJANGO_ID, ID, Indexable +from haystack.fields import * +from haystack.manager import SearchIndexManager +from haystack.utils import get_facet_field_name, get_identifier, get_model_ct + +try: + from django.utils.encoding import force_text +except ImportError: + from django.utils.encoding import force_unicode as force_text + + +class DeclarativeMetaclass(type): + def __new__(cls, name, bases, attrs): + attrs['fields'] = {} + + # Inherit any fields from parent(s). + try: + parents = [b for b in bases if issubclass(b, SearchIndex)] + # Simulate the MRO. + parents.reverse() + + for p in parents: + fields = getattr(p, 'fields', None) + + if fields: + attrs['fields'].update(fields) + except NameError: + pass + + # Build a dictionary of faceted fields for cross-referencing. + facet_fields = {} + + for field_name, obj in attrs.items(): + # Only need to check the FacetFields. + if hasattr(obj, 'facet_for'): + if not obj.facet_for in facet_fields: + facet_fields[obj.facet_for] = [] + + facet_fields[obj.facet_for].append(field_name) + + built_fields = {} + + for field_name, obj in attrs.items(): + if isinstance(obj, SearchField): + field = attrs[field_name] + field.set_instance_name(field_name) + built_fields[field_name] = field + + # Only check non-faceted fields for the following info. + if not hasattr(field, 'facet_for'): + if field.faceted == True: + # If no other field is claiming this field as + # ``facet_for``, create a shadow ``FacetField``. + if not field_name in facet_fields: + shadow_facet_name = get_facet_field_name(field_name) + shadow_facet_field = field.facet_class(facet_for=field_name) + shadow_facet_field.set_instance_name(shadow_facet_name) + built_fields[shadow_facet_name] = shadow_facet_field + + attrs['fields'].update(built_fields) + + # Assigning default 'objects' query manager if it does not already exist + if not 'objects' in attrs: + try: + attrs['objects'] = SearchIndexManager(attrs['Meta'].index_label) + except (KeyError, AttributeError): + attrs['objects'] = SearchIndexManager(DEFAULT_ALIAS) + + return super(DeclarativeMetaclass, cls).__new__(cls, name, bases, attrs) + + +class SearchIndex(with_metaclass(DeclarativeMetaclass, threading.local)): + """ + Base class for building indexes. + + An example might look like this:: + + import datetime + from haystack import indexes + from myapp.models import Note + + class NoteIndex(indexes.SearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + author = indexes.CharField(model_attr='user') + pub_date = indexes.DateTimeField(model_attr='pub_date') + + def get_model(self): + return Note + + def index_queryset(self, using=None): + return self.get_model().objects.filter(pub_date__lte=datetime.datetime.now()) + + """ + def __init__(self): + self.prepared_data = None + content_fields = [] + + self.field_map = dict() + for field_name, field in self.fields.items(): + #form field map + self.field_map[field.index_fieldname] = field_name + if field.document is True: + content_fields.append(field_name) + + if not len(content_fields) == 1: + raise SearchFieldError("The index '%s' must have one (and only one) SearchField with document=True." % self.__class__.__name__) + + def get_model(self): + """ + Should return the ``Model`` class (not an instance) that the rest of the + ``SearchIndex`` should use. + + This method is required & you must override it to return the correct class. + """ + raise NotImplementedError("You must provide a 'model' method for the '%r' index." % self) + + def index_queryset(self, using=None): + """ + Get the default QuerySet to index when doing a full update. + + Subclasses can override this method to avoid indexing certain objects. + """ + return self.get_model()._default_manager.all() + + def read_queryset(self, using=None): + """ + Get the default QuerySet for read actions. + + Subclasses can override this method to work with other managers. + Useful when working with default managers that filter some objects. + """ + return self.index_queryset(using=using) + + def build_queryset(self, using=None, start_date=None, end_date=None): + """ + Get the default QuerySet to index when doing an index update. + + Subclasses can override this method to take into account related + model modification times. + + The default is to use ``SearchIndex.index_queryset`` and filter + based on ``SearchIndex.get_updated_field`` + """ + extra_lookup_kwargs = {} + model = self.get_model() + updated_field = self.get_updated_field() + + update_field_msg = ("No updated date field found for '%s' " + "- not restricting by age.") % model.__name__ + + if start_date: + if updated_field: + extra_lookup_kwargs['%s__gte' % updated_field] = start_date + else: + warnings.warn(update_field_msg) + + if end_date: + if updated_field: + extra_lookup_kwargs['%s__lte' % updated_field] = end_date + else: + warnings.warn(update_field_msg) + + index_qs = None + + if hasattr(self, 'get_queryset'): + warnings.warn("'SearchIndex.get_queryset' was deprecated in Haystack v2. Please rename the method 'index_queryset'.") + index_qs = self.get_queryset() + else: + index_qs = self.index_queryset(using=using) + + if not hasattr(index_qs, 'filter'): + raise ImproperlyConfigured("The '%r' class must return a 'QuerySet' in the 'index_queryset' method." % self) + + # `.select_related()` seems like a good idea here but can fail on + # nullable `ForeignKey` as well as what seems like other cases. + return index_qs.filter(**extra_lookup_kwargs).order_by(model._meta.pk.name) + + def prepare(self, obj): + """ + Fetches and adds/alters data before indexing. + """ + self.prepared_data = { + ID: get_identifier(obj), + DJANGO_CT: get_model_ct(obj), + DJANGO_ID: force_text(obj.pk), + } + + for field_name, field in self.fields.items(): + # Use the possibly overridden name, which will default to the + # variable name of the field. + self.prepared_data[field.index_fieldname] = field.prepare(obj) + + if hasattr(self, "prepare_%s" % field_name): + value = getattr(self, "prepare_%s" % field_name)(obj) + self.prepared_data[field.index_fieldname] = value + + return self.prepared_data + + def full_prepare(self, obj): + self.prepared_data = self.prepare(obj) + + for field_name, field in self.fields.items(): + # Duplicate data for faceted fields. + if getattr(field, 'facet_for', None): + source_field_name = self.fields[field.facet_for].index_fieldname + + # If there's data there, leave it alone. Otherwise, populate it + # with whatever the related field has. + if self.prepared_data[field_name] is None and source_field_name in self.prepared_data: + self.prepared_data[field.index_fieldname] = self.prepared_data[source_field_name] + + # Remove any fields that lack a value and are ``null=True``. + if field.null is True: + if self.prepared_data[field.index_fieldname] is None: + del(self.prepared_data[field.index_fieldname]) + + return self.prepared_data + + def get_content_field(self): + """Returns the field that supplies the primary document to be indexed.""" + for field_name, field in self.fields.items(): + if field.document is True: + return field.index_fieldname + + def get_field_weights(self): + """Returns a dict of fields with weight values""" + weights = {} + for field_name, field in self.fields.items(): + if field.boost: + weights[field_name] = field.boost + return weights + + def _get_backend(self, using): + if using is None: + try: + using = connection_router.for_write(index=self)[0] + except IndexError: + # There's no backend to handle it. Bomb out. + return None + + return connections[using].get_backend() + + def update(self, using=None): + """ + Updates the entire index. + + If ``using`` is provided, it specifies which connection should be + used. Default relies on the routers to decide which backend should + be used. + """ + backend = self._get_backend(using) + + if backend is not None: + backend.update(self, self.index_queryset(using=using)) + + def update_object(self, instance, using=None, **kwargs): + """ + Update the index for a single object. Attached to the class's + post-save hook. + + If ``using`` is provided, it specifies which connection should be + used. Default relies on the routers to decide which backend should + be used. + """ + # Check to make sure we want to index this first. + if self.should_update(instance, **kwargs): + backend = self._get_backend(using) + + if backend is not None: + backend.update(self, [instance]) + + def remove_object(self, instance, using=None, **kwargs): + """ + Remove an object from the index. Attached to the class's + post-delete hook. + + If ``using`` is provided, it specifies which connection should be + used. Default relies on the routers to decide which backend should + be used. + """ + backend = self._get_backend(using) + + if backend is not None: + backend.remove(instance, **kwargs) + + def clear(self, using=None): + """ + Clears the entire index. + + If ``using`` is provided, it specifies which connection should be + used. Default relies on the routers to decide which backend should + be used. + """ + backend = self._get_backend(using) + + if backend is not None: + backend.clear(models=[self.get_model()]) + + def reindex(self, using=None): + """ + Completely clear the index for this model and rebuild it. + + If ``using`` is provided, it specifies which connection should be + used. Default relies on the routers to decide which backend should + be used. + """ + self.clear(using=using) + self.update(using=using) + + def get_updated_field(self): + """ + Get the field name that represents the updated date for the model. + + If specified, this is used by the reindex command to filter out results + from the QuerySet, enabling you to reindex only recent records. This + method should either return None (reindex everything always) or a + string of the Model's DateField/DateTimeField name. + """ + return None + + def should_update(self, instance, **kwargs): + """ + Determine if an object should be updated in the index. + + It's useful to override this when an object may save frequently and + cause excessive reindexing. You should check conditions on the instance + and return False if it is not to be indexed. + + By default, returns True (always reindex). + """ + return True + + def load_all_queryset(self): + """ + Provides the ability to override how objects get loaded in conjunction + with ``SearchQuerySet.load_all``. + + This is useful for post-processing the results from the query, enabling + things like adding ``select_related`` or filtering certain data. + + By default, returns ``all()`` on the model's default manager. + """ + return self.get_model()._default_manager.all() + + +class BasicSearchIndex(SearchIndex): + text = CharField(document=True, use_template=True) + + +# End SearchIndexes +# Begin ModelSearchIndexes + + +def index_field_from_django_field(f, default=CharField): + """ + Returns the Haystack field type that would likely be associated with each + Django type. + """ + result = default + + if f.get_internal_type() in ('DateField', 'DateTimeField'): + result = DateTimeField + elif f.get_internal_type() in ('BooleanField', 'NullBooleanField'): + result = BooleanField + elif f.get_internal_type() in ('CommaSeparatedIntegerField',): + result = MultiValueField + elif f.get_internal_type() in ('DecimalField', 'FloatField'): + result = FloatField + elif f.get_internal_type() in ('IntegerField', 'PositiveIntegerField', 'PositiveSmallIntegerField', 'SmallIntegerField'): + result = IntegerField + + return result + + +class ModelSearchIndex(SearchIndex): + """ + Introspects the model assigned to it and generates a `SearchIndex` based on + the fields of that model. + + In addition, it adds a `text` field that is the `document=True` field and + has `use_template=True` option set, just like the `BasicSearchIndex`. + + Usage of this class might result in inferior `SearchIndex` objects, which + can directly affect your search results. Use this to establish basic + functionality and move to custom `SearchIndex` objects for better control. + + At this time, it does not handle related fields. + """ + text = CharField(document=True, use_template=True) + # list of reserved field names + fields_to_skip = (ID, DJANGO_CT, DJANGO_ID, 'content', 'text') + + def __init__(self, extra_field_kwargs=None): + self.model = None + + self.prepared_data = None + content_fields = [] + self.extra_field_kwargs = extra_field_kwargs or {} + + # Introspect the model, adding/removing fields as needed. + # Adds/Excludes should happen only if the fields are not already + # defined in `self.fields`. + self._meta = getattr(self, 'Meta', None) + + if self._meta: + self.model = getattr(self._meta, 'model', None) + fields = getattr(self._meta, 'fields', []) + excludes = getattr(self._meta, 'excludes', []) + + # Add in the new fields. + self.fields.update(self.get_fields(fields, excludes)) + + for field_name, field in self.fields.items(): + if field.document is True: + content_fields.append(field_name) + + if not len(content_fields) == 1: + raise SearchFieldError("The index '%s' must have one (and only one) SearchField with document=True." % self.__class__.__name__) + + def should_skip_field(self, field): + """ + Given a Django model field, return if it should be included in the + contributed SearchFields. + """ + # Skip fields in skip list + if field.name in self.fields_to_skip: + return True + + # Ignore certain fields (AutoField, related fields). + if field.primary_key or getattr(field, 'rel'): + return True + + return False + + def get_model(self): + return self.model + + def get_index_fieldname(self, f): + """ + Given a Django field, return the appropriate index fieldname. + """ + return f.name + + def get_fields(self, fields=None, excludes=None): + """ + Given any explicit fields to include and fields to exclude, add + additional fields based on the associated model. + """ + final_fields = {} + fields = fields or [] + excludes = excludes or [] + + for f in self.model._meta.fields: + # If the field name is already present, skip + if f.name in self.fields: + continue + + # If field is not present in explicit field listing, skip + if fields and f.name not in fields: + continue + + # If field is in exclude list, skip + if excludes and f.name in excludes: + continue + + if self.should_skip_field(f): + continue + + index_field_class = index_field_from_django_field(f) + + kwargs = copy.copy(self.extra_field_kwargs) + kwargs.update({ + 'model_attr': f.name, + }) + + if f.null is True: + kwargs['null'] = True + + if f.has_default(): + kwargs['default'] = f.default + + final_fields[f.name] = index_field_class(**kwargs) + final_fields[f.name].set_instance_name(self.get_index_fieldname(f)) + + return final_fields diff --git a/haystack/inputs.py b/haystack/inputs.py new file mode 100644 index 0000000..ef0a929 --- /dev/null +++ b/haystack/inputs.py @@ -0,0 +1,159 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import re +import warnings + +from django.utils.encoding import python_2_unicode_compatible + +try: + from django.utils.encoding import force_text +except ImportError: + from django.utils.encoding import force_unicode as force_text + + +@python_2_unicode_compatible +class BaseInput(object): + """ + The base input type. Doesn't do much. You want ``Raw`` instead. + """ + input_type_name = 'base' + post_process = True + + def __init__(self, query_string, **kwargs): + self.query_string = query_string + self.kwargs = kwargs + + def __repr__(self): + return u"<%s '%s'>" % (self.__class__.__name__, self.__unicode__().encode('utf8')) + + def __str__(self): + return force_text(self.query_string) + + def prepare(self, query_obj): + return self.query_string + + +class Raw(BaseInput): + """ + An input type for passing a query directly to the backend. + + Prone to not being very portable. + """ + input_type_name = 'raw' + post_process = False + + +class PythonData(BaseInput): + """ + Represents a bare Python non-string type. + + Largely only for internal use. + """ + input_type_name = 'python_data' + + +class Clean(BaseInput): + """ + An input type for sanitizing user/untrusted input. + """ + input_type_name = 'clean' + + def prepare(self, query_obj): + query_string = super(Clean, self).prepare(query_obj) + return query_obj.clean(query_string) + + +class Exact(BaseInput): + """ + An input type for making exact matches. + """ + input_type_name = 'exact' + + def prepare(self, query_obj): + query_string = super(Exact, self).prepare(query_obj) + + if self.kwargs.get('clean', False): + # We need to clean each part of the exact match. + exact_bits = [Clean(bit).prepare(query_obj) for bit in query_string.split(' ') if bit] + query_string = u' '.join(exact_bits) + + return query_obj.build_exact_query(query_string) + + +class Not(Clean): + """ + An input type for negating a query. + """ + input_type_name = 'not' + + def prepare(self, query_obj): + query_string = super(Not, self).prepare(query_obj) + return query_obj.build_not_query(query_string) + + +class AutoQuery(BaseInput): + """ + A convenience class that handles common user queries. + + In addition to cleaning all tokens, it handles double quote bits as + exact matches & terms with '-' in front as NOT queries. + """ + input_type_name = 'auto_query' + post_process = False + exact_match_re = re.compile(r'"(?P.*?)"') + + def prepare(self, query_obj): + query_string = super(AutoQuery, self).prepare(query_obj) + exacts = self.exact_match_re.findall(query_string) + tokens = [] + query_bits = [] + + for rough_token in self.exact_match_re.split(query_string): + if not rough_token: + continue + elif not rough_token in exacts: + # We have something that's not an exact match but may have more + # than on word in it. + tokens.extend(rough_token.split(' ')) + else: + tokens.append(rough_token) + + for token in tokens: + if not token: + continue + if token in exacts: + query_bits.append(Exact(token, clean=True).prepare(query_obj)) + elif token.startswith('-') and len(token) > 1: + # This might break Xapian. Check on this. + query_bits.append(Not(token[1:]).prepare(query_obj)) + else: + query_bits.append(Clean(token).prepare(query_obj)) + + return u' '.join(query_bits) + + +class AltParser(BaseInput): + """ + If the engine supports it, this input type allows for submitting a query + that uses a different parser. + """ + input_type_name = 'alt_parser' + post_process = False + use_parens = False + + def __init__(self, parser_name, query_string='', **kwargs): + self.parser_name = parser_name + self.query_string = query_string + self.kwargs = kwargs + + def __repr__(self): + return u"<%s '%s' '%s' '%s'>" % (self.__class__.__name__, self.parser_name, self.query_string, self.kwargs) + + def prepare(self, query_obj): + if not hasattr(query_obj, 'build_alt_parser_query'): + warnings.warn("Use of 'AltParser' input type is being ignored, as the '%s' backend doesn't support them." % query_obj) + return '' + + return query_obj.build_alt_parser_query(self.parser_name, self.query_string, **self.kwargs) diff --git a/haystack/management/__init__.py b/haystack/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/haystack/management/commands/__init__.py b/haystack/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/haystack/management/commands/build_solr_schema.py b/haystack/management/commands/build_solr_schema.py new file mode 100644 index 0000000..6465e37 --- /dev/null +++ b/haystack/management/commands/build_solr_schema.py @@ -0,0 +1,70 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import sys +from optparse import make_option + +from django.core.exceptions import ImproperlyConfigured +from django.core.management.base import BaseCommand +from django.template import Context, loader + +from haystack import constants +from haystack.backends.solr_backend import SolrSearchBackend + + +class Command(BaseCommand): + help = "Generates a Solr schema that reflects the indexes." + base_options = ( + make_option("-f", "--filename", action="store", type="string", dest="filename", + help='If provided, directs output to a file instead of stdout.'), + make_option("-u", "--using", action="store", type="string", dest="using", default=constants.DEFAULT_ALIAS, + help='If provided, chooses a connection to work with.'), + ) + option_list = BaseCommand.option_list + base_options + + def handle(self, **options): + """Generates a Solr schema that reflects the indexes.""" + using = options.get('using') + schema_xml = self.build_template(using=using) + + if options.get('filename'): + self.write_file(options.get('filename'), schema_xml) + else: + self.print_stdout(schema_xml) + + def build_context(self, using): + from haystack import connections, connection_router + backend = connections[using].get_backend() + + if not isinstance(backend, SolrSearchBackend): + raise ImproperlyConfigured("'%s' isn't configured as a SolrEngine)." % backend.connection_alias) + + content_field_name, fields = backend.build_schema(connections[using].get_unified_index().all_searchfields()) + return Context({ + 'content_field_name': content_field_name, + 'fields': fields, + 'default_operator': constants.DEFAULT_OPERATOR, + 'ID': constants.ID, + 'DJANGO_CT': constants.DJANGO_CT, + 'DJANGO_ID': constants.DJANGO_ID, + }) + + def build_template(self, using): + t = loader.get_template('search_configuration/solr.xml') + c = self.build_context(using=using) + return t.render(c) + + def print_stdout(self, schema_xml): + sys.stderr.write("\n") + sys.stderr.write("\n") + sys.stderr.write("\n") + sys.stderr.write("Save the following output to 'schema.xml' and place it in your Solr configuration directory.\n") + sys.stderr.write("--------------------------------------------------------------------------------------------\n") + sys.stderr.write("\n") + print(schema_xml) + + def write_file(self, filename, schema_xml): + schema_file = open(filename, 'w') + schema_file.write(schema_xml) + schema_file.close() diff --git a/haystack/management/commands/clear_index.py b/haystack/management/commands/clear_index.py new file mode 100644 index 0000000..e9803e6 --- /dev/null +++ b/haystack/management/commands/clear_index.py @@ -0,0 +1,59 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import sys +from optparse import make_option + +from django.core.management.base import BaseCommand +from django.utils import six + + +class Command(BaseCommand): + help = "Clears out the search index completely." + base_options = ( + make_option('--noinput', action='store_false', dest='interactive', default=True, + help='If provided, no prompts will be issued to the user and the data will be wiped out.' + ), + make_option("-u", "--using", action="append", dest="using", + default=[], + help='Update only the named backend (can be used multiple times). ' + 'By default all backends will be updated.' + ), + make_option('--nocommit', action='store_false', dest='commit', + default=True, help='Will pass commit=False to the backend.' + ), + ) + option_list = BaseCommand.option_list + base_options + + def handle(self, **options): + """Clears out the search index completely.""" + from haystack import connections + self.verbosity = int(options.get('verbosity', 1)) + self.commit = options.get('commit', True) + + using = options.get('using') + if not using: + using = connections.connections_info.keys() + + if options.get('interactive', True): + print() + print("WARNING: This will irreparably remove EVERYTHING from your search index in connection '%s'." % "', '".join(using)) + print("Your choices after this are to restore from backups or rebuild via the `rebuild_index` command.") + + yes_or_no = six.moves.input("Are you sure you wish to continue? [y/N] ") + print + + if not yes_or_no.lower().startswith('y'): + print("No action taken.") + sys.exit() + + if self.verbosity >= 1: + print("Removing all documents from your index because you said so.") + + for backend_name in using: + backend = connections[backend_name].get_backend() + backend.clear(commit=self.commit) + + if self.verbosity >= 1: + print("All documents removed.") diff --git a/haystack/management/commands/haystack_info.py b/haystack/management/commands/haystack_info.py new file mode 100644 index 0000000..9e478a6 --- /dev/null +++ b/haystack/management/commands/haystack_info.py @@ -0,0 +1,21 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.core.management.base import NoArgsCommand + + +class Command(NoArgsCommand): + help = "Provides feedback about the current Haystack setup." + + def handle_noargs(self, **options): + """Provides feedback about the current Haystack setup.""" + from haystack import connections + + unified_index = connections['default'].get_unified_index() + indexed = unified_index.get_indexed_models() + index_count = len(indexed) + print("Number of handled %s index(es)." % index_count) + + for index in indexed: + print(" - Model: %s by Index: %s" % (index.__name__, unified_index.get_indexes()[index])) diff --git a/haystack/management/commands/rebuild_index.py b/haystack/management/commands/rebuild_index.py new file mode 100644 index 0000000..58c1096 --- /dev/null +++ b/haystack/management/commands/rebuild_index.py @@ -0,0 +1,26 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.core.management import call_command +from django.core.management.base import BaseCommand + +from haystack.management.commands.clear_index import Command as ClearCommand +from haystack.management.commands.update_index import Command as UpdateCommand + +__all__ = ['Command'] + +_combined_options = list(BaseCommand.option_list) +_combined_options.extend(option for option in UpdateCommand.base_options + if option.get_opt_string() not in [i.get_opt_string() for i in _combined_options]) +_combined_options.extend(option for option in ClearCommand.base_options + if option.get_opt_string() not in [i.get_opt_string() for i in _combined_options]) + + +class Command(BaseCommand): + help = "Completely rebuilds the search index by removing the old data and then updating." + option_list = _combined_options + + def handle(self, **options): + call_command('clear_index', **options) + call_command('update_index', **options) diff --git a/haystack/management/commands/update_index.py b/haystack/management/commands/update_index.py new file mode 100755 index 0000000..81981e5 --- /dev/null +++ b/haystack/management/commands/update_index.py @@ -0,0 +1,289 @@ +# encoding: utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging +import os +import sys +import warnings +from datetime import timedelta +from optparse import make_option + +try: + from django.db import close_old_connections +except ImportError: + # This can be removed when we drop support for Django 1.7 and earlier: + from django.db import close_connection as close_old_connections + +from django.core.management.base import LabelCommand +from django.db import reset_queries + +from haystack import connections as haystack_connections +from haystack.query import SearchQuerySet +from haystack.utils.app_loading import haystack_get_models, haystack_load_apps + +try: + from django.utils.encoding import force_text +except ImportError: + from django.utils.encoding import force_unicode as force_text + +try: + from django.utils.encoding import smart_bytes +except ImportError: + from django.utils.encoding import smart_str as smart_bytes + +try: + from django.utils.timezone import now +except ImportError: + from datetime import datetime + now = datetime.now + + +DEFAULT_BATCH_SIZE = None +DEFAULT_AGE = None +APP = 'app' +MODEL = 'model' + + +def worker(bits): + # We need to reset the connections, otherwise the different processes + # will try to share the connection, which causes things to blow up. + from django.db import connections + + for alias, info in connections.databases.items(): + # We need to also tread lightly with SQLite, because blindly wiping + # out connections (via ``... = {}``) destroys in-memory DBs. + if 'sqlite3' not in info['ENGINE']: + try: + close_old_connections() + if isinstance(connections._connections, dict): + del(connections._connections[alias]) + else: + delattr(connections._connections, alias) + except KeyError: + pass + + if bits[0] == 'do_update': + func, model, start, end, total, using, start_date, end_date, verbosity, commit = bits + elif bits[0] == 'do_remove': + func, model, pks_seen, start, upper_bound, using, verbosity, commit = bits + else: + return + + unified_index = haystack_connections[using].get_unified_index() + index = unified_index.get_index(model) + backend = haystack_connections[using].get_backend() + + if func == 'do_update': + qs = index.build_queryset(start_date=start_date, end_date=end_date) + do_update(backend, index, qs, start, end, total, verbosity=verbosity, commit=commit) + else: + raise NotImplementedError('Unknown function %s' % func) + + +def do_update(backend, index, qs, start, end, total, verbosity=1, commit=True): + # Get a clone of the QuerySet so that the cache doesn't bloat up + # in memory. Useful when reindexing large amounts of data. + small_cache_qs = qs.all() + current_qs = small_cache_qs[start:end] + + if verbosity >= 2: + if hasattr(os, 'getppid') and os.getpid() == os.getppid(): + print(" indexed %s - %d of %d." % (start + 1, end, total)) + else: + print(" indexed %s - %d of %d (by %s)." % (start + 1, end, total, os.getpid())) + + # FIXME: Get the right backend. + backend.update(index, current_qs, commit=commit) + + # Clear out the DB connections queries because it bloats up RAM. + reset_queries() + + +class Command(LabelCommand): + help = "Freshens the index for the given app(s)." + base_options = ( + make_option('-a', '--age', action='store', dest='age', + default=DEFAULT_AGE, type='int', + help='Number of hours back to consider objects new.' + ), + make_option('-s', '--start', action='store', dest='start_date', + default=None, type='string', + help='The start date for indexing within. Can be any dateutil-parsable string, recommended to be YYYY-MM-DDTHH:MM:SS.' + ), + make_option('-e', '--end', action='store', dest='end_date', + default=None, type='string', + help='The end date for indexing within. Can be any dateutil-parsable string, recommended to be YYYY-MM-DDTHH:MM:SS.' + ), + make_option('-b', '--batch-size', action='store', dest='batchsize', + default=None, type='int', + help='Number of items to index at once.' + ), + make_option('-r', '--remove', action='store_true', dest='remove', + default=False, help='Remove objects from the index that are no longer present in the database.' + ), + make_option("-u", "--using", action="append", dest="using", + default=[], + help='Update only the named backend (can be used multiple times). ' + 'By default all backends will be updated.' + ), + make_option('-k', '--workers', action='store', dest='workers', + default=0, type='int', + help='Allows for the use multiple workers to parallelize indexing. Requires multiprocessing.' + ), + make_option('--nocommit', action='store_false', dest='commit', + default=True, help='Will pass commit=False to the backend.' + ), + ) + option_list = LabelCommand.option_list + base_options + + def handle(self, *items, **options): + self.verbosity = int(options.get('verbosity', 1)) + self.batchsize = options.get('batchsize', DEFAULT_BATCH_SIZE) + self.start_date = None + self.end_date = None + self.remove = options.get('remove', False) + self.workers = int(options.get('workers', 0)) + self.commit = options.get('commit', True) + + if sys.version_info < (2, 7): + warnings.warn('multiprocessing is disabled on Python 2.6 and earlier. ' + 'See https://github.com/toastdriven/django-haystack/issues/1001') + self.workers = 0 + + self.backends = options.get('using') + if not self.backends: + self.backends = haystack_connections.connections_info.keys() + + age = options.get('age', DEFAULT_AGE) + start_date = options.get('start_date') + end_date = options.get('end_date') + + if age is not None: + self.start_date = now() - timedelta(hours=int(age)) + + if start_date is not None: + from dateutil.parser import parse as dateutil_parse + + try: + self.start_date = dateutil_parse(start_date) + except ValueError: + pass + + if end_date is not None: + from dateutil.parser import parse as dateutil_parse + + try: + self.end_date = dateutil_parse(end_date) + except ValueError: + pass + + if not items: + items = haystack_load_apps() + + return super(Command, self).handle(*items, **options) + + def handle_label(self, label, **options): + for using in self.backends: + try: + self.update_backend(label, using) + except: + logging.exception("Error updating %s using %s ", label, using) + raise + + def update_backend(self, label, using): + from haystack.exceptions import NotHandled + + backend = haystack_connections[using].get_backend() + unified_index = haystack_connections[using].get_unified_index() + + if self.workers > 0: + import multiprocessing + + for model in haystack_get_models(label): + try: + index = unified_index.get_index(model) + except NotHandled: + if self.verbosity >= 2: + print("Skipping '%s' - no index." % model) + continue + + if self.workers > 0: + # workers resetting connections leads to references to models / connections getting + # stale and having their connection disconnected from under them. Resetting before + # the loop continues and it accesses the ORM makes it better. + close_old_connections() + + qs = index.build_queryset(using=using, start_date=self.start_date, + end_date=self.end_date) + + total = qs.count() + + if self.verbosity >= 1: + print(u"Indexing %d %s" % (total, force_text(model._meta.verbose_name_plural))) + + batch_size = self.batchsize or backend.batch_size + + if self.workers > 0: + ghetto_queue = [] + + for start in range(0, total, batch_size): + end = min(start + batch_size, total) + + if self.workers == 0: + do_update(backend, index, qs, start, end, total, verbosity=self.verbosity, commit=self.commit) + else: + ghetto_queue.append(('do_update', model, start, end, total, using, self.start_date, self.end_date, self.verbosity, self.commit)) + + if self.workers > 0: + pool = multiprocessing.Pool(self.workers) + pool.map(worker, ghetto_queue) + pool.close() + pool.join() + + if self.remove: + if self.start_date or self.end_date or total <= 0: + # They're using a reduced set, which may not incorporate + # all pks. Rebuild the list with everything. + qs = index.index_queryset().values_list('pk', flat=True) + database_pks = set(smart_bytes(pk) for pk in qs) + + total = len(database_pks) + else: + database_pks = set(smart_bytes(pk) for pk in qs.values_list('pk', flat=True)) + + # Since records may still be in the search index but not the local database + # we'll use that to create batches for processing. + # See https://github.com/django-haystack/django-haystack/issues/1186 + index_total = SearchQuerySet(using=backend.connection_alias).models(model).count() + + # Retrieve PKs from the index. Note that this cannot be a numeric range query because although + # pks are normally numeric they can be non-numeric UUIDs or other custom values. To reduce + # load on the search engine, we only retrieve the pk field, which will be checked against the + # full list obtained from the database, and the id field, which will be used to delete the + # record should it be found to be stale. + index_pks = SearchQuerySet(using=backend.connection_alias).models(model) + index_pks = index_pks.values_list('pk', 'id') + + # We'll collect all of the record IDs which are no longer present in the database and delete + # them after walking the entire index. This uses more memory than the incremental approach but + # avoids needing the pagination logic below to account for both commit modes: + stale_records = set() + + for start in range(0, index_total, batch_size): + upper_bound = start + batch_size + + # If the database pk is no longer present, queue the index key for removal: + for pk, rec_id in index_pks[start:upper_bound]: + if smart_bytes(pk) not in database_pks: + stale_records.add(rec_id) + + if stale_records: + if self.verbosity >= 1: + print(" removing %d stale records." % len(stale_records)) + + for rec_id in stale_records: + # Since the PK was not in the database list, we'll delete the record from the search index: + if self.verbosity >= 2: + print(" removing %s." % rec_id) + + backend.remove(rec_id, commit=self.commit) diff --git a/haystack/manager.py b/haystack/manager.py new file mode 100644 index 0000000..b7588d2 --- /dev/null +++ b/haystack/manager.py @@ -0,0 +1,107 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from haystack.query import EmptySearchQuerySet, SearchQuerySet + + +class SearchIndexManager(object): + def __init__(self, using=None): + super(SearchIndexManager, self).__init__() + self.using = using + + def get_search_queryset(self): + """Returns a new SearchQuerySet object. Subclasses can override this method + to easily customize the behavior of the Manager. + """ + return SearchQuerySet(using=self.using) + + def get_empty_query_set(self): + return EmptySearchQuerySet(using=self.using) + + def all(self): + return self.get_search_queryset() + + def none(self): + return self.get_empty_query_set() + + def filter(self, *args, **kwargs): + return self.get_search_queryset().filter(*args, **kwargs) + + def exclude(self, *args, **kwargs): + return self.get_search_queryset().exclude(*args, **kwargs) + + def filter_and(self, *args, **kwargs): + return self.get_search_queryset().filter_and(*args, **kwargs) + + def filter_or(self, *args, **kwargs): + return self.get_search_queryset().filter_or(*args, **kwargs) + + def order_by(self, *args): + return self.get_search_queryset().order_by(*args) + + def highlight(self): + return self.get_search_queryset().highlight() + + def boost(self, term, boost): + return self.get_search_queryset().boost(term, boost) + + def facet(self, field): + return self.get_search_queryset().facet(field) + + def within(self, field, point_1, point_2): + return self.get_search_queryset().within(field, point_1, point_2) + + def dwithin(self, field, point, distance): + return self.get_search_queryset().dwithin(field, point, distance) + + def distance(self, field, point): + return self.get_search_queryset().distance(field, point) + + def date_facet(self, field, start_date, end_date, gap_by, gap_amount=1): + return self.get_search_queryset().date_facet(field, start_date, end_date, gap_by, gap_amount=1) + + def query_facet(self, field, query): + return self.get_search_queryset().query_facet(field, query) + + def narrow(self, query): + return self.get_search_queryset().narrow(query) + + def raw_search(self, query_string, **kwargs): + return self.get_search_queryset().raw_search(query_string, **kwargs) + + def load_all(self): + return self.get_search_queryset().load_all() + + def auto_query(self, query_string, fieldname='content'): + return self.get_search_queryset().auto_query(query_string, fieldname=fieldname) + + def autocomplete(self, **kwargs): + return self.get_search_queryset().autocomplete(**kwargs) + + def using(self, connection_name): + return self.get_search_queryset().using(connection_name) + + def count(self): + return self.get_search_queryset().count() + + def best_match(self): + return self.get_search_queryset().best_match() + + def latest(self, date_field): + return self.get_search_queryset().latest(date_field) + + def more_like_this(self, model_instance): + return self.get_search_queryset().more_like_this(model_instance) + + def facet_counts(self): + return self.get_search_queryset().facet_counts() + + def spelling_suggestion(self, preferred_query=None): + return self.get_search_queryset().spelling_suggestion(preferred_query=None) + + def values(self, *fields): + return self.get_search_queryset().values(*fields) + + def values_list(self, *fields, **kwargs): + return self.get_search_queryset().values_list(*fields, **kwargs) diff --git a/haystack/models.py b/haystack/models.py new file mode 100644 index 0000000..a121207 --- /dev/null +++ b/haystack/models.py @@ -0,0 +1,247 @@ +# encoding: utf-8 + +# "Hey, Django! Look at me, I'm an app! For Serious!" + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist +from django.db import models +from django.utils import six +from django.utils.text import capfirst + +from haystack.exceptions import NotHandled, SpatialError +from haystack.utils import log as logging + +try: + from django.utils.encoding import force_text +except ImportError: + from django.utils.encoding import force_unicode as force_text + +try: + from geopy import distance as geopy_distance +except ImportError: + geopy_distance = None + + +# Not a Django model, but tightly tied to them and there doesn't seem to be a +# better spot in the tree. +class SearchResult(object): + """ + A single search result. The actual object is loaded lazily by accessing + object; until then this object only stores the model, pk, and score. + + Note that iterating over SearchResults and getting the object for each + result will do O(N) database queries, which may not fit your needs for + performance. + """ + def __init__(self, app_label, model_name, pk, score, **kwargs): + self.app_label, self.model_name = app_label, model_name + self.pk = pk + self.score = score + self._object = None + self._model = None + self._verbose_name = None + self._additional_fields = [] + self._point_of_origin = kwargs.pop('_point_of_origin', None) + self._distance = kwargs.pop('_distance', None) + self.stored_fields = None + self.log = self._get_log() + + for key, value in kwargs.items(): + if not key in self.__dict__: + self.__dict__[key] = value + self._additional_fields.append(key) + + def _get_log(self): + return logging.getLogger('haystack') + + def __repr__(self): + return "" % (self.app_label, self.model_name, self.pk) + + def __unicode__(self): + return force_text(self.__repr__()) + + def __getattr__(self, attr): + if attr == '__getnewargs__': + raise AttributeError + + return self.__dict__.get(attr, None) + + def _get_searchindex(self): + from haystack import connections + return connections['default'].get_unified_index().get_index(self.model) + + searchindex = property(_get_searchindex) + + def _get_object(self): + if self._object is None: + if self.model is None: + self.log.error("Model could not be found for SearchResult '%s'.", self) + return None + + try: + try: + self._object = self.searchindex.read_queryset().get(pk=self.pk) + except NotHandled: + self.log.warning("Model '%s.%s' not handled by the routers.", self.app_label, self.model_name) + # Revert to old behaviour + self._object = self.model._default_manager.get(pk=self.pk) + except ObjectDoesNotExist: + self.log.error("Object could not be found in database for SearchResult '%s'.", self) + self._object = None + + return self._object + + def _set_object(self, obj): + self._object = obj + + object = property(_get_object, _set_object) + + def _get_model(self): + if self._model is None: + try: + self._model = models.get_model(self.app_label, self.model_name) + except LookupError: + # this changed in change 1.7 to throw an error instead of + # returning None when the model isn't found. So catch the + # lookup error and keep self._model == None. + pass + + return self._model + + def _set_model(self, obj): + self._model = obj + + model = property(_get_model, _set_model) + + def _get_distance(self): + from haystack.utils.geo import Distance + + if self._distance is None: + # We didn't get it from the backend & we haven't tried calculating + # it yet. Check if geopy is available to do it the "slow" way + # (even though slow meant 100 distance calculations in 0.004 seconds + # in my testing). + if geopy_distance is None: + raise SpatialError("The backend doesn't have 'DISTANCE_AVAILABLE' enabled & the 'geopy' library could not be imported, so distance information is not available.") + + if not self._point_of_origin: + raise SpatialError("The original point is not available.") + + if not hasattr(self, self._point_of_origin['field']): + raise SpatialError("The field '%s' was not included in search results, so the distance could not be calculated." % self._point_of_origin['field']) + + po_lng, po_lat = self._point_of_origin['point'].get_coords() + location_field = getattr(self, self._point_of_origin['field']) + + if location_field is None: + return None + + lf_lng, lf_lat = location_field.get_coords() + self._distance = Distance(km=geopy_distance.distance((po_lat, po_lng), (lf_lat, lf_lng)).km) + + # We've either already calculated it or the backend returned it, so + # let's use that. + return self._distance + + def _set_distance(self, dist): + self._distance = dist + + distance = property(_get_distance, _set_distance) + + def _get_verbose_name(self): + if self.model is None: + self.log.error("Model could not be found for SearchResult '%s'.", self) + return u'' + + return force_text(capfirst(self.model._meta.verbose_name)) + + verbose_name = property(_get_verbose_name) + + def _get_verbose_name_plural(self): + if self.model is None: + self.log.error("Model could not be found for SearchResult '%s'.", self) + return u'' + + return force_text(capfirst(self.model._meta.verbose_name_plural)) + + verbose_name_plural = property(_get_verbose_name_plural) + + def content_type(self): + """Returns the content type for the result's model instance.""" + if self.model is None: + self.log.error("Model could not be found for SearchResult '%s'.", self) + return u'' + + return six.text_type(self.model._meta) + + def get_additional_fields(self): + """ + Returns a dictionary of all of the fields from the raw result. + + Useful for serializing results. Only returns what was seen from the + search engine, so it may have extra fields Haystack's indexes aren't + aware of. + """ + additional_fields = {} + + for fieldname in self._additional_fields: + additional_fields[fieldname] = getattr(self, fieldname) + + return additional_fields + + def get_stored_fields(self): + """ + Returns a dictionary of all of the stored fields from the SearchIndex. + + Useful for serializing results. Only returns the fields Haystack's + indexes are aware of as being 'stored'. + """ + if self._stored_fields is None: + from haystack import connections + from haystack.exceptions import NotHandled + + try: + index = connections['default'].get_unified_index().get_index(self.model) + except NotHandled: + # Not found? Return nothing. + return {} + + self._stored_fields = {} + + # Iterate through the index's fields, pulling out the fields that + # are stored. + for fieldname, field in index.fields.items(): + if field.stored is True: + self._stored_fields[fieldname] = getattr(self, fieldname, u'') + + return self._stored_fields + + def __getstate__(self): + """ + Returns a dictionary representing the ``SearchResult`` in order to + make it pickleable. + """ + # The ``log`` is excluded because, under the hood, ``logging`` uses + # ``threading.Lock``, which doesn't pickle well. + ret_dict = self.__dict__.copy() + del(ret_dict['log']) + return ret_dict + + def __setstate__(self, data_dict): + """ + Updates the object's attributes according to data passed by pickle. + """ + self.__dict__.update(data_dict) + self.log = self._get_log() + + +def reload_indexes(sender, *args, **kwargs): + from haystack import connections + + for conn in connections.all(): + ui = conn.get_unified_index() + # Note: Unlike above, we're resetting the ``UnifiedIndex`` here. + # Thi gives us a clean slate. + ui.reset() diff --git a/haystack/panels.py b/haystack/panels.py new file mode 100644 index 0000000..058c7ad --- /dev/null +++ b/haystack/panels.py @@ -0,0 +1,86 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import datetime + +from debug_toolbar.panels import DebugPanel +from django.template.loader import render_to_string +from django.utils import six +from django.utils.translation import ugettext_lazy as _ + +from haystack import connections + + +class HaystackDebugPanel(DebugPanel): + """ + Panel that displays information about the Haystack queries run while + processing the request. + """ + name = 'Haystack' + has_content = True + + def __init__(self, *args, **kwargs): + super(self.__class__, self).__init__(*args, **kwargs) + self._offset = dict((alias, len(connections[alias].queries)) for alias in connections.connections_info.keys()) + self._search_time = 0 + self._queries = [] + self._backends = {} + + def nav_title(self): + return _('Haystack') + + def nav_subtitle(self): + self._queries = [] + self._backends = {} + + for alias in connections.connections_info.keys(): + search_queries = connections[alias].queries[self._offset[alias]:] + self._backends[alias] = { + 'time_spent': sum(float(q['time']) for q in search_queries), + 'queries': len(search_queries), + } + self._queries.extend([(alias, q) for q in search_queries]) + + self._queries.sort(key=lambda x: x[1]['start']) + self._search_time = sum([d['time_spent'] for d in self._backends.itervalues()]) + num_queries = len(self._queries) + return "%d %s in %.2fms" % ( + num_queries, + (num_queries == 1) and 'query' or 'queries', + self._search_time + ) + + def title(self): + return _('Search Queries') + + def url(self): + return '' + + def content(self): + width_ratio_tally = 0 + + for alias, query in self._queries: + query['alias'] = alias + query['query'] = query['query_string'] + + if query.get('additional_kwargs'): + if query['additional_kwargs'].get('result_class'): + query['additional_kwargs']['result_class'] = six.text_type(query['additional_kwargs']['result_class']) + + try: + query['width_ratio'] = (float(query['time']) / self._search_time) * 100 + except ZeroDivisionError: + query['width_ratio'] = 0 + + query['start_offset'] = width_ratio_tally + width_ratio_tally += query['width_ratio'] + + context = self.context.copy() + context.update({ + 'backends': sorted(self._backends.items(), key=lambda x: -x[1]['time_spent']), + 'queries': [q for a, q in self._queries], + 'sql_time': self._search_time, + }) + + return render_to_string('panels/haystack.html', context) diff --git a/haystack/query.py b/haystack/query.py new file mode 100644 index 0000000..096a4b3 --- /dev/null +++ b/haystack/query.py @@ -0,0 +1,841 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import operator +import warnings + +from django.utils import six + +from haystack import connection_router, connections +from haystack.backends import SQ +from haystack.constants import DEFAULT_OPERATOR, ITERATOR_LOAD_PER_QUERY, REPR_OUTPUT_SIZE +from haystack.exceptions import NotHandled +from haystack.inputs import AutoQuery, Clean, Raw +from haystack.utils import log as logging + + +class SearchQuerySet(object): + """ + Provides a way to specify search parameters and lazily load results. + + Supports chaining (a la QuerySet) to narrow the search. + """ + def __init__(self, using=None, query=None): + # ``_using`` should only ever be a value other than ``None`` if it's + # been forced with the ``.using`` method. + self._using = using + self.query = None + self._determine_backend() + + # If ``query`` is present, it should override even what the routers + # think. + if query is not None: + self.query = query + + self._result_cache = [] + self._result_count = None + self._cache_full = False + self._load_all = False + self._ignored_result_count = 0 + self.log = logging.getLogger('haystack') + + def _determine_backend(self): + from haystack import connections + # A backend has been manually selected. Use it instead. + if self._using is not None: + self.query = connections[self._using].get_query() + return + + # No backend, so rely on the routers to figure out what's right. + hints = {} + + if self.query: + hints['models'] = self.query.models + + backend_alias = connection_router.for_read(**hints) + + if isinstance(backend_alias, (list, tuple)) and len(backend_alias): + # We can only effectively read from one engine. + backend_alias = backend_alias[0] + + # The ``SearchQuery`` might swap itself out for a different variant + # here. + if self.query: + self.query = self.query.using(backend_alias) + else: + self.query = connections[backend_alias].get_query() + + def __getstate__(self): + """ + For pickling. + """ + len(self) + obj_dict = self.__dict__.copy() + obj_dict['_iter'] = None + obj_dict['log'] = None + return obj_dict + + def __setstate__(self, data_dict): + """ + For unpickling. + """ + self.__dict__ = data_dict + self.log = logging.getLogger('haystack') + + def __repr__(self): + data = list(self[:REPR_OUTPUT_SIZE]) + + if len(self) > REPR_OUTPUT_SIZE: + data[-1] = "...(remaining elements truncated)..." + + return repr(data) + + def __len__(self): + if not self._result_count: + self._result_count = self.query.get_count() + + # Some backends give weird, false-y values here. Convert to zero. + if not self._result_count: + self._result_count = 0 + + # This needs to return the actual number of hits, not what's in the cache. + return self._result_count - self._ignored_result_count + + def __iter__(self): + if self._cache_is_full(): + # We've got a fully populated cache. Let Python do the hard work. + return iter(self._result_cache) + + return self._manual_iter() + + def __and__(self, other): + if isinstance(other, EmptySearchQuerySet): + return other._clone() + combined = self._clone() + combined.query.combine(other.query, SQ.AND) + return combined + + def __or__(self, other): + combined = self._clone() + if isinstance(other, EmptySearchQuerySet): + return combined + combined.query.combine(other.query, SQ.OR) + return combined + + def _cache_is_full(self): + if not self.query.has_run(): + return False + + if len(self) <= 0: + return True + + try: + self._result_cache.index(None) + return False + except ValueError: + # No ``None``s found in the results. Check the length of the cache. + return len(self._result_cache) > 0 + + def _manual_iter(self): + # If we're here, our cache isn't fully populated. + # For efficiency, fill the cache as we go if we run out of results. + # Also, this can't be part of the __iter__ method due to Python's rules + # about generator functions. + current_position = 0 + current_cache_max = 0 + + while True: + if len(self._result_cache) > 0: + try: + current_cache_max = self._result_cache.index(None) + except ValueError: + current_cache_max = len(self._result_cache) + + while current_position < current_cache_max: + yield self._result_cache[current_position] + current_position += 1 + + if self._cache_is_full(): + raise StopIteration + + # We've run out of results and haven't hit our limit. + # Fill more of the cache. + if not self._fill_cache(current_position, current_position + ITERATOR_LOAD_PER_QUERY): + raise StopIteration + + def _fill_cache(self, start, end, **kwargs): + # Tell the query where to start from and how many we'd like. + self.query._reset() + self.query.set_limits(start, end) + results = self.query.get_results(**kwargs) + + if results == None or len(results) == 0: + return False + + # Setup the full cache now that we know how many results there are. + # We need the ``None``s as placeholders to know what parts of the + # cache we have/haven't filled. + # Using ``None`` like this takes up very little memory. In testing, + # an array of 100,000 ``None``s consumed less than .5 Mb, which ought + # to be an acceptable loss for consistent and more efficient caching. + if len(self._result_cache) == 0: + self._result_cache = [None for i in range(self.query.get_count())] + + if start is None: + start = 0 + + if end is None: + end = self.query.get_count() + + to_cache = self.post_process_results(results) + + # Assign by slice. + self._result_cache[start:start + len(to_cache)] = to_cache + return True + + def post_process_results(self, results): + to_cache = [] + + # Check if we wish to load all objects. + if self._load_all: + models_pks = {} + loaded_objects = {} + + # Remember the search position for each result so we don't have to resort later. + for result in results: + models_pks.setdefault(result.model, []).append(result.pk) + + # Load the objects for each model in turn. + for model in models_pks: + try: + ui = connections[self.query._using].get_unified_index() + index = ui.get_index(model) + objects = index.read_queryset(using=self.query._using) + loaded_objects[model] = objects.in_bulk(models_pks[model]) + except NotHandled: + self.log.warning("Model '%s' not handled by the routers", model) + # Revert to old behaviour + loaded_objects[model] = model._default_manager.in_bulk(models_pks[model]) + + for result in results: + if self._load_all: + # We have to deal with integer keys being cast from strings + model_objects = loaded_objects.get(result.model, {}) + if not result.pk in model_objects: + try: + result.pk = int(result.pk) + except ValueError: + pass + try: + result._object = model_objects[result.pk] + except KeyError: + # The object was either deleted since we indexed or should + # be ignored; fail silently. + self._ignored_result_count += 1 + continue + + to_cache.append(result) + + return to_cache + + def __getitem__(self, k): + """ + Retrieves an item or slice from the set of results. + """ + if not isinstance(k, (slice, six.integer_types)): + raise TypeError + assert ((not isinstance(k, slice) and (k >= 0)) + or (isinstance(k, slice) and (k.start is None or k.start >= 0) + and (k.stop is None or k.stop >= 0))), \ + "Negative indexing is not supported." + + # Remember if it's a slice or not. We're going to treat everything as + # a slice to simply the logic and will `.pop()` at the end as needed. + if isinstance(k, slice): + is_slice = True + start = k.start + + if k.stop is not None: + bound = int(k.stop) + else: + bound = None + else: + is_slice = False + start = k + bound = k + 1 + + # We need check to see if we need to populate more of the cache. + if len(self._result_cache) <= 0 or (None in self._result_cache[start:bound] and not self._cache_is_full()): + try: + self._fill_cache(start, bound) + except StopIteration: + # There's nothing left, even though the bound is higher. + pass + + # Cache should be full enough for our needs. + if is_slice: + return self._result_cache[start:bound] + else: + return self._result_cache[start] + + # Methods that return a SearchQuerySet. + def all(self): + """Returns all results for the query.""" + return self._clone() + + def none(self): + """Returns an empty result list for the query.""" + return self._clone(klass=EmptySearchQuerySet) + + def filter(self, *args, **kwargs): + """Narrows the search based on certain attributes and the default operator.""" + if DEFAULT_OPERATOR == 'OR': + return self.filter_or(*args, **kwargs) + else: + return self.filter_and(*args, **kwargs) + + def exclude(self, *args, **kwargs): + """Narrows the search by ensuring certain attributes are not included.""" + clone = self._clone() + clone.query.add_filter(~SQ(*args, **kwargs)) + return clone + + def filter_and(self, *args, **kwargs): + """Narrows the search by looking for (and including) certain attributes.""" + clone = self._clone() + clone.query.add_filter(SQ(*args, **kwargs)) + return clone + + def filter_or(self, *args, **kwargs): + """Narrows the search by ensuring certain attributes are not included.""" + clone = self._clone() + clone.query.add_filter(SQ(*args, **kwargs), use_or=True) + return clone + + def order_by(self, *args): + """Alters the order in which the results should appear.""" + clone = self._clone() + + for field in args: + clone.query.add_order_by(field) + + return clone + + def highlight(self): + """Adds highlighting to the results.""" + clone = self._clone() + clone.query.add_highlight() + return clone + + def models(self, *models): + """Accepts an arbitrary number of Model classes to include in the search.""" + clone = self._clone() + + for model in models: + if not model in connections[self.query._using].get_unified_index().get_indexed_models(): + warnings.warn('The model %r is not registered for search.' % (model,)) + + clone.query.add_model(model) + + return clone + + def result_class(self, klass): + """ + Allows specifying a different class to use for results. + + Overrides any previous usages. If ``None`` is provided, Haystack will + revert back to the default ``SearchResult`` object. + """ + clone = self._clone() + clone.query.set_result_class(klass) + return clone + + def boost(self, term, boost): + """Boosts a certain aspect of the query.""" + clone = self._clone() + clone.query.add_boost(term, boost) + return clone + + def facet(self, field, **options): + """Adds faceting to a query for the provided field.""" + clone = self._clone() + clone.query.add_field_facet(field, **options) + return clone + + def within(self, field, point_1, point_2): + """Spatial: Adds a bounding box search to the query.""" + clone = self._clone() + clone.query.add_within(field, point_1, point_2) + return clone + + def dwithin(self, field, point, distance): + """Spatial: Adds a distance-based search to the query.""" + clone = self._clone() + clone.query.add_dwithin(field, point, distance) + return clone + + def stats(self, field): + """Adds stats to a query for the provided field.""" + return self.stats_facet(field, facet_fields=None) + + def stats_facet(self, field, facet_fields=None): + """Adds stats facet for the given field and facet_fields represents + the faceted fields.""" + clone = self._clone() + stats_facets = [] + try: + stats_facets.append(sum(facet_fields,[])) + except TypeError: + if facet_fields: stats_facets.append(facet_fields) + clone.query.add_stats_query(field,stats_facets) + return clone + + def distance(self, field, point): + """ + Spatial: Denotes results must have distance measurements from the + provided point. + """ + clone = self._clone() + clone.query.add_distance(field, point) + return clone + + def date_facet(self, field, start_date, end_date, gap_by, gap_amount=1): + """Adds faceting to a query for the provided field by date.""" + clone = self._clone() + clone.query.add_date_facet(field, start_date, end_date, gap_by, gap_amount=gap_amount) + return clone + + def query_facet(self, field, query): + """Adds faceting to a query for the provided field with a custom query.""" + clone = self._clone() + clone.query.add_query_facet(field, query) + return clone + + def narrow(self, query): + """Pushes existing facet choices into the search.""" + + if isinstance(query, SQ): + # produce query string using empty query of the same class + empty_query = self.query._clone() + empty_query._reset() + query = query.as_query_string(empty_query.build_query_fragment) + + clone = self._clone() + clone.query.add_narrow_query(query) + return clone + + def raw_search(self, query_string, **kwargs): + """Passes a raw query directly to the backend.""" + return self.filter(content=Raw(query_string, **kwargs)) + + def load_all(self): + """Efficiently populates the objects in the search results.""" + clone = self._clone() + clone._load_all = True + return clone + + def auto_query(self, query_string, fieldname='content'): + """ + Performs a best guess constructing the search query. + + This method is somewhat naive but works well enough for the simple, + common cases. + """ + kwargs = { + fieldname: AutoQuery(query_string) + } + return self.filter(**kwargs) + + def autocomplete(self, **kwargs): + """ + A shortcut method to perform an autocomplete search. + + Must be run against fields that are either ``NgramField`` or + ``EdgeNgramField``. + """ + clone = self._clone() + query_bits = [] + + for field_name, query in kwargs.items(): + for word in query.split(' '): + bit = clone.query.clean(word.strip()) + if bit: + kwargs = { + field_name: bit, + } + query_bits.append(SQ(**kwargs)) + + return clone.filter(six.moves.reduce(operator.__and__, query_bits)) + + def using(self, connection_name): + """ + Allows switching which connection the ``SearchQuerySet`` uses to + search in. + """ + clone = self._clone() + clone.query = self.query.using(connection_name) + clone._using = connection_name + return clone + + # Methods that do not return a SearchQuerySet. + + def count(self): + """Returns the total number of matching results.""" + return len(self) + + def best_match(self): + """Returns the best/top search result that matches the query.""" + return self[0] + + def latest(self, date_field): + """Returns the most recent search result that matches the query.""" + clone = self._clone() + clone.query.clear_order_by() + clone.query.add_order_by("-%s" % date_field) + return clone.best_match() + + def more_like_this(self, model_instance): + """Finds similar results to the object passed in.""" + clone = self._clone() + clone.query.more_like_this(model_instance) + return clone + + def facet_counts(self): + """ + Returns the facet counts found by the query. + + This will cause the query to execute and should generally be used when + presenting the data. + """ + if self.query.has_run(): + return self.query.get_facet_counts() + else: + clone = self._clone() + return clone.query.get_facet_counts() + + def stats_results(self): + """ + Returns the stats results found by the query. + """ + if self.query.has_run(): + return self.query.get_stats() + else: + clone = self._clone() + return clone.query.get_stats() + + def spelling_suggestion(self, preferred_query=None): + """ + Returns the spelling suggestion found by the query. + + To work, you must set ``INCLUDE_SPELLING`` within your connection's + settings dictionary to ``True``. Otherwise, ``None`` will be returned. + + This will cause the query to execute and should generally be used when + presenting the data. + """ + if self.query.has_run(): + return self.query.get_spelling_suggestion(preferred_query) + else: + clone = self._clone() + return clone.query.get_spelling_suggestion(preferred_query) + + def values(self, *fields): + """ + Returns a list of dictionaries, each containing the key/value pairs for + the result, exactly like Django's ``ValuesQuerySet``. + """ + qs = self._clone(klass=ValuesSearchQuerySet) + qs._fields.extend(fields) + return qs + + def values_list(self, *fields, **kwargs): + """ + Returns a list of field values as tuples, exactly like Django's + ``QuerySet.values``. + + Optionally accepts a ``flat=True`` kwarg, which in the case of a + single field being provided, will return a flat list of that field + rather than a list of tuples. + """ + flat = kwargs.pop("flat", False) + + if flat and len(fields) > 1: + raise TypeError("'flat' is not valid when values_list is called with more than one field.") + + qs = self._clone(klass=ValuesListSearchQuerySet) + qs._fields.extend(fields) + qs._flat = flat + return qs + + # Utility methods. + + def _clone(self, klass=None): + if klass is None: + klass = self.__class__ + + query = self.query._clone() + clone = klass(query=query) + clone._load_all = self._load_all + return clone + + +class EmptySearchQuerySet(SearchQuerySet): + """ + A stubbed SearchQuerySet that behaves as normal but always returns no + results. + """ + def __len__(self): + return 0 + + def _cache_is_full(self): + # Pretend the cache is always full with no results. + return True + + def _clone(self, klass=None): + clone = super(EmptySearchQuerySet, self)._clone(klass=klass) + clone._result_cache = [] + return clone + + def _fill_cache(self, start, end): + return False + + def facet_counts(self): + return {} + + +class ValuesListSearchQuerySet(SearchQuerySet): + """ + A ``SearchQuerySet`` which returns a list of field values as tuples, exactly + like Django's ``ValuesListQuerySet``. + """ + def __init__(self, *args, **kwargs): + super(ValuesListSearchQuerySet, self).__init__(*args, **kwargs) + self._flat = False + self._fields = [] + + # Removing this dependency would require refactoring much of the backend + # code (_process_results, etc.) and these aren't large enough to make it + # an immediate priority: + self._internal_fields = ['id', 'django_ct', 'django_id', 'score'] + + def _clone(self, klass=None): + clone = super(ValuesListSearchQuerySet, self)._clone(klass=klass) + clone._fields = self._fields + clone._flat = self._flat + return clone + + def _fill_cache(self, start, end): + query_fields = set(self._internal_fields) + query_fields.update(self._fields) + kwargs = { + 'fields': query_fields + } + return super(ValuesListSearchQuerySet, self)._fill_cache(start, end, **kwargs) + + def post_process_results(self, results): + to_cache = [] + + if self._flat: + accum = to_cache.extend + else: + accum = to_cache.append + + for result in results: + accum([getattr(result, i, None) for i in self._fields]) + + return to_cache + + +class ValuesSearchQuerySet(ValuesListSearchQuerySet): + """ + A ``SearchQuerySet`` which returns a list of dictionaries, each containing + the key/value pairs for the result, exactly like Django's + ``ValuesQuerySet``. + """ + def _fill_cache(self, start, end): + query_fields = set(self._internal_fields) + query_fields.update(self._fields) + kwargs = { + 'fields': query_fields + } + return super(ValuesListSearchQuerySet, self)._fill_cache(start, end, **kwargs) + + def post_process_results(self, results): + to_cache = [] + + for result in results: + to_cache.append(dict((i, getattr(result, i, None)) for i in self._fields)) + + return to_cache + + +class RelatedSearchQuerySet(SearchQuerySet): + """ + A variant of the SearchQuerySet that can handle `load_all_queryset`s. + + This is predominantly different in the `_fill_cache` method, as it is + far less efficient but needs to fill the cache before it to maintain + consistency. + """ + + def __init__(self, *args, **kwargs): + super(RelatedSearchQuerySet, self).__init__(*args, **kwargs) + self._load_all_querysets = {} + self._result_cache = [] + + def _cache_is_full(self): + return len(self._result_cache) >= len(self) + + def _manual_iter(self): + # If we're here, our cache isn't fully populated. + # For efficiency, fill the cache as we go if we run out of results. + # Also, this can't be part of the __iter__ method due to Python's rules + # about generator functions. + current_position = 0 + current_cache_max = 0 + + while True: + current_cache_max = len(self._result_cache) + + while current_position < current_cache_max: + yield self._result_cache[current_position] + current_position += 1 + + if self._cache_is_full(): + raise StopIteration + + # We've run out of results and haven't hit our limit. + # Fill more of the cache. + start = current_position + self._ignored_result_count + + if not self._fill_cache(start, start + ITERATOR_LOAD_PER_QUERY): + raise StopIteration + + def _fill_cache(self, start, end): + # Tell the query where to start from and how many we'd like. + self.query._reset() + self.query.set_limits(start, end) + results = self.query.get_results() + + if len(results) == 0: + return False + + if start is None: + start = 0 + + if end is None: + end = self.query.get_count() + + # Check if we wish to load all objects. + if self._load_all: + models_pks = {} + loaded_objects = {} + + # Remember the search position for each result so we don't have to resort later. + for result in results: + models_pks.setdefault(result.model, []).append(result.pk) + + # Load the objects for each model in turn. + for model in models_pks: + if model in self._load_all_querysets: + # Use the overriding queryset. + loaded_objects[model] = self._load_all_querysets[model].in_bulk(models_pks[model]) + else: + # Check the SearchIndex for the model for an override. + try: + index = connections[self.query._using].get_unified_index().get_index(model) + qs = index.load_all_queryset() + loaded_objects[model] = qs.in_bulk(models_pks[model]) + except NotHandled: + # The model returned doesn't seem to be handled by the + # routers. We should silently fail and populate + # nothing for those objects. + loaded_objects[model] = [] + + if len(results) + len(self._result_cache) < len(self) and len(results) < ITERATOR_LOAD_PER_QUERY: + self._ignored_result_count += ITERATOR_LOAD_PER_QUERY - len(results) + + for result in results: + if self._load_all: + # We have to deal with integer keys being cast from strings; if this + # fails we've got a character pk. + try: + result.pk = int(result.pk) + except ValueError: + pass + try: + result._object = loaded_objects[result.model][result.pk] + except (KeyError, IndexError): + # The object was either deleted since we indexed or should + # be ignored; fail silently. + self._ignored_result_count += 1 + continue + + self._result_cache.append(result) + + return True + + def __getitem__(self, k): + """ + Retrieves an item or slice from the set of results. + """ + if not isinstance(k, (slice, six.integer_types)): + raise TypeError + assert ((not isinstance(k, slice) and (k >= 0)) + or (isinstance(k, slice) and (k.start is None or k.start >= 0) + and (k.stop is None or k.stop >= 0))), \ + "Negative indexing is not supported." + + # Remember if it's a slice or not. We're going to treat everything as + # a slice to simply the logic and will `.pop()` at the end as needed. + if isinstance(k, slice): + is_slice = True + start = k.start + + if k.stop is not None: + bound = int(k.stop) + else: + bound = None + else: + is_slice = False + start = k + bound = k + 1 + + # We need check to see if we need to populate more of the cache. + if len(self._result_cache) <= 0 or not self._cache_is_full(): + try: + while len(self._result_cache) < bound and not self._cache_is_full(): + current_max = len(self._result_cache) + self._ignored_result_count + self._fill_cache(current_max, current_max + ITERATOR_LOAD_PER_QUERY) + except StopIteration: + # There's nothing left, even though the bound is higher. + pass + + # Cache should be full enough for our needs. + if is_slice: + return self._result_cache[start:bound] + else: + return self._result_cache[start] + + def load_all_queryset(self, model, queryset): + """ + Allows for specifying a custom ``QuerySet`` that changes how ``load_all`` + will fetch records for the provided model. + + This is useful for post-processing the results from the query, enabling + things like adding ``select_related`` or filtering certain data. + """ + clone = self._clone() + clone._load_all_querysets[model] = queryset + return clone + + def _clone(self, klass=None): + if klass is None: + klass = self.__class__ + + query = self.query._clone() + clone = klass(query=query) + clone._load_all = self._load_all + clone._load_all_querysets = self._load_all_querysets + return clone diff --git a/haystack/routers.py b/haystack/routers.py new file mode 100644 index 0000000..0a77e17 --- /dev/null +++ b/haystack/routers.py @@ -0,0 +1,18 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from haystack.constants import DEFAULT_ALIAS + + +class BaseRouter(object): + # Reserved for future extension. + pass + + +class DefaultRouter(BaseRouter): + def for_read(self, **hints): + return DEFAULT_ALIAS + + def for_write(self, **hints): + return DEFAULT_ALIAS diff --git a/haystack/signals.py b/haystack/signals.py new file mode 100644 index 0000000..63a6c5f --- /dev/null +++ b/haystack/signals.py @@ -0,0 +1,90 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.db import models + +from haystack.exceptions import NotHandled + + +class BaseSignalProcessor(object): + """ + A convenient way to attach Haystack to Django's signals & cause things to + index. + + By default, does nothing with signals but provides underlying functionality. + """ + def __init__(self, connections, connection_router): + self.connections = connections + self.connection_router = connection_router + self.setup() + + def setup(self): + """ + A hook for setting up anything necessary for + ``handle_save/handle_delete`` to be executed. + + Default behavior is to do nothing (``pass``). + """ + # Do nothing. + pass + + def teardown(self): + """ + A hook for tearing down anything necessary for + ``handle_save/handle_delete`` to no longer be executed. + + Default behavior is to do nothing (``pass``). + """ + # Do nothing. + pass + + def handle_save(self, sender, instance, **kwargs): + """ + Given an individual model instance, determine which backends the + update should be sent to & update the object on those backends. + """ + using_backends = self.connection_router.for_write(instance=instance) + + for using in using_backends: + try: + index = self.connections[using].get_unified_index().get_index(sender) + index.update_object(instance, using=using) + except NotHandled: + # TODO: Maybe log it or let the exception bubble? + pass + + def handle_delete(self, sender, instance, **kwargs): + """ + Given an individual model instance, determine which backends the + delete should be sent to & delete the object on those backends. + """ + using_backends = self.connection_router.for_write(instance=instance) + + for using in using_backends: + try: + index = self.connections[using].get_unified_index().get_index(sender) + index.remove_object(instance, using=using) + except NotHandled: + # TODO: Maybe log it or let the exception bubble? + pass + + +class RealtimeSignalProcessor(BaseSignalProcessor): + """ + Allows for observing when saves/deletes fire & automatically updates the + search engine appropriately. + """ + def setup(self): + # Naive (listen to all model saves). + models.signals.post_save.connect(self.handle_save) + models.signals.post_delete.connect(self.handle_delete) + # Efficient would be going through all backends & collecting all models + # being used, then hooking up signals only for those. + + def teardown(self): + # Naive (listen to all model saves). + models.signals.post_save.disconnect(self.handle_save) + models.signals.post_delete.disconnect(self.handle_delete) + # Efficient would be going through all backends & collecting all models + # being used, then disconnecting signals only for those. diff --git a/haystack/templates/panels/haystack.html b/haystack/templates/panels/haystack.html new file mode 100644 index 0000000..1896658 --- /dev/null +++ b/haystack/templates/panels/haystack.html @@ -0,0 +1,33 @@ +{% load i18n %} + + + + + + + + + + + + {% for query in queries %} + + + + + + + + {% endfor %} + +
{% trans 'Query' %}{% trans 'Backend Alias' %}{% trans 'Timeline' %}{% trans 'Time' %} (ms){% trans 'Kwargs' %}
+
+
{{ query.query_string|safe }}
+
+
{{ query.alias }} +   + {{ query.time }} + {% for key, value in query.additional_kwargs.items %} + '{{ key }}': {{ value|stringformat:"r" }}
+ {% endfor %} +
diff --git a/haystack/templates/search_configuration/solr.xml b/haystack/templates/search_configuration/solr.xml new file mode 100644 index 0000000..03fed3b --- /dev/null +++ b/haystack/templates/search_configuration/solr.xml @@ -0,0 +1,166 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +{% for field in fields %} + +{% endfor %} + + + + {{ ID }} + + + {{ content_field_name }} + + + + diff --git a/haystack/templatetags/__init__.py b/haystack/templatetags/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/haystack/templatetags/highlight.py b/haystack/templatetags/highlight.py new file mode 100644 index 0000000..f1ca697 --- /dev/null +++ b/haystack/templatetags/highlight.py @@ -0,0 +1,119 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django import template +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.utils import six + +from haystack.utils import importlib + + +register = template.Library() + + +class HighlightNode(template.Node): + def __init__(self, text_block, query, html_tag=None, css_class=None, max_length=None): + self.text_block = template.Variable(text_block) + self.query = template.Variable(query) + self.html_tag = html_tag + self.css_class = css_class + self.max_length = max_length + + if html_tag is not None: + self.html_tag = template.Variable(html_tag) + + if css_class is not None: + self.css_class = template.Variable(css_class) + + if max_length is not None: + self.max_length = template.Variable(max_length) + + def render(self, context): + text_block = self.text_block.resolve(context) + query = self.query.resolve(context) + kwargs = {} + + if self.html_tag is not None: + kwargs['html_tag'] = self.html_tag.resolve(context) + + if self.css_class is not None: + kwargs['css_class'] = self.css_class.resolve(context) + + if self.max_length is not None: + kwargs['max_length'] = self.max_length.resolve(context) + + # Handle a user-defined highlighting function. + if hasattr(settings, 'HAYSTACK_CUSTOM_HIGHLIGHTER') and settings.HAYSTACK_CUSTOM_HIGHLIGHTER: + # Do the import dance. + try: + path_bits = settings.HAYSTACK_CUSTOM_HIGHLIGHTER.split('.') + highlighter_path, highlighter_classname = '.'.join(path_bits[:-1]), path_bits[-1] + highlighter_module = importlib.import_module(highlighter_path) + highlighter_class = getattr(highlighter_module, highlighter_classname) + except (ImportError, AttributeError) as e: + raise ImproperlyConfigured("The highlighter '%s' could not be imported: %s" % (settings.HAYSTACK_CUSTOM_HIGHLIGHTER, e)) + else: + from haystack.utils import Highlighter + highlighter_class = Highlighter + + highlighter = highlighter_class(query, **kwargs) + highlighted_text = highlighter.highlight(text_block) + return highlighted_text + + +@register.tag +def highlight(parser, token): + """ + Takes a block of text and highlights words from a provided query within that + block of text. Optionally accepts arguments to provide the HTML tag to wrap + highlighted word in, a CSS class to use with the tag and a maximum length of + the blurb in characters. + + Syntax:: + + {% highlight with [css_class "class_name"] [html_tag "span"] [max_length 200] %} + + Example:: + + # Highlight summary with default behavior. + {% highlight result.summary with request.query %} + + # Highlight summary but wrap highlighted words with a div and the + # following CSS class. + {% highlight result.summary with request.query html_tag "div" css_class "highlight_me_please" %} + + # Highlight summary but only show 40 characters. + {% highlight result.summary with request.query max_length 40 %} + """ + bits = token.split_contents() + tag_name = bits[0] + + if not len(bits) % 2 == 0: + raise template.TemplateSyntaxError(u"'%s' tag requires valid pairings arguments." % tag_name) + + text_block = bits[1] + + if len(bits) < 4: + raise template.TemplateSyntaxError(u"'%s' tag requires an object and a query provided by 'with'." % tag_name) + + if bits[2] != 'with': + raise template.TemplateSyntaxError(u"'%s' tag's second argument should be 'with'." % tag_name) + + query = bits[3] + + arg_bits = iter(bits[4:]) + kwargs = {} + + for bit in arg_bits: + if bit == 'css_class': + kwargs['css_class'] = six.next(arg_bits) + + if bit == 'html_tag': + kwargs['html_tag'] = six.next(arg_bits) + + if bit == 'max_length': + kwargs['max_length'] = six.next(arg_bits) + + return HighlightNode(text_block, query, **kwargs) diff --git a/haystack/templatetags/more_like_this.py b/haystack/templatetags/more_like_this.py new file mode 100644 index 0000000..edad111 --- /dev/null +++ b/haystack/templatetags/more_like_this.py @@ -0,0 +1,108 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django import template +from django.db import models + +from haystack.query import SearchQuerySet + +register = template.Library() + + +class MoreLikeThisNode(template.Node): + def __init__(self, model, varname, for_types=None, limit=None): + self.model = template.Variable(model) + self.varname = varname + self.for_types = for_types + self.limit = limit + + if not self.limit is None: + self.limit = int(self.limit) + + def render(self, context): + try: + model_instance = self.model.resolve(context) + sqs = SearchQuerySet() + + if not self.for_types is None: + intermediate = template.Variable(self.for_types) + for_types = intermediate.resolve(context).split(',') + search_models = [] + + for model in for_types: + model_class = models.get_model(*model.split('.')) + + if model_class: + search_models.append(model_class) + + sqs = sqs.models(*search_models) + + sqs = sqs.more_like_this(model_instance) + + if not self.limit is None: + sqs = sqs[:self.limit] + + context[self.varname] = sqs + except: + pass + + return '' + + +@register.tag +def more_like_this(parser, token): + """ + Fetches similar items from the search index to find content that is similar + to the provided model's content. + + Syntax:: + + {% more_like_this model_instance as varname [for app_label.model_name,app_label.model_name,...] [limit n] %} + + Example:: + + # Pull a full SearchQuerySet (lazy loaded) of similar content. + {% more_like_this entry as related_content %} + + # Pull just the top 5 similar pieces of content. + {% more_like_this entry as related_content limit 5 %} + + # Pull just the top 5 similar entries or comments. + {% more_like_this entry as related_content for "blog.entry,comments.comment" limit 5 %} + """ + bits = token.split_contents() + + if not len(bits) in (4, 6, 8): + raise template.TemplateSyntaxError(u"'%s' tag requires either 3, 5 or 7 arguments." % bits[0]) + + model = bits[1] + + if bits[2] != 'as': + raise template.TemplateSyntaxError(u"'%s' tag's second argument should be 'as'." % bits[0]) + + varname = bits[3] + limit = None + for_types = None + + if len(bits) == 6: + if bits[4] != 'limit' and bits[4] != 'for': + raise template.TemplateSyntaxError(u"'%s' tag's fourth argument should be either 'limit' or 'for'." % bits[0]) + + if bits[4] == 'limit': + limit = bits[5] + else: + for_types = bits[5] + + if len(bits) == 8: + if bits[4] != 'for': + raise template.TemplateSyntaxError(u"'%s' tag's fourth argument should be 'for'." % bits[0]) + + for_types = bits[5] + + if bits[6] != 'limit': + raise template.TemplateSyntaxError(u"'%s' tag's sixth argument should be 'limit'." % bits[0]) + + limit = bits[7] + + return MoreLikeThisNode(model, varname, for_types, limit) diff --git a/haystack/urls.py b/haystack/urls.py new file mode 100644 index 0000000..4b96615 --- /dev/null +++ b/haystack/urls.py @@ -0,0 +1,16 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from haystack.views import SearchView + +try: + from django.conf.urls import patterns, url +except ImportError: + from django.conf.urls.defaults import patterns, url + + + +urlpatterns = patterns('haystack.views', + url(r'^$', SearchView(), name='haystack_search'), +) diff --git a/haystack/utils/__init__.py b/haystack/utils/__init__.py new file mode 100644 index 0000000..659f2f5 --- /dev/null +++ b/haystack/utils/__init__.py @@ -0,0 +1,88 @@ +# encoding: utf-8 + +from __future__ import unicode_literals +import re + +import django +from django.conf import settings +from django.utils import six + +from haystack.constants import ID, DJANGO_CT, DJANGO_ID +from haystack.utils.highlighting import Highlighter + +try: + # Introduced in Python 2.7 + import importlib +except ImportError: + # Deprecated in Django 1.8; removed in Django 1.9 (both of which require + # at least Python 2.7) + from django.utils import importlib + +IDENTIFIER_REGEX = re.compile('^[\w\d_]+\.[\w\d_]+\.\d+$') + + +def default_get_identifier(obj_or_string): + """ + Get an unique identifier for the object or a string representing the + object. + + If not overridden, uses ... + """ + if isinstance(obj_or_string, six.string_types): + if not IDENTIFIER_REGEX.match(obj_or_string): + raise AttributeError(u"Provided string '%s' is not a valid identifier." % obj_or_string) + + return obj_or_string + + return u"%s.%s" % (get_model_ct(obj_or_string), + obj_or_string._get_pk_val()) + + +def _lookup_identifier_method(): + """ + If the user has set HAYSTACK_IDENTIFIER_METHOD, import it and return the method uncalled. + If HAYSTACK_IDENTIFIER_METHOD is not defined, return haystack.utils.default_get_identifier. + + This always runs at module import time. We keep the code in a function + so that it can be called from unit tests, in order to simulate the re-loading + of this module. + """ + if not hasattr(settings, 'HAYSTACK_IDENTIFIER_METHOD'): + return default_get_identifier + + module_path, method_name = settings.HAYSTACK_IDENTIFIER_METHOD.rsplit(".", 1) + + try: + module = importlib.import_module(module_path) + except ImportError: + raise ImportError(u"Unable to import module '%s' provided for HAYSTACK_IDENTIFIER_METHOD." % module_path) + + identifier_method = getattr(module, method_name, None) + + if not identifier_method: + raise AttributeError( + u"Provided method '%s' for HAYSTACK_IDENTIFIER_METHOD does not exist in '%s'." % (method_name, module_path) + ) + + return identifier_method + + +get_identifier = _lookup_identifier_method() + + +if django.VERSION >= (1, 6): + def get_model_ct_tuple(model): + return (model._meta.app_label, model._meta.model_name) +else: + def get_model_ct_tuple(model): + return (model._meta.app_label, model._meta.module_name) + +def get_model_ct(model): + return "%s.%s" % get_model_ct_tuple(model) + + +def get_facet_field_name(fieldname): + if fieldname in [ID, DJANGO_ID, DJANGO_CT]: + return fieldname + + return "%s_exact" % fieldname diff --git a/haystack/utils/app_loading.py b/haystack/utils/app_loading.py new file mode 100755 index 0000000..efba6ad --- /dev/null +++ b/haystack/utils/app_loading.py @@ -0,0 +1,90 @@ +# encoding: utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +from django import VERSION as DJANGO_VERSION +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured + +from haystack.utils import importlib + +__all__ = ['haystack_get_models', 'haystack_load_apps'] + +APP = 'app' +MODEL = 'model' + +if DJANGO_VERSION >= (1, 7): + from django.apps import apps + + def haystack_get_app_modules(): + """Return the Python module for each installed app""" + return [i.module for i in apps.get_app_configs()] + + def haystack_load_apps(): + """Return a list of app labels for all installed applications which have models""" + return [i.label for i in apps.get_app_configs() if i.models_module is not None] + + def haystack_get_models(label): + try: + app_mod = apps.get_app_config(label) + return app_mod.get_models() + except LookupError: + if '.' not in label: + raise ImproperlyConfigured('Unknown application label {}'.format(label)) + app_label, model_name = label.rsplit('.', 1) + return [apps.get_model(app_label, model_name)] + except ImproperlyConfigured: + pass + + def haystack_get_model(app_label, model_name): + return apps.get_model(app_label, model_name) + +else: + from django.db.models.loading import get_app, get_model, get_models + + def is_app_or_model(label): + label_bits = label.split('.') + + if len(label_bits) == 1: + return APP + elif len(label_bits) == 2: + try: + get_model(*label_bits) + except LookupError: + return APP + return MODEL + else: + raise ImproperlyConfigured( + "'%s' isn't recognized as an app () or model (.)." % label) + + def haystack_get_app_modules(): + """Return the Python module for each installed app""" + return [importlib.import_module(i) for i in settings.INSTALLED_APPS] + + def haystack_load_apps(): + # Do all, in an INSTALLED_APPS sorted order. + items = [] + + for app in settings.INSTALLED_APPS: + app_label = app.split('.')[-1] + + try: + get_app(app_label) + except ImproperlyConfigured: + continue # Intentionally allow e.g. apps without models.py + + items.append(app_label) + + return items + + def haystack_get_models(label): + app_or_model = is_app_or_model(label) + + if app_or_model == APP: + app_mod = get_app(label) + return get_models(app_mod) + else: + app_label, model_name = label.rsplit('.', 1) + return [get_model(app_label, model_name)] + + def haystack_get_model(app_label, model_name): + return get_model(app_label, model_name) diff --git a/haystack/utils/geo.py b/haystack/utils/geo.py new file mode 100644 index 0000000..d3b87da --- /dev/null +++ b/haystack/utils/geo.py @@ -0,0 +1,78 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.contrib.gis.geos import Point +from django.contrib.gis.measure import D, Distance + +from haystack.constants import WGS_84_SRID +from haystack.exceptions import SpatialError + + +def ensure_geometry(geom): + """ + Makes sure the parameter passed in looks like a GEOS ``GEOSGeometry``. + """ + if not hasattr(geom, 'geom_type'): + raise SpatialError("Point '%s' doesn't appear to be a GEOS geometry." % geom) + + return geom + + +def ensure_point(geom): + """ + Makes sure the parameter passed in looks like a GEOS ``Point``. + """ + ensure_geometry(geom) + + if geom.geom_type != 'Point': + raise SpatialError("Provided geometry '%s' is not a 'Point'." % geom) + + return geom + + +def ensure_wgs84(point): + """ + Ensures the point passed in is a GEOS ``Point`` & returns that point's + data is in the WGS-84 spatial reference. + """ + ensure_point(point) + # Clone it so we don't alter the original, in case they're using it for + # something else. + new_point = point.clone() + + if not new_point.srid: + # It has no spatial reference id. Assume WGS-84. + new_point.set_srid(WGS_84_SRID) + elif new_point.srid != WGS_84_SRID: + # Transform it to get to the right system. + new_point.transform(WGS_84_SRID) + + return new_point + + +def ensure_distance(dist): + """ + Makes sure the parameter passed in is a 'Distance' object. + """ + try: + # Since we mostly only care about the ``.km`` attribute, make sure + # it's there. + km = dist.km + except AttributeError: + raise SpatialError("'%s' does not appear to be a 'Distance' object." % dist) + + return dist + + +def generate_bounding_box(bottom_left, top_right): + """ + Takes two opposite corners of a bounding box (order matters!) & generates + a two-tuple of the correct coordinates for the bounding box. + + The two-tuple is in the form ``((min_lat, min_lng), (max_lat, max_lng))``. + """ + west, lat_1 = bottom_left.get_coords() + east, lat_2 = top_right.get_coords() + min_lat, max_lat = min(lat_1, lat_2), max(lat_1, lat_2) + return ((min_lat, west), (max_lat, east)) diff --git a/haystack/utils/highlighting.py b/haystack/utils/highlighting.py new file mode 100644 index 0000000..71ccb98 --- /dev/null +++ b/haystack/utils/highlighting.py @@ -0,0 +1,165 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.utils.html import strip_tags + + +class Highlighter(object): + css_class = 'highlighted' + html_tag = 'span' + max_length = 200 + text_block = '' + + def __init__(self, query, **kwargs): + self.query = query + + if 'max_length' in kwargs: + self.max_length = int(kwargs['max_length']) + + if 'html_tag' in kwargs: + self.html_tag = kwargs['html_tag'] + + if 'css_class' in kwargs: + self.css_class = kwargs['css_class'] + + self.query_words = set([word.lower() for word in self.query.split() if not word.startswith('-')]) + + def highlight(self, text_block): + self.text_block = strip_tags(text_block) + highlight_locations = self.find_highlightable_words() + start_offset, end_offset = self.find_window(highlight_locations) + return self.render_html(highlight_locations, start_offset, end_offset) + + def find_highlightable_words(self): + # Use a set so we only do this once per unique word. + word_positions = {} + + # Pre-compute the length. + end_offset = len(self.text_block) + lower_text_block = self.text_block.lower() + + for word in self.query_words: + if not word in word_positions: + word_positions[word] = [] + + start_offset = 0 + + while start_offset < end_offset: + next_offset = lower_text_block.find(word, start_offset, end_offset) + + # If we get a -1 out of find, it wasn't found. Bomb out and + # start the next word. + if next_offset == -1: + break + + word_positions[word].append(next_offset) + start_offset = next_offset + len(word) + + return word_positions + + def find_window(self, highlight_locations): + best_start = 0 + best_end = self.max_length + + # First, make sure we have words. + if not len(highlight_locations): + return (best_start, best_end) + + words_found = [] + + # Next, make sure we found any words at all. + for word, offset_list in highlight_locations.items(): + if len(offset_list): + # Add all of the locations to the list. + words_found.extend(offset_list) + + if not len(words_found): + return (best_start, best_end) + + if len(words_found) == 1: + return (words_found[0], words_found[0] + self.max_length) + + # Sort the list so it's in ascending order. + words_found = sorted(words_found) + + # We now have a denormalized list of all positions were a word was + # found. We'll iterate through and find the densest window we can by + # counting the number of found offsets (-1 to fit in the window). + highest_density = 0 + + if words_found[:-1][0] > self.max_length: + best_start = words_found[:-1][0] + best_end = best_start + self.max_length + + for count, start in enumerate(words_found[:-1]): + current_density = 1 + + for end in words_found[count + 1:]: + if end - start < self.max_length: + current_density += 1 + else: + current_density = 0 + + # Only replace if we have a bigger (not equal density) so we + # give deference to windows earlier in the document. + if current_density > highest_density: + best_start = start + best_end = start + self.max_length + highest_density = current_density + + return (best_start, best_end) + + def render_html(self, highlight_locations=None, start_offset=None, end_offset=None): + # Start by chopping the block down to the proper window. + text = self.text_block[start_offset:end_offset] + + # Invert highlight_locations to a location -> term list + term_list = [] + + for term, locations in highlight_locations.items(): + term_list += [(loc - start_offset, term) for loc in locations] + + loc_to_term = sorted(term_list) + + # Prepare the highlight template + if self.css_class: + hl_start = '<%s class="%s">' % (self.html_tag, self.css_class) + else: + hl_start = '<%s>' % (self.html_tag) + + hl_end = '' % self.html_tag + + # Copy the part from the start of the string to the first match, + # and there replace the match with a highlighted version. + highlighted_chunk = "" + matched_so_far = 0 + prev = 0 + prev_str = "" + + for cur, cur_str in loc_to_term: + # This can be in a different case than cur_str + actual_term = text[cur:cur + len(cur_str)] + + # Handle incorrect highlight_locations by first checking for the term + if actual_term.lower() == cur_str: + if cur < prev + len(prev_str): + continue + + highlighted_chunk += text[prev + len(prev_str):cur] + hl_start + actual_term + hl_end + prev = cur + prev_str = cur_str + + # Keep track of how far we've copied so far, for the last step + matched_so_far = cur + len(actual_term) + + # Don't forget the chunk after the last term + highlighted_chunk += text[matched_so_far:] + + if start_offset > 0: + highlighted_chunk = '...%s' % highlighted_chunk + + if end_offset < len(self.text_block): + highlighted_chunk = '%s...' % highlighted_chunk + + return highlighted_chunk diff --git a/haystack/utils/loading.py b/haystack/utils/loading.py new file mode 100644 index 0000000..1547804 --- /dev/null +++ b/haystack/utils/loading.py @@ -0,0 +1,334 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import copy +import inspect +import warnings + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.utils.module_loading import module_has_submodule + +from haystack.exceptions import NotHandled, SearchFieldError +from haystack.utils import importlib +from haystack.utils.app_loading import haystack_get_app_modules + + +try: + # Introduced in Python 2.7 + from collections import OrderedDict +except ImportError: + # Deprecated in Django 1.8; removed in Django 1.9 (both of which require + # at least Python 2.7) + from django.utils.datastructures import SortedDict as OrderedDict + + +def import_class(path): + path_bits = path.split('.') + # Cut off the class name at the end. + class_name = path_bits.pop() + module_path = '.'.join(path_bits) + module_itself = importlib.import_module(module_path) + + if not hasattr(module_itself, class_name): + raise ImportError("The Python module '%s' has no '%s' class." % (module_path, class_name)) + + return getattr(module_itself, class_name) + + +# Load the search backend. +def load_backend(full_backend_path): + """ + Loads a backend for interacting with the search engine. + + Requires a ``backend_path``. It should be a string resembling a Python + import path, pointing to a ``BaseEngine`` subclass. The built-in options + available include:: + + * haystack.backends.solr.SolrEngine + * haystack.backends.xapian.XapianEngine (third-party) + * haystack.backends.whoosh.WhooshEngine + * haystack.backends.simple.SimpleEngine + + If you've implemented a custom backend, you can provide the path to + your backend & matching ``Engine`` class. For example:: + + ``myapp.search_backends.CustomSolrEngine`` + + """ + path_bits = full_backend_path.split('.') + + if len(path_bits) < 2: + raise ImproperlyConfigured("The provided backend '%s' is not a complete Python path to a BaseEngine subclass." % full_backend_path) + + return import_class(full_backend_path) + + +def load_router(full_router_path): + """ + Loads a router for choosing which connection to use. + + Requires a ``full_router_path``. It should be a string resembling a Python + import path, pointing to a ``BaseRouter`` subclass. The built-in options + available include:: + + * haystack.routers.DefaultRouter + + If you've implemented a custom backend, you can provide the path to + your backend & matching ``Engine`` class. For example:: + + ``myapp.search_routers.MasterSlaveRouter`` + + """ + path_bits = full_router_path.split('.') + + if len(path_bits) < 2: + raise ImproperlyConfigured("The provided router '%s' is not a complete Python path to a BaseRouter subclass." % full_router_path) + + return import_class(full_router_path) + + +class ConnectionHandler(object): + def __init__(self, connections_info): + self.connections_info = connections_info + self._connections = {} + self._index = None + + def ensure_defaults(self, alias): + try: + conn = self.connections_info[alias] + except KeyError: + raise ImproperlyConfigured("The key '%s' isn't an available connection." % alias) + + if not conn.get('ENGINE'): + conn['ENGINE'] = 'haystack.backends.simple_backend.SimpleEngine' + + def __getitem__(self, key): + if key in self._connections: + return self._connections[key] + + self.ensure_defaults(key) + self._connections[key] = load_backend(self.connections_info[key]['ENGINE'])(using=key) + return self._connections[key] + + def reload(self, key): + try: + del self._connections[key] + except KeyError: + pass + + return self.__getitem__(key) + + def all(self): + return [self[alias] for alias in self.connections_info] + + +class ConnectionRouter(object): + def __init__(self, routers_list=None): + self.routers_list = routers_list + self.routers = [] + + if self.routers_list is None: + self.routers_list = ['haystack.routers.DefaultRouter'] + + for router_path in self.routers_list: + router_class = load_router(router_path) + self.routers.append(router_class()) + + def for_action(self, action, **hints): + conns = [] + + for router in self.routers: + if hasattr(router, action): + action_callable = getattr(router, action) + connection_to_use = action_callable(**hints) + + if connection_to_use is not None: + conns.append(connection_to_use) + + return conns + + def for_write(self, **hints): + return self.for_action('for_write', **hints) + + def for_read(self, **hints): + return self.for_action('for_read', **hints) + + +class UnifiedIndex(object): + # Used to collect all the indexes into a cohesive whole. + def __init__(self, excluded_indexes=None): + self._indexes = {} + self.fields = OrderedDict() + self._built = False + self.excluded_indexes = excluded_indexes or [] + self.excluded_indexes_ids = {} + self.document_field = getattr(settings, 'HAYSTACK_DOCUMENT_FIELD', 'text') + self._fieldnames = {} + self._facet_fieldnames = {} + + @property + def indexes(self): + warnings.warn("'UnifiedIndex.indexes' was deprecated in Haystack v2.3.0. Please use UnifiedIndex.get_indexes().") + return self._indexes + + def collect_indexes(self): + indexes = [] + + for app_mod in haystack_get_app_modules(): + try: + search_index_module = importlib.import_module("%s.search_indexes" % app_mod.__name__) + except ImportError: + if module_has_submodule(app_mod, 'search_indexes'): + raise + + continue + + for item_name, item in inspect.getmembers(search_index_module, inspect.isclass): + if getattr(item, 'haystack_use_for_indexing', False) and getattr(item, 'get_model', None): + # We've got an index. Check if we should be ignoring it. + class_path = "%s.search_indexes.%s" % (app_mod.__name__, item_name) + + if class_path in self.excluded_indexes or self.excluded_indexes_ids.get(item_name) == id(item): + self.excluded_indexes_ids[str(item_name)] = id(item) + continue + + indexes.append(item()) + + return indexes + + def reset(self): + self._indexes = {} + self.fields = OrderedDict() + self._built = False + self._fieldnames = {} + self._facet_fieldnames = {} + + def build(self, indexes=None): + self.reset() + + if indexes is None: + indexes = self.collect_indexes() + + for index in indexes: + model = index.get_model() + + if model in self._indexes: + raise ImproperlyConfigured( + "Model '%s' has more than one 'SearchIndex`` handling it. " + "Please exclude either '%s' or '%s' using the 'EXCLUDED_INDEXES' " + "setting defined in 'settings.HAYSTACK_CONNECTIONS'." % ( + model, self._indexes[model], index + ) + ) + + self._indexes[model] = index + self.collect_fields(index) + + self._built = True + + def collect_fields(self, index): + for fieldname, field_object in index.fields.items(): + if field_object.document is True: + if field_object.index_fieldname != self.document_field: + raise SearchFieldError("All 'SearchIndex' classes must use the same '%s' fieldname for the 'document=True' field. Offending index is '%s'." % (self.document_field, index)) + + # Stow the index_fieldname so we don't have to get it the hard way again. + if fieldname in self._fieldnames and field_object.index_fieldname != self._fieldnames[fieldname]: + # We've already seen this field in the list. Raise an exception if index_fieldname differs. + raise SearchFieldError("All uses of the '%s' field need to use the same 'index_fieldname' attribute." % fieldname) + + self._fieldnames[fieldname] = field_object.index_fieldname + + # Stow the facet_fieldname so we don't have to look that up either. + if hasattr(field_object, 'facet_for'): + if field_object.facet_for: + self._facet_fieldnames[field_object.facet_for] = fieldname + else: + self._facet_fieldnames[field_object.instance_name] = fieldname + + # Copy the field in so we've got a unified schema. + if field_object.index_fieldname not in self.fields: + self.fields[field_object.index_fieldname] = field_object + self.fields[field_object.index_fieldname] = copy.copy(field_object) + else: + # If the field types are different, we can mostly + # safely ignore this. The exception is ``MultiValueField``, + # in which case we'll use it instead, copying over the + # values. + if field_object.is_multivalued: + old_field = self.fields[field_object.index_fieldname] + self.fields[field_object.index_fieldname] = field_object + self.fields[field_object.index_fieldname] = copy.copy(field_object) + + # Switch it so we don't have to dupe the remaining + # checks. + field_object = old_field + + # We've already got this field in the list. Ensure that + # what we hand back is a superset of all options that + # affect the schema. + if field_object.indexed is True: + self.fields[field_object.index_fieldname].indexed = True + + if field_object.stored is True: + self.fields[field_object.index_fieldname].stored = True + + if field_object.faceted is True: + self.fields[field_object.index_fieldname].faceted = True + + if field_object.use_template is True: + self.fields[field_object.index_fieldname].use_template = True + + if field_object.null is True: + self.fields[field_object.index_fieldname].null = True + + def get_indexes(self): + if not self._built: + self.build() + + return self._indexes + + def get_indexed_models(self): + # Ensuring a list here since Python3 will give us an iterator + return list(self.get_indexes().keys()) + + def get_index_fieldname(self, field): + if not self._built: + self.build() + + return self._fieldnames.get(field) or field + + def get_index(self, model_klass): + + indexes = self.get_indexes() + + if model_klass not in indexes: + raise NotHandled('The model %s is not registered' % model_klass) + + return indexes[model_klass] + + def get_facet_fieldname(self, field): + if not self._built: + self.build() + + for fieldname, field_object in self.fields.items(): + if fieldname != field: + continue + + if hasattr(field_object, 'facet_for'): + if field_object.facet_for: + return field_object.facet_for + else: + return field_object.instance_name + else: + return self._facet_fieldnames.get(field) or field + + return field + + def all_searchfields(self): + if not self._built: + self.build() + + return self.fields diff --git a/haystack/utils/log.py b/haystack/utils/log.py new file mode 100644 index 0000000..50b25bc --- /dev/null +++ b/haystack/utils/log.py @@ -0,0 +1,25 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging + +from django.conf import settings + + +def getLogger(name): + real_logger = logging.getLogger(name) + return LoggingFacade(real_logger) + + +class LoggingFacade(object): + def __init__(self, real_logger): + self.real_logger = real_logger + + def noop(self, *args, **kwargs): + pass + + def __getattr__(self, attr): + if getattr(settings, 'HAYSTACK_LOGGING', True): + return getattr(self.real_logger, attr) + return self.noop diff --git a/haystack/views.py b/haystack/views.py new file mode 100644 index 0000000..6f20dc2 --- /dev/null +++ b/haystack/views.py @@ -0,0 +1,235 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +from django.conf import settings +from django.core.paginator import InvalidPage, Paginator +from django.http import Http404 +from django.shortcuts import render_to_response +from django.template import RequestContext + +from haystack.forms import FacetedSearchForm, ModelSearchForm +from haystack.query import EmptySearchQuerySet + +RESULTS_PER_PAGE = getattr(settings, 'HAYSTACK_SEARCH_RESULTS_PER_PAGE', 20) + + +class SearchView(object): + template = 'search/search.html' + extra_context = {} + query = '' + results = EmptySearchQuerySet() + request = None + form = None + results_per_page = RESULTS_PER_PAGE + + def __init__(self, template=None, load_all=True, form_class=None, searchqueryset=None, context_class=RequestContext, results_per_page=None): + self.load_all = load_all + self.form_class = form_class + self.context_class = context_class + self.searchqueryset = searchqueryset + + if form_class is None: + self.form_class = ModelSearchForm + + if not results_per_page is None: + self.results_per_page = results_per_page + + if template: + self.template = template + + def __call__(self, request): + """ + Generates the actual response to the search. + + Relies on internal, overridable methods to construct the response. + """ + self.request = request + + self.form = self.build_form() + self.query = self.get_query() + self.results = self.get_results() + + return self.create_response() + + def build_form(self, form_kwargs=None): + """ + Instantiates the form the class should use to process the search query. + """ + data = None + kwargs = { + 'load_all': self.load_all, + } + if form_kwargs: + kwargs.update(form_kwargs) + + if len(self.request.GET): + data = self.request.GET + + if self.searchqueryset is not None: + kwargs['searchqueryset'] = self.searchqueryset + + return self.form_class(data, **kwargs) + + def get_query(self): + """ + Returns the query provided by the user. + + Returns an empty string if the query is invalid. + """ + if self.form.is_valid(): + return self.form.cleaned_data['q'] + + return '' + + def get_results(self): + """ + Fetches the results via the form. + + Returns an empty list if there's no query to search with. + """ + return self.form.search() + + def build_page(self): + """ + Paginates the results appropriately. + + In case someone does not want to use Django's built-in pagination, it + should be a simple matter to override this method to do what they would + like. + """ + try: + page_no = int(self.request.GET.get('page', 1)) + except (TypeError, ValueError): + raise Http404("Not a valid number for page.") + + if page_no < 1: + raise Http404("Pages should be 1 or greater.") + + start_offset = (page_no - 1) * self.results_per_page + self.results[start_offset:start_offset + self.results_per_page] + + paginator = Paginator(self.results, self.results_per_page) + + try: + page = paginator.page(page_no) + except InvalidPage: + raise Http404("No such page!") + + return (paginator, page) + + def extra_context(self): + """ + Allows the addition of more context variables as needed. + + Must return a dictionary. + """ + return {} + + def create_response(self): + """ + Generates the actual HttpResponse to send back to the user. + """ + (paginator, page) = self.build_page() + + context = { + 'query': self.query, + 'form': self.form, + 'page': page, + 'paginator': paginator, + 'suggestion': None, + } + + if self.results and hasattr(self.results, 'query') and self.results.query.backend.include_spelling: + context['suggestion'] = self.form.get_suggestion() + + context.update(self.extra_context()) + return render_to_response(self.template, context, context_instance=self.context_class(self.request)) + + +def search_view_factory(view_class=SearchView, *args, **kwargs): + def search_view(request): + return view_class(*args, **kwargs)(request) + return search_view + + +class FacetedSearchView(SearchView): + def __init__(self, *args, **kwargs): + # Needed to switch out the default form class. + if kwargs.get('form_class') is None: + kwargs['form_class'] = FacetedSearchForm + + super(FacetedSearchView, self).__init__(*args, **kwargs) + + def build_form(self, form_kwargs=None): + if form_kwargs is None: + form_kwargs = {} + + # This way the form can always receive a list containing zero or more + # facet expressions: + form_kwargs['selected_facets'] = self.request.GET.getlist("selected_facets") + + return super(FacetedSearchView, self).build_form(form_kwargs) + + def extra_context(self): + extra = super(FacetedSearchView, self).extra_context() + extra['request'] = self.request + extra['facets'] = self.results.facet_counts() + return extra + + +def basic_search(request, template='search/search.html', load_all=True, form_class=ModelSearchForm, searchqueryset=None, context_class=RequestContext, extra_context=None, results_per_page=None): + """ + A more traditional view that also demonstrate an alternative + way to use Haystack. + + Useful as an example of for basing heavily custom views off of. + + Also has the benefit of thread-safety, which the ``SearchView`` class may + not be. + + Template:: ``search/search.html`` + Context:: + * form + An instance of the ``form_class``. (default: ``ModelSearchForm``) + * page + The current page of search results. + * paginator + A paginator instance for the results. + * query + The query received by the form. + """ + query = '' + results = EmptySearchQuerySet() + + if request.GET.get('q'): + form = form_class(request.GET, searchqueryset=searchqueryset, load_all=load_all) + + if form.is_valid(): + query = form.cleaned_data['q'] + results = form.search() + else: + form = form_class(searchqueryset=searchqueryset, load_all=load_all) + + paginator = Paginator(results, results_per_page or RESULTS_PER_PAGE) + + try: + page = paginator.page(int(request.GET.get('page', 1))) + except InvalidPage: + raise Http404("No such page of results!") + + context = { + 'form': form, + 'page': page, + 'paginator': paginator, + 'query': query, + 'suggestion': None, + } + + if results.query.backend.include_spelling: + context['suggestion'] = form.get_suggestion() + + if extra_context: + context.update(extra_context) + + return render_to_response(template, context, context_instance=context_class(request)) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..1cf4557 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,22 @@ +[pep8] +max-line-length = 110 +exclude = docs + +[flake8] +max-line-length = 110 +exclude = docs + +[frosted] +max-line-length = 110 +exclude = docs + +[isort] +line_length = 110 +default_section = THIRDPARTY +known_first_party = haystack + +[egg_info] +tag_build = +tag_date = 0 +tag_svn_revision = 0 + diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..cb68312 --- /dev/null +++ b/setup.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# encoding: utf-8 + +# n.b. we can't have unicode_literals here due to http://bugs.python.org/setuptools/issue152 +from __future__ import absolute_import, division, print_function + +try: + from setuptools import setup +except ImportError: + from ez_setup import use_setuptools + use_setuptools() + from setuptools import setup + +install_requires = [ + 'Django', +] + +tests_require = [ + 'elasticsearch>=1.0.0,<2.0.0', + 'pysolr>=3.3.2', + 'whoosh==2.5.4', + 'python-dateutil', + 'geopy==0.95.1', + + 'nose', + 'mock', + 'coverage', +] + +setup( + name='django-haystack', + version='2.4.0', + description='Pluggable search for Django.', + author='Daniel Lindsley', + author_email='daniel@toastdriven.com', + long_description=open('README.rst', 'r').read(), + url='http://haystacksearch.org/', + packages=[ + 'haystack', + 'haystack.backends', + 'haystack.management', + 'haystack.management.commands', + 'haystack.templatetags', + 'haystack.utils', + ], + package_data={ + 'haystack': [ + 'templates/panels/*', + 'templates/search_configuration/*', + ] + }, + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Environment :: Web Environment', + 'Framework :: Django', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Topic :: Utilities', + ], + zip_safe=False, + install_requires=install_requires, + tests_require=tests_require, + test_suite="test_haystack.run_tests.run_all", +)