debian-django-haystack/haystack/backends/solr_backend.py

719 lines
28 KiB
Python

# encoding: utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
import warnings
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from django.utils import six
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
from haystack.exceptions import MissingDependency, MoreLikeThisError, SkipDocument
from haystack.inputs import Clean, Exact, PythonData, Raw
from haystack.models import SearchResult
from haystack.utils import log as logging
from haystack.utils import get_identifier, get_model_ct
from haystack.utils.app_loading import haystack_get_model
try:
from pysolr import Solr, SolrError
except ImportError:
raise MissingDependency("The 'solr' backend requires the installation of 'pysolr'. Please refer to the documentation.")
class SolrSearchBackend(BaseSearchBackend):
# Word reserved by Solr for special use.
RESERVED_WORDS = (
'AND',
'NOT',
'OR',
'TO',
)
# Characters reserved by Solr for special use.
# The '\\' must come first, so as not to overwrite the other slash replacements.
RESERVED_CHARACTERS = (
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
'[', ']', '^', '"', '~', '*', '?', ':', '/',
)
def __init__(self, connection_alias, **connection_options):
super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)
if not 'URL' in connection_options:
raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias)
self.conn = Solr(connection_options['URL'], timeout=self.timeout, **connection_options.get('KWARGS', {}))
self.log = logging.getLogger('haystack')
def update(self, index, iterable, commit=True):
docs = []
for obj in iterable:
try:
docs.append(index.full_prepare(obj))
except SkipDocument:
self.log.debug(u"Indexing for object `%s` skipped", obj)
except UnicodeDecodeError:
if not self.silently_fail:
raise
# We'll log the object identifier but won't include the actual object
# to avoid the possibility of that generating encoding errors while
# processing the log message:
self.log.error(u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={
"data": {
"index": index,
"object": get_identifier(obj)
}
})
if len(docs) > 0:
try:
self.conn.add(docs, commit=commit, boost=index.get_field_weights())
except (IOError, SolrError) as e:
if not self.silently_fail:
raise
self.log.error("Failed to add documents to Solr: %s", e)
def remove(self, obj_or_string, commit=True):
solr_id = get_identifier(obj_or_string)
try:
kwargs = {
'commit': commit,
'id': solr_id
}
self.conn.delete(**kwargs)
except (IOError, SolrError) as e:
if not self.silently_fail:
raise
self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e)
def clear(self, models=[], commit=True):
try:
if not models:
# *:* matches all docs in Solr
self.conn.delete(q='*:*', commit=commit)
else:
models_to_delete = []
for model in models:
models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model)))
self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)
if commit:
# Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
self.conn.optimize()
except (IOError, SolrError) as e:
if not self.silently_fail:
raise
if len(models):
self.log.error("Failed to clear Solr index of models '%s': %s", ','.join(models_to_delete), e)
else:
self.log.error("Failed to clear Solr index: %s", e)
@log_query
def search(self, query_string, **kwargs):
if len(query_string) == 0:
return {
'results': [],
'hits': 0,
}
search_kwargs = self.build_search_kwargs(query_string, **kwargs)
try:
raw_results = self.conn.search(query_string, **search_kwargs)
except (IOError, SolrError) as e:
if not self.silently_fail:
raise
self.log.error("Failed to query Solr using '%s': %s", query_string, e)
raw_results = EmptyResults()
return self._process_results(raw_results, highlight=kwargs.get('highlight'), result_class=kwargs.get('result_class', SearchResult), distance_point=kwargs.get('distance_point'))
def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None,
fields='', highlight=False, facets=None,
date_facets=None, query_facets=None,
narrow_queries=None, spelling_query=None,
within=None, dwithin=None, distance_point=None,
models=None, limit_to_registered_models=None,
result_class=None, stats=None):
kwargs = {'fl': '* score'}
if fields:
if isinstance(fields, (list, set)):
fields = " ".join(fields)
kwargs['fl'] = fields
if sort_by is not None:
if sort_by in ['distance asc', 'distance desc'] and distance_point:
# Do the geo-enabled sort.
lng, lat = distance_point['point'].get_coords()
kwargs['sfield'] = distance_point['field']
kwargs['pt'] = '%s,%s' % (lat, lng)
if sort_by == 'distance asc':
kwargs['sort'] = 'geodist() asc'
else:
kwargs['sort'] = 'geodist() desc'
else:
if sort_by.startswith('distance '):
warnings.warn("In order to sort by distance, you must call the '.distance(...)' method.")
# Regular sorting.
kwargs['sort'] = sort_by
if start_offset is not None:
kwargs['start'] = start_offset
if end_offset is not None:
kwargs['rows'] = end_offset - start_offset
if highlight is True:
kwargs['hl'] = 'true'
kwargs['hl.fragsize'] = '200'
if self.include_spelling is True:
kwargs['spellcheck'] = 'true'
kwargs['spellcheck.collate'] = 'true'
kwargs['spellcheck.count'] = 1
if spelling_query:
kwargs['spellcheck.q'] = spelling_query
if facets is not None:
kwargs['facet'] = 'on'
kwargs['facet.field'] = facets.keys()
for facet_field, options in facets.items():
for key, value in options.items():
kwargs['f.%s.facet.%s' % (facet_field, key)] = self.conn._from_python(value)
if date_facets is not None:
kwargs['facet'] = 'on'
kwargs['facet.date'] = date_facets.keys()
kwargs['facet.date.other'] = 'none'
for key, value in date_facets.items():
kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
gap_by_string = value.get('gap_by').upper()
gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string)
if value.get('gap_amount') != 1:
gap_string += "S"
kwargs["f.%s.facet.date.gap" % key] = '+%s/%s' % (gap_string, gap_by_string)
if query_facets is not None:
kwargs['facet'] = 'on'
kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets]
if limit_to_registered_models is None:
limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
if models and len(models):
model_choices = sorted(get_model_ct(model) for model in models)
elif limit_to_registered_models:
# Using narrow queries, limit the results to only models handled
# with the current routers.
model_choices = self.build_models_list()
else:
model_choices = []
if len(model_choices) > 0:
if narrow_queries is None:
narrow_queries = set()
narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices)))
if narrow_queries is not None:
kwargs['fq'] = list(narrow_queries)
if stats:
kwargs['stats'] = "true"
for k in stats.keys():
kwargs['stats.field'] = k
for facet in stats[k]:
kwargs['f.%s.stats.facet' % k] = facet
if within is not None:
from haystack.utils.geo import generate_bounding_box
kwargs.setdefault('fq', [])
((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(within['point_1'], within['point_2'])
# Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
# very clear on this.
bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng, max_lat, max_lng)
kwargs['fq'].append(bbox)
if dwithin is not None:
kwargs.setdefault('fq', [])
lng, lat = dwithin['point'].get_coords()
geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (lat, lng, dwithin['field'], dwithin['distance'].km)
kwargs['fq'].append(geofilt)
# Check to see if the backend should try to include distances
# (Solr 4.X+) in the results.
if self.distance_available and distance_point:
# In early testing, you can't just hand Solr 4.X a proper bounding box
# & request distances. To enable native distance would take calculating
# a center point & a radius off the user-provided box, which kinda
# sucks. We'll avoid it for now, since Solr 4.x's release will be some
# time yet.
# kwargs['fl'] += ' _dist_:geodist()'
pass
return kwargs
def more_like_this(self, model_instance, additional_query_string=None,
start_offset=0, end_offset=None, models=None,
limit_to_registered_models=None, result_class=None, **kwargs):
from haystack import connections
# Deferred models will have a different class ("RealClass_Deferred_fieldname")
# which won't be in our registry:
model_klass = model_instance._meta.concrete_model
index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
field_name = index.get_content_field()
params = {
'fl': '*,score',
}
if start_offset is not None:
params['start'] = start_offset
if end_offset is not None:
params['rows'] = end_offset
narrow_queries = set()
if limit_to_registered_models is None:
limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
if models and len(models):
model_choices = sorted(get_model_ct(model) for model in models)
elif limit_to_registered_models:
# Using narrow queries, limit the results to only models handled
# with the current routers.
model_choices = self.build_models_list()
else:
model_choices = []
if len(model_choices) > 0:
if narrow_queries is None:
narrow_queries = set()
narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices)))
if additional_query_string:
narrow_queries.add(additional_query_string)
if narrow_queries:
params['fq'] = list(narrow_queries)
query = "%s:%s" % (ID, get_identifier(model_instance))
try:
raw_results = self.conn.more_like_this(query, field_name, **params)
except (IOError, SolrError) as e:
if not self.silently_fail:
raise
self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e)
raw_results = EmptyResults()
return self._process_results(raw_results, result_class=result_class)
def _process_results(self, raw_results, highlight=False, result_class=None, distance_point=None):
from haystack import connections
results = []
hits = raw_results.hits
facets = {}
stats = {}
spelling_suggestion = None
if result_class is None:
result_class = SearchResult
if hasattr(raw_results,'stats'):
stats = raw_results.stats.get('stats_fields',{})
if hasattr(raw_results, 'facets'):
facets = {
'fields': raw_results.facets.get('facet_fields', {}),
'dates': raw_results.facets.get('facet_dates', {}),
'queries': raw_results.facets.get('facet_queries', {}),
}
for key in ['fields']:
for facet_field in facets[key]:
# Convert to a two-tuple, as Solr's json format returns a list of
# pairs.
facets[key][facet_field] = list(zip(facets[key][facet_field][::2], facets[key][facet_field][1::2]))
if self.include_spelling is True:
if hasattr(raw_results, 'spellcheck'):
if len(raw_results.spellcheck.get('suggestions', [])):
# For some reason, it's an array of pairs. Pull off the
# collated result from the end.
spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1]
unified_index = connections[self.connection_alias].get_unified_index()
indexed_models = unified_index.get_indexed_models()
for raw_result in raw_results.docs:
app_label, model_name = raw_result[DJANGO_CT].split('.')
additional_fields = {}
model = haystack_get_model(app_label, model_name)
if model and model in indexed_models:
index = unified_index.get_index(model)
index_field_map = index.field_map
for key, value in raw_result.items():
string_key = str(key)
# re-map key if alternate name used
if string_key in index_field_map:
string_key = index_field_map[key]
if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
additional_fields[string_key] = index.fields[string_key].convert(value)
else:
additional_fields[string_key] = self.conn._to_python(value)
del(additional_fields[DJANGO_CT])
del(additional_fields[DJANGO_ID])
del(additional_fields['score'])
if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
if distance_point:
additional_fields['_point_of_origin'] = distance_point
if raw_result.get('__dist__'):
from haystack.utils.geo import Distance
additional_fields['_distance'] = Distance(km=float(raw_result['__dist__']))
else:
additional_fields['_distance'] = None
result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], **additional_fields)
results.append(result)
else:
hits -= 1
return {
'results': results,
'hits': hits,
'stats': stats,
'facets': facets,
'spelling_suggestion': spelling_suggestion,
}
def build_schema(self, fields):
content_field_name = ''
schema_fields = []
for field_name, field_class in fields.items():
field_data = {
'field_name': field_class.index_fieldname,
'type': 'text_en',
'indexed': 'true',
'stored': 'true',
'multi_valued': 'false',
}
if field_class.document is True:
content_field_name = field_class.index_fieldname
# DRL_FIXME: Perhaps move to something where, if none of these
# checks succeed, call a custom method on the form that
# returns, per-backend, the right type of storage?
if field_class.field_type in ['date', 'datetime']:
field_data['type'] = 'date'
elif field_class.field_type == 'integer':
field_data['type'] = 'long'
elif field_class.field_type == 'float':
field_data['type'] = 'float'
elif field_class.field_type == 'boolean':
field_data['type'] = 'boolean'
elif field_class.field_type == 'ngram':
field_data['type'] = 'ngram'
elif field_class.field_type == 'edge_ngram':
field_data['type'] = 'edge_ngram'
elif field_class.field_type == 'location':
field_data['type'] = 'location'
if field_class.is_multivalued:
field_data['multi_valued'] = 'true'
if field_class.stored is False:
field_data['stored'] = 'false'
# Do this last to override `text` fields.
if field_class.indexed is False:
field_data['indexed'] = 'false'
# If it's text and not being indexed, we probably don't want
# to do the normal lowercase/tokenize/stemming/etc. dance.
if field_data['type'] == 'text_en':
field_data['type'] = 'string'
# If it's a ``FacetField``, make sure we don't postprocess it.
if hasattr(field_class, 'facet_for'):
# If it's text, it ought to be a string.
if field_data['type'] == 'text_en':
field_data['type'] = 'string'
schema_fields.append(field_data)
return (content_field_name, schema_fields)
def extract_file_contents(self, file_obj):
"""Extract text and metadata from a structured file (PDF, MS Word, etc.)
Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
See the Solr wiki for details:
http://wiki.apache.org/solr/ExtractingRequestHandler
Due to the way the ExtractingRequestHandler is implemented it completely
replaces the normal Haystack indexing process with several unfortunate
restrictions: only one file per request, the extracted data is added to
the index with no ability to modify it, etc. To simplify the process and
allow for more advanced use we'll run using the extract-only mode to
return the extracted data without adding it to the index so we can then
use it within Haystack's normal templating process.
Returns None if metadata cannot be extracted; otherwise returns a
dictionary containing at least two keys:
:contents:
Extracted full-text content, if applicable
:metadata:
key:value pairs of text strings
"""
try:
return self.conn.extract(file_obj)
except Exception as e:
self.log.warning(u"Unable to extract file contents: %s", e,
exc_info=True, extra={"data": {"file": file_obj}})
return None
class SolrSearchQuery(BaseSearchQuery):
def matching_all_fragment(self):
return '*:*'
def build_query_fragment(self, field, filter_type, value):
from haystack import connections
query_frag = ''
if not hasattr(value, 'input_type_name'):
# Handle when we've got a ``ValuesListQuerySet``...
if hasattr(value, 'values_list'):
value = list(value)
if isinstance(value, six.string_types):
# It's not an ``InputType``. Assume ``Clean``.
value = Clean(value)
else:
value = PythonData(value)
# Prepare the query using the InputType.
prepared_value = value.prepare(self)
if not isinstance(prepared_value, (set, list, tuple)):
# Then convert whatever we get back to what pysolr wants if needed.
prepared_value = self.backend.conn._from_python(prepared_value)
# 'content' is a special reserved word, much like 'pk' in
# Django's ORM layer. It indicates 'no special field'.
if field == 'content':
index_fieldname = ''
else:
index_fieldname = u'%s:' % connections[self._using].get_unified_index().get_index_fieldname(field)
filter_types = {
'contains': u'%s',
'startswith': u'%s*',
'exact': u'%s',
'gt': u'{%s TO *}',
'gte': u'[%s TO *]',
'lt': u'{* TO %s}',
'lte': u'[* TO %s]',
}
if value.post_process is False:
query_frag = prepared_value
else:
if filter_type in ['contains', 'startswith']:
if value.input_type_name == 'exact':
query_frag = prepared_value
else:
# Iterate over terms & incorportate the converted form of each into the query.
terms = []
for possible_value in prepared_value.split(' '):
terms.append(filter_types[filter_type] % self.backend.conn._from_python(possible_value))
if len(terms) == 1:
query_frag = terms[0]
else:
query_frag = u"(%s)" % " AND ".join(terms)
elif filter_type == 'in':
in_options = []
for possible_value in prepared_value:
in_options.append(u'"%s"' % self.backend.conn._from_python(possible_value))
query_frag = u"(%s)" % " OR ".join(in_options)
elif filter_type == 'range':
start = self.backend.conn._from_python(prepared_value[0])
end = self.backend.conn._from_python(prepared_value[1])
query_frag = u'["%s" TO "%s"]' % (start, end)
elif filter_type == 'exact':
if value.input_type_name == 'exact':
query_frag = prepared_value
else:
prepared_value = Exact(prepared_value).prepare(self)
query_frag = filter_types[filter_type] % prepared_value
else:
if value.input_type_name != 'exact':
prepared_value = Exact(prepared_value).prepare(self)
query_frag = filter_types[filter_type] % prepared_value
if len(query_frag) and not isinstance(value, Raw):
if not query_frag.startswith('(') and not query_frag.endswith(')'):
query_frag = "(%s)" % query_frag
return u"%s%s" % (index_fieldname, query_frag)
def build_alt_parser_query(self, parser_name, query_string='', **kwargs):
if query_string:
query_string = Clean(query_string).prepare(self)
kwarg_bits = []
for key in sorted(kwargs.keys()):
if isinstance(kwargs[key], six.string_types) and ' ' in kwargs[key]:
kwarg_bits.append(u"%s='%s'" % (key, kwargs[key]))
else:
kwarg_bits.append(u"%s=%s" % (key, kwargs[key]))
return u'_query_:"{!%s %s}%s"' % (parser_name, Clean(' '.join(kwarg_bits)), query_string)
def build_params(self, spelling_query=None, **kwargs):
search_kwargs = {
'start_offset': self.start_offset,
'result_class': self.result_class
}
order_by_list = None
if self.order_by:
if order_by_list is None:
order_by_list = []
for order_by in self.order_by:
if order_by.startswith('-'):
order_by_list.append('%s desc' % order_by[1:])
else:
order_by_list.append('%s asc' % order_by)
search_kwargs['sort_by'] = ", ".join(order_by_list)
if self.date_facets:
search_kwargs['date_facets'] = self.date_facets
if self.distance_point:
search_kwargs['distance_point'] = self.distance_point
if self.dwithin:
search_kwargs['dwithin'] = self.dwithin
if self.end_offset is not None:
search_kwargs['end_offset'] = self.end_offset
if self.facets:
search_kwargs['facets'] = self.facets
if self.fields:
search_kwargs['fields'] = self.fields
if self.highlight:
search_kwargs['highlight'] = self.highlight
if self.models:
search_kwargs['models'] = self.models
if self.narrow_queries:
search_kwargs['narrow_queries'] = self.narrow_queries
if self.query_facets:
search_kwargs['query_facets'] = self.query_facets
if self.within:
search_kwargs['within'] = self.within
if spelling_query:
search_kwargs['spelling_query'] = spelling_query
if self.stats:
search_kwargs['stats'] = self.stats
return search_kwargs
def run(self, spelling_query=None, **kwargs):
"""Builds and executes the query. Returns a list of search results."""
final_query = self.build_query()
search_kwargs = self.build_params(spelling_query, **kwargs)
if kwargs:
search_kwargs.update(kwargs)
results = self.backend.search(final_query, **search_kwargs)
self._results = results.get('results', [])
self._hit_count = results.get('hits', 0)
self._facet_counts = self.post_process_facets(results)
self._stats = results.get('stats',{})
self._spelling_suggestion = results.get('spelling_suggestion', None)
def run_mlt(self, **kwargs):
"""Builds and executes the query. Returns a list of search results."""
if self._more_like_this is False or self._mlt_instance is None:
raise MoreLikeThisError("No instance was provided to determine 'More Like This' results.")
additional_query_string = self.build_query()
search_kwargs = {
'start_offset': self.start_offset,
'result_class': self.result_class,
'models': self.models
}
if self.end_offset is not None:
search_kwargs['end_offset'] = self.end_offset - self.start_offset
results = self.backend.more_like_this(self._mlt_instance, additional_query_string, **search_kwargs)
self._results = results.get('results', [])
self._hit_count = results.get('hits', 0)
class SolrEngine(BaseEngine):
backend = SolrSearchBackend
query = SolrSearchQuery