945 lines
35 KiB
Python
945 lines
35 KiB
Python
# encoding: utf-8
|
|
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
import datetime
|
|
import re
|
|
import warnings
|
|
|
|
from django.conf import settings
|
|
from django.core.exceptions import ImproperlyConfigured
|
|
from django.utils import six
|
|
|
|
import haystack
|
|
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, log_query
|
|
from haystack.constants import DEFAULT_OPERATOR, DJANGO_CT, DJANGO_ID, ID
|
|
from haystack.exceptions import MissingDependency, MoreLikeThisError, SkipDocument
|
|
from haystack.inputs import Clean, Exact, PythonData, Raw
|
|
from haystack.models import SearchResult
|
|
from haystack.utils import log as logging
|
|
from haystack.utils import get_identifier, get_model_ct
|
|
from haystack.utils.app_loading import haystack_get_model
|
|
|
|
try:
|
|
import elasticsearch
|
|
from elasticsearch.helpers import bulk_index
|
|
from elasticsearch.exceptions import NotFoundError
|
|
except ImportError:
|
|
raise MissingDependency("The 'elasticsearch' backend requires the installation of 'elasticsearch'. Please refer to the documentation.")
|
|
|
|
|
|
DATETIME_REGEX = re.compile(
|
|
r'^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T'
|
|
r'(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d+)?$')
|
|
|
|
|
|
class ElasticsearchSearchBackend(BaseSearchBackend):
|
|
# Word reserved by Elasticsearch for special use.
|
|
RESERVED_WORDS = (
|
|
'AND',
|
|
'NOT',
|
|
'OR',
|
|
'TO',
|
|
)
|
|
|
|
# Characters reserved by Elasticsearch for special use.
|
|
# The '\\' must come first, so as not to overwrite the other slash replacements.
|
|
RESERVED_CHARACTERS = (
|
|
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
|
|
'[', ']', '^', '"', '~', '*', '?', ':', '/',
|
|
)
|
|
|
|
# Settings to add an n-gram & edge n-gram analyzer.
|
|
DEFAULT_SETTINGS = {
|
|
'settings': {
|
|
"analysis": {
|
|
"analyzer": {
|
|
"ngram_analyzer": {
|
|
"type": "custom",
|
|
"tokenizer": "standard",
|
|
"filter": ["haystack_ngram", "lowercase"]
|
|
},
|
|
"edgengram_analyzer": {
|
|
"type": "custom",
|
|
"tokenizer": "standard",
|
|
"filter": ["haystack_edgengram", "lowercase"]
|
|
}
|
|
},
|
|
"tokenizer": {
|
|
"haystack_ngram_tokenizer": {
|
|
"type": "nGram",
|
|
"min_gram": 3,
|
|
"max_gram": 15,
|
|
},
|
|
"haystack_edgengram_tokenizer": {
|
|
"type": "edgeNGram",
|
|
"min_gram": 2,
|
|
"max_gram": 15,
|
|
"side": "front"
|
|
}
|
|
},
|
|
"filter": {
|
|
"haystack_ngram": {
|
|
"type": "nGram",
|
|
"min_gram": 3,
|
|
"max_gram": 15
|
|
},
|
|
"haystack_edgengram": {
|
|
"type": "edgeNGram",
|
|
"min_gram": 2,
|
|
"max_gram": 15
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
def __init__(self, connection_alias, **connection_options):
|
|
super(ElasticsearchSearchBackend, self).__init__(connection_alias, **connection_options)
|
|
|
|
if not 'URL' in connection_options:
|
|
raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias)
|
|
|
|
if not 'INDEX_NAME' in connection_options:
|
|
raise ImproperlyConfigured("You must specify a 'INDEX_NAME' in your settings for connection '%s'." % connection_alias)
|
|
|
|
self.conn = elasticsearch.Elasticsearch(connection_options['URL'], timeout=self.timeout, **connection_options.get('KWARGS', {}))
|
|
self.index_name = connection_options['INDEX_NAME']
|
|
self.log = logging.getLogger('haystack')
|
|
self.setup_complete = False
|
|
self.existing_mapping = {}
|
|
|
|
def setup(self):
|
|
"""
|
|
Defers loading until needed.
|
|
"""
|
|
# Get the existing mapping & cache it. We'll compare it
|
|
# during the ``update`` & if it doesn't match, we'll put the new
|
|
# mapping.
|
|
try:
|
|
self.existing_mapping = self.conn.indices.get_mapping(index=self.index_name)
|
|
except NotFoundError:
|
|
pass
|
|
except Exception:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
unified_index = haystack.connections[self.connection_alias].get_unified_index()
|
|
self.content_field_name, field_mapping = self.build_schema(unified_index.all_searchfields())
|
|
current_mapping = {
|
|
'modelresult': {
|
|
'properties': field_mapping,
|
|
'_boost': {
|
|
'name': 'boost',
|
|
'null_value': 1.0
|
|
}
|
|
}
|
|
}
|
|
|
|
if current_mapping != self.existing_mapping:
|
|
try:
|
|
# Make sure the index is there first.
|
|
self.conn.indices.create(index=self.index_name, body=self.DEFAULT_SETTINGS, ignore=400)
|
|
self.conn.indices.put_mapping(index=self.index_name, doc_type='modelresult', body=current_mapping)
|
|
self.existing_mapping = current_mapping
|
|
except Exception:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
self.setup_complete = True
|
|
|
|
def update(self, index, iterable, commit=True):
|
|
if not self.setup_complete:
|
|
try:
|
|
self.setup()
|
|
except elasticsearch.TransportError as e:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
self.log.error("Failed to add documents to Elasticsearch: %s", e)
|
|
return
|
|
|
|
prepped_docs = []
|
|
|
|
for obj in iterable:
|
|
try:
|
|
prepped_data = index.full_prepare(obj)
|
|
final_data = {}
|
|
|
|
# Convert the data to make sure it's happy.
|
|
for key, value in prepped_data.items():
|
|
final_data[key] = self._from_python(value)
|
|
final_data['_id'] = final_data[ID]
|
|
|
|
prepped_docs.append(final_data)
|
|
except SkipDocument:
|
|
self.log.debug(u"Indexing for object `%s` skipped", obj)
|
|
except elasticsearch.TransportError as e:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
# We'll log the object identifier but won't include the actual object
|
|
# to avoid the possibility of that generating encoding errors while
|
|
# processing the log message:
|
|
self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={
|
|
"data": {
|
|
"index": index,
|
|
"object": get_identifier(obj)
|
|
}
|
|
})
|
|
|
|
bulk_index(self.conn, prepped_docs, index=self.index_name, doc_type='modelresult')
|
|
|
|
if commit:
|
|
self.conn.indices.refresh(index=self.index_name)
|
|
|
|
def remove(self, obj_or_string, commit=True):
|
|
doc_id = get_identifier(obj_or_string)
|
|
|
|
if not self.setup_complete:
|
|
try:
|
|
self.setup()
|
|
except elasticsearch.TransportError as e:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
|
|
return
|
|
|
|
try:
|
|
self.conn.delete(index=self.index_name, doc_type='modelresult', id=doc_id, ignore=404)
|
|
|
|
if commit:
|
|
self.conn.indices.refresh(index=self.index_name)
|
|
except elasticsearch.TransportError as e:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
self.log.error("Failed to remove document '%s' from Elasticsearch: %s", doc_id, e)
|
|
|
|
def clear(self, models=[], commit=True):
|
|
# We actually don't want to do this here, as mappings could be
|
|
# very different.
|
|
# if not self.setup_complete:
|
|
# self.setup()
|
|
|
|
try:
|
|
if not models:
|
|
self.conn.indices.delete(index=self.index_name, ignore=404)
|
|
self.setup_complete = False
|
|
self.existing_mapping = {}
|
|
else:
|
|
models_to_delete = []
|
|
|
|
for model in models:
|
|
models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model)))
|
|
|
|
# Delete by query in Elasticsearch asssumes you're dealing with
|
|
# a ``query`` root object. :/
|
|
query = {'query': {'query_string': {'query': " OR ".join(models_to_delete)}}}
|
|
self.conn.delete_by_query(index=self.index_name, doc_type='modelresult', body=query)
|
|
except elasticsearch.TransportError as e:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
if len(models):
|
|
self.log.error("Failed to clear Elasticsearch index of models '%s': %s", ','.join(models_to_delete), e)
|
|
else:
|
|
self.log.error("Failed to clear Elasticsearch index: %s", e)
|
|
|
|
def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None,
|
|
fields='', highlight=False, facets=None,
|
|
date_facets=None, query_facets=None,
|
|
narrow_queries=None, spelling_query=None,
|
|
within=None, dwithin=None, distance_point=None,
|
|
models=None, limit_to_registered_models=None,
|
|
result_class=None):
|
|
index = haystack.connections[self.connection_alias].get_unified_index()
|
|
content_field = index.document_field
|
|
|
|
if query_string == '*:*':
|
|
kwargs = {
|
|
'query': {
|
|
"match_all": {}
|
|
},
|
|
}
|
|
else:
|
|
kwargs = {
|
|
'query': {
|
|
'query_string': {
|
|
'default_field': content_field,
|
|
'default_operator': DEFAULT_OPERATOR,
|
|
'query': query_string,
|
|
'analyze_wildcard': True,
|
|
'auto_generate_phrase_queries': True,
|
|
},
|
|
},
|
|
}
|
|
|
|
# so far, no filters
|
|
filters = []
|
|
|
|
if fields:
|
|
if isinstance(fields, (list, set)):
|
|
fields = " ".join(fields)
|
|
|
|
kwargs['fields'] = fields
|
|
|
|
if sort_by is not None:
|
|
order_list = []
|
|
for field, direction in sort_by:
|
|
if field == 'distance' and distance_point:
|
|
# Do the geo-enabled sort.
|
|
lng, lat = distance_point['point'].get_coords()
|
|
sort_kwargs = {
|
|
"_geo_distance": {
|
|
distance_point['field']: [lng, lat],
|
|
"order": direction,
|
|
"unit": "km"
|
|
}
|
|
}
|
|
else:
|
|
if field == 'distance':
|
|
warnings.warn("In order to sort by distance, you must call the '.distance(...)' method.")
|
|
|
|
# Regular sorting.
|
|
sort_kwargs = {field: {'order': direction}}
|
|
|
|
order_list.append(sort_kwargs)
|
|
|
|
kwargs['sort'] = order_list
|
|
|
|
# From/size offsets don't seem to work right in Elasticsearch's DSL. :/
|
|
# if start_offset is not None:
|
|
# kwargs['from'] = start_offset
|
|
|
|
# if end_offset is not None:
|
|
# kwargs['size'] = end_offset - start_offset
|
|
|
|
if highlight is True:
|
|
kwargs['highlight'] = {
|
|
'fields': {
|
|
content_field: {'store': 'yes'},
|
|
}
|
|
}
|
|
|
|
if self.include_spelling:
|
|
kwargs['suggest'] = {
|
|
'suggest': {
|
|
'text': spelling_query or query_string,
|
|
'term': {
|
|
# Using content_field here will result in suggestions of stemmed words.
|
|
'field': '_all',
|
|
},
|
|
},
|
|
}
|
|
|
|
if narrow_queries is None:
|
|
narrow_queries = set()
|
|
|
|
if facets is not None:
|
|
kwargs.setdefault('facets', {})
|
|
|
|
for facet_fieldname, extra_options in facets.items():
|
|
facet_options = {
|
|
'terms': {
|
|
'field': facet_fieldname,
|
|
'size': 100,
|
|
},
|
|
}
|
|
# Special cases for options applied at the facet level (not the terms level).
|
|
if extra_options.pop('global_scope', False):
|
|
# Renamed "global_scope" since "global" is a python keyword.
|
|
facet_options['global'] = True
|
|
if 'facet_filter' in extra_options:
|
|
facet_options['facet_filter'] = extra_options.pop('facet_filter')
|
|
facet_options['terms'].update(extra_options)
|
|
kwargs['facets'][facet_fieldname] = facet_options
|
|
|
|
if date_facets is not None:
|
|
kwargs.setdefault('facets', {})
|
|
|
|
for facet_fieldname, value in date_facets.items():
|
|
# Need to detect on gap_by & only add amount if it's more than one.
|
|
interval = value.get('gap_by').lower()
|
|
|
|
# Need to detect on amount (can't be applied on months or years).
|
|
if value.get('gap_amount', 1) != 1 and interval not in ('month', 'year'):
|
|
# Just the first character is valid for use.
|
|
interval = "%s%s" % (value['gap_amount'], interval[:1])
|
|
|
|
kwargs['facets'][facet_fieldname] = {
|
|
'date_histogram': {
|
|
'field': facet_fieldname,
|
|
'interval': interval,
|
|
},
|
|
'facet_filter': {
|
|
"range": {
|
|
facet_fieldname: {
|
|
'from': self._from_python(value.get('start_date')),
|
|
'to': self._from_python(value.get('end_date')),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if query_facets is not None:
|
|
kwargs.setdefault('facets', {})
|
|
|
|
for facet_fieldname, value in query_facets:
|
|
kwargs['facets'][facet_fieldname] = {
|
|
'query': {
|
|
'query_string': {
|
|
'query': value,
|
|
}
|
|
},
|
|
}
|
|
|
|
if limit_to_registered_models is None:
|
|
limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
|
|
|
|
if models and len(models):
|
|
model_choices = sorted(get_model_ct(model) for model in models)
|
|
elif limit_to_registered_models:
|
|
# Using narrow queries, limit the results to only models handled
|
|
# with the current routers.
|
|
model_choices = self.build_models_list()
|
|
else:
|
|
model_choices = []
|
|
|
|
if len(model_choices) > 0:
|
|
filters.append({"terms": {DJANGO_CT: model_choices}})
|
|
|
|
for q in narrow_queries:
|
|
filters.append({
|
|
'fquery': {
|
|
'query': {
|
|
'query_string': {
|
|
'query': q
|
|
},
|
|
},
|
|
'_cache': True,
|
|
}
|
|
})
|
|
|
|
if within is not None:
|
|
from haystack.utils.geo import generate_bounding_box
|
|
|
|
((south, west), (north, east)) = generate_bounding_box(within['point_1'], within['point_2'])
|
|
within_filter = {
|
|
"geo_bounding_box": {
|
|
within['field']: {
|
|
"top_left": {
|
|
"lat": north,
|
|
"lon": west
|
|
},
|
|
"bottom_right": {
|
|
"lat": south,
|
|
"lon": east
|
|
}
|
|
}
|
|
},
|
|
}
|
|
filters.append(within_filter)
|
|
|
|
if dwithin is not None:
|
|
lng, lat = dwithin['point'].get_coords()
|
|
|
|
# NB: the 1.0.0 release of elasticsearch introduce an
|
|
# incompatible change on the distance filter formating
|
|
if elasticsearch.VERSION >= (1, 0, 0):
|
|
distance = "%(dist).6f%(unit)s" % {
|
|
'dist': dwithin['distance'].km,
|
|
'unit': "km"
|
|
}
|
|
else:
|
|
distance = dwithin['distance'].km
|
|
|
|
dwithin_filter = {
|
|
"geo_distance": {
|
|
"distance": distance,
|
|
dwithin['field']: {
|
|
"lat": lat,
|
|
"lon": lng
|
|
}
|
|
}
|
|
}
|
|
filters.append(dwithin_filter)
|
|
|
|
# if we want to filter, change the query type to filteres
|
|
if filters:
|
|
kwargs["query"] = {"filtered": {"query": kwargs.pop("query")}}
|
|
if len(filters) == 1:
|
|
kwargs['query']['filtered']["filter"] = filters[0]
|
|
else:
|
|
kwargs['query']['filtered']["filter"] = {"bool": {"must": filters}}
|
|
|
|
return kwargs
|
|
|
|
@log_query
|
|
def search(self, query_string, **kwargs):
|
|
if len(query_string) == 0:
|
|
return {
|
|
'results': [],
|
|
'hits': 0,
|
|
}
|
|
|
|
if not self.setup_complete:
|
|
self.setup()
|
|
|
|
search_kwargs = self.build_search_kwargs(query_string, **kwargs)
|
|
search_kwargs['from'] = kwargs.get('start_offset', 0)
|
|
|
|
order_fields = set()
|
|
for order in search_kwargs.get('sort', []):
|
|
for key in order.keys():
|
|
order_fields.add(key)
|
|
|
|
geo_sort = '_geo_distance' in order_fields
|
|
|
|
end_offset = kwargs.get('end_offset')
|
|
start_offset = kwargs.get('start_offset', 0)
|
|
if end_offset is not None and end_offset > start_offset:
|
|
search_kwargs['size'] = end_offset - start_offset
|
|
|
|
try:
|
|
raw_results = self.conn.search(body=search_kwargs,
|
|
index=self.index_name,
|
|
doc_type='modelresult',
|
|
_source=True)
|
|
except elasticsearch.TransportError as e:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
self.log.error("Failed to query Elasticsearch using '%s': %s", query_string, e)
|
|
raw_results = {}
|
|
|
|
return self._process_results(raw_results,
|
|
highlight=kwargs.get('highlight'),
|
|
result_class=kwargs.get('result_class', SearchResult),
|
|
distance_point=kwargs.get('distance_point'),
|
|
geo_sort=geo_sort)
|
|
|
|
def more_like_this(self, model_instance, additional_query_string=None,
|
|
start_offset=0, end_offset=None, models=None,
|
|
limit_to_registered_models=None, result_class=None, **kwargs):
|
|
from haystack import connections
|
|
|
|
if not self.setup_complete:
|
|
self.setup()
|
|
|
|
# Deferred models will have a different class ("RealClass_Deferred_fieldname")
|
|
# which won't be in our registry:
|
|
model_klass = model_instance._meta.concrete_model
|
|
|
|
index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
|
|
field_name = index.get_content_field()
|
|
params = {}
|
|
|
|
if start_offset is not None:
|
|
params['search_from'] = start_offset
|
|
|
|
if end_offset is not None:
|
|
params['search_size'] = end_offset - start_offset
|
|
|
|
doc_id = get_identifier(model_instance)
|
|
|
|
try:
|
|
raw_results = self.conn.mlt(index=self.index_name, doc_type='modelresult', id=doc_id, mlt_fields=[field_name], **params)
|
|
except elasticsearch.TransportError as e:
|
|
if not self.silently_fail:
|
|
raise
|
|
|
|
self.log.error("Failed to fetch More Like This from Elasticsearch for document '%s': %s", doc_id, e)
|
|
raw_results = {}
|
|
|
|
return self._process_results(raw_results, result_class=result_class)
|
|
|
|
def _process_results(self, raw_results, highlight=False,
|
|
result_class=None, distance_point=None,
|
|
geo_sort=False):
|
|
from haystack import connections
|
|
results = []
|
|
hits = raw_results.get('hits', {}).get('total', 0)
|
|
facets = {}
|
|
spelling_suggestion = None
|
|
|
|
if result_class is None:
|
|
result_class = SearchResult
|
|
|
|
if self.include_spelling and 'suggest' in raw_results:
|
|
raw_suggest = raw_results['suggest'].get('suggest')
|
|
if raw_suggest:
|
|
spelling_suggestion = ' '.join([word['text'] if len(word['options']) == 0 else word['options'][0]['text'] for word in raw_suggest])
|
|
|
|
if 'facets' in raw_results:
|
|
facets = {
|
|
'fields': {},
|
|
'dates': {},
|
|
'queries': {},
|
|
}
|
|
|
|
for facet_fieldname, facet_info in raw_results['facets'].items():
|
|
if facet_info.get('_type', 'terms') == 'terms':
|
|
facets['fields'][facet_fieldname] = [(individual['term'], individual['count']) for individual in facet_info['terms']]
|
|
elif facet_info.get('_type', 'terms') == 'date_histogram':
|
|
# Elasticsearch provides UTC timestamps with an extra three
|
|
# decimals of precision, which datetime barfs on.
|
|
facets['dates'][facet_fieldname] = [(datetime.datetime.utcfromtimestamp(individual['time'] / 1000), individual['count']) for individual in facet_info['entries']]
|
|
elif facet_info.get('_type', 'terms') == 'query':
|
|
facets['queries'][facet_fieldname] = facet_info['count']
|
|
|
|
unified_index = connections[self.connection_alias].get_unified_index()
|
|
indexed_models = unified_index.get_indexed_models()
|
|
content_field = unified_index.document_field
|
|
|
|
for raw_result in raw_results.get('hits', {}).get('hits', []):
|
|
source = raw_result['_source']
|
|
app_label, model_name = source[DJANGO_CT].split('.')
|
|
additional_fields = {}
|
|
model = haystack_get_model(app_label, model_name)
|
|
|
|
if model and model in indexed_models:
|
|
for key, value in source.items():
|
|
index = unified_index.get_index(model)
|
|
string_key = str(key)
|
|
|
|
if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
|
|
additional_fields[string_key] = index.fields[string_key].convert(value)
|
|
else:
|
|
additional_fields[string_key] = self._to_python(value)
|
|
|
|
del(additional_fields[DJANGO_CT])
|
|
del(additional_fields[DJANGO_ID])
|
|
|
|
if 'highlight' in raw_result:
|
|
additional_fields['highlighted'] = raw_result['highlight'].get(content_field, '')
|
|
|
|
if distance_point:
|
|
additional_fields['_point_of_origin'] = distance_point
|
|
|
|
if geo_sort and raw_result.get('sort'):
|
|
from haystack.utils.geo import Distance
|
|
additional_fields['_distance'] = Distance(km=float(raw_result['sort'][0]))
|
|
else:
|
|
additional_fields['_distance'] = None
|
|
|
|
result = result_class(app_label, model_name, source[DJANGO_ID], raw_result['_score'], **additional_fields)
|
|
results.append(result)
|
|
else:
|
|
hits -= 1
|
|
|
|
return {
|
|
'results': results,
|
|
'hits': hits,
|
|
'facets': facets,
|
|
'spelling_suggestion': spelling_suggestion,
|
|
}
|
|
|
|
def build_schema(self, fields):
|
|
content_field_name = ''
|
|
mapping = {
|
|
DJANGO_CT: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
|
|
DJANGO_ID: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False},
|
|
}
|
|
|
|
for field_name, field_class in fields.items():
|
|
field_mapping = FIELD_MAPPINGS.get(field_class.field_type, DEFAULT_FIELD_MAPPING).copy()
|
|
if field_class.boost != 1.0:
|
|
field_mapping['boost'] = field_class.boost
|
|
|
|
if field_class.document is True:
|
|
content_field_name = field_class.index_fieldname
|
|
|
|
# Do this last to override `text` fields.
|
|
if field_mapping['type'] == 'string':
|
|
if field_class.indexed is False or hasattr(field_class, 'facet_for'):
|
|
field_mapping['index'] = 'not_analyzed'
|
|
del field_mapping['analyzer']
|
|
|
|
mapping[field_class.index_fieldname] = field_mapping
|
|
|
|
return (content_field_name, mapping)
|
|
|
|
def _iso_datetime(self, value):
|
|
"""
|
|
If value appears to be something datetime-like, return it in ISO format.
|
|
|
|
Otherwise, return None.
|
|
"""
|
|
if hasattr(value, 'strftime'):
|
|
if hasattr(value, 'hour'):
|
|
return value.isoformat()
|
|
else:
|
|
return '%sT00:00:00' % value.isoformat()
|
|
|
|
def _from_python(self, value):
|
|
"""Convert more Python data types to ES-understandable JSON."""
|
|
iso = self._iso_datetime(value)
|
|
if iso:
|
|
return iso
|
|
elif isinstance(value, six.binary_type):
|
|
# TODO: Be stricter.
|
|
return six.text_type(value, errors='replace')
|
|
elif isinstance(value, set):
|
|
return list(value)
|
|
return value
|
|
|
|
def _to_python(self, value):
|
|
"""Convert values from ElasticSearch to native Python values."""
|
|
if isinstance(value, (int, float, complex, list, tuple, bool)):
|
|
return value
|
|
|
|
if isinstance(value, six.string_types):
|
|
possible_datetime = DATETIME_REGEX.search(value)
|
|
|
|
if possible_datetime:
|
|
date_values = possible_datetime.groupdict()
|
|
|
|
for dk, dv in date_values.items():
|
|
date_values[dk] = int(dv)
|
|
|
|
return datetime.datetime(
|
|
date_values['year'], date_values['month'],
|
|
date_values['day'], date_values['hour'],
|
|
date_values['minute'], date_values['second'])
|
|
|
|
try:
|
|
# This is slightly gross but it's hard to tell otherwise what the
|
|
# string's original type might have been. Be careful who you trust.
|
|
converted_value = eval(value)
|
|
|
|
# Try to handle most built-in types.
|
|
if isinstance(
|
|
converted_value,
|
|
(int, list, tuple, set, dict, float, complex)):
|
|
return converted_value
|
|
except Exception:
|
|
# If it fails (SyntaxError or its ilk) or we don't trust it,
|
|
# continue on.
|
|
pass
|
|
|
|
return value
|
|
|
|
# DRL_FIXME: Perhaps move to something where, if none of these
|
|
# match, call a custom method on the form that returns, per-backend,
|
|
# the right type of storage?
|
|
DEFAULT_FIELD_MAPPING = {'type': 'string', 'analyzer': 'snowball'}
|
|
FIELD_MAPPINGS = {
|
|
'edge_ngram': {'type': 'string', 'analyzer': 'edgengram_analyzer'},
|
|
'ngram': {'type': 'string', 'analyzer': 'ngram_analyzer'},
|
|
'date': {'type': 'date'},
|
|
'datetime': {'type': 'date'},
|
|
|
|
'location': {'type': 'geo_point'},
|
|
'boolean': {'type': 'boolean'},
|
|
'float': {'type': 'float'},
|
|
'long': {'type': 'long'},
|
|
'integer': {'type': 'long'},
|
|
}
|
|
|
|
|
|
# Sucks that this is almost an exact copy of what's in the Solr backend,
|
|
# but we can't import due to dependencies.
|
|
class ElasticsearchSearchQuery(BaseSearchQuery):
|
|
def matching_all_fragment(self):
|
|
return '*:*'
|
|
|
|
def build_query_fragment(self, field, filter_type, value):
|
|
from haystack import connections
|
|
query_frag = ''
|
|
|
|
if not hasattr(value, 'input_type_name'):
|
|
# Handle when we've got a ``ValuesListQuerySet``...
|
|
if hasattr(value, 'values_list'):
|
|
value = list(value)
|
|
|
|
if isinstance(value, six.string_types):
|
|
# It's not an ``InputType``. Assume ``Clean``.
|
|
value = Clean(value)
|
|
else:
|
|
value = PythonData(value)
|
|
|
|
# Prepare the query using the InputType.
|
|
prepared_value = value.prepare(self)
|
|
|
|
if not isinstance(prepared_value, (set, list, tuple)):
|
|
# Then convert whatever we get back to what pysolr wants if needed.
|
|
prepared_value = self.backend._from_python(prepared_value)
|
|
|
|
# 'content' is a special reserved word, much like 'pk' in
|
|
# Django's ORM layer. It indicates 'no special field'.
|
|
if field == 'content':
|
|
index_fieldname = ''
|
|
else:
|
|
index_fieldname = u'%s:' % connections[self._using].get_unified_index().get_index_fieldname(field)
|
|
|
|
filter_types = {
|
|
'contains': u'%s',
|
|
'startswith': u'%s*',
|
|
'exact': u'%s',
|
|
'gt': u'{%s TO *}',
|
|
'gte': u'[%s TO *]',
|
|
'lt': u'{* TO %s}',
|
|
'lte': u'[* TO %s]',
|
|
}
|
|
|
|
if value.post_process is False:
|
|
query_frag = prepared_value
|
|
else:
|
|
if filter_type in ['contains', 'startswith']:
|
|
if value.input_type_name == 'exact':
|
|
query_frag = prepared_value
|
|
else:
|
|
# Iterate over terms & incorportate the converted form of each into the query.
|
|
terms = []
|
|
|
|
if isinstance(prepared_value, six.string_types):
|
|
for possible_value in prepared_value.split(' '):
|
|
terms.append(filter_types[filter_type] % self.backend._from_python(possible_value))
|
|
else:
|
|
terms.append(filter_types[filter_type] % self.backend._from_python(prepared_value))
|
|
|
|
if len(terms) == 1:
|
|
query_frag = terms[0]
|
|
else:
|
|
query_frag = u"(%s)" % " AND ".join(terms)
|
|
elif filter_type == 'in':
|
|
in_options = []
|
|
|
|
for possible_value in prepared_value:
|
|
in_options.append(u'"%s"' % self.backend._from_python(possible_value))
|
|
|
|
query_frag = u"(%s)" % " OR ".join(in_options)
|
|
elif filter_type == 'range':
|
|
start = self.backend._from_python(prepared_value[0])
|
|
end = self.backend._from_python(prepared_value[1])
|
|
query_frag = u'["%s" TO "%s"]' % (start, end)
|
|
elif filter_type == 'exact':
|
|
if value.input_type_name == 'exact':
|
|
query_frag = prepared_value
|
|
else:
|
|
prepared_value = Exact(prepared_value).prepare(self)
|
|
query_frag = filter_types[filter_type] % prepared_value
|
|
else:
|
|
if value.input_type_name != 'exact':
|
|
prepared_value = Exact(prepared_value).prepare(self)
|
|
|
|
query_frag = filter_types[filter_type] % prepared_value
|
|
|
|
if len(query_frag) and not isinstance(value, Raw):
|
|
if not query_frag.startswith('(') and not query_frag.endswith(')'):
|
|
query_frag = "(%s)" % query_frag
|
|
|
|
return u"%s%s" % (index_fieldname, query_frag)
|
|
|
|
def build_alt_parser_query(self, parser_name, query_string='', **kwargs):
|
|
if query_string:
|
|
kwargs['v'] = query_string
|
|
|
|
kwarg_bits = []
|
|
|
|
for key in sorted(kwargs.keys()):
|
|
if isinstance(kwargs[key], six.string_types) and ' ' in kwargs[key]:
|
|
kwarg_bits.append(u"%s='%s'" % (key, kwargs[key]))
|
|
else:
|
|
kwarg_bits.append(u"%s=%s" % (key, kwargs[key]))
|
|
|
|
return u"{!%s %s}" % (parser_name, ' '.join(kwarg_bits))
|
|
|
|
def build_params(self, spelling_query=None, **kwargs):
|
|
search_kwargs = {
|
|
'start_offset': self.start_offset,
|
|
'result_class': self.result_class
|
|
}
|
|
order_by_list = None
|
|
|
|
if self.order_by:
|
|
if order_by_list is None:
|
|
order_by_list = []
|
|
|
|
for field in self.order_by:
|
|
direction = 'asc'
|
|
if field.startswith('-'):
|
|
direction = 'desc'
|
|
field = field[1:]
|
|
order_by_list.append((field, direction))
|
|
|
|
search_kwargs['sort_by'] = order_by_list
|
|
|
|
if self.date_facets:
|
|
search_kwargs['date_facets'] = self.date_facets
|
|
|
|
if self.distance_point:
|
|
search_kwargs['distance_point'] = self.distance_point
|
|
|
|
if self.dwithin:
|
|
search_kwargs['dwithin'] = self.dwithin
|
|
|
|
if self.end_offset is not None:
|
|
search_kwargs['end_offset'] = self.end_offset
|
|
|
|
if self.facets:
|
|
search_kwargs['facets'] = self.facets
|
|
|
|
if self.fields:
|
|
search_kwargs['fields'] = self.fields
|
|
|
|
if self.highlight:
|
|
search_kwargs['highlight'] = self.highlight
|
|
|
|
if self.models:
|
|
search_kwargs['models'] = self.models
|
|
|
|
if self.narrow_queries:
|
|
search_kwargs['narrow_queries'] = self.narrow_queries
|
|
|
|
if self.query_facets:
|
|
search_kwargs['query_facets'] = self.query_facets
|
|
|
|
if self.within:
|
|
search_kwargs['within'] = self.within
|
|
|
|
if spelling_query:
|
|
search_kwargs['spelling_query'] = spelling_query
|
|
|
|
return search_kwargs
|
|
|
|
def run(self, spelling_query=None, **kwargs):
|
|
"""Builds and executes the query. Returns a list of search results."""
|
|
final_query = self.build_query()
|
|
search_kwargs = self.build_params(spelling_query, **kwargs)
|
|
|
|
if kwargs:
|
|
search_kwargs.update(kwargs)
|
|
|
|
results = self.backend.search(final_query, **search_kwargs)
|
|
self._results = results.get('results', [])
|
|
self._hit_count = results.get('hits', 0)
|
|
self._facet_counts = self.post_process_facets(results)
|
|
self._spelling_suggestion = results.get('spelling_suggestion', None)
|
|
|
|
def run_mlt(self, **kwargs):
|
|
"""Builds and executes the query. Returns a list of search results."""
|
|
if self._more_like_this is False or self._mlt_instance is None:
|
|
raise MoreLikeThisError("No instance was provided to determine 'More Like This' results.")
|
|
|
|
additional_query_string = self.build_query()
|
|
search_kwargs = {
|
|
'start_offset': self.start_offset,
|
|
'result_class': self.result_class,
|
|
'models': self.models
|
|
}
|
|
|
|
if self.end_offset is not None:
|
|
search_kwargs['end_offset'] = self.end_offset - self.start_offset
|
|
|
|
results = self.backend.more_like_this(self._mlt_instance, additional_query_string, **search_kwargs)
|
|
self._results = results.get('results', [])
|
|
self._hit_count = results.get('hits', 0)
|
|
|
|
|
|
class ElasticsearchSearchEngine(BaseEngine):
|
|
backend = ElasticsearchSearchBackend
|
|
query = ElasticsearchSearchQuery
|