collective.solr/src/collective/solr/mangler.py

266 lines
9.3 KiB
Python

from zope.component import queryUtility
from AccessControl import getSecurityManager
from DateTime import DateTime
from collective.solr.interfaces import ISolrConnectionConfig
from collective.solr.queryparser import quote
from collective.solr.utils import isSimpleTerm
from collective.solr.utils import isSimpleSearch
from collective.solr.utils import isWildCard
from collective.solr.utils import splitSimpleSearch
from collective.solr.utils import prepare_wildcard
ranges = {
'min': '[%s TO *]',
'max': '[* TO %s]',
'min:max': '[%s TO %s]',
}
sort_aliases = {
'sortable_title': 'Title',
}
query_args = ('range',
'operator',
'depth',
)
ignored = 'use_solr', '-C'
def iso8601date(value):
""" convert `DateTime` to iso 8601 date format """
if isinstance(value, DateTime):
v = value.toZone('UTC')
value = '%04d-%02d-%02dT%02d:%02d:%06.3fZ' % (v.year(),
v.month(), v.day(), v.hour(), v.minute(), v.second())
return value
def makeSimpleExpressions(term, levenstein_distance):
'''Return a search expression for part of the query that
includes the levenstein distance and wildcards where appropriate.
Returns both an expression for "value" and "base_value"'''
base_value = term
if levenstein_distance:
levenstein_expr = '~%s' % levenstein_distance
else:
levenstein_expr = ''
if '"' in term: # quoted literals
value = '%s%s' % (term, levenstein_expr)
base_value = value
elif isWildCard(term):
value = prepare_wildcard(term)
base_value = quote(term.replace('*', '').replace('?', ''))
else:
value = '%s* OR %s%s' % (prepare_wildcard(term), term,
levenstein_expr)
return '(%s)' % value, '(%s)' % base_value
def mangleSearchableText(value, config):
pattern = getattr(config, 'search_pattern', '')
levenstein_distance = getattr(config, 'levenstein_distance', 0)
value_parts = []
base_value_parts = []
if not isSimpleSearch(value):
return value
for term in splitSimpleSearch(value):
(term_value,
term_base_value) = makeSimpleExpressions(term,
levenstein_distance)
value_parts.append(term_value)
base_value_parts.append(term_base_value)
base_value = ' '.join(base_value_parts)
value = ' '.join(value_parts)
if pattern:
value = pattern.format(value=quote(value),
base_value=base_value)
return set([value]) # add literal query parameter
return value
def mangleQuery(keywords, config, schema):
""" translate / mangle query parameters to replace zope specifics
with equivalent constructs for solr """
extras = {}
for key, value in keywords.items():
if key.endswith('_usage'): # convert old-style parameters
category, spec = value.split(':', 1)
extras[key[:-6]] = {category: spec}
del keywords[key]
elif isinstance(value, dict): # unify dict parameters
keywords[key] = value['query']
del value['query']
extras[key] = value
elif hasattr(value, 'query'): # unify object parameters
keywords[key] = value.query
extra = dict()
for arg in query_args:
arg_val = getattr(value, arg, None)
if arg_val is not None:
extra[arg] = arg_val
extras[key] = extra
elif key in ignored:
del keywords[key]
# find EPI indexes
if schema:
epi_indexes = {}
for name in schema.keys():
parts = name.split('_')
if parts[-1] in ['string', 'depth', 'parents']:
count = epi_indexes.get(parts[0], 0)
epi_indexes[parts[0]] = count + 1
epi_indexes = [k for k, v in epi_indexes.items() if v == 3]
else:
epi_indexes = ['path']
for key, value in keywords.items():
args = extras.get(key, {})
if key == 'SearchableText':
keywords[key] = mangleSearchableText(value, config)
continue
if key in epi_indexes:
path = keywords['%s_parents' % key] = value
del keywords[key]
if 'depth' in args:
depth = int(args['depth'])
if depth >= 0:
if not isinstance(value, (list, tuple)):
path = [path]
tmpl = '(+%s_depth:[%d TO %d] AND +%s_parents:%s)'
params = keywords['%s_parents' % key] = set()
for p in path:
base = len(p.split('/'))
params.add(
tmpl % (
key,
base + (depth and 1),
base + depth,
key,
p
)
)
del args['depth']
elif key == 'effectiveRange':
if isinstance(value, DateTime):
steps = getattr(config, 'effective_steps', 1)
if steps > 1:
value = DateTime(value.timeTime() // steps * steps)
value = iso8601date(value)
del keywords[key]
keywords['effective'] = '[* TO %s]' % value
keywords['expires'] = '[%s TO *]' % value
elif key == 'show_inactive':
del keywords[key] # marker for `effectiveRange`
elif 'range' in args:
if not isinstance(value, (list, tuple)):
value = [value]
payload = map(iso8601date, value)
keywords[key] = ranges[args['range']] % tuple(payload)
del args['range']
elif 'operator' in args:
if isinstance(value, (list, tuple)) and len(value) > 1:
sep = ' %s ' % args['operator'].upper()
value = sep.join(map(str, map(iso8601date, value)))
keywords[key] = '(%s)' % value
del args['operator']
elif key == 'allowedRolesAndUsers':
if getattr(config, 'exclude_user', False):
token = 'user$' + getSecurityManager().getUser().getId()
if token in value:
value.remove(token)
elif isinstance(value, DateTime):
keywords[key] = iso8601date(value)
elif not isinstance(value, basestring):
assert not args, 'unsupported usage: %r' % args
def extractQueryParameters(args):
""" extract parameters related to sorting and limiting search results
from a given set of arguments, also removing them """
def get(name):
for prefix in 'sort_', 'sort-':
key = '%s%s' % (prefix, name)
value = args.get(key, None)
if value is not None:
del args[key]
return value
return None
params = {}
index = get('on')
if index:
reverse = get('order') or ''
reverse = reverse.lower() in ('reverse', 'descending')
order = reverse and 'desc' or 'asc'
params['sort'] = '%s %s' % (index, order)
limit = get('limit')
if limit:
params['rows'] = int(limit)
for key, value in args.items():
if key in ('fq', 'fl', 'facet', 'hl'):
params[key] = value
del args[key]
elif key.startswith('facet.') or key.startswith('facet_'):
name = lambda facet: facet.split(':', 1)[0]
if isinstance(value, list):
value = map(name, value)
elif isinstance(value, tuple):
value = tuple(map(name, value))
else:
value = name(value)
params[key.replace('_', '.', 1)] = value
del args[key]
elif key == 'b_start':
params['start'] = int(value)
del args[key]
elif key == 'b_size':
params['rows'] = int(value)
del args[key]
return params
def cleanupQueryParameters(args, schema):
""" validate and possibly clean up the given query parameters using
the given solr schema """
sort = args.get('sort', None)
if sort is not None:
field, order = sort.split(' ', 1)
if not field in schema:
field = sort_aliases.get(field, None)
fld = schema.get(field, None)
if fld is not None and fld.indexed:
args['sort'] = '%s %s' % (field, order)
else:
del args['sort']
if 'facet.field' in args and not 'facet' in args:
args['facet'] = 'true'
return args
def optimizeQueryParameters(query, params):
""" optimize query parameters by using filter queries for
configured indexes """
config = queryUtility(ISolrConnectionConfig)
fq = []
if config is not None:
for idxs in config.filter_queries:
idxs = set(idxs.split(' '))
if idxs.issubset(query.keys()):
fq.append(' '.join([query.pop(idx) for idx in idxs]))
if 'fq' in params:
if isinstance(params['fq'], list):
params['fq'].extend(fq)
else:
params['fq'] = [params['fq']] + fq
elif fq:
params['fq'] = fq
if not query:
query['*'] = '*:*' # catch all if no regular query is left...