240 lines
8.5 KiB
Python
240 lines
8.5 KiB
Python
from re import compile
|
|
|
|
# Solr/lucene reserved characters/terms: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
|
|
# (see http://wiki.apache.org/solr/SolrQuerySyntax)
|
|
# Four groups for tokenizer:
|
|
# 1) Whitespace (\s+)
|
|
# 2) Any non reserved characters (normal text) ([^(){}\[\]+\-!^\"~*?:\\\\\s]+)
|
|
# 3) Any grouping characters ([(){}[\]\"])
|
|
# 4) Any special operators ([+\-!^~*?:\\\]))
|
|
query_tokenizer = compile(
|
|
"(?:(\s+)|([^(){}[\]+\-!^\"~*?:\\\\\s]+)|([(){}\[\]\"])|([+\-!^~*?:\\\]))"
|
|
)
|
|
|
|
|
|
class Whitespace(object):
|
|
|
|
def __nonzero__(self):
|
|
return False
|
|
|
|
def __str__(self):
|
|
return ' '
|
|
|
|
|
|
class Group(list):
|
|
|
|
def __init__(self, start=None, end=None):
|
|
self.start = start
|
|
self.end = end
|
|
self.isgroup = False # Set on pop
|
|
|
|
def __str__(self):
|
|
res = [x for x in self if x]
|
|
lenres = len(res)
|
|
if lenres == 0:
|
|
return ''
|
|
elif lenres == 1:
|
|
return str(res[0])
|
|
# Otherwise, also print whitespace
|
|
return '%s%s%s' % (
|
|
self.start,
|
|
''.join([str(x) for x in self]),
|
|
self.end)
|
|
|
|
|
|
class Quote(Group):
|
|
|
|
def __str__(self):
|
|
if not self.end:
|
|
# No finishing quote, we have to add new group if there is
|
|
# whitespace
|
|
if [x for x in self if isinstance(x, Whitespace)]:
|
|
self.start = '(%s' % self.start
|
|
self.end = ')'
|
|
return '%s%s%s' % (
|
|
self.start,
|
|
''.join([str(x) for x in self]),
|
|
self.end)
|
|
|
|
|
|
class Range(Group):
|
|
|
|
def __str__(self):
|
|
first = last = '*'
|
|
if len(self) == 0:
|
|
return ''
|
|
if not 'TO' in self:
|
|
# Not valid range, quote
|
|
return '\\%s%s\\%s' % (
|
|
self.start,
|
|
''.join([str(x) for x in self]),
|
|
self.end)
|
|
else:
|
|
# split on 'TO'
|
|
split = self.index('TO')
|
|
if split > 0:
|
|
first = ''.join([
|
|
str(x) for x in self[:split]
|
|
if not isinstance(x, Whitespace)])
|
|
if split < (len(self) - 1):
|
|
last = ''.join([
|
|
str(x) for x in self[split + 1:]
|
|
if not isinstance(x, Whitespace)])
|
|
return '%s%s TO %s%s' % (self.start, first, last, self.end)
|
|
|
|
|
|
class Stack(list):
|
|
|
|
def __init__(self):
|
|
self.append([])
|
|
|
|
def add(self, item):
|
|
self.current.append(item)
|
|
self.append(item)
|
|
|
|
@property
|
|
def current(self):
|
|
return self[-1]
|
|
|
|
def __str__(self):
|
|
return ''.join([str(x) for x in self[0]])
|
|
|
|
|
|
def quote(term, textfield=False):
|
|
if isinstance(term, unicode):
|
|
term = term.encode('utf-8')
|
|
stack = Stack()
|
|
tokens = query_tokenizer.findall(term.strip())
|
|
# Counter enables lookahead
|
|
i = 0
|
|
stop = len(tokens)
|
|
while i < stop:
|
|
whitespace, text, grouping, special = tokens[i]
|
|
|
|
if whitespace:
|
|
# Add whitespace if group text, range and group filter on display
|
|
if isinstance(stack.current, Group):
|
|
stack.current.append(Whitespace())
|
|
elif isinstance(stack.current, list):
|
|
# We have whitespace with no grouping, insert group
|
|
new = Group('(', ')')
|
|
new.extend(stack.current)
|
|
new.append(Whitespace())
|
|
stack.current[:] = []
|
|
stack.add(new)
|
|
|
|
elif grouping:
|
|
# [] (inclusive range), {} (exclusive range), always with TO inside
|
|
# () group
|
|
# "" for quotes
|
|
if grouping == '"':
|
|
if isinstance(stack.current, Quote):
|
|
# Handle empty double quote
|
|
if not stack.current:
|
|
stack.current.end = '\\"'
|
|
else:
|
|
stack.current.start = stack.current.end = '"'
|
|
stack.current.isgroup = True
|
|
stack.pop()
|
|
else:
|
|
# Right now this is just a single quote,
|
|
# we set proper start and end before popping
|
|
new = Quote(start='\\"', end='')
|
|
stack.add(new)
|
|
elif isinstance(stack.current, Quote):
|
|
# If we're in a quote, escape and print
|
|
stack.current.append('\\%s' % grouping)
|
|
elif grouping in '[{':
|
|
new = Range(start=grouping, end={'[': ']', '{': '}'}[grouping])
|
|
stack.add(new)
|
|
elif grouping == '(':
|
|
new = Group(start='(', end=')')
|
|
stack.add(new)
|
|
elif grouping in ']})':
|
|
if isinstance(stack.current, Group) and stack.current.end == grouping:
|
|
stack.current.isgroup = True
|
|
stack.pop()
|
|
else:
|
|
stack.current.append('\\%s' % grouping)
|
|
|
|
elif text:
|
|
stack.current.append(text)
|
|
|
|
elif special:
|
|
if special == '\\':
|
|
# Inspect next to see if it's quoted special or quoted group
|
|
if (i + 1) < stop:
|
|
_, _, g2, s2 = tokens[i + 1]
|
|
if s2:
|
|
stack.current.append('%s%s' % (special, s2))
|
|
# Jump ahead
|
|
i += 1
|
|
elif g2:
|
|
stack.current.append('%s%s' % (special, g2))
|
|
# Jump ahead
|
|
i += 1
|
|
else:
|
|
# Quote it
|
|
stack.current.append('\\%s' % special)
|
|
else:
|
|
# Quote it
|
|
stack.current.append('\\\\')
|
|
elif isinstance(stack.current, Quote):
|
|
stack.current.append('\\%s' % special)
|
|
elif special in '+-':
|
|
if (i + 1) < stop:
|
|
_, t2, g2, _ = tokens[i + 1]
|
|
# We allow + and - in front of phrase and text
|
|
if t2 or g2 == '"':
|
|
if textfield and i > 0 and tokens[i - 1][1]:
|
|
# Quote intra-word hyphens, so they are normal text
|
|
# and not syntax
|
|
stack.current.append('\\%s' % special)
|
|
else:
|
|
stack.current.append(special)
|
|
else:
|
|
# Quote it
|
|
stack.current.append('\\%s' % special)
|
|
elif special in '~^':
|
|
# Fuzzy or proximity is always after a term or phrase, and
|
|
# sometimes before int or float like roam~0.8 or
|
|
# "jakarta apache"~10
|
|
if i > 0:
|
|
_, t0, g0, _ = tokens[i - 1]
|
|
if t0 or g0 == '"':
|
|
# Look ahead to check for integer or float
|
|
|
|
if (i + 1)<stop:
|
|
_, t2, _, _ = tokens[i + 1]
|
|
try: # float(t2) might fail
|
|
if t2 and float(t2):
|
|
stack.current.append('%s%s' % (special, t2))
|
|
# Jump ahead
|
|
i += 1
|
|
else:
|
|
stack.current.append(special)
|
|
except ValueError:
|
|
stack.current.append(special)
|
|
else: # (i+1)<stop
|
|
stack.current.append(special)
|
|
else: # t0 or g0 == '"'
|
|
stack.current.append('\\%s' % special)
|
|
else: # i>0
|
|
stack.current.append('\\%s' % special)
|
|
elif special in '?*':
|
|
# ? and * can not be the first characters of a search
|
|
if (stack.current \
|
|
and not getattr(stack.current[-1], 'isgroup', False) \
|
|
and (isinstance(stack.current[-1], str) and \
|
|
not stack.current[-1] in special)) \
|
|
or isinstance(stack.current, Range):
|
|
stack.current.append(special)
|
|
elif isinstance(stack.current, Range):
|
|
stack.current.append(special)
|
|
elif isinstance(stack.current, Group):
|
|
stack.current.append('\\%s' % special)
|
|
elif isinstance(stack.current, list):
|
|
stack.current.append('\\%s' % special)
|
|
i += 1
|
|
return str(stack)
|