collective.solr/src/collective/solr/queryparser.py

from re import compile

# Solr/lucene reserved characters/terms: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
# (see http://wiki.apache.org/solr/SolrQuerySyntax)
# Four groups for tokenizer:
# 1) Whitespace (\s+)
# 2) Any non reserved characters (normal text) ([^(){}\[\]+\-!^\"~*?:\\\\\s]+)
# 3) Any grouping characters ([(){}[\]\"])
# 4) Any special operators ([+\-!^~*?:\\\]))
query_tokenizer = compile(
    "(?:(\s+)|([^(){}[\]+\-!^\"~*?:\\\\\s]+)|([(){}\[\]\"])|([+\-!^~*?:\\\]))"
)


class Whitespace(object):

    def __nonzero__(self):
        return False

    def __str__(self):
        return ' '


class Group(list):

    def __init__(self, start=None, end=None):
        self.start = start
        self.end = end
        self.isgroup = False  # Set on pop

    def __str__(self):
        res = [x for x in self if x]
        lenres = len(res)
        if lenres == 0:
            return ''
        elif lenres == 1:
            return str(res[0])
        # Otherwise, also print whitespace
        return '%s%s%s' % (
            self.start,
            ''.join([str(x) for x in self]),
            self.end)


class Quote(Group):

    def __str__(self):
        if not self.end:
            # No finishing quote, we have to add new group if there is
            # whitespace
            if [x for x in self if isinstance(x, Whitespace)]:
                self.start = '(%s' % self.start
                self.end = ')'
        return '%s%s%s' % (
            self.start,
            ''.join([str(x) for x in self]),
            self.end)


class Range(Group):

    def __str__(self):
        first = last = '*'
        if len(self) == 0:
            return ''
        if not 'TO' in self:
            # Not valid range, quote
            return '\\%s%s\\%s' % (
                self.start,
                ''.join([str(x) for x in self]),
                self.end)
        else:
            # split on 'TO'
            split = self.index('TO')
            if split > 0:
                first = ''.join([
                    str(x) for x in self[:split]
                    if not isinstance(x, Whitespace)])
            if split < (len(self) - 1):
                last = ''.join([
                    str(x) for x in self[split + 1:]
                    if not isinstance(x, Whitespace)])
        return '%s%s TO %s%s' % (self.start, first, last, self.end)


class Stack(list):

    def __init__(self):
        self.append([])

    def add(self, item):
        self.current.append(item)
        self.append(item)

    @property
    def current(self):
        return self[-1]

    def __str__(self):
        return ''.join([str(x) for x in self[0]])


def quote(term, textfield=False):
    if isinstance(term, unicode):
        term = term.encode('utf-8')
    stack = Stack()
    tokens = query_tokenizer.findall(term.strip())
    # Counter enables lookahead
    i = 0
    stop = len(tokens)
    while i < stop:
        whitespace, text, grouping, special = tokens[i]

        if whitespace:
            # Add whitespace if group text, range and group filter on display
            if isinstance(stack.current, Group):
                stack.current.append(Whitespace())
            elif isinstance(stack.current, list):
                # We have whitespace with no grouping, insert group
                new = Group('(', ')')
                new.extend(stack.current)
                new.append(Whitespace())
                stack.current[:] = []
                stack.add(new)

        elif grouping:
            # [] (inclusive range), {} (exclusive range), always with TO inside
            # () group
            # "" for quotes
            if grouping == '"':
                if isinstance(stack.current, Quote):
                    # Handle empty double quote
                    if not stack.current:
                        stack.current.end = '\\"'
                    else:
                        stack.current.start = stack.current.end = '"'
                        stack.current.isgroup = True
                    stack.pop()
                else:
                    # Right now this is just a single quote,
                    # we set proper start and end before popping
                    new = Quote(start='\\"', end='')
                    stack.add(new)
            elif isinstance(stack.current, Quote):
                # If we're in a quote, escape and print
                stack.current.append('\\%s' % grouping)
            elif grouping in '[{':
                new = Range(start=grouping, end={'[': ']', '{': '}'}[grouping])
                stack.add(new)
            elif grouping == '(':
                new = Group(start='(', end=')')
                stack.add(new)
            elif grouping in ']})':
                if isinstance(stack.current, Group) and stack.current.end == grouping:
                    stack.current.isgroup = True
                    stack.pop()
                else:
                    stack.current.append('\\%s' % grouping)

        elif text:
            stack.current.append(text)

        elif special:
            if special == '\\':
                # Inspect next to see if it's quoted special or quoted group
                if (i + 1) < stop:
                    _, _, g2, s2 = tokens[i + 1]
                    if s2:
                        stack.current.append('%s%s' % (special, s2))
                        # Jump ahead
                        i += 1
                    elif g2:
                        stack.current.append('%s%s' % (special, g2))
                        # Jump ahead
                        i += 1
                    else:
                        # Quote it
                        stack.current.append('\\%s' % special)
                else:
                    # Quote it
                    stack.current.append('\\\\')
            elif isinstance(stack.current, Quote):
                stack.current.append('\\%s' % special)
            elif special in '+-':
                if (i + 1) < stop:
                    _, t2, g2, _ = tokens[i + 1]
                    # We allow + and - in front of phrase and text
                    if t2 or g2 == '"':
                        if textfield and i > 0 and tokens[i - 1][1]:
                            # Quote intra-word hyphens, so they are normal text
                            # and not syntax
                            stack.current.append('\\%s' % special)
                        else:
                            stack.current.append(special)
                    else:
                        # Quote it
                        stack.current.append('\\%s' % special)
            elif special in '~^':
                # Fuzzy or proximity is always after a term or phrase, and
                # sometimes before int or float like roam~0.8 or
                # "jakarta apache"~10
                if i > 0:
                    _, t0, g0, _ = tokens[i - 1]
                    if t0 or g0 == '"':
                        # Look ahead to check for integer or float

                        if (i + 1)<stop:
                            _, t2, _, _ = tokens[i + 1]
                            try: # float(t2) might fail
                                if t2 and float(t2):
                                    stack.current.append('%s%s' % (special, t2))
                                    # Jump ahead
                                    i += 1
                                else:
                                    stack.current.append(special)
                            except ValueError:
                                stack.current.append(special)
                        else:  # (i+1)<stop
                            stack.current.append(special)
                    else:  # t0 or g0 == '"'
                        stack.current.append('\\%s' % special)
                else:  # i>0
                    stack.current.append('\\%s' % special)
            elif special in '?*':
                # ? and * can not be the first characters of a search
                if (stack.current \
                    and not getattr(stack.current[-1], 'isgroup', False) \
                    and (isinstance(stack.current[-1], str) and \
                         not stack.current[-1] in special)) \
                   or isinstance(stack.current, Range):
                    stack.current.append(special)
            elif isinstance(stack.current, Range):
                stack.current.append(special)
            elif isinstance(stack.current, Group):
                stack.current.append('\\%s' % special)
            elif isinstance(stack.current, list):
                stack.current.append('\\%s' % special)
        i += 1
    return str(stack)