# coding=utf-8 # Copyright 2007 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from itertools import chain from whoosh.compat import next, xrange from whoosh.analysis.acore import Composable from whoosh.util.text import rcompile # Default list of stop words (words so common it's usually wasteful to index # them). This list is used by the StopFilter class, which allows you to supply # an optional list to override this one. STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', 'to', 'us', 'we', 'when', 'will', 'with', 'yet', 'you', 'your')) # Simple pattern for filtering URLs, may be useful url_pattern = rcompile(""" ( [A-Za-z+]+:// # URL protocol \\S+? # URL body (?=\\s|[.]\\s|$|[.]$) # Stop at space/end, or a dot followed by space/end ) | ( # or... \w+([:.]?\w+)* # word characters, with opt. internal colons/dots ) """, verbose=True) # Filters class Filter(Composable): """Base class for Filter objects. A Filter subclass must implement a filter() method that takes a single argument, which is an iterator of Token objects, and yield a series of Token objects in return. Filters that do morphological transformation of tokens (e.g. stemming) should set their ``is_morph`` attribute to True. """ def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.__dict__ == other.__dict__) def __ne__(self, other): return not self == other def __call__(self, tokens): raise NotImplementedError class PassFilter(Filter): """An identity filter: passes the tokens through untouched. """ def __call__(self, tokens): return tokens class LoggingFilter(Filter): """Prints the contents of every filter that passes through as a debug log entry. """ def __init__(self, logger=None): """ :param target: the logger to use. If omitted, the "whoosh.analysis" logger is used. """ if logger is None: import logging logger = logging.getLogger("whoosh.analysis") self.logger = logger def __call__(self, tokens): logger = self.logger for t in tokens: logger.debug(repr(t)) yield t class MultiFilter(Filter): """Chooses one of two or more sub-filters based on the 'mode' attribute of the token stream. """ default_filter = PassFilter() def __init__(self, **kwargs): """Use keyword arguments to associate mode attribute values with instantiated filters. >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False) >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False) >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query) This class expects that the value of the mode attribute is consistent among all tokens in a token stream. """ self.filters = kwargs def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.filters == other.filters) def __call__(self, tokens): # Only selects on the first token t = next(tokens) filter = self.filters.get(t.mode, self.default_filter) return filter(chain([t], tokens)) class TeeFilter(Filter): """Interleaves the results of two or more filters (or filter chains). NOTE: because it needs to create copies of each token for each sub-filter, this filter is quite slow. >>> target = "ALFA BRAVO CHARLIE" >>> # In one branch, we'll lower-case the tokens >>> f1 = LowercaseFilter() >>> # In the other branch, we'll reverse the tokens >>> f2 = ReverseTextFilter() >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) >>> [token.text for token in ana(target)] ["alfa", "AFLA", "bravo", "OVARB", "charlie", "EILRAHC"] To combine the incoming token stream with the output of a filter chain, use ``TeeFilter`` and make one of the filters a :class:`PassFilter`. >>> f1 = PassFilter() >>> f2 = BiWordFilter() >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) | LowercaseFilter() >>> [token.text for token in ana(target)] ["alfa", "alfa-bravo", "bravo", "bravo-charlie", "charlie"] """ def __init__(self, *filters): if len(filters) < 2: raise Exception("TeeFilter requires two or more filters") self.filters = filters def __eq__(self, other): return (self.__class__ is other.__class__ and self.filters == other.fitlers) def __call__(self, tokens): from itertools import tee count = len(self.filters) # Tee the token iterator and wrap each teed iterator with the # corresponding filter gens = [filter(t.copy() for t in gen) for filter, gen in zip(self.filters, tee(tokens, count))] # Keep a count of the number of running iterators running = count while running: for i, gen in enumerate(gens): if gen is not None: try: yield next(gen) except StopIteration: gens[i] = None running -= 1 class ReverseTextFilter(Filter): """Reverses the text of each token. >>> ana = RegexTokenizer() | ReverseTextFilter() >>> [token.text for token in ana("hello there")] ["olleh", "ereht"] """ def __call__(self, tokens): for t in tokens: t.text = t.text[::-1] yield t class LowercaseFilter(Filter): """Uses unicode.lower() to lowercase token text. >>> rext = RegexTokenizer() >>> stream = rext("This is a TEST") >>> [token.text for token in LowercaseFilter(stream)] ["this", "is", "a", "test"] """ def __call__(self, tokens): for t in tokens: t.text = t.text.lower() yield t class StripFilter(Filter): """Calls unicode.strip() on the token text. """ def __call__(self, tokens): for t in tokens: t.text = t.text.strip() yield t class StopFilter(Filter): """Marks "stop" words (words too common to index) in the stream (and by default removes them). Make sure you precede this filter with a :class:`LowercaseFilter`. >>> stopper = RegexTokenizer() | StopFilter() >>> [token.text for token in stopper(u"this is a test")] ["test"] >>> es_stopper = RegexTokenizer() | StopFilter(lang="es") >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")] ["lapiz", "mesa"] The list of available languages is in `whoosh.lang.languages`. You can use :func:`whoosh.lang.has_stopwords` to check if a given language has a stop word list available. """ def __init__(self, stoplist=STOP_WORDS, minsize=2, maxsize=None, renumber=True, lang=None): """ :param stoplist: A collection of words to remove from the stream. This is converted to a frozenset. The default is a list of common English stop words. :param minsize: The minimum length of token texts. Tokens with text smaller than this will be stopped. The default is 2. :param maxsize: The maximum length of token texts. Tokens with text larger than this will be stopped. Use None to allow any length. :param renumber: Change the 'pos' attribute of unstopped tokens to reflect their position with the stopped words removed. :param lang: Automatically get a list of stop words for the given language """ stops = set() if stoplist: stops.update(stoplist) if lang: from whoosh.lang import stopwords_for_language stops.update(stopwords_for_language(lang)) self.stops = frozenset(stops) self.min = minsize self.max = maxsize self.renumber = renumber def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.stops == other.stops and self.min == other.min and self.renumber == other.renumber) def __call__(self, tokens): stoplist = self.stops minsize = self.min maxsize = self.max renumber = self.renumber pos = None for t in tokens: text = t.text if (len(text) >= minsize and (maxsize is None or len(text) <= maxsize) and text not in stoplist): # This is not a stop word if renumber and t.positions: if pos is None: pos = t.pos else: pos += 1 t.pos = pos t.stopped = False yield t else: # This is a stop word if not t.removestops: # This IS a stop word, but we're not removing them t.stopped = True yield t class CharsetFilter(Filter): """Translates the text of tokens by calling unicode.translate() using the supplied character mapping object. This is useful for case and accent folding. The ``whoosh.support.charset`` module has a useful map for accent folding. >>> from whoosh.support.charset import accent_map >>> retokenizer = RegexTokenizer() >>> chfilter = CharsetFilter(accent_map) >>> [t.text for t in chfilter(retokenizer(u'café'))] [u'cafe'] Another way to get a character mapping object is to convert a Sphinx charset table file using :func:`whoosh.support.charset.charset_table_to_dict`. >>> from whoosh.support.charset import charset_table_to_dict >>> from whoosh.support.charset import default_charset >>> retokenizer = RegexTokenizer() >>> charmap = charset_table_to_dict(default_charset) >>> chfilter = CharsetFilter(charmap) >>> [t.text for t in chfilter(retokenizer(u'Stra\\xdfe'))] [u'strase'] The Sphinx charset table format is described at http://www.sphinxsearch.com/docs/current.html#conf-charset-table. """ __inittypes__ = dict(charmap=dict) def __init__(self, charmap): """ :param charmap: a dictionary mapping from integer character numbers to unicode characters, as required by the unicode.translate() method. """ self.charmap = charmap def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.charmap == other.charmap) def __call__(self, tokens): assert hasattr(tokens, "__iter__") charmap = self.charmap for t in tokens: t.text = t.text.translate(charmap) yield t class DelimitedAttributeFilter(Filter): """Looks for delimiter characters in the text of each token and stores the data after the delimiter in a named attribute on the token. The defaults are set up to use the ``^`` character as a delimiter and store the value after the ``^`` as the boost for the token. >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost") >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter() >>> for t in ana(u("image render^2 file^0.5")) ... print("%r %f" % (t.text, t.boost)) 'image' 1.0 'render' 2.0 'file' 0.5 Note that you need to make sure your tokenizer includes the delimiter and data as part of the token! """ def __init__(self, delimiter="^", attribute="boost", default=1.0, type=float): """ :param delimiter: a string that, when present in a token's text, separates the actual text from the "data" payload. :param attribute: the name of the attribute in which to store the data on the token. :param default: the value to use for the attribute for tokens that don't have delimited data. :param type: the type of the data, for example ``str`` or ``float``. This is used to convert the string value of the data before storing it in the attribute. """ self.delim = delimiter self.attr = attribute self.default = default self.type = type def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.delim == other.delim and self.attr == other.attr and self.default == other.default) def __call__(self, tokens): delim = self.delim attr = self.attr default = self.default type_ = self.type for t in tokens: text = t.text pos = text.find(delim) if pos > -1: setattr(t, attr, type_(text[pos + 1:])) if t.chars: t.endchar -= len(t.text) - pos t.text = text[:pos] else: setattr(t, attr, default) yield t class SubstitutionFilter(Filter): """Performs a regular expression substitution on the token text. This is especially useful for removing text from tokens, for example hyphens:: ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "") Because it has the full power of the re.sub() method behind it, this filter can perform some fairly complex transformations. For example, to take tokens like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c', 'f=e'``:: # Analyzer that swaps the text on either side of an equal sign rt = RegexTokenizer(r"\\S+") sf = SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1") ana = rt | sf """ def __init__(self, pattern, replacement): """ :param pattern: a pattern string or compiled regular expression object describing the text to replace. :param replacement: the substitution text. """ self.pattern = rcompile(pattern) self.replacement = replacement def __eq__(self, other): return (other and self.__class__ is other.__class__ and self.pattern == other.pattern and self.replacement == other.replacement) def __call__(self, tokens): pattern = self.pattern replacement = self.replacement for t in tokens: t.text = pattern.sub(replacement, t.text) yield t