Implement fuzzy search (w/o a config UI)

This commit is contained in:
Carsten Senger 2013-07-04 15:34:32 +02:00
parent dfb947682b
commit b512e1faa0
4 changed files with 82 additions and 23 deletions

View File

@ -7,6 +7,7 @@ from collective.solr.queryparser import quote
from collective.solr.utils import isSimpleTerm
from collective.solr.utils import isSimpleSearch
from collective.solr.utils import isWildCard
from collective.solr.utils import splitSimpleSearch
from collective.solr.utils import prepare_wildcard
@ -37,6 +38,53 @@ def iso8601date(value):
return value
def makeSimpleExpressions(term, levenstein_distance):
'''Return a search expression for part of the query that
includes the levenstein distance and wildcards where appropriate.
Returns both an expression for "value" and "base_value"'''
base_value = term
if levenstein_distance:
levenstein_expr = '~%s' % levenstein_distance
else:
levenstein_expr = ''
if '"' in term: # quoted literals
value = '%s%s' % (term, levenstein_expr)
base_value = value
elif isWildCard(term):
value = prepare_wildcard(term)
base_value = quote(term.replace('*', '').replace('?', ''))
else:
value = '%s* OR %s%s' % (prepare_wildcard(term), term,
levenstein_expr)
return '(%s)' % value, '(%s)' % base_value
def mangleSearchableText(value, config):
pattern = getattr(config, 'search_pattern', '')
levenstein_distance = getattr(config, 'levenstein_distance', 0)
value_parts = []
base_value_parts = []
if not isSimpleSearch(value):
return value
for term in splitSimpleSearch(value):
(term_value,
term_base_value) = makeSimpleExpressions(term,
levenstein_distance)
value_parts.append(term_value)
base_value_parts.append(term_base_value)
base_value = ' '.join(base_value_parts)
value = ' '.join(value_parts)
if pattern:
value = pattern.format(value=quote(value),
base_value=base_value)
return set([value]) # add literal query parameter
return value
def mangleQuery(keywords, config, schema):
""" translate / mangle query parameters to replace zope specifics
with equivalent constructs for solr """
@ -76,24 +124,8 @@ def mangleQuery(keywords, config, schema):
for key, value in keywords.items():
args = extras.get(key, {})
if key == 'SearchableText':
pattern = getattr(config, 'search_pattern', '')
simple_term = isSimpleTerm(value)
if pattern and isSimpleSearch(value):
base_value = value
if simple_term: # use prefix/wildcard search
value = '(%s* OR %s)' % (prepare_wildcard(value), value)
elif isWildCard(value):
value = prepare_wildcard(value)
base_value = quote(value.replace('*', '').replace('?', ''))
# simple queries use custom search pattern
value = pattern.format(value=quote(value),
base_value=base_value)
keywords[key] = set([value]) # add literal query parameter
continue
elif simple_term: # use prefix/wildcard search
keywords[key] = '(%s* OR %s)' % (
prepare_wildcard(value), value)
continue
keywords[key] = mangleSearchableText(value, config)
continue
if key in epi_indexes:
path = keywords['%s_parents' % key] = value
del keywords[key]

View File

@ -470,7 +470,9 @@ class SolrServerTests(SolrTestCase):
# the pattern is applied for multi-word searches
response = solrSearchResults(SearchableText='foo bar', Language='all')
query = response.responseHeader['params']['q']
self.assertEqual(query, '(Title:(foo bar)^5 OR getId:(foo bar))')
self.assertEqual(query,
'(Title:((foo* OR foo) (bar* OR bar))^5 OR '
'getId:((foo* OR foo) (bar* OR bar)))')
# extra parameters should be unaffected
response = solrSearchResults(SearchableText='"news"', Type='xy', Language='all')
query = response.responseHeader['params']['q']
@ -479,14 +481,14 @@ class SolrServerTests(SolrTestCase):
self.config.search_pattern = '(Title:{value} OR getId:{base_value})'
response = solrSearchResults(SearchableText='news', Language='all')
query = response.responseHeader['params']['q']
self.assertEqual(query, '(Title:(news* OR news) OR getId:news)')
self.assertEqual(query, '(Title:(news* OR news) OR getId:(news))')
# and they handle wildcards as advertised
response = solrSearchResults(SearchableText='news*', Language='all')
query = response.responseHeader['params']['q']
self.assertEqual(query, '(Title:news* OR getId:news)')
self.assertEqual(query, '(Title:(news*) OR getId:(news))')
response = solrSearchResults(SearchableText='*news*', Language='all')
query = response.responseHeader['params']['q']
self.assertEqual(query, '(Title:news* OR getId:news)')
self.assertEqual(query, '(Title:(news*) OR getId:(news))')
def testSolrSearchResultsWithDictRequest(self):
self.maintenance.reindex()

View File

@ -6,7 +6,7 @@ from Testing import ZopeTestCase as ztc
from collective.solr.tests.utils import getData
from collective.solr.parser import SolrResponse
from collective.solr.utils import findObjects, isSimpleTerm, isSimpleSearch
from collective.solr.utils import isWildCard
from collective.solr.utils import isWildCard, splitSimpleSearch
from collective.solr.utils import setupTranslationMap, prepareData
from collective.solr.utils import padResults
@ -93,6 +93,13 @@ class UtilsTests(ztc.ZopeTestCase):
self.failIf(isSimpleSearch('foo 42 bar11'))
self.failUnless(isSimpleSearch('2000 foo'))
def testSplitSimpleSearch(self):
self.assertEqual(splitSimpleSearch('foo bar'), ['foo', 'bar'])
self.assertEqual(splitSimpleSearch('foo "bar foobar" baz'),
['foo', '"bar foobar"', 'baz'])
self.assertRaises(AssertionError, splitSimpleSearch, 'foo AND bar')
self.assertRaises(AssertionError, splitSimpleSearch, 'foo42')
def testIsWildCard(self):
self.failUnless(isWildCard('foo*'))
self.failUnless(isWildCard('fo?'))

View File

@ -111,6 +111,24 @@ def isSimpleSearch(term):
return False
def splitSimpleSearch(term):
'''Split a simple search term into tokens (words and phrases)'''
if not isSimpleSearch(term):
raise AssertionError('term is not a simple search')
parts = term.split('"')
tokens = []
for i in range(0, len(parts)):
if i % 2 == 0:
# unquoted text
words = [word for word in parts[i].split() if word]
tokens.extend(words)
else:
# The uneven parts are those inside quotes.
if parts[i]:
tokens.append('"%s"' % parts[i])
return tokens
wildCard = compile(r'^[\w\d\s*?]*[*?]+[\w\d\s*?]*$', UNICODE)
def isWildCard(term):
if isinstance(term, str):