Implement fuzzy search (w/o a config UI)
This commit is contained in:
parent
dfb947682b
commit
b512e1faa0
|
@ -7,6 +7,7 @@ from collective.solr.queryparser import quote
|
|||
from collective.solr.utils import isSimpleTerm
|
||||
from collective.solr.utils import isSimpleSearch
|
||||
from collective.solr.utils import isWildCard
|
||||
from collective.solr.utils import splitSimpleSearch
|
||||
from collective.solr.utils import prepare_wildcard
|
||||
|
||||
|
||||
|
@ -37,6 +38,53 @@ def iso8601date(value):
|
|||
return value
|
||||
|
||||
|
||||
def makeSimpleExpressions(term, levenstein_distance):
|
||||
'''Return a search expression for part of the query that
|
||||
includes the levenstein distance and wildcards where appropriate.
|
||||
Returns both an expression for "value" and "base_value"'''
|
||||
|
||||
base_value = term
|
||||
if levenstein_distance:
|
||||
levenstein_expr = '~%s' % levenstein_distance
|
||||
else:
|
||||
levenstein_expr = ''
|
||||
if '"' in term: # quoted literals
|
||||
value = '%s%s' % (term, levenstein_expr)
|
||||
base_value = value
|
||||
elif isWildCard(term):
|
||||
value = prepare_wildcard(term)
|
||||
base_value = quote(term.replace('*', '').replace('?', ''))
|
||||
else:
|
||||
value = '%s* OR %s%s' % (prepare_wildcard(term), term,
|
||||
levenstein_expr)
|
||||
return '(%s)' % value, '(%s)' % base_value
|
||||
|
||||
|
||||
def mangleSearchableText(value, config):
|
||||
pattern = getattr(config, 'search_pattern', '')
|
||||
levenstein_distance = getattr(config, 'levenstein_distance', 0)
|
||||
value_parts = []
|
||||
base_value_parts = []
|
||||
|
||||
if not isSimpleSearch(value):
|
||||
return value
|
||||
|
||||
for term in splitSimpleSearch(value):
|
||||
(term_value,
|
||||
term_base_value) = makeSimpleExpressions(term,
|
||||
levenstein_distance)
|
||||
value_parts.append(term_value)
|
||||
base_value_parts.append(term_base_value)
|
||||
|
||||
base_value = ' '.join(base_value_parts)
|
||||
value = ' '.join(value_parts)
|
||||
if pattern:
|
||||
value = pattern.format(value=quote(value),
|
||||
base_value=base_value)
|
||||
return set([value]) # add literal query parameter
|
||||
return value
|
||||
|
||||
|
||||
def mangleQuery(keywords, config, schema):
|
||||
""" translate / mangle query parameters to replace zope specifics
|
||||
with equivalent constructs for solr """
|
||||
|
@ -76,24 +124,8 @@ def mangleQuery(keywords, config, schema):
|
|||
for key, value in keywords.items():
|
||||
args = extras.get(key, {})
|
||||
if key == 'SearchableText':
|
||||
pattern = getattr(config, 'search_pattern', '')
|
||||
simple_term = isSimpleTerm(value)
|
||||
if pattern and isSimpleSearch(value):
|
||||
base_value = value
|
||||
if simple_term: # use prefix/wildcard search
|
||||
value = '(%s* OR %s)' % (prepare_wildcard(value), value)
|
||||
elif isWildCard(value):
|
||||
value = prepare_wildcard(value)
|
||||
base_value = quote(value.replace('*', '').replace('?', ''))
|
||||
# simple queries use custom search pattern
|
||||
value = pattern.format(value=quote(value),
|
||||
base_value=base_value)
|
||||
keywords[key] = set([value]) # add literal query parameter
|
||||
continue
|
||||
elif simple_term: # use prefix/wildcard search
|
||||
keywords[key] = '(%s* OR %s)' % (
|
||||
prepare_wildcard(value), value)
|
||||
continue
|
||||
keywords[key] = mangleSearchableText(value, config)
|
||||
continue
|
||||
if key in epi_indexes:
|
||||
path = keywords['%s_parents' % key] = value
|
||||
del keywords[key]
|
||||
|
|
|
@ -470,7 +470,9 @@ class SolrServerTests(SolrTestCase):
|
|||
# the pattern is applied for multi-word searches
|
||||
response = solrSearchResults(SearchableText='foo bar', Language='all')
|
||||
query = response.responseHeader['params']['q']
|
||||
self.assertEqual(query, '(Title:(foo bar)^5 OR getId:(foo bar))')
|
||||
self.assertEqual(query,
|
||||
'(Title:((foo* OR foo) (bar* OR bar))^5 OR '
|
||||
'getId:((foo* OR foo) (bar* OR bar)))')
|
||||
# extra parameters should be unaffected
|
||||
response = solrSearchResults(SearchableText='"news"', Type='xy', Language='all')
|
||||
query = response.responseHeader['params']['q']
|
||||
|
@ -479,14 +481,14 @@ class SolrServerTests(SolrTestCase):
|
|||
self.config.search_pattern = '(Title:{value} OR getId:{base_value})'
|
||||
response = solrSearchResults(SearchableText='news', Language='all')
|
||||
query = response.responseHeader['params']['q']
|
||||
self.assertEqual(query, '(Title:(news* OR news) OR getId:news)')
|
||||
self.assertEqual(query, '(Title:(news* OR news) OR getId:(news))')
|
||||
# and they handle wildcards as advertised
|
||||
response = solrSearchResults(SearchableText='news*', Language='all')
|
||||
query = response.responseHeader['params']['q']
|
||||
self.assertEqual(query, '(Title:news* OR getId:news)')
|
||||
self.assertEqual(query, '(Title:(news*) OR getId:(news))')
|
||||
response = solrSearchResults(SearchableText='*news*', Language='all')
|
||||
query = response.responseHeader['params']['q']
|
||||
self.assertEqual(query, '(Title:news* OR getId:news)')
|
||||
self.assertEqual(query, '(Title:(news*) OR getId:(news))')
|
||||
|
||||
def testSolrSearchResultsWithDictRequest(self):
|
||||
self.maintenance.reindex()
|
||||
|
|
|
@ -6,7 +6,7 @@ from Testing import ZopeTestCase as ztc
|
|||
from collective.solr.tests.utils import getData
|
||||
from collective.solr.parser import SolrResponse
|
||||
from collective.solr.utils import findObjects, isSimpleTerm, isSimpleSearch
|
||||
from collective.solr.utils import isWildCard
|
||||
from collective.solr.utils import isWildCard, splitSimpleSearch
|
||||
from collective.solr.utils import setupTranslationMap, prepareData
|
||||
from collective.solr.utils import padResults
|
||||
|
||||
|
@ -93,6 +93,13 @@ class UtilsTests(ztc.ZopeTestCase):
|
|||
self.failIf(isSimpleSearch('foo 42 bar11'))
|
||||
self.failUnless(isSimpleSearch('2000 foo'))
|
||||
|
||||
def testSplitSimpleSearch(self):
|
||||
self.assertEqual(splitSimpleSearch('foo bar'), ['foo', 'bar'])
|
||||
self.assertEqual(splitSimpleSearch('foo "bar foobar" baz'),
|
||||
['foo', '"bar foobar"', 'baz'])
|
||||
self.assertRaises(AssertionError, splitSimpleSearch, 'foo AND bar')
|
||||
self.assertRaises(AssertionError, splitSimpleSearch, 'foo42')
|
||||
|
||||
def testIsWildCard(self):
|
||||
self.failUnless(isWildCard('foo*'))
|
||||
self.failUnless(isWildCard('fo?'))
|
||||
|
|
|
@ -111,6 +111,24 @@ def isSimpleSearch(term):
|
|||
return False
|
||||
|
||||
|
||||
def splitSimpleSearch(term):
|
||||
'''Split a simple search term into tokens (words and phrases)'''
|
||||
if not isSimpleSearch(term):
|
||||
raise AssertionError('term is not a simple search')
|
||||
parts = term.split('"')
|
||||
tokens = []
|
||||
for i in range(0, len(parts)):
|
||||
if i % 2 == 0:
|
||||
# unquoted text
|
||||
words = [word for word in parts[i].split() if word]
|
||||
tokens.extend(words)
|
||||
else:
|
||||
# The uneven parts are those inside quotes.
|
||||
if parts[i]:
|
||||
tokens.append('"%s"' % parts[i])
|
||||
return tokens
|
||||
|
||||
|
||||
wildCard = compile(r'^[\w\d\s*?]*[*?]+[\w\d\s*?]*$', UNICODE)
|
||||
def isWildCard(term):
|
||||
if isinstance(term, str):
|
||||
|
|
Loading…
Reference in New Issue