summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCarsten Senger <senger@rehfisch.de>2013-07-04 13:34:32 (GMT)
committerCarsten Senger <senger@rehfisch.de>2013-07-04 13:34:32 (GMT)
commitb512e1faa0a1655ce85dff19cb1c81ab276708a9 (patch)
tree46bb0e9e936dde09f6960d6addf5d091cd26b5bd
parentdfb947682b3b5306a367f683bec17c9d897eb350 (diff)
downloadcollective.solr-b512e1faa0a1655ce85dff19cb1c81ab276708a9.zip
collective.solr-b512e1faa0a1655ce85dff19cb1c81ab276708a9.tar.gz
collective.solr-b512e1faa0a1655ce85dff19cb1c81ab276708a9.tar.bz2
Implement fuzzy search (w/o a config UI)
-rw-r--r--src/collective/solr/mangler.py68
-rw-r--r--src/collective/solr/tests/test_server.py10
-rw-r--r--src/collective/solr/tests/test_utils.py9
-rw-r--r--src/collective/solr/utils.py18
4 files changed, 82 insertions, 23 deletions
diff --git a/src/collective/solr/mangler.py b/src/collective/solr/mangler.py
index f601bec..8a44856 100644
--- a/src/collective/solr/mangler.py
+++ b/src/collective/solr/mangler.py
@@ -7,6 +7,7 @@ from collective.solr.queryparser import quote
from collective.solr.utils import isSimpleTerm
from collective.solr.utils import isSimpleSearch
from collective.solr.utils import isWildCard
+from collective.solr.utils import splitSimpleSearch
from collective.solr.utils import prepare_wildcard
@@ -37,6 +38,53 @@ def iso8601date(value):
return value
+def makeSimpleExpressions(term, levenstein_distance):
+ '''Return a search expression for part of the query that
+ includes the levenstein distance and wildcards where appropriate.
+ Returns both an expression for "value" and "base_value"'''
+
+ base_value = term
+ if levenstein_distance:
+ levenstein_expr = '~%s' % levenstein_distance
+ else:
+ levenstein_expr = ''
+ if '"' in term: # quoted literals
+ value = '%s%s' % (term, levenstein_expr)
+ base_value = value
+ elif isWildCard(term):
+ value = prepare_wildcard(term)
+ base_value = quote(term.replace('*', '').replace('?', ''))
+ else:
+ value = '%s* OR %s%s' % (prepare_wildcard(term), term,
+ levenstein_expr)
+ return '(%s)' % value, '(%s)' % base_value
+
+
+def mangleSearchableText(value, config):
+ pattern = getattr(config, 'search_pattern', '')
+ levenstein_distance = getattr(config, 'levenstein_distance', 0)
+ value_parts = []
+ base_value_parts = []
+
+ if not isSimpleSearch(value):
+ return value
+
+ for term in splitSimpleSearch(value):
+ (term_value,
+ term_base_value) = makeSimpleExpressions(term,
+ levenstein_distance)
+ value_parts.append(term_value)
+ base_value_parts.append(term_base_value)
+
+ base_value = ' '.join(base_value_parts)
+ value = ' '.join(value_parts)
+ if pattern:
+ value = pattern.format(value=quote(value),
+ base_value=base_value)
+ return set([value]) # add literal query parameter
+ return value
+
+
def mangleQuery(keywords, config, schema):
""" translate / mangle query parameters to replace zope specifics
with equivalent constructs for solr """
@@ -76,24 +124,8 @@ def mangleQuery(keywords, config, schema):
for key, value in keywords.items():
args = extras.get(key, {})
if key == 'SearchableText':
- pattern = getattr(config, 'search_pattern', '')
- simple_term = isSimpleTerm(value)
- if pattern and isSimpleSearch(value):
- base_value = value
- if simple_term: # use prefix/wildcard search
- value = '(%s* OR %s)' % (prepare_wildcard(value), value)
- elif isWildCard(value):
- value = prepare_wildcard(value)
- base_value = quote(value.replace('*', '').replace('?', ''))
- # simple queries use custom search pattern
- value = pattern.format(value=quote(value),
- base_value=base_value)
- keywords[key] = set([value]) # add literal query parameter
- continue
- elif simple_term: # use prefix/wildcard search
- keywords[key] = '(%s* OR %s)' % (
- prepare_wildcard(value), value)
- continue
+ keywords[key] = mangleSearchableText(value, config)
+ continue
if key in epi_indexes:
path = keywords['%s_parents' % key] = value
del keywords[key]
diff --git a/src/collective/solr/tests/test_server.py b/src/collective/solr/tests/test_server.py
index 6fcdfbf..011d56d 100644
--- a/src/collective/solr/tests/test_server.py
+++ b/src/collective/solr/tests/test_server.py
@@ -470,7 +470,9 @@ class SolrServerTests(SolrTestCase):
# the pattern is applied for multi-word searches
response = solrSearchResults(SearchableText='foo bar', Language='all')
query = response.responseHeader['params']['q']
- self.assertEqual(query, '(Title:(foo bar)^5 OR getId:(foo bar))')
+ self.assertEqual(query,
+ '(Title:((foo* OR foo) (bar* OR bar))^5 OR '
+ 'getId:((foo* OR foo) (bar* OR bar)))')
# extra parameters should be unaffected
response = solrSearchResults(SearchableText='"news"', Type='xy', Language='all')
query = response.responseHeader['params']['q']
@@ -479,14 +481,14 @@ class SolrServerTests(SolrTestCase):
self.config.search_pattern = '(Title:{value} OR getId:{base_value})'
response = solrSearchResults(SearchableText='news', Language='all')
query = response.responseHeader['params']['q']
- self.assertEqual(query, '(Title:(news* OR news) OR getId:news)')
+ self.assertEqual(query, '(Title:(news* OR news) OR getId:(news))')
# and they handle wildcards as advertised
response = solrSearchResults(SearchableText='news*', Language='all')
query = response.responseHeader['params']['q']
- self.assertEqual(query, '(Title:news* OR getId:news)')
+ self.assertEqual(query, '(Title:(news*) OR getId:(news))')
response = solrSearchResults(SearchableText='*news*', Language='all')
query = response.responseHeader['params']['q']
- self.assertEqual(query, '(Title:news* OR getId:news)')
+ self.assertEqual(query, '(Title:(news*) OR getId:(news))')
def testSolrSearchResultsWithDictRequest(self):
self.maintenance.reindex()
diff --git a/src/collective/solr/tests/test_utils.py b/src/collective/solr/tests/test_utils.py
index c8f0000..cbc9865 100644
--- a/src/collective/solr/tests/test_utils.py
+++ b/src/collective/solr/tests/test_utils.py
@@ -6,7 +6,7 @@ from Testing import ZopeTestCase as ztc
from collective.solr.tests.utils import getData
from collective.solr.parser import SolrResponse
from collective.solr.utils import findObjects, isSimpleTerm, isSimpleSearch
-from collective.solr.utils import isWildCard
+from collective.solr.utils import isWildCard, splitSimpleSearch
from collective.solr.utils import setupTranslationMap, prepareData
from collective.solr.utils import padResults
@@ -93,6 +93,13 @@ class UtilsTests(ztc.ZopeTestCase):
self.failIf(isSimpleSearch('foo 42 bar11'))
self.failUnless(isSimpleSearch('2000 foo'))
+ def testSplitSimpleSearch(self):
+ self.assertEqual(splitSimpleSearch('foo bar'), ['foo', 'bar'])
+ self.assertEqual(splitSimpleSearch('foo "bar foobar" baz'),
+ ['foo', '"bar foobar"', 'baz'])
+ self.assertRaises(AssertionError, splitSimpleSearch, 'foo AND bar')
+ self.assertRaises(AssertionError, splitSimpleSearch, 'foo42')
+
def testIsWildCard(self):
self.failUnless(isWildCard('foo*'))
self.failUnless(isWildCard('fo?'))
diff --git a/src/collective/solr/utils.py b/src/collective/solr/utils.py
index b3e8f27..0716eb8 100644
--- a/src/collective/solr/utils.py
+++ b/src/collective/solr/utils.py
@@ -111,6 +111,24 @@ def isSimpleSearch(term):
return False
+def splitSimpleSearch(term):
+ '''Split a simple search term into tokens (words and phrases)'''
+ if not isSimpleSearch(term):
+ raise AssertionError('term is not a simple search')
+ parts = term.split('"')
+ tokens = []
+ for i in range(0, len(parts)):
+ if i % 2 == 0:
+ # unquoted text
+ words = [word for word in parts[i].split() if word]
+ tokens.extend(words)
+ else:
+ # The uneven parts are those inside quotes.
+ if parts[i]:
+ tokens.append('"%s"' % parts[i])
+ return tokens
+
+
wildCard = compile(r'^[\w\d\s*?]*[*?]+[\w\d\s*?]*$', UNICODE)
def isWildCard(term):
if isinstance(term, str):