diff --git a/src/collective/solr/mangler.py b/src/collective/solr/mangler.py index f601bec..8a44856 100644 --- a/src/collective/solr/mangler.py +++ b/src/collective/solr/mangler.py @@ -7,6 +7,7 @@ from collective.solr.queryparser import quote from collective.solr.utils import isSimpleTerm from collective.solr.utils import isSimpleSearch from collective.solr.utils import isWildCard +from collective.solr.utils import splitSimpleSearch from collective.solr.utils import prepare_wildcard @@ -37,6 +38,53 @@ def iso8601date(value): return value +def makeSimpleExpressions(term, levenstein_distance): + '''Return a search expression for part of the query that + includes the levenstein distance and wildcards where appropriate. + Returns both an expression for "value" and "base_value"''' + + base_value = term + if levenstein_distance: + levenstein_expr = '~%s' % levenstein_distance + else: + levenstein_expr = '' + if '"' in term: # quoted literals + value = '%s%s' % (term, levenstein_expr) + base_value = value + elif isWildCard(term): + value = prepare_wildcard(term) + base_value = quote(term.replace('*', '').replace('?', '')) + else: + value = '%s* OR %s%s' % (prepare_wildcard(term), term, + levenstein_expr) + return '(%s)' % value, '(%s)' % base_value + + +def mangleSearchableText(value, config): + pattern = getattr(config, 'search_pattern', '') + levenstein_distance = getattr(config, 'levenstein_distance', 0) + value_parts = [] + base_value_parts = [] + + if not isSimpleSearch(value): + return value + + for term in splitSimpleSearch(value): + (term_value, + term_base_value) = makeSimpleExpressions(term, + levenstein_distance) + value_parts.append(term_value) + base_value_parts.append(term_base_value) + + base_value = ' '.join(base_value_parts) + value = ' '.join(value_parts) + if pattern: + value = pattern.format(value=quote(value), + base_value=base_value) + return set([value]) # add literal query parameter + return value + + def mangleQuery(keywords, config, schema): """ translate / mangle query parameters to replace zope specifics with equivalent constructs for solr """ @@ -76,24 +124,8 @@ def mangleQuery(keywords, config, schema): for key, value in keywords.items(): args = extras.get(key, {}) if key == 'SearchableText': - pattern = getattr(config, 'search_pattern', '') - simple_term = isSimpleTerm(value) - if pattern and isSimpleSearch(value): - base_value = value - if simple_term: # use prefix/wildcard search - value = '(%s* OR %s)' % (prepare_wildcard(value), value) - elif isWildCard(value): - value = prepare_wildcard(value) - base_value = quote(value.replace('*', '').replace('?', '')) - # simple queries use custom search pattern - value = pattern.format(value=quote(value), - base_value=base_value) - keywords[key] = set([value]) # add literal query parameter - continue - elif simple_term: # use prefix/wildcard search - keywords[key] = '(%s* OR %s)' % ( - prepare_wildcard(value), value) - continue + keywords[key] = mangleSearchableText(value, config) + continue if key in epi_indexes: path = keywords['%s_parents' % key] = value del keywords[key] diff --git a/src/collective/solr/tests/test_server.py b/src/collective/solr/tests/test_server.py index 6fcdfbf..011d56d 100644 --- a/src/collective/solr/tests/test_server.py +++ b/src/collective/solr/tests/test_server.py @@ -470,7 +470,9 @@ class SolrServerTests(SolrTestCase): # the pattern is applied for multi-word searches response = solrSearchResults(SearchableText='foo bar', Language='all') query = response.responseHeader['params']['q'] - self.assertEqual(query, '(Title:(foo bar)^5 OR getId:(foo bar))') + self.assertEqual(query, + '(Title:((foo* OR foo) (bar* OR bar))^5 OR ' + 'getId:((foo* OR foo) (bar* OR bar)))') # extra parameters should be unaffected response = solrSearchResults(SearchableText='"news"', Type='xy', Language='all') query = response.responseHeader['params']['q'] @@ -479,14 +481,14 @@ class SolrServerTests(SolrTestCase): self.config.search_pattern = '(Title:{value} OR getId:{base_value})' response = solrSearchResults(SearchableText='news', Language='all') query = response.responseHeader['params']['q'] - self.assertEqual(query, '(Title:(news* OR news) OR getId:news)') + self.assertEqual(query, '(Title:(news* OR news) OR getId:(news))') # and they handle wildcards as advertised response = solrSearchResults(SearchableText='news*', Language='all') query = response.responseHeader['params']['q'] - self.assertEqual(query, '(Title:news* OR getId:news)') + self.assertEqual(query, '(Title:(news*) OR getId:(news))') response = solrSearchResults(SearchableText='*news*', Language='all') query = response.responseHeader['params']['q'] - self.assertEqual(query, '(Title:news* OR getId:news)') + self.assertEqual(query, '(Title:(news*) OR getId:(news))') def testSolrSearchResultsWithDictRequest(self): self.maintenance.reindex() diff --git a/src/collective/solr/tests/test_utils.py b/src/collective/solr/tests/test_utils.py index c8f0000..cbc9865 100644 --- a/src/collective/solr/tests/test_utils.py +++ b/src/collective/solr/tests/test_utils.py @@ -6,7 +6,7 @@ from Testing import ZopeTestCase as ztc from collective.solr.tests.utils import getData from collective.solr.parser import SolrResponse from collective.solr.utils import findObjects, isSimpleTerm, isSimpleSearch -from collective.solr.utils import isWildCard +from collective.solr.utils import isWildCard, splitSimpleSearch from collective.solr.utils import setupTranslationMap, prepareData from collective.solr.utils import padResults @@ -93,6 +93,13 @@ class UtilsTests(ztc.ZopeTestCase): self.failIf(isSimpleSearch('foo 42 bar11')) self.failUnless(isSimpleSearch('2000 foo')) + def testSplitSimpleSearch(self): + self.assertEqual(splitSimpleSearch('foo bar'), ['foo', 'bar']) + self.assertEqual(splitSimpleSearch('foo "bar foobar" baz'), + ['foo', '"bar foobar"', 'baz']) + self.assertRaises(AssertionError, splitSimpleSearch, 'foo AND bar') + self.assertRaises(AssertionError, splitSimpleSearch, 'foo42') + def testIsWildCard(self): self.failUnless(isWildCard('foo*')) self.failUnless(isWildCard('fo?')) diff --git a/src/collective/solr/utils.py b/src/collective/solr/utils.py index b3e8f27..0716eb8 100644 --- a/src/collective/solr/utils.py +++ b/src/collective/solr/utils.py @@ -111,6 +111,24 @@ def isSimpleSearch(term): return False +def splitSimpleSearch(term): + '''Split a simple search term into tokens (words and phrases)''' + if not isSimpleSearch(term): + raise AssertionError('term is not a simple search') + parts = term.split('"') + tokens = [] + for i in range(0, len(parts)): + if i % 2 == 0: + # unquoted text + words = [word for word in parts[i].split() if word] + tokens.extend(words) + else: + # The uneven parts are those inside quotes. + if parts[i]: + tokens.append('"%s"' % parts[i]) + return tokens + + wildCard = compile(r'^[\w\d\s*?]*[*?]+[\w\d\s*?]*$', UNICODE) def isWildCard(term): if isinstance(term, str):