sql: normalize phonenumbers in fts index (#76875)
gitea/wcs/pipeline/head This commit looks good Details

* restricted to values of less than 30 characters
* indexed with weight 'D' to decrease the score compared to field with
  the phonenumber validation
This commit is contained in:
Benjamin Dauvergne 2023-04-21 15:17:48 +02:00 committed by Frédéric Péters
parent 87e3e9aa51
commit 2d619766b7
3 changed files with 35 additions and 18 deletions

View File

@ -5328,10 +5328,15 @@ def test_fts_phone(pub):
formdata.just_created()
formdata.store()
assert formdef.data_class().count([FtsMatch('01 23 45 67 89')]) == 1
assert formdef.data_class().count([FtsMatch('0123456789')]) == 1
assert formdef.data_class().count([FtsMatch('+33123456789')]) == 1
assert formdef.data_class().count([FtsMatch('+33(0)123456789')]) == 1
formdata = formdef.data_class()()
formdata.data = {'1': None, '2': '0123456789'}
formdata.just_created()
formdata.store()
assert formdef.data_class().count([FtsMatch('01 23 45 67 89')]) == 2
assert formdef.data_class().count([FtsMatch('0123456789')]) == 2
assert formdef.data_class().count([FtsMatch('+33123456789')]) == 2
assert formdef.data_class().count([FtsMatch('+33(0)123456789')]) == 2
assert formdef.data_class().count([FtsMatch('+33(0)123456789 foo')]) == 1
assert formdef.data_class().count([FtsMatch('+33(0)123456789 bar')]) == 0
assert formdef.data_class().count([FtsMatch('foo +33(0)123456789')]) == 1

View File

@ -2624,6 +2624,11 @@ class SqlDataMixin(SqlMixin):
if isinstance(value, str) and len(value) < 10000:
# avoid overlong strings, typically base64-encoded values
fts_strings[weight].add(value)
# normalize values looking like phonenumbers, because
# phonenumbers are normalized by the FTS criteria
if len(value) < 30 and value != normalize_phone_number_for_fts_if_needed(value):
# use weight 'D' to give preference to fields with the phonenumber validation
fts_strings['D'].add(normalize_phone_number_for_fts_if_needed(value))
elif type(value) in (tuple, list):
for val in value:
fts_strings[weight].add(val)

View File

@ -343,26 +343,33 @@ class ILike(Criteria):
self.value = '%' + like_escape(self.value) + '%'
phone_re = re.compile(
r'''.*?(?P<phone> # a phone number
((\+[1-9])|(\b0)) # starting with an international prefix, or 0
[-\(\)\d\.\s/]{6,20} # then a bunch of numbers/symbols
\b) # till the end of the "word"''',
re.X,
)
def normalize_phone_number_for_fts_if_needed(value):
phone_match = phone_re.match(value)
if phone_match and not re.match(r'^\d+-\d+$', phone_match.group('phone').strip()):
# if it looks like a phone number, normalize it to its
# "international/E164" format to match what's stored in the
# database.
phone_value = misc.normalize_phone_number_for_fts(phone_match.group('phone').strip())
value = value.replace(phone_match.group('phone').strip(), phone_value)
return value
class FtsMatch(Criteria):
def __init__(self, value, extra_normalize=True, **kwargs):
# make Criteria.__repr__ works
self.attribute = 'fts'
self.value = self.get_fts_value(value)
if extra_normalize:
phone_match = re.match(
r'''.*?(?P<phone> # a phone number
((\+[1-9])|(\b0)) # starting with an international prefix, or 0
[-\(\)\d\.\s/]{6,20} # then a bunch of numbers/symbols
\b) # till the end of the "word"''',
self.value,
re.X,
)
if phone_match and not re.match(r'^\d+-\d+$', phone_match.group('phone').strip()):
# if it looks like a phone number, normalize it to its
# "international/E164" format to match what's stored in the
# database.
phone_value = misc.normalize_phone_number_for_fts(phone_match.group('phone').strip())
self.value = self.value.replace(phone_match.group('phone').strip(), phone_value)
self.value = normalize_phone_number_for_fts_if_needed(self.value)
@classmethod
def get_fts_value(cls, value):