misc: rewrite qommon.misc.simplify() to use unicodedata module (#6846)

This commit is contained in:
Thomas NOËL 2015-03-27 19:10:58 +01:00 committed by Frédéric Péters
parent 4230ff7d9d
commit 8d61f8a015
1 changed files with 11 additions and 132 deletions

View File

@ -23,6 +23,7 @@ import urllib
import socket
import base64
import json
import unicodedata
from quixote import get_publisher, get_session
from quixote.html import htmltext
@ -127,140 +128,18 @@ def get_provider(provider_key):
def get_provider_key(provider_id):
return provider_id.replace('://', '-').replace('/', '-').replace('?', '-').replace(':', '-')
xlate = {
u'\N{ACUTE ACCENT}': "'",
u'\N{BROKEN BAR}': '|',
u'\N{CEDILLA}': '{cedilla}',
u'\N{CENT SIGN}': '{cent}',
u'\N{COPYRIGHT SIGN}': '{C}',
u'\N{CURRENCY SIGN}': '{currency}',
u'\N{DEGREE SIGN}': '{degrees}',
u'\N{DIAERESIS}': '{umlaut}',
u'\N{DIVISION SIGN}': '/',
u'\N{FEMININE ORDINAL INDICATOR}': '{^a}',
u'\N{INVERTED EXCLAMATION MARK}': '!',
u'\N{INVERTED QUESTION MARK}': '?',
u'\N{LATIN CAPITAL LETTER A WITH ACUTE}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH GRAVE}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}': 'A',
u'\N{LATIN CAPITAL LETTER A WITH TILDE}': 'A',
u'\N{LATIN CAPITAL LETTER AE}': 'Ae',
u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C',
u'\N{LATIN CAPITAL LETTER E WITH ACUTE}': 'E',
u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}': 'E',
u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}': 'E',
u'\N{LATIN CAPITAL LETTER E WITH GRAVE}': 'E',
u'\N{LATIN CAPITAL LETTER ETH}': 'Th',
u'\N{LATIN CAPITAL LETTER I WITH ACUTE}': 'I',
u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}': 'I',
u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}': 'I',
u'\N{LATIN CAPITAL LETTER I WITH GRAVE}': 'I',
u'\N{LATIN CAPITAL LETTER N WITH TILDE}': 'N',
u'\N{LATIN CAPITAL LETTER O WITH ACUTE}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH GRAVE}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH STROKE}': 'O',
u'\N{LATIN CAPITAL LETTER O WITH TILDE}': 'O',
u'\N{LATIN CAPITAL LETTER THORN}': 'th',
u'\N{LATIN CAPITAL LETTER U WITH ACUTE}': 'U',
u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}': 'U',
u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U',
u'\N{LATIN CAPITAL LETTER U WITH GRAVE}': 'U',
u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}': 'Y',
u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'a',
u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}': 'a',
u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': 'a',
u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'a',
u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'a',
u'\N{LATIN SMALL LETTER A WITH TILDE}': 'a',
u'\N{LATIN SMALL LETTER AE}': 'ae',
u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'c',
u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'e',
u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}': 'e',
u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': 'e',
u'\N{LATIN SMALL LETTER E WITH GRAVE}': 'e',
u'\N{LATIN SMALL LETTER ETH}': 'th',
u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'i',
u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}': 'i',
u'\N{LATIN SMALL LETTER I WITH DIAERESIS}': 'i',
u'\N{LATIN SMALL LETTER I WITH GRAVE}': 'i',
u'\N{LATIN SMALL LETTER N WITH TILDE}': 'n',
u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'o',
u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}': 'o',
u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'o',
u'\N{LATIN SMALL LETTER O WITH GRAVE}': 'o',
u'\N{LATIN SMALL LETTER O WITH STROKE}': 'o',
u'\N{LATIN SMALL LETTER O WITH TILDE}': 'o',
u'\N{LATIN SMALL LETTER SHARP S}': 'ss',
u'\N{LATIN SMALL LETTER THORN}': 'th',
u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'u',
u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': 'u',
u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'u',
u'\N{LATIN SMALL LETTER U WITH GRAVE}': 'u',
u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'y',
u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}': 'y',
u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}': '<<',
u'\N{MACRON}': '_',
u'\N{MASCULINE ORDINAL INDICATOR}': '{^o}',
u'\N{MICRO SIGN}': '{micro}',
u'\N{MIDDLE DOT}': '*',
u'\N{MULTIPLICATION SIGN}': '*',
u'\N{NOT SIGN}': '{not}',
u'\N{PILCROW SIGN}': '{paragraph}',
u'\N{PLUS-MINUS SIGN}': '{+/-}',
u'\N{POUND SIGN}': '{pound}',
u'\N{REGISTERED SIGN}': '{R}',
u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}': '>>',
u'\N{SECTION SIGN}': '{section}',
u'\N{SOFT HYPHEN}': '-',
u'\N{SUPERSCRIPT ONE}': '{^1}',
u'\N{SUPERSCRIPT THREE}': '{^3}',
u'\N{SUPERSCRIPT TWO}': '{^2}',
u'\N{VULGAR FRACTION ONE HALF}': '{1/2}',
u'\N{VULGAR FRACTION ONE QUARTER}': '{1/4}',
u'\N{VULGAR FRACTION THREE QUARTERS}': '{3/4}',
u'\N{YEN SIGN}': '{yen}'
}
def latin1_to_ascii (unicrap):
"""This takes a UNICODE string and replaces Latin-1 characters with
something equivalent in 7-bit ASCII. It returns a plain ASCII string.
This function makes a best effort to convert Latin-1 characters into
ASCII equivalents. It does not just strip out the Latin-1 characters.
All characters in the standard 7-bit ASCII range are preserved.
In the 8th bit range all the Latin-1 accented letters are converted
to unaccented equivalents. Most symbol characters are converted to
something meaningful. Anything not converted is deleted.
<http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/251871>
"""
if not isinstance(unicrap, unicode):
def simplify(s, space='-'):
if s is None:
return ''
if not isinstance(s, unicode):
if get_publisher() and get_publisher().site_charset:
unicrap = unicode(unicrap, get_publisher().site_charset, 'ignore')
s = unicode(s, get_publisher().site_charset, 'ignore')
else:
unicrap = unicode(unicrap, 'iso-8859-1', 'ignore')
r = ""
for i in unicrap:
if xlate.has_key(i):
r += xlate[i]
elif ord(i) >= 0x80:
pass
else:
r += str(i)
return r
simplifyRegex = re.compile(r'''([\s()'"/:\'\.,\|\;\?\&\[\]]+)''')
def simplify(s, space = '-'):
if s is None: s = ''
s = latin1_to_ascii(simplifyRegex.sub(' ', s).strip()).replace(' ', space).lower()
return re.sub(r'\%s+' % space, space, s) # remove consecutive dashes (or whatever)
s = unicode(s, 'iso-8859-1', 'ignore')
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
s = re.sub('[^\w\s%s]' % space, '', s).strip().lower()
s = re.sub('[\s%s]+' % space, space, s)
return s
def get_datetime_language():
lang = get_cfg('language', {}).get('language', None)