misc: rewrite qommon.misc.simplify() to use unicodedata module (#6846)
This commit is contained in:
parent
4230ff7d9d
commit
8d61f8a015
|
@ -23,6 +23,7 @@ import urllib
|
|||
import socket
|
||||
import base64
|
||||
import json
|
||||
import unicodedata
|
||||
|
||||
from quixote import get_publisher, get_session
|
||||
from quixote.html import htmltext
|
||||
|
@ -127,140 +128,18 @@ def get_provider(provider_key):
|
|||
def get_provider_key(provider_id):
|
||||
return provider_id.replace('://', '-').replace('/', '-').replace('?', '-').replace(':', '-')
|
||||
|
||||
|
||||
xlate = {
|
||||
u'\N{ACUTE ACCENT}': "'",
|
||||
u'\N{BROKEN BAR}': '|',
|
||||
u'\N{CEDILLA}': '{cedilla}',
|
||||
u'\N{CENT SIGN}': '{cent}',
|
||||
u'\N{COPYRIGHT SIGN}': '{C}',
|
||||
u'\N{CURRENCY SIGN}': '{currency}',
|
||||
u'\N{DEGREE SIGN}': '{degrees}',
|
||||
u'\N{DIAERESIS}': '{umlaut}',
|
||||
u'\N{DIVISION SIGN}': '/',
|
||||
u'\N{FEMININE ORDINAL INDICATOR}': '{^a}',
|
||||
u'\N{INVERTED EXCLAMATION MARK}': '!',
|
||||
u'\N{INVERTED QUESTION MARK}': '?',
|
||||
u'\N{LATIN CAPITAL LETTER A WITH ACUTE}': 'A',
|
||||
u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}': 'A',
|
||||
u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}': 'A',
|
||||
u'\N{LATIN CAPITAL LETTER A WITH GRAVE}': 'A',
|
||||
u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}': 'A',
|
||||
u'\N{LATIN CAPITAL LETTER A WITH TILDE}': 'A',
|
||||
u'\N{LATIN CAPITAL LETTER AE}': 'Ae',
|
||||
u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}': 'C',
|
||||
u'\N{LATIN CAPITAL LETTER E WITH ACUTE}': 'E',
|
||||
u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}': 'E',
|
||||
u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}': 'E',
|
||||
u'\N{LATIN CAPITAL LETTER E WITH GRAVE}': 'E',
|
||||
u'\N{LATIN CAPITAL LETTER ETH}': 'Th',
|
||||
u'\N{LATIN CAPITAL LETTER I WITH ACUTE}': 'I',
|
||||
u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}': 'I',
|
||||
u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}': 'I',
|
||||
u'\N{LATIN CAPITAL LETTER I WITH GRAVE}': 'I',
|
||||
u'\N{LATIN CAPITAL LETTER N WITH TILDE}': 'N',
|
||||
u'\N{LATIN CAPITAL LETTER O WITH ACUTE}': 'O',
|
||||
u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}': 'O',
|
||||
u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}': 'O',
|
||||
u'\N{LATIN CAPITAL LETTER O WITH GRAVE}': 'O',
|
||||
u'\N{LATIN CAPITAL LETTER O WITH STROKE}': 'O',
|
||||
u'\N{LATIN CAPITAL LETTER O WITH TILDE}': 'O',
|
||||
u'\N{LATIN CAPITAL LETTER THORN}': 'th',
|
||||
u'\N{LATIN CAPITAL LETTER U WITH ACUTE}': 'U',
|
||||
u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}': 'U',
|
||||
u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}': 'U',
|
||||
u'\N{LATIN CAPITAL LETTER U WITH GRAVE}': 'U',
|
||||
u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}': 'Y',
|
||||
u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'a',
|
||||
u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}': 'a',
|
||||
u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': 'a',
|
||||
u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'a',
|
||||
u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'a',
|
||||
u'\N{LATIN SMALL LETTER A WITH TILDE}': 'a',
|
||||
u'\N{LATIN SMALL LETTER AE}': 'ae',
|
||||
u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'c',
|
||||
u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'e',
|
||||
u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}': 'e',
|
||||
u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': 'e',
|
||||
u'\N{LATIN SMALL LETTER E WITH GRAVE}': 'e',
|
||||
u'\N{LATIN SMALL LETTER ETH}': 'th',
|
||||
u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'i',
|
||||
u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}': 'i',
|
||||
u'\N{LATIN SMALL LETTER I WITH DIAERESIS}': 'i',
|
||||
u'\N{LATIN SMALL LETTER I WITH GRAVE}': 'i',
|
||||
u'\N{LATIN SMALL LETTER N WITH TILDE}': 'n',
|
||||
u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'o',
|
||||
u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}': 'o',
|
||||
u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'o',
|
||||
u'\N{LATIN SMALL LETTER O WITH GRAVE}': 'o',
|
||||
u'\N{LATIN SMALL LETTER O WITH STROKE}': 'o',
|
||||
u'\N{LATIN SMALL LETTER O WITH TILDE}': 'o',
|
||||
u'\N{LATIN SMALL LETTER SHARP S}': 'ss',
|
||||
u'\N{LATIN SMALL LETTER THORN}': 'th',
|
||||
u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'u',
|
||||
u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': 'u',
|
||||
u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'u',
|
||||
u'\N{LATIN SMALL LETTER U WITH GRAVE}': 'u',
|
||||
u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'y',
|
||||
u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}': 'y',
|
||||
u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}': '<<',
|
||||
u'\N{MACRON}': '_',
|
||||
u'\N{MASCULINE ORDINAL INDICATOR}': '{^o}',
|
||||
u'\N{MICRO SIGN}': '{micro}',
|
||||
u'\N{MIDDLE DOT}': '*',
|
||||
u'\N{MULTIPLICATION SIGN}': '*',
|
||||
u'\N{NOT SIGN}': '{not}',
|
||||
u'\N{PILCROW SIGN}': '{paragraph}',
|
||||
u'\N{PLUS-MINUS SIGN}': '{+/-}',
|
||||
u'\N{POUND SIGN}': '{pound}',
|
||||
u'\N{REGISTERED SIGN}': '{R}',
|
||||
u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}': '>>',
|
||||
u'\N{SECTION SIGN}': '{section}',
|
||||
u'\N{SOFT HYPHEN}': '-',
|
||||
u'\N{SUPERSCRIPT ONE}': '{^1}',
|
||||
u'\N{SUPERSCRIPT THREE}': '{^3}',
|
||||
u'\N{SUPERSCRIPT TWO}': '{^2}',
|
||||
u'\N{VULGAR FRACTION ONE HALF}': '{1/2}',
|
||||
u'\N{VULGAR FRACTION ONE QUARTER}': '{1/4}',
|
||||
u'\N{VULGAR FRACTION THREE QUARTERS}': '{3/4}',
|
||||
u'\N{YEN SIGN}': '{yen}'
|
||||
}
|
||||
|
||||
def latin1_to_ascii (unicrap):
|
||||
"""This takes a UNICODE string and replaces Latin-1 characters with
|
||||
something equivalent in 7-bit ASCII. It returns a plain ASCII string.
|
||||
This function makes a best effort to convert Latin-1 characters into
|
||||
ASCII equivalents. It does not just strip out the Latin-1 characters.
|
||||
All characters in the standard 7-bit ASCII range are preserved.
|
||||
In the 8th bit range all the Latin-1 accented letters are converted
|
||||
to unaccented equivalents. Most symbol characters are converted to
|
||||
something meaningful. Anything not converted is deleted.
|
||||
|
||||
<http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/251871>
|
||||
"""
|
||||
|
||||
if not isinstance(unicrap, unicode):
|
||||
def simplify(s, space='-'):
|
||||
if s is None:
|
||||
return ''
|
||||
if not isinstance(s, unicode):
|
||||
if get_publisher() and get_publisher().site_charset:
|
||||
unicrap = unicode(unicrap, get_publisher().site_charset, 'ignore')
|
||||
s = unicode(s, get_publisher().site_charset, 'ignore')
|
||||
else:
|
||||
unicrap = unicode(unicrap, 'iso-8859-1', 'ignore')
|
||||
r = ""
|
||||
for i in unicrap:
|
||||
if xlate.has_key(i):
|
||||
r += xlate[i]
|
||||
elif ord(i) >= 0x80:
|
||||
pass
|
||||
else:
|
||||
r += str(i)
|
||||
return r
|
||||
|
||||
simplifyRegex = re.compile(r'''([\s()'"/:\'\.,\|\;\?\&\[\]]+)''')
|
||||
|
||||
def simplify(s, space = '-'):
|
||||
if s is None: s = ''
|
||||
s = latin1_to_ascii(simplifyRegex.sub(' ', s).strip()).replace(' ', space).lower()
|
||||
return re.sub(r'\%s+' % space, space, s) # remove consecutive dashes (or whatever)
|
||||
|
||||
s = unicode(s, 'iso-8859-1', 'ignore')
|
||||
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
|
||||
s = re.sub('[^\w\s%s]' % space, '', s).strip().lower()
|
||||
s = re.sub('[\s%s]+' % space, space, s)
|
||||
return s
|
||||
|
||||
def get_datetime_language():
|
||||
lang = get_cfg('language', {}).get('language', None)
|
||||
|
|
Loading…
Reference in New Issue