passerelle/passerelle/utils/conversion.py

70 lines
2.2 KiB
Python

# passerelle - uniform access to multiple data sources and services
# Copyright (C) 2016 Entr'ouvert
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import codecs
import re
import unicodedata
import warnings
from StringIO import StringIO
import unidecode
from PIL import Image
def to_pdf(content):
if content.startswith(('%PDF', codecs.BOM + '%PDF', codecs.BOM_UTF8 + '%PDF')):
return content
try:
with warnings.catch_warnings():
warnings.simplefilter('error', Image.DecompressionBombWarning)
image = Image.open(StringIO(content))
except IOError:
raise ValueError('invalid image')
except Image.DecompressionBombWarning:
raise ValueError('unsafe image')
if image.mode != 'RGB':
# PDF cannot handle alpha (RGBA)
image = image.convert('RGB')
out = StringIO()
image.save(out, format='PDF')
return out.getvalue()
# copied from
# https://stackoverflow.com/questions/10294032/python-replace-typographical-quotes-dashes-etc-with-their-ascii-counterparts
def char_filter(string):
'''Fallback to ASCII char if found'''
latin = re.compile('[a-zA-Z]+')
for char in unicodedata.normalize('NFC', string):
decoded = unidecode.unidecode(char)
if latin.match(decoded):
yield char
else:
yield decoded
def clean_string(string):
return "".join(char_filter(string))
def ensure_encoding(s, encoding):
s = clean_string(s)
return s.encode(encoding, 'replace').decode(encoding)
def to_ascii(s):
return unidecode.unidecode(s).decode('ascii')