to_pdf: detect PDF with BOM prefix (#32088)

This commit is contained in:
Thomas NOËL 2019-04-08 16:51:44 +02:00
parent 1eff11f7f3
commit b00b070523
5 changed files with 254 additions and 1 deletions

View File

@ -14,6 +14,7 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import codecs
import re
import unicodedata
import warnings
@ -24,7 +25,7 @@ from PIL import Image
def to_pdf(content):
if content.startswith('%PDF'):
if content.startswith(('%PDF', codecs.BOM + '%PDF', codecs.BOM_UTF8 + '%PDF')):
return content
try:
with warnings.catch_warnings():

80
tests/data/minimal.pdf Normal file
View File

@ -0,0 +1,80 @@
%PDF-1.1
%¥±ë
% MIT License
%
% Copyright (c) 2010 Brendan Zagaeski
%
% Permission is hereby granted, free of charge, to any person obtaining a copy
% of this software and associated documentation files (the "Software"), to deal
% in the Software without restriction, including without limitation the rights
% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
% copies of the Software, and to permit persons to whom the Software is
% furnished to do so, subject to the following conditions:
%
% The above copyright notice and this permission notice shall be included in all
% copies or substantial portions of the Software.
%
% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
% SOFTWARE.
1 0 obj
<< /Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<< /Type /Pages
/Kids [3 0 R]
/Count 1
/MediaBox [0 0 300 144]
>>
endobj
3 0 obj
<< /Type /Page
/Parent 2 0 R
/Resources
<< /Font
<< /F1
<< /Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
>>
>>
/Contents 4 0 R
>>
endobj
4 0 obj
<< /Length 55 >>
stream
BT
/F1 18 Tf
0 0 Td
(Hello World) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000001130 00000 n
0000001189 00000 n
0000001290 00000 n
0000001569 00000 n
trailer
<< /Root 1 0 R
/Size 5
>>
startxref
1677
%%EOF

View File

@ -0,0 +1,80 @@
倥䙄ㄭㄮ
숥슥쎱<EFBFBD>
‥䥍⁔楌散獮<EFBFBD>
<EFBFBD>
‥潃祰楲桧⁴挨〲〱䈠敲摮湡娠条敡歳<EFBFBD>
<EFBFBD>
‥敐浲獩楳湯椠⁳敨敲祢朠慲瑮摥‬牦敥漠⁦档牡敧‬潴愠祮瀠牥潳扯慴湩湩⁧⁡潣祰
‥景琠楨⁳潳瑦慷敲愠摮愠獳捯慩整潤畣敭瑮瑡潩楦敬⁳琨敨∠潓瑦慷敲⤢潴搠慥<EFBFBD>
‥湩琠敨匠景睴牡⁥楷桴畯⁴敲瑳楲瑣潩Ɱ椠据畬楤杮眠瑩潨瑵氠浩瑩瑡潩桴⁥楲桧獴
‥潴甠敳潣祰潭楤祦敭杲ⱥ瀠扵楬桳楤瑳楲畢整畳汢捩湥敳湡⽤牯猠汥<EFBFBD>
‥潣楰獥漠潓瑦慷敲潴瀠牥業⁴数獲湯⁳潴眠潨潓瑦慷敲椠<EFBFBD>
‥畦湲獩敨⁤潴搠潳‬畳橢捥⁴潴琠敨映汯潬楷杮挠湯楤楴湯㩳
<EFBFBD>
‥桔⁥扡癯⁥潣祰楲桧⁴潮楴散愠摮琠楨⁳数浲獩楳湯渠瑯捩⁥桳污敢椠据畬敤⁤湩愠汬
‥潣楰獥漠⁲畳獢慴瑮慩潰瑲潩獮漠潓瑦慷敲<EFBFBD>
<EFBFBD>
‥䡔⁅体呆䅗䕒䤠⁓剐噏䑉䑅∠十䤠≓‬䥗䡔問⁔䅗剒乁奔传⁆乁⁙䥋䑎‬塅剐卅⁓剏
‥䵉䱐䕉ⱄ䤠䍎啌䥄䝎䈠呕丠呏䰠䵉呉䑅吠⁏䡔⁅䅗剒乁䥔卅传⁆䕍䍒䅈呎䉁䱉呉ⱙ
‥䥆乔卅但⁒䅐呒䍉䱕剁倠剕佐䕓䄠䑎丠乏义剆义䕇䕍呎义丠⁏噅久⁔䡓䱁⁌䡔<EFBFBD>
‥啁䡔剏⁓剏䌠偏剙䝉呈䠠䱏䕄卒䈠⁅䥌䉁䕌䘠剏䄠奎䌠䅌䵉‬䅄䅍䕇⁓剏传䡔剅
‥䥌䉁䱉呉ⱙ圠䕈䡔剅䤠乁䄠呃佉䙏䌠乏剔䍁ⱔ吠剏⁔剏传䡔剅䥗䕓剁卉义⁇剆䵏<EFBFBD>
‥問⁔䙏传⁒义䌠乏䕎呃佉䥗䡔吠䕈匠䙏坔剁⁅剏吠䕈唠䕓传⁒呏䕈⁒䕄䱁义升䤠<EFBFBD>
‥体呆䅗䕒<EFBFBD>
‱‰扯<EFBFBD>
†㰼⼠祔数⼠慃慴潬<EFBFBD>
††⼠慐敧⁳<EFBFBD>
†㸾
湥潤橢
‰扯<EFBFBD>
†㰼⼠祔数⼠慐敧<EFBFBD>
††⼠楋獤嬠″‰嵒
††⼠潃湵⁴<EFBFBD>
††⼠敍楤䉡硯嬠‰‰〳‰㐱崴
†㸾
湥潤橢
″‰扯<EFBFBD>
†㰼†启灹倯条<EFBFBD>
†††倯牡湥⁴<EFBFBD>
†††刯獥畯捲獥
†††㰠‼䘯湯<EFBFBD>
†††††㰠‼䘯<EFBFBD>
†††††††㰠‼启灹䘯湯<EFBFBD>
†††††††††匯扵祴数⼠祔数<EFBFBD>
†††††††††䈯獡䙥湯⁴启浩獥刭浯湡
†††††††㸠<EFBFBD>
†††††㸠<EFBFBD>
†††㸠<EFBFBD>
†††䌯湯整瑮⁳‴‰<EFBFBD>
†㸾
湥潤橢
‴‰扯<EFBFBD>
†㰼⼠敌杮桴㔠‵㸾
瑳敲浡
†呂
††䘯‱㠱吠<EFBFBD>
††‰‰摔
††䠨汥潬圠牯摬
橔
†呅
湥獤牴慥<EFBFBD>
湥潤橢
牸晥
<EFBFBD>
〰〰〰〰〰㘠㔵㔳映<EFBFBD>
〰〰〰ㄱ〠〰〰渠<EFBFBD>
〰〰〰ㄱ㤸〠〰〰渠<EFBFBD>
〰〰〰㈱〹〠〰〰渠<EFBFBD>
〰〰〰㔱㤶〠〰〰渠<EFBFBD>
牴楡敬<EFBFBD>
†㰼†刯潯⁴‱‰<EFBFBD>
†††匯穩<EFBFBD>
†㸾
瑳牡硴敲<EFBFBD>
㘱㜷
┥佅<EFBFBD>

View File

@ -0,0 +1,80 @@
%PDF-1.1
%¥±ë
% MIT License
%
% Copyright (c) 2010 Brendan Zagaeski
%
% Permission is hereby granted, free of charge, to any person obtaining a copy
% of this software and associated documentation files (the "Software"), to deal
% in the Software without restriction, including without limitation the rights
% to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
% copies of the Software, and to permit persons to whom the Software is
% furnished to do so, subject to the following conditions:
%
% The above copyright notice and this permission notice shall be included in all
% copies or substantial portions of the Software.
%
% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
% AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
% OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
% SOFTWARE.
1 0 obj
<< /Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<< /Type /Pages
/Kids [3 0 R]
/Count 1
/MediaBox [0 0 300 144]
>>
endobj
3 0 obj
<< /Type /Page
/Parent 2 0 R
/Resources
<< /Font
<< /F1
<< /Type /Font
/Subtype /Type1
/BaseFont /Times-Roman
>>
>>
>>
/Contents 4 0 R
>>
endobj
4 0 obj
<< /Length 55 >>
stream
BT
/F1 18 Tf
0 0 Td
(Hello World) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000001130 00000 n
0000001189 00000 n
0000001290 00000 n
0000001569 00000 n
trailer
<< /Root 1 0 R
/Size 5
>>
startxref
1677
%%EOF

View File

@ -0,0 +1,12 @@
import os
from passerelle.utils.conversion import to_pdf
def test_pdf_to_pdf_do_nothing():
pdf = open(os.path.join(os.path.dirname(__file__), 'data', 'minimal.pdf')).read()
assert to_pdf(pdf) == pdf
pdf = open(os.path.join(os.path.dirname(__file__), 'data', 'minimal_bom.pdf')).read()
assert to_pdf(pdf) == pdf
pdf = open(os.path.join(os.path.dirname(__file__), 'data', 'minimal_bomutf8.pdf')).read()
assert to_pdf(pdf) == pdf