pdf: support des boutons radio et des listes à choix (#75373) #138

Merged
bdauvergne merged 3 commits from wip/75373-pdf-champs-du-PDF-inaccessibles into main 2023-03-16 23:48:46 +01:00
6 changed files with 285 additions and 68 deletions

View File

@ -15,6 +15,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from django import forms
from django.urls import reverse
from django.utils.html import mark_safe
from django.utils.translation import gettext_lazy as _
from passerelle.utils.forms import ConditionField, TemplateField
@ -40,11 +42,34 @@ class FieldsMappingEditForm(forms.ModelForm):
elif field.widget_type == 'text':
help_text = _('text template')
field_class = TemplateField
elif field.widget_type == 'radio':
values = ', '.join('"%s"' % value for value in field.radio_possible_values)
help_text = _('text template, possibles values %s') % values
field_class = TemplateField
elif field.widget_type in ('list', 'combo'):
ds_url = (
reverse(
'generic-endpoint',
kwargs={
'connector': 'pdf',
'endpoint': 'field-values',
'slug': self.instance.slug,
},
)
+ '?digest_id='
+ field.digest_id
)
help_text = mark_safe(
_('text template, possibles values <a href="%s">data source</a>') % ds_url
)
field_class = TemplateField
else:
continue
label = _('field {number} ({help_text})').format(number=i + 1, help_text=help_text)
label = _('field {number}').format(number=i + 1)
initial = fields_mapping.get(name, '')
self.fields[name] = field_class(label=label, required=False, initial=initial)
self.fields[name] = field_class(
label=label, required=False, initial=initial, help_text=help_text
)
self.fields[name].page_number = page.page_number
self.fields[name].widget.attrs['tabindex'] = '0'
self.fields[name].widget.attrs['class'] = '0'

View File

@ -133,6 +133,7 @@ class Resource(BaseResource):
description=_('Returns the assembly of received PDF files'),
perm='can_access',
methods=['post'],
display_order=0,
post={
'request_body': {'schema': {'application/json': ASSEMBLE_SCHEMA}},
'input_example': {
@ -207,6 +208,7 @@ class Resource(BaseResource):
description=_('Fills the input PDF form with fields applying mappings to the received payload'),
perm='can_access',
methods=['post'],
display_order=1,
parameters={
'filename': {'description': _('file name')},
'flatten': {'description': _('remove PDF fields, keep only the drawed values')},
@ -250,6 +252,11 @@ class Resource(BaseResource):
value = evaluate_condition(mapping_template, post_data)
elif field.widget_type == 'text':
value = evaluate_template(mapping_template, post_data)
elif field.widget_type == 'radio':
value = evaluate_template(mapping_template, post_data)
elif field.widget_type in ('combo', 'list'):
value = evaluate_template(mapping_template, post_data)
self.logger.info('field=%r value=%r', field, value)
else:
raise NotImplementedError
if value is not None:
@ -258,3 +265,28 @@ class Resource(BaseResource):
response['Content-Disposition'] = 'attachment; filename="%s"' % filename
pdf.write(response, flatten=flatten_pdf)
return response
@endpoint(
name='field-values',
description=_('Return possible values for PDF\'s combo or list form fields'),
perm='can_access',
parameters={
'digest_id': {'description': _('Identifier of the field')},
},
)
def field_values(self, request, digest_id):
if not self.fill_form_file:
raise APIError('not PDF file configured')
with self.fill_form_file.open() as fd:
pdf_content = fd.read()
pdf = PDF(pdf_content)
fields = [field for page in pdf.pages for field in page.fields if field.digest_id == digest_id]
if not fields:
raise APIError(f'unknown digest-id {digest_id!r}')
field = fields[0]
if field.widget_type not in ('list', 'combo'):
raise APIError(f'wrong field type for digest-id {digest_id!r}: {field.widget_type}')
return {'data': [{'id': value, 'text': value} for _, value in field.combo_possible_values]}

View File

@ -59,7 +59,7 @@ class PageThumbnailView(ResourceView):
thumbnail = page.thumbnail_png()
image = PIL.Image.open(io.BytesIO(thumbnail))
draw = PIL.ImageDraw.Draw(image, 'RGBA')
for i, (field, area_rect) in enumerate(page.thumbnail_field_rects()):
for i, field, area_rect in page.thumbnail_field_rects():
draw.rectangle(area_rect, fill=(255, 0, 0, 50))
x = area_rect.x1
y = (area_rect.y1 + area_rect.y2) / 2 - 5

View File

@ -25,6 +25,10 @@ import typing
import pdfrw
RADIO_FLAG = 1 << 15 # bit 16
PUSH_BUTTON_FLAG = 1 << 16 # bit 17
LIST_FLAG = 1 << 17 # bit 18
class Rect(typing.NamedTuple):
x1: float
@ -32,15 +36,97 @@ class Rect(typing.NamedTuple):
x2: float
y2: float
@classmethod
def from_pdf_annotation(cls, annotation):
return cls(*map(float, annotation.Rect))
def rect_compare(rect1, rect2):
'''Sort rect top to bottom and left to right, PDF origin is in the
bottom-left corner.
Rect on the same horizontal band are considered at the same height.
'''
if -rect1.y1 < -rect2.y2:
return -1
if -rect2.y1 < -rect1.y1:
return 1
if rect1.x1 < rect2.x1:
return -1
if rect1.x1 > rect2.x1:
return 1
return 0
class FieldFlags(int):
@property
def is_radio(self):
return self & RADIO_FLAG
@property
def is_push_button(self):
return self & PUSH_BUTTON_FLAG
@property
def is_list(self):
return self & LIST_FLAG
@dataclasses.dataclass(frozen=True)
class Widget:
page: 'Page' = dataclasses.field(compare=False, repr=False)
name: str
widget_type: str = dataclasses.field(compare=False)
rect: Rect = dataclasses.field(compare=False)
on_value: str = dataclasses.field(compare=False, default=pdfrw.PdfName.On)
annotation: pdfrw.PdfDict = dataclasses.field(default=None, repr=False)
annotation: pdfrw.PdfDict = dataclasses.field(repr=False)
@functools.cached_property
def name(self):
annot = self.annotation
name = annot.T.decode()
while annot.Parent:
annot = annot.Parent
name = f'{annot.T.decode()}.{name}'
return name
@property
def field_flags(self):
return FieldFlags(int(self.annotation.Ff or 0))
@property
def field_type(self):
return self.annotation.FT
@property
def widget_type(self):
if (
self.field_type == pdfrw.PdfName.Btn
and self.field_flags.is_radio
and not self.field_flags.is_push_button
):
return 'radio'
elif (
self.field_type == pdfrw.PdfName.Btn
and not self.field_flags.is_radio
and not self.field_flags.is_push_button
):
return 'checkbox'
elif self.field_type == pdfrw.PdfName.Tx:
return 'text'
elif self.field_type == pdfrw.PdfName.Ch:
if self.field_flags.is_list:
return 'list'
else:
return 'combo'
else:
raise NotImplementedError
@property
def rect(self):
return self.rects[0]
@property
def rects(self):
if self.widget_type == 'radio':
return [Rect.from_pdf_annotation(kid) for kid in self.kids_ordered_by_rect]
else:
return [Rect.from_pdf_annotation(self.annotation)]
@property
def digest_id(self):
@ -52,6 +138,39 @@ class Widget:
b32_encoded = base64.b32encode(digest).decode()
return b32_encoded.strip('=').upper()
@property
def checkbox_true_value(self):
assert self.widget_type == 'checkbox'
try:
true_values = list(self.annotation.AP.N.keys())
except KeyError:
return pdfrw.PdfName.On
else:
if pdfrw.PdfName.Off in true_values:
true_values.remove(pdfrw.PdfName.Off)
return true_values[0]
@property
def kids_ordered_by_rect(self):
assert self.widget_type == 'radio'
kids = list(self.annotation.Kids or [])
def compare(kid1, kid2):
return rect_compare(Rect.from_pdf_annotation(kid1), Rect.from_pdf_annotation(kid2))
kids.sort(key=functools.cmp_to_key(compare))
return kids
@property
def radio_possible_values(self):
assert self.widget_type == 'radio'
return list(list(kid.AP.N.keys())[0][1:] for kid in self.kids_ordered_by_rect if kid.AP and kid.AP.N)
@property
def combo_possible_values(self):
assert self.widget_type in ('list', 'combo')
return [(option[0].decode(), option[1].decode()) for option in self.annotation.Opt]
@property
def value(self):
if self.widget_type == 'text':
@ -59,7 +178,11 @@ class Widget:
return self.annotation.V.decode()
return ''
elif self.widget_type == 'checkbox':
return self.annotation.V == self.on_value
return self.annotation.V == self.checkbox_true_value
elif self.widget_type == 'radio':
return self.annotation.V.lstrip('/') if self.annotation.V else None
elif self.widget_type in ('list', 'combo'):
return self.annotation.V.decode() if self.annotation.V is not None else None
def set(self, value):
# allow rendering of values in Acrobat Reader
@ -68,8 +191,39 @@ class Widget:
str_value = str(value)
self.annotation.update(pdfrw.PdfDict(V=str_value, AS=str_value))
elif self.widget_type == 'checkbox':
bool_value = self.on_value if value else pdfrw.PdfName.Off
bool_value = self.checkbox_true_value if value else pdfrw.PdfName.Off
self.annotation.update(pdfrw.PdfDict(V=bool_value, AS=bool_value))
elif self.widget_type == 'radio':
if value not in self.radio_possible_values:
raise ValueError(f'"{value}" is not one of {self.radio_possible_values}')
radio_value = pdfrw.PdfName(value)
for kid in self.annotation.Kids:
if kid.AP and kid.AP.N and radio_value in kid.AP.N:
kid.update(pdfrw.PdfDict(AS=radio_value))
else:
kid.update(pdfrw.PdfDict(AS=pdfrw.PdfName.Off))
self.annotation.update(pdfrw.PdfDict(V=radio_value))
elif self.widget_type in ('list', 'combo'):
for export, combo_value in self.combo_possible_values:
if combo_value == value:
self.annotation.update(
pdfrw.PdfDict(
V=pdfrw.PdfString.from_unicode(export), AS=pdfrw.PdfString.from_unicode(export)
)
)
break
@classmethod
def from_pdf_widget(cls, page, pdf_widget):
widget = cls(page=page, annotation=pdf_widget)
try:
widget.widget_type
except NotImplementedError:
return None
return widget
def __repr__(self):
return f'<Widget {self.name!r} : {self.widget_type}>'
@dataclasses.dataclass
@ -85,54 +239,36 @@ class Page:
@property
def fields(self):
def widgets():
'''Find annotation which are widgets, if Subtype is not defined,
look at the parent (case of radio fields)'''
seen = set()
for annotation in self.page.Annots or ():
field = annotation
if field.Subtype != pdfrw.PdfName.Widget:
continue
while not field.T and field.Parent:
field = field.Parent
# skip field without name
if not field.T:
continue
# radio checkboxes have the same parent, to prevent duplicate
# fields
if field.T in seen:
continue
seen.add(field.T)
yield field
fields = []
for annotation in self.page.Annots or ():
if annotation.Subtype != pdfrw.PdfName.Widget:
continue
if not annotation.T:
continue
name = annotation.T.decode()
parent = annotation.Parent
while parent and parent.T:
name = f'{parent.T.decode()}.{name}'
parent = parent.Parent
if not annotation.FT:
continue
pdf_field_type = annotation.FT
pdf_field_flags = int(annotation.Ff or 0)
RADIO_FLAG = 2**16
PUSH_BUTTON_FLAG = 2**17
if (
pdf_field_type == pdfrw.PdfName.Btn
and not (pdf_field_flags & RADIO_FLAG)
and not (pdf_field_flags & PUSH_BUTTON_FLAG)
):
widget_type = 'checkbox'
elif pdf_field_type == pdfrw.PdfName.Tx:
widget_type = 'text'
else:
continue
on_value = None
if widget_type == 'checkbox':
try:
on_values = list(annotation.AP.N.keys())
except KeyError:
on_value = pdfrw.PdfName.On
else:
if pdfrw.PdfName.Off in on_values:
on_values.remove(pdfrw.PdfName.Off)
on_value = on_values[0]
fields.append(
Widget(
name=name,
widget_type=widget_type,
rect=Rect(*map(float, annotation.Rect)),
on_value=on_value,
page=self,
annotation=annotation,
)
)
fields.sort(key=lambda field: (-field.rect[1], field.rect[0]))
for widget in widgets():
widget = Widget.from_pdf_widget(self, widget)
if widget:
fields.append(widget)
def compare(field1, field2):
return rect_compare(field1.rect, field2.rect)
fields.sort(key=functools.cmp_to_key(compare))
return fields
@property
@ -171,20 +307,20 @@ class Page:
media_height = media_box.y2 - media_box.y1
height = int(width / media_width * media_height)
for field in self.fields:
field_rect = field.rect
yield field, Rect(
# PDF coordinates origin is in the bottom-left corner but img
# tag origin is in the top-left corner
x1=int((field_rect.x1 - media_box.x1) / media_width * width),
y1=int((media_box.y2 - field_rect.y1) / media_height * height),
x2=int((field_rect.x2 - media_box.x1) / media_width * width),
y2=int((media_box.y2 - field_rect.y2) / media_height * height),
)
for i, field in enumerate(self.fields):
for field_rect in field.rects:
yield i, field, Rect(
# PDF coordinates origin is in the bottom-left corner but img
# tag origin is in the top-left corner
x1=int((field_rect.x1 - media_box.x1) / media_width * width),
y1=int((media_box.y2 - field_rect.y1) / media_height * height),
x2=int((field_rect.x2 - media_box.x1) / media_width * width),
y2=int((media_box.y2 - field_rect.y2) / media_height * height),
)
def fields_image_map(self, width=None, sep='\n', id_prefix='', id_suffix=''):
tags = []
for field, area_rect in self.thumbnail_field_rects(width=width):
for _, field, area_rect in self.thumbnail_field_rects(width=width):
coords = ','.join(map(str, area_rect))
tags.append(
f'<area shape="rect" '

Binary file not shown.

View File

@ -93,3 +93,27 @@ def test_field_set(pdf):
else:
raise NotImplementedError
assert check == {1, 2}
def test_radio_button():
with open('tests/data/cerfa_14011-02.pdf', 'rb') as fd:
pdf = PDF(content=fd)
radio = [field for field in pdf.page(0).fields if field.name == 'topmostSubform[0].Page1[0].Gender[0]']
assert len(radio) == 1
radio = radio[0]
assert radio.radio_possible_values == ['H', 'F']
radio.set('H')
assert radio.value == 'H'
def test_combo_box():
with open('tests/data/cerfa_14011-02.pdf', 'rb') as fd:
pdf = PDF(content=fd)
combo = [field for field in pdf.page(0).fields if field.name == 'topmostSubform[0].Page1[0].Pays[0]']
assert len(combo) == 1
combo = combo[0]
assert len(combo.combo_possible_values) == 235
combo.set('X')
assert combo.value is None
combo.set('FRANCE')
assert combo.value == 'FRANCE'