diff --git a/conservancy_beancount/pdfforms/__init__.py b/conservancy_beancount/pdfforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/conservancy_beancount/pdfforms/errors.py b/conservancy_beancount/pdfforms/errors.py new file mode 100644 index 0000000000000000000000000000000000000000..36924da6a3c01d6d485dba0174f6922f70798aa9 --- /dev/null +++ b/conservancy_beancount/pdfforms/errors.py @@ -0,0 +1,18 @@ +"""errors.py - Exception classes for PDF reporting errors""" +# Copyright © 2021 Brett Smith +# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 +# +# Full copyright and licensing details can be found at toplevel file +# LICENSE.txt in the repository. + +class PDFError(Exception): + pass + +class PDFKeyError(KeyError, PDFError): + pass + +class PDFSpecError(ValueError, PDFError): + pass + +class NoFormDataError(ValueError, PDFError): + pass diff --git a/conservancy_beancount/pdfforms/extract.py b/conservancy_beancount/pdfforms/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..3ed18aec872a8bfe40db9ad1f4877458c8ae792f --- /dev/null +++ b/conservancy_beancount/pdfforms/extract.py @@ -0,0 +1,179 @@ +"""extract.py - Extract form data from PDF files""" +# Copyright © 2021 Brett Smith +# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 +# +# Full copyright and licensing details can be found at toplevel file +# LICENSE.txt in the repository. + +import argparse +import contextlib +import logging +import sys + +import yaml + +from . import fields as fieldmod +from . import utils as pdfutils +from .. import cliutil + +from pathlib import Path +from pdfminer.pdfdocument import PDFDocument # type:ignore[import] +from pdfminer.pdfparser import PDFParser # type:ignore[import] +from pdfminer.pdftypes import resolve1 # type:ignore[import] + +from typing import ( + Any, + BinaryIO, + Iterator, + Mapping, + Optional, + Sequence, + TextIO, +) + +PROGNAME = 'pdfform-extract' +logger = logging.getLogger('conservancy_beancount.pdfforms.extract') + +class FormExtractor: + def __init__( + self, + pdf: PDFDocument, + form_key: Optional[str]=None, + source: Optional[str]=None, + ) -> None: + if form_key is None: + form_key = pdfutils.guess_form_key(pdf) + self.document = pdf + self.form_key = form_key + self.source = source + + @classmethod + @contextlib.contextmanager + def from_path( + cls, + path: Path, + form_key: Optional[str]=None, + ) -> Iterator['FormExtractor']: + pdf_file = path.open('rb') + try: + yield cls.from_file(pdf_file, form_key, path) + finally: + pdf_file.close() + + @classmethod + def from_file( + cls, + source: BinaryIO, + form_key: Optional[str]=None, + source_path: Optional[Path]=None, + ) -> 'FormExtractor': + if source_path is None: + source_path = Path(source.name) + parser = PDFParser(source) + pdf_doc = PDFDocument(parser) + return cls(pdf_doc, form_key, source_path.name) + + def _extract_field( + self, + field: fieldmod.FormField, + name_prefix: str='', + ) -> Iterator[Mapping[str, Any]]: + name = name_prefix + field.name() + yield_this = not field.is_readonly() + try: + field_type = field.field_type().name + except ValueError: + yield_this = False + if yield_this: + retval = { + 'fdf': { + 'type': field_type, + 'name': name, + }, + 'description': f'{field_type} {name}', + 'value': field.fill_value(), + } + if isinstance(field, fieldmod.CheckboxField): + retval['fdf']['options'] = field.options() + yield retval + name += '.' + for kid in field.kids(): + yield from self._extract_field(kid, name) + + def extract(self) -> Mapping[str, Any]: + return { + 'from file': self.source, + 'form key': self.form_key, + 'fields': [ + field + for field_source in resolve1(self.document.catalog[self.form_key])['Fields'] + for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source))) + ], + } + + +class FormYAMLDumper(yaml.dumper.SafeDumper): + def represent_mapping(self, tag: Any, value: Any, flow_style: Any=None) -> Any: + if flow_style is None: + # We never want mappings flowed by default. + flow_style = False + # If the super method can call value.items(), it does that and re-sorts + # the result. We don't want re-sorted output, so call value.items() now + # as a bypass. + try: + value = value.items() + except AttributeError: + pass + return super().represent_mapping(tag, value, flow_style) + + +def parse_arguments(arglist: Optional[Sequence[str]]=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(prog=PROGNAME) + cliutil.add_version_argument(parser) + cliutil.add_loglevel_argument(parser) + parser.add_argument( + '--form-key', '-f', + metavar='KEY', + help="""Key in the document catalog with form data. +Default is guessed by examining the document. +""") + parser.add_argument( + '--output-file', '-O', + metavar='PATH', + type=Path, + help="""Write output YAML to this file, or stdout when PATH is `-`. +Default stdout. +""") + parser.add_argument( + 'document', + type=Path, + help="""PDF or FDF file to extract form data from. +Use `-` to read from stdin. +""") + return parser.parse_args(arglist) + +def main(arglist: Optional[Sequence[str]]=None, + stdout: TextIO=sys.stdout, + stderr: TextIO=sys.stderr, +) -> int: + args = parse_arguments(arglist) + cliutil.set_loglevel(logger, args.loglevel) + with contextlib.ExitStack() as exit_stack: + if args.document == cliutil.STDSTREAM_PATH: + extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key) + else: + extractor = exit_stack.enter_context( + FormExtractor.from_path(args.document, args.form_key), + ) + extracted_form = extractor.extract() + with contextlib.ExitStack() as exit_stack: + out_file = cliutil.text_output(args.output_file, stdout) + if out_file is not stdout: + exit_stack.enter_context(out_file) + yaml.dump(extracted_form, out_file, Dumper=FormYAMLDumper) + return 0 + +entry_point = cliutil.make_entry_point(__name__, PROGNAME) + +if __name__ == '__main__': + exit(entry_point()) diff --git a/conservancy_beancount/pdfforms/fields.py b/conservancy_beancount/pdfforms/fields.py new file mode 100644 index 0000000000000000000000000000000000000000..8a49567a6492c5078066ef4d840a3a217f81673c --- /dev/null +++ b/conservancy_beancount/pdfforms/fields.py @@ -0,0 +1,245 @@ +"""fields.py - Python classes to read and write PDF form data""" +# Copyright © 2020 Brett Smith +# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 +# +# Full copyright and licensing details can be found at toplevel file +# LICENSE.txt in the repository. + +import enum +import functools + +from pdfminer.pdftypes import resolve1 # type:ignore[import] +from pdfminer import psparser # type:ignore[import] +from . import utils as pdfutils +from .errors import PDFKeyError, PDFSpecError + +from typing import ( + Any, + Iterator, + Optional, + Mapping, + MutableMapping, + Sequence, + Tuple, + Union, +) + +FieldSource = MutableMapping[str, Any] + +class FieldFlags(enum.IntFlag): + # Flags for all fields + ReadOnly = 2 ** 0 + Required = 2 ** 1 + NoExport = 2 ** 2 + # Flags for buttons + NoToggleToOff = 2 ** 14 + Radio = 2 ** 15 + Pushbutton = 2 ** 16 + RadiosInUnison = 2 ** 25 + # Flags for text + Multiline = 2 ** 12 + Password = 2 ** 13 + FileSelect = 2 ** 20 + DoNotSpellCheck = 2 ** 22 + DoNotScroll = 2 ** 23 + Comb = 2 ** 24 + RichText = 2 ** 25 + + +class FieldType(enum.Enum): + Btn = 'Btn' + BUTTON = Btn + Ch = 'Ch' + CHOICE = Ch + Sig = 'Sig' + SIG = Sig + SIGNATURE = Sig + Tx = 'Tx' + TEXT = Tx + + +class FormField: + __slots__ = ['_source'] + _SENTINEL = object() + DEFAULT_FILL: object = None + INHERITABLE = frozenset([ + 'DV', + 'Ff', + 'FT', + 'MaxLen', + 'Opt', + 'V', + ]) + + def __init__(self, source: FieldSource) -> None: + self._source = source + + @classmethod + def by_type(cls, source: FieldSource) -> 'FormField': + retval = cls(source) + try: + field_type = retval.field_type() + except ValueError: + return retval + flags = retval.flags() + if field_type is FieldType.BUTTON: + if flags & FieldFlags.Radio: + pass + elif flags & FieldFlags.Pushbutton: + pass + else: + retval.__class__ = CheckboxField + elif field_type is FieldType.TEXT: + retval.__class__ = TextField + return retval + + def _get_value(self, key: str, default: Any=_SENTINEL) -> Any: + can_inherit = key in self.INHERITABLE + source: Optional[FieldSource] = self._source + while source is not None: + try: + return resolve1(source[key]) + except KeyError: + source = resolve1(source.get('Parent')) if can_inherit else None + if default is self._SENTINEL: + raise PDFKeyError(key) + else: + return default + + def field_type(self) -> FieldType: + try: + source = self._get_value('FT') + except KeyError: + raise PDFSpecError("field does not specify a field type") from None + try: + return FieldType[source.name] + except (AttributeError, KeyError): + raise PDFSpecError(f"field has invalid field type {source!r}") from None + + def kids(self) -> Iterator['FormField']: + for source in self._get_value('Kids', ()): + yield self.by_type(resolve1(source)) + + def parent(self) -> Optional['FormField']: + try: + return self.by_type(self._get_value('Parent')) + except KeyError: + return None + + def is_terminal(self) -> bool: + return not self._get_value('Kids', None) + + def flags(self) -> int: + return self._get_value('Ff', 0) # type:ignore[no-any-return] + + def is_readonly(self) -> bool: + return bool(self.flags() & FieldFlags.ReadOnly) + + def name(self) -> str: + return pdfutils.decode_text(self._get_value('T', b'')) + + def value(self) -> Any: + return self._get_value('V', None) # type:ignore[no-any-return] + + def set_value(self, value: Any) -> None: + self._source['V'] = value + + def fill_value(self) -> Any: + return resolve1(self._source.get('V', self.DEFAULT_FILL)) + + def as_filled_fdf(self) -> Mapping[str, Any]: + retval: FieldSource = {} + try: + retval['T'] = pdfutils.decode_text(self._source['T']) + except KeyError: + pass + value = self.fill_value() + if value is not None: + retval['V'] = value + kids = [kid.as_filled_fdf() for kid in self.kids()] + if kids: + retval['Kids'] = kids + return retval + + def as_mapping(self, name_prefix: str='') -> Iterator[Tuple[str, 'FormField']]: + name = name_prefix + self.name() + yield (name, self) + name += '.' + for kid in self.kids(): + yield from kid.as_mapping(name) + + +class CheckboxField(FormField): + __slots__: Sequence[str] = [] + OFF = 'Off' + ON = 'Yes' + + @functools.lru_cache() + def options(self) -> Sequence[str]: + try: + keys: Tuple[str, ...] = tuple(self._source['AP']['N']) + except KeyError: + keys = () + count = len(keys) + if count == 0: + return [self.ON, self.OFF] + elif count == 1: + return [keys[0], self.OFF] + elif count > 2: + raise PDFSpecError("checkbox has more than two states available") + try: + off_index = keys.index(self.OFF) + except ValueError: + try: + off_index = 0 if keys.index(self.ON) else 1 + except ValueError: + raise PDFSpecError("checkbox defines two on states") from None + return [keys[0 if off_index else 1], keys[off_index]] + + def _bool_value(self, literal_value: Optional[psparser.PSLiteral]) -> Optional[bool]: + if literal_value is None: + return None + try: + value = literal_value.name + except AttributeError: + raise PDFSpecError("checkbox value is not a PSLiteral") + on, off = self.options() + if value == on: + return True + elif value == off: + return False + else: + raise PDFSpecError(f"checkbox has unknown value {value!r}") + + def value(self) -> Optional[bool]: + return self._bool_value(super().value()) + + def set_value(self, value: Optional[bool]) -> None: + if value is None: + literal_value: Optional[psparser.PSLiteral] = None + else: + on, off = self.options() + literal_value = psparser.PSLiteralTable.intern(on if value else off) + super().set_value(literal_value) + + +class TextField(FormField): + __slots__: Sequence[str] = [] + DEFAULT_FILL = b'' + + def _decode(self, value: Any) -> Optional[str]: + if value is None: + return value + elif isinstance(value, bytes): + return pdfutils.decode_text(value) + else: + raise PDFSpecError("text field value is not bytes") + + def value(self) -> Optional[str]: + return self._decode(super().value()) + + def set_value(self, value: Optional[str]) -> None: + super().set_value(None if value is None else pdfutils.encode_text(value)) + + def fill_value(self) -> Optional[str]: + return self._decode(super().fill_value()) diff --git a/conservancy_beancount/pdfforms/utils.py b/conservancy_beancount/pdfforms/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c82858a887f9d47417d3302a46bfdf9360c07372 --- /dev/null +++ b/conservancy_beancount/pdfforms/utils.py @@ -0,0 +1,53 @@ +"""utils.py - Utility methods for working with PDFs""" +# Copyright © 2020 Brett Smith +# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 +# +# Full copyright and licensing details can be found at toplevel file +# LICENSE.txt in the repository. + +from codecs import BOM_UTF16_BE + +import pdfminer.utils # type:ignore[import] + +from . import errors as pdferrors + +from pdfminer.pdfdocument import PDFDocument # type:ignore[import] +from pdfminer.pdftypes import resolve1 # type:ignore[import] + +from typing import ( + Callable, +) + +decode_text: Callable[[bytes], str] = pdfminer.utils.decode_text + +def encode_text(s: str) -> bytes: + """Encode a string to bytes for PDF + + If possible, encodes to ASCII for readability and compactness. + Otherwise uses UTF-16BE. + """ + try: + return s.encode('ascii') + except UnicodeEncodeError: + return BOM_UTF16_BE + s.encode('utf-16be') + +def guess_form_key(pdf: PDFDocument) -> str: + """Guess and return the PDF document catalog key with form data + + This function knows common catalog keys that hold PDF form data, + searches the given document for form data, and returns the best candidate. + Raises ValueError + """ + catalog = pdf.catalog + for key in [ + 'AcroForm', + 'FDF', + ]: + try: + 'Fields' in resolve1(catalog[key]) + except (KeyError, TypeError): + pass + else: + return key + else: + raise pdferrors.NoFormDataError("could not find catalog key with form data") diff --git a/setup.py b/setup.py index d50de594f14d9f012e99c07ed824f5f5cfba5219..9bd200806484a00efb2abd5de176a1bd6622fd40 100755 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ setup( 'GitPython>=2.0', # Debian:python3-git # 1.4.1 crashes when trying to save some documents. 'odfpy>=1.4.0,!=1.4.1', # Debian:python3-odf + 'pdfminer.six>=20200101', 'PyYAML>=3.0', # Debian:python3-yaml 'regex', # Debian:python3-regex 'rt>=2.0', @@ -31,6 +32,7 @@ setup( packages=[ 'conservancy_beancount', + 'conservancy_beancount.pdfforms', 'conservancy_beancount.plugin', 'conservancy_beancount.reports', 'conservancy_beancount.tools', @@ -46,6 +48,7 @@ setup( 'fund-report = conservancy_beancount.reports.fund:entry_point', 'ledger-report = conservancy_beancount.reports.ledger:entry_point', 'opening-balances = conservancy_beancount.tools.opening_balances:entry_point', + 'pdfform-extract = conservancy_beancount.pdfforms.extract:entry_point', 'split-ods-links = conservancy_beancount.tools.split_ods_links:entry_point', ], }, diff --git a/tests/pdfforms/form1.fdf b/tests/pdfforms/form1.fdf new file mode 100644 index 0000000000000000000000000000000000000000..c032a440a81823d3dee214e870cccd67c276f44e --- /dev/null +++ b/tests/pdfforms/form1.fdf @@ -0,0 +1,72 @@ +%FDF-1.2 +%âãÏÓ +1 0 obj +<< +/FDF +<< +/Fields [ +<< +/T (topform) +/Kids [ + << + /T (text1_0) + /FT /Tx + /V () + >> + << + /T (button1) + /Kids [ + << + /FT /Btn + /T (button1_0) + /AP << /N << /1 1 0 R >> >> + >> + << + /FT /Btn + /T (button1_1) + /AP << /N << /2 1 0 R >> >> + >> + ] + >> + << + /T (text1_1) + /FT /Tx + /V () + >> + << + /T (text2_0) + /FT /Tx + /V () + >> + << + /T (button2) + /Kids [ + << + /FT /Btn + /T (button2_0) + /AP << /N << /1 1 0 R >> >> + >> + << + /FT /Btn + /T (button2_1) + /AP << /N << /2 1 0 R >> >> + >> + ] + >> + << + % Readonly + /T (text2_R) + /FT /Tx + /Ff 1 + >> +] +>>] +>> +>> +endobj +trailer + +<< +/Root 1 0 R +>> +%%EOF diff --git a/tests/pdfforms/form1.yml b/tests/pdfforms/form1.yml new file mode 100644 index 0000000000000000000000000000000000000000..a17e187a8c504ed19e4034abb1468affbbd266c5 --- /dev/null +++ b/tests/pdfforms/form1.yml @@ -0,0 +1,25 @@ +- fdf: + type: Tx + name: topform.text1_0 +- fdf: + type: Btn + name: topform.button1.button1_0 + options: ['1', 'Off'] +- fdf: + type: Btn + name: topform.button1.button1_1 + options: ['2', 'Off'] +- fdf: + type: Tx + name: topform.text1_1 +- fdf: + type: Tx + name: topform.text2_0 +- fdf: + type: Btn + name: topform.button2.button2_0 + options: ['1', 'Off'] +- fdf: + type: Btn + name: topform.button2.button2_1 + options: ['2', 'Off'] diff --git a/tests/test_pdfforms_extract.py b/tests/test_pdfforms_extract.py new file mode 100644 index 0000000000000000000000000000000000000000..0f08fd6d96c26ba47d11be6cdd3a403208c91423 --- /dev/null +++ b/tests/test_pdfforms_extract.py @@ -0,0 +1,62 @@ +"""test_pdfforms_extract.py - Unit tests for PDF form extractor""" +# Copyright © 2020 Brett Smith +# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 +# +# Full copyright and licensing details can be found at toplevel file +# LICENSE.txt in the repository. + +import io +import itertools + +import pytest +import yaml + +from . import testutil + +from pathlib import Path + +from conservancy_beancount.pdfforms import extract as extractmod + +def compare_to_yaml(actual, yaml_path, from_file, form_key): + if isinstance(yaml_path, str): + yaml_path = testutil.test_path(f'pdfforms/{yaml_path}') + with yaml_path.open() as yaml_file: + expect_fields = yaml.safe_load(yaml_file) + assert actual.get('from file') == from_file + assert actual.get('form key') == form_key + for act_f, exp_f in itertools.zip_longest(actual.get('fields', ()), expect_fields): + for key, exp_value in exp_f.items(): + assert act_f[key] == exp_value + +@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [ + ('form1.fdf', 'FDF', 'form1.yml'), +]) +def test_extract_from_path(fdf_filename, form_key, fields_yaml): + fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}') + with extractmod.FormExtractor.from_path(fdf_path) as extractor: + actual = extractor.extract() + compare_to_yaml(actual, fields_yaml, fdf_filename, form_key) + +@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [ + ('form1.fdf', 'FDF', 'form1.yml'), +]) +def test_extract_from_file(fdf_filename, form_key, fields_yaml): + with testutil.test_path(f'pdfforms/{fdf_filename}').open('rb') as fdf_file: + extractor = extractmod.FormExtractor.from_file(fdf_file) + actual = extractor.extract() + compare_to_yaml(actual, fields_yaml, fdf_filename, form_key) + +@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [ + ('form1.fdf', 'FDF', 'form1.yml'), +]) +def test_main(fdf_filename, form_key, fields_yaml): + fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}') + arglist = [str(fdf_path)] + stdout = io.StringIO() + stderr = io.StringIO() + returncode = extractmod.main(arglist, stdout, stderr) + assert returncode == 0 + assert not stderr.getvalue() + stdout.seek(0) + actual = yaml.safe_load(stdout) + compare_to_yaml(actual, fields_yaml, fdf_filename, form_key) diff --git a/tests/test_pdfforms_fields.py b/tests/test_pdfforms_fields.py new file mode 100644 index 0000000000000000000000000000000000000000..13521259c3774bf39c3181280ca710dde873b75b --- /dev/null +++ b/tests/test_pdfforms_fields.py @@ -0,0 +1,350 @@ +"""test_pdfforms_fields.py - Unit tests for PDF forms manipulation""" +# Copyright © 2020 Brett Smith +# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 +# +# Full copyright and licensing details can be found at toplevel file +# LICENSE.txt in the repository. + +import codecs +import itertools + +import pytest + +from pdfminer.psparser import PSLiteral + +from conservancy_beancount.pdfforms import fields as fieldsmod + +def field_source( + name=None, + value=None, + field_type=None, + flags=None, + parent=None, + kids=None, + *, + literal=None, +): + retval = {} + if isinstance(name, str): + retval['T'] = name.encode('ascii') + elif name is not None: + retval['T'] = name + if value is not None: + if literal is None: + literal = field_type and field_type != 'Tx' + if literal: + value = PSLiteral(value) + retval['V'] = value + if field_type is not None: + retval['FT'] = PSLiteral(field_type) + if flags is not None: + retval['Ff'] = flags + if parent is not None: + retval['Parent'] = parent + if kids is not None: + retval['Kids'] = list(kids) + return retval + +def appearance_states(*names): + return {key: object() for key in names if key is not None} + +def test_empty_field(): + source = field_source() + field = fieldsmod.FormField(source) + assert not field.name() + assert field.value() is None + assert field.parent() is None + assert not list(field.kids()) + assert field.flags() == 0 + assert field.is_terminal() + with pytest.raises(ValueError): + field.field_type() + +def test_text_field_base(): + source = field_source(b's', b'string of text', 'Tx') + field = fieldsmod.FormField(source) + assert field.field_type() is fieldsmod.FieldType.TEXT + assert field.name() == 's' + assert field.value() == b'string of text' + +@pytest.mark.parametrize('value', ['Off', 'Yes', 'On']) +def test_checkbox_field_base(value): + source = field_source(b'cb', value, 'Btn', literal=True) + field = fieldsmod.FormField(source) + assert field.field_type() is fieldsmod.FieldType.BUTTON + assert field.name() == 'cb' + assert field.value().name == value + +@pytest.mark.parametrize('flags', range(4)) +def test_readonly_flag(flags): + source = field_source(flags=flags) + field = fieldsmod.FormField(source) + assert field.flags() == flags + assert field.is_readonly() == flags % 2 + +@pytest.mark.parametrize('kid_count', range(3)) +def test_kids(kid_count): + kids = [field_source(f'kid{n}', field_type='Ch') for n in range(kid_count)] + source = field_source(kids=iter(kids)) + field = fieldsmod.FormField(source) + got_kids = list(field.kids()) + assert len(got_kids) == len(kids) + assert field.is_terminal() == (not kids) + for actual, expected in zip(got_kids, kids): + assert actual.name() == expected['T'].decode('ascii') + +def test_kids_by_type(): + kids = [field_source(field_type='Tx'), field_source(field_type='Btn')] + source = field_source('topform', kids=iter(kids)) + actual = fieldsmod.FormField.by_type(source).kids() + assert isinstance(next(actual), fieldsmod.TextField) + assert isinstance(next(actual), fieldsmod.CheckboxField) + assert next(actual, None) is None + +def test_inheritance(): + parent_source = field_source(b'parent', 'parent value', 'Tx', 17) + kid_source = field_source('kid', parent=parent_source) + parent_source['Kids'] = [kid_source] + field = fieldsmod.FormField(kid_source) + parent = field.parent() + assert parent is not None + assert parent.name() == 'parent' + assert not parent.is_terminal() + assert field.is_terminal() + assert field.name() == 'kid' + assert field.field_type() is fieldsmod.FieldType.TEXT + assert field.value() == 'parent value' + assert field.flags() == 17 + assert not list(field.kids()) + +@pytest.mark.parametrize('field_type,value', [ + ('Tx', b'new value'), + ('Btn', PSLiteral('Yes')), +]) +def test_set_value(field_type, value): + source = field_source(field_type=field_type) + field = fieldsmod.FormField(source) + assert field.value() is None + field.set_value(value) + assert field.value() == value + +@pytest.mark.parametrize('field_type,expected', [ + ('Tx', fieldsmod.TextField), + ('Btn', fieldsmod.CheckboxField), +]) +def test_by_type(field_type, expected): + source = field_source(field_type=field_type) + field = fieldsmod.FormField.by_type(source) + assert isinstance(field, expected) + +def test_container_by_type(): + kids = [field_source(field_type='Tx'), field_source(field_type='Btn')] + source = field_source('topform', kids=iter(kids)) + field = fieldsmod.FormField.by_type(source) + assert isinstance(field, fieldsmod.FormField) + +@pytest.mark.parametrize('flag', [ + # If you add dedicated classes for these types of buttons, you can remove + # their test cases. + fieldsmod.FieldFlags.Radio, + fieldsmod.FieldFlags.Pushbutton, +]) +def test_unsupported_button_by_type(flag): + source = field_source(field_type='Btn', flags=flag) + field = fieldsmod.FormField.by_type(source) + assert type(field) is fieldsmod.FormField + +@pytest.mark.parametrize('field_type', [ + # If you add dedicated classes for these types of fields, you can remove + # their test cases. + 'Ch', + 'Sig', +]) +def test_unsupported_field_by_type(field_type): + source = field_source(field_type=field_type) + field = fieldsmod.FormField.by_type(source) + assert type(field) is fieldsmod.FormField + +@pytest.mark.parametrize('value', [None, 'Off', 'Yes']) +def test_checkbox_value(value): + source = field_source('cb', value, 'Btn', literal=True) + field = fieldsmod.CheckboxField(source) + assert field.value() == (value and value == 'Yes') + +@pytest.mark.parametrize('value,expected', [ + (None, None), + (False, 'Off'), + (True, 'Yes'), +]) +def test_checkbox_set_value(value, expected): + source = field_source('cb', field_type='Btn') + field = fieldsmod.CheckboxField(source) + field.set_value(value) + actual = fieldsmod.FormField.value(field) + if expected is None: + assert actual is None + else: + assert actual.name == expected + +@pytest.mark.parametrize('on_key,off_key', itertools.product( + ['1', '2', 'On', 'Yes'], + ['Off', None], +)) +def test_checkbox_options(on_key, off_key): + source = field_source('cb', field_type='Btn') + source['AP'] = {'N': appearance_states(on_key, off_key)} + field = fieldsmod.CheckboxField(source) + assert field.options() == [on_key, 'Off'] + +def test_checkbox_options_yes_no(): + # I'm not sure this is actually allowed under the spec, but… + expected = ['Yes', 'No'] + source = field_source('cb', field_type='Btn') + source['AP'] = {'N': appearance_states(*expected)} + field = fieldsmod.CheckboxField(source) + assert field.options() == expected + +@pytest.mark.parametrize('on_key,off_key,set_value', itertools.product( + ['1', '2', 'On', 'Yes'], + ['Off', None], + [True, False, None], +)) +def test_checkbox_set_custom_value(on_key, off_key, set_value): + source = field_source('cb', field_type='Btn') + source['AP'] = {'N': appearance_states(on_key, off_key)} + field = fieldsmod.CheckboxField(source) + field.set_value(set_value) + actual = fieldsmod.FormField.value(field) + if set_value is None: + assert actual is None + elif set_value: + assert actual.name == (on_key or 'Yes') + else: + assert actual.name == 'Off' + +@pytest.mark.parametrize('encoding,prefix', [ + ('ascii', b''), + ('utf-16be', codecs.BOM_UTF16_BE), +]) +def test_text_value(encoding, prefix): + expected = f'{encoding} encoding test' + value = prefix + expected.encode(encoding) + source = field_source('t', value, 'Tx') + field = fieldsmod.TextField(source) + assert field.value() == expected + +def test_text_value_none(): + source = field_source(field_type='Tx') + assert fieldsmod.TextField(source).value() is None + +@pytest.mark.parametrize('text,bprefix', [ + ('ASCII test', b''), + ('UTF—16 test', codecs.BOM_UTF16_BE), +]) +def test_text_set_value(text, bprefix): + source = field_source(field_type='Tx') + field = fieldsmod.TextField(source) + field.set_value(text) + assert field.value() == text + actual = fieldsmod.FormField.value(field) + assert actual == bprefix + text.encode('utf-16be' if bprefix else 'ascii') + +def test_text_set_value_none(): + source = field_source('t', b'set None test', 'Tx') + field = fieldsmod.TextField(source) + field.set_value(None) + assert fieldsmod.FormField.value(field) is None + +def test_empty_as_filled_fdf(): + source = field_source() + field = fieldsmod.FormField(source) + assert field.as_filled_fdf() == {} + +@pytest.mark.parametrize('field_type,field_class,set_value', [ + ('Btn', fieldsmod.CheckboxField, True), + ('Btn', fieldsmod.CheckboxField, False), + ('Ch', fieldsmod.FormField, None), + ('Tx', fieldsmod.TextField, 'export test'), + ('Tx', fieldsmod.TextField, 'UTF—16 export'), +]) +def test_as_filled_fdf_after_set_value(field_type, field_class, set_value): + source = field_source(field_type, field_type=field_type) + field = field_class(source) + field.set_value(set_value) + actual = field.as_filled_fdf() + assert actual['T'] == field_type + expect_len = 2 + if set_value is None: + assert 'V' not in actual + expect_len = 1 + elif field_class is fieldsmod.CheckboxField: + assert actual['V'].name == ('Yes' if set_value else 'Off') + else: + assert actual['V'] == set_value + assert len(actual) == expect_len + +@pytest.mark.parametrize('field_type,expected', [ + ('Btn', None), + ('Tx', ''), +]) +def test_as_filled_fdf_default_value(field_type, expected): + source = field_source(field_type=field_type) + field = fieldsmod.FormField.by_type(source) + actual = field.as_filled_fdf() + assert actual.get('V') == expected + +def test_as_filled_fdf_recursion(): + buttons = [field_source(f'bt{n}', field_type='Btn') for n in range(1, 3)] + pair = field_source('Buttons', kids=iter(buttons)) + text = field_source('tx', field_type='Tx') + source = field_source('topform', kids=[text, pair]) + field = fieldsmod.FormField(source) + actual = field.as_filled_fdf() + assert actual['T'] == 'topform' + assert 'V' not in actual + actual = iter(actual['Kids']) + assert next(actual)['T'] == 'tx' + actual = next(actual) + assert actual['T'] == 'Buttons' + assert 'V' not in actual + actual = iter(actual['Kids']) + assert next(actual)['T'] == 'bt1' + assert next(actual)['T'] == 'bt2' + assert next(actual, None) is None + +@pytest.mark.parametrize('name,value,field_type', [ + (None, None, None), + ('mt', 'mapping text', 'Tx'), + ('mb', 'Yes', 'Btn'), +]) +def test_simple_as_mapping(name, value, field_type): + source = field_source(name, value, field_type) + field = fieldsmod.FormField(source) + actual = field.as_mapping() + key, mapped = next(actual) + assert key == (name or '') + assert mapped is field + assert next(actual, None) is None + +def test_recursive_as_mapping(): + btn_kids = [field_source(f'btn{n}', field_type='Btn') for n in range(1, 3)] + buttons = field_source('buttons', kids=iter(btn_kids)) + text_kids = [field_source(f'tx{n}', field_type='Tx') for n in range(1, 3)] + texts = field_source('texts', kids=iter(text_kids)) + source = field_source('root', kids=[texts, buttons]) + root_field = fieldsmod.FormField(source) + actual = root_field.as_mapping() + for expected_key in [ + 'root', + 'root.texts', + 'root.texts.tx1', + 'root.texts.tx2', + 'root.buttons', + 'root.buttons.btn1', + 'root.buttons.btn2', + ]: + key, field = next(actual) + assert key == expected_key + _, _, expected_name = expected_key.rpartition('.') + assert field.name() == expected_name + assert next(actual, None) is None