Changeset - 13c66e8ce296
[Not reviewed]
0 1 9
Brett Smith - 3 years ago 2021-01-09 15:09:08
brettcsmith@brettcsmith.org
pdfforms: Initial module and tool to extract PDF form data to YAML.

Next steps:

* A tool to fill the PDF form based on values written to that YAML.
* An extension to fill some of those values with numbers queried from the
books (which is why we need something more involved than FDF).
10 files changed with 1007 insertions and 0 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/pdfforms/__init__.py
Show inline comments
 
new file 100644
conservancy_beancount/pdfforms/errors.py
Show inline comments
 
new file 100644
 
"""errors.py - Exception classes for PDF reporting errors"""
 
# Copyright © 2021  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
 
class PDFError(Exception):
 
    pass
 

	
 
class PDFKeyError(KeyError, PDFError):
 
    pass
 

	
 
class PDFSpecError(ValueError, PDFError):
 
    pass
 

	
 
class NoFormDataError(ValueError, PDFError):
 
    pass
conservancy_beancount/pdfforms/extract.py
Show inline comments
 
new file 100644
 
"""extract.py - Extract form data from PDF files"""
 
# Copyright © 2021  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
 
import argparse
 
import contextlib
 
import logging
 
import sys
 

	
 
import yaml
 

	
 
from . import fields as fieldmod
 
from . import utils as pdfutils
 
from .. import cliutil
 

	
 
from pathlib import Path
 
from pdfminer.pdfdocument import PDFDocument  # type:ignore[import]
 
from pdfminer.pdfparser import PDFParser  # type:ignore[import]
 
from pdfminer.pdftypes import resolve1  # type:ignore[import]
 

	
 
from typing import (
 
    Any,
 
    BinaryIO,
 
    Iterator,
 
    Mapping,
 
    Optional,
 
    Sequence,
 
    TextIO,
 
)
 

	
 
PROGNAME = 'pdfform-extract'
 
logger = logging.getLogger('conservancy_beancount.pdfforms.extract')
 

	
 
class FormExtractor:
 
    def __init__(
 
            self,
 
            pdf: PDFDocument,
 
            form_key: Optional[str]=None,
 
            source: Optional[str]=None,
 
    ) -> None:
 
        if form_key is None:
 
            form_key = pdfutils.guess_form_key(pdf)
 
        self.document = pdf
 
        self.form_key = form_key
 
        self.source = source
 

	
 
    @classmethod
 
    @contextlib.contextmanager
 
    def from_path(
 
            cls,
 
            path: Path,
 
            form_key: Optional[str]=None,
 
    ) -> Iterator['FormExtractor']:
 
        pdf_file = path.open('rb')
 
        try:
 
            yield cls.from_file(pdf_file, form_key, path)
 
        finally:
 
            pdf_file.close()
 

	
 
    @classmethod
 
    def from_file(
 
            cls,
 
            source: BinaryIO,
 
            form_key: Optional[str]=None,
 
            source_path: Optional[Path]=None,
 
    ) -> 'FormExtractor':
 
        if source_path is None:
 
            source_path = Path(source.name)
 
        parser = PDFParser(source)
 
        pdf_doc = PDFDocument(parser)
 
        return cls(pdf_doc, form_key, source_path.name)
 

	
 
    def _extract_field(
 
            self,
 
            field: fieldmod.FormField,
 
            name_prefix: str='',
 
    ) -> Iterator[Mapping[str, Any]]:
 
        name = name_prefix + field.name()
 
        yield_this = not field.is_readonly()
 
        try:
 
            field_type = field.field_type().name
 
        except ValueError:
 
            yield_this = False
 
        if yield_this:
 
            retval = {
 
                'fdf': {
 
                    'type': field_type,
 
                    'name': name,
 
                },
 
                'description': f'{field_type} {name}',
 
                'value': field.fill_value(),
 
            }
 
            if isinstance(field, fieldmod.CheckboxField):
 
                retval['fdf']['options'] = field.options()
 
            yield retval
 
        name += '.'
 
        for kid in field.kids():
 
            yield from self._extract_field(kid, name)
 

	
 
    def extract(self) -> Mapping[str, Any]:
 
        return {
 
            'from file': self.source,
 
            'form key': self.form_key,
 
            'fields': [
 
                field
 
                for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
                for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
 
            ],
 
        }
 

	
 

	
 
class FormYAMLDumper(yaml.dumper.SafeDumper):
 
    def represent_mapping(self, tag: Any, value: Any, flow_style: Any=None) -> Any:
 
        if flow_style is None:
 
            # We never want mappings flowed by default.
 
            flow_style = False
 
        # If the super method can call value.items(), it does that and re-sorts
 
        # the result. We don't want re-sorted output, so call value.items() now
 
        # as a bypass.
 
        try:
 
            value = value.items()
 
        except AttributeError:
 
            pass
 
        return super().represent_mapping(tag, value, flow_style)
 

	
 

	
 
def parse_arguments(arglist: Optional[Sequence[str]]=None) -> argparse.Namespace:
 
    parser = argparse.ArgumentParser(prog=PROGNAME)
 
    cliutil.add_version_argument(parser)
 
    cliutil.add_loglevel_argument(parser)
 
    parser.add_argument(
 
        '--form-key', '-f',
 
        metavar='KEY',
 
        help="""Key in the document catalog with form data.
 
Default is guessed by examining the document.
 
""")
 
    parser.add_argument(
 
        '--output-file', '-O',
 
        metavar='PATH',
 
        type=Path,
 
        help="""Write output YAML to this file, or stdout when PATH is `-`.
 
Default stdout.
 
""")
 
    parser.add_argument(
 
        'document',
 
        type=Path,
 
        help="""PDF or FDF file to extract form data from.
 
Use `-` to read from stdin.
 
""")
 
    return parser.parse_args(arglist)
 

	
 
def main(arglist: Optional[Sequence[str]]=None,
 
         stdout: TextIO=sys.stdout,
 
         stderr: TextIO=sys.stderr,
 
) -> int:
 
    args = parse_arguments(arglist)
 
    cliutil.set_loglevel(logger, args.loglevel)
 
    with contextlib.ExitStack() as exit_stack:
 
        if args.document == cliutil.STDSTREAM_PATH:
 
            extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
 
        else:
 
            extractor = exit_stack.enter_context(
 
                FormExtractor.from_path(args.document, args.form_key),
 
            )
 
        extracted_form = extractor.extract()
 
    with contextlib.ExitStack() as exit_stack:
 
        out_file = cliutil.text_output(args.output_file, stdout)
 
        if out_file is not stdout:
 
            exit_stack.enter_context(out_file)
 
        yaml.dump(extracted_form, out_file, Dumper=FormYAMLDumper)
 
    return 0
 

	
 
entry_point = cliutil.make_entry_point(__name__, PROGNAME)
 

	
 
if __name__ == '__main__':
 
    exit(entry_point())
conservancy_beancount/pdfforms/fields.py
Show inline comments
 
new file 100644
 
"""fields.py - Python classes to read and write PDF form data"""
 
# Copyright © 2020  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
 
import enum
 
import functools
 

	
 
from pdfminer.pdftypes import resolve1  # type:ignore[import]
 
from pdfminer import psparser  # type:ignore[import]
 
from . import utils as pdfutils
 
from .errors import PDFKeyError, PDFSpecError
 

	
 
from typing import (
 
    Any,
 
    Iterator,
 
    Optional,
 
    Mapping,
 
    MutableMapping,
 
    Sequence,
 
    Tuple,
 
    Union,
 
)
 

	
 
FieldSource = MutableMapping[str, Any]
 

	
 
class FieldFlags(enum.IntFlag):
 
    # Flags for all fields
 
    ReadOnly = 2 ** 0
 
    Required = 2 ** 1
 
    NoExport = 2 ** 2
 
    # Flags for buttons
 
    NoToggleToOff = 2 ** 14
 
    Radio = 2 ** 15
 
    Pushbutton = 2 ** 16
 
    RadiosInUnison = 2 ** 25
 
    # Flags for text
 
    Multiline = 2 ** 12
 
    Password = 2 ** 13
 
    FileSelect = 2 ** 20
 
    DoNotSpellCheck = 2 ** 22
 
    DoNotScroll = 2 ** 23
 
    Comb = 2 ** 24
 
    RichText = 2 ** 25
 

	
 

	
 
class FieldType(enum.Enum):
 
    Btn = 'Btn'
 
    BUTTON = Btn
 
    Ch = 'Ch'
 
    CHOICE = Ch
 
    Sig = 'Sig'
 
    SIG = Sig
 
    SIGNATURE = Sig
 
    Tx = 'Tx'
 
    TEXT = Tx
 

	
 

	
 
class FormField:
 
    __slots__ = ['_source']
 
    _SENTINEL = object()
 
    DEFAULT_FILL: object = None
 
    INHERITABLE = frozenset([
 
        'DV',
 
        'Ff',
 
        'FT',
 
        'MaxLen',
 
        'Opt',
 
        'V',
 
    ])
 

	
 
    def __init__(self, source: FieldSource) -> None:
 
        self._source = source
 

	
 
    @classmethod
 
    def by_type(cls, source: FieldSource) -> 'FormField':
 
        retval = cls(source)
 
        try:
 
            field_type = retval.field_type()
 
        except ValueError:
 
            return retval
 
        flags = retval.flags()
 
        if field_type is FieldType.BUTTON:
 
            if flags & FieldFlags.Radio:
 
                pass
 
            elif flags & FieldFlags.Pushbutton:
 
                pass
 
            else:
 
                retval.__class__ = CheckboxField
 
        elif field_type is FieldType.TEXT:
 
            retval.__class__ = TextField
 
        return retval
 

	
 
    def _get_value(self, key: str, default: Any=_SENTINEL) -> Any:
 
        can_inherit = key in self.INHERITABLE
 
        source: Optional[FieldSource] = self._source
 
        while source is not None:
 
            try:
 
                return resolve1(source[key])
 
            except KeyError:
 
                source = resolve1(source.get('Parent')) if can_inherit else None
 
        if default is self._SENTINEL:
 
            raise PDFKeyError(key)
 
        else:
 
            return default
 

	
 
    def field_type(self) -> FieldType:
 
        try:
 
            source = self._get_value('FT')
 
        except KeyError:
 
            raise PDFSpecError("field does not specify a field type") from None
 
        try:
 
            return FieldType[source.name]
 
        except (AttributeError, KeyError):
 
            raise PDFSpecError(f"field has invalid field type {source!r}") from None
 

	
 
    def kids(self) -> Iterator['FormField']:
 
        for source in self._get_value('Kids', ()):
 
            yield self.by_type(resolve1(source))
 

	
 
    def parent(self) -> Optional['FormField']:
 
        try:
 
            return self.by_type(self._get_value('Parent'))
 
        except KeyError:
 
            return None
 

	
 
    def is_terminal(self) -> bool:
 
        return not self._get_value('Kids', None)
 

	
 
    def flags(self) -> int:
 
        return self._get_value('Ff', 0)  # type:ignore[no-any-return]
 

	
 
    def is_readonly(self) -> bool:
 
        return bool(self.flags() & FieldFlags.ReadOnly)
 

	
 
    def name(self) -> str:
 
        return pdfutils.decode_text(self._get_value('T', b''))
 

	
 
    def value(self) -> Any:
 
        return self._get_value('V', None)  # type:ignore[no-any-return]
 

	
 
    def set_value(self, value: Any) -> None:
 
        self._source['V'] = value
 

	
 
    def fill_value(self) -> Any:
 
        return resolve1(self._source.get('V', self.DEFAULT_FILL))
 

	
 
    def as_filled_fdf(self) -> Mapping[str, Any]:
 
        retval: FieldSource = {}
 
        try:
 
            retval['T'] = pdfutils.decode_text(self._source['T'])
 
        except KeyError:
 
            pass
 
        value = self.fill_value()
 
        if value is not None:
 
            retval['V'] = value
 
        kids = [kid.as_filled_fdf() for kid in self.kids()]
 
        if kids:
 
            retval['Kids'] = kids
 
        return retval
 

	
 
    def as_mapping(self, name_prefix: str='') -> Iterator[Tuple[str, 'FormField']]:
 
        name = name_prefix + self.name()
 
        yield (name, self)
 
        name += '.'
 
        for kid in self.kids():
 
            yield from kid.as_mapping(name)
 

	
 

	
 
class CheckboxField(FormField):
 
    __slots__: Sequence[str] = []
 
    OFF = 'Off'
 
    ON = 'Yes'
 

	
 
    @functools.lru_cache()
 
    def options(self) -> Sequence[str]:
 
        try:
 
            keys: Tuple[str, ...] = tuple(self._source['AP']['N'])
 
        except KeyError:
 
            keys = ()
 
        count = len(keys)
 
        if count == 0:
 
            return [self.ON, self.OFF]
 
        elif count == 1:
 
            return [keys[0], self.OFF]
 
        elif count > 2:
 
            raise PDFSpecError("checkbox has more than two states available")
 
        try:
 
            off_index = keys.index(self.OFF)
 
        except ValueError:
 
            try:
 
                off_index = 0 if keys.index(self.ON) else 1
 
            except ValueError:
 
                raise PDFSpecError("checkbox defines two on states") from None
 
        return [keys[0 if off_index else 1], keys[off_index]]
 

	
 
    def _bool_value(self, literal_value: Optional[psparser.PSLiteral]) -> Optional[bool]:
 
        if literal_value is None:
 
            return None
 
        try:
 
            value = literal_value.name
 
        except AttributeError:
 
            raise PDFSpecError("checkbox value is not a PSLiteral")
 
        on, off = self.options()
 
        if value == on:
 
            return True
 
        elif value == off:
 
            return False
 
        else:
 
            raise PDFSpecError(f"checkbox has unknown value {value!r}")
 

	
 
    def value(self) -> Optional[bool]:
 
        return self._bool_value(super().value())
 

	
 
    def set_value(self, value: Optional[bool]) -> None:
 
        if value is None:
 
            literal_value: Optional[psparser.PSLiteral] = None
 
        else:
 
            on, off = self.options()
 
            literal_value = psparser.PSLiteralTable.intern(on if value else off)
 
        super().set_value(literal_value)
 

	
 

	
 
class TextField(FormField):
 
    __slots__: Sequence[str] = []
 
    DEFAULT_FILL = b''
 

	
 
    def _decode(self, value: Any) -> Optional[str]:
 
        if value is None:
 
            return value
 
        elif isinstance(value, bytes):
 
            return pdfutils.decode_text(value)
 
        else:
 
            raise PDFSpecError("text field value is not bytes")
 

	
 
    def value(self) -> Optional[str]:
 
        return self._decode(super().value())
 

	
 
    def set_value(self, value: Optional[str]) -> None:
 
        super().set_value(None if value is None else pdfutils.encode_text(value))
 

	
 
    def fill_value(self) -> Optional[str]:
 
        return self._decode(super().fill_value())
conservancy_beancount/pdfforms/utils.py
Show inline comments
 
new file 100644
 
"""utils.py - Utility methods for working with PDFs"""
 
# Copyright © 2020  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
 
from codecs import BOM_UTF16_BE
 

	
 
import pdfminer.utils  # type:ignore[import]
 

	
 
from . import errors as pdferrors
 

	
 
from pdfminer.pdfdocument import PDFDocument  # type:ignore[import]
 
from pdfminer.pdftypes import resolve1  # type:ignore[import]
 

	
 
from typing import (
 
    Callable,
 
)
 

	
 
decode_text: Callable[[bytes], str] = pdfminer.utils.decode_text
 

	
 
def encode_text(s: str) -> bytes:
 
    """Encode a string to bytes for PDF
 

	
 
    If possible, encodes to ASCII for readability and compactness.
 
    Otherwise uses UTF-16BE.
 
    """
 
    try:
 
        return s.encode('ascii')
 
    except UnicodeEncodeError:
 
        return BOM_UTF16_BE + s.encode('utf-16be')
 

	
 
def guess_form_key(pdf: PDFDocument) -> str:
 
    """Guess and return the PDF document catalog key with form data
 

	
 
    This function knows common catalog keys that hold PDF form data,
 
    searches the given document for form data, and returns the best candidate.
 
    Raises ValueError
 
    """
 
    catalog = pdf.catalog
 
    for key in [
 
            'AcroForm',
 
            'FDF',
 
    ]:
 
        try:
 
            'Fields' in resolve1(catalog[key])
 
        except (KeyError, TypeError):
 
            pass
 
        else:
 
            return key
 
    else:
 
        raise pdferrors.NoFormDataError("could not find catalog key with form data")
setup.py
Show inline comments
...
 
@@ -16,6 +16,7 @@ setup(
 
        'GitPython>=2.0',  # Debian:python3-git
 
        # 1.4.1 crashes when trying to save some documents.
 
        'odfpy>=1.4.0,!=1.4.1',  # Debian:python3-odf
 
        'pdfminer.six>=20200101',
 
        'PyYAML>=3.0',  # Debian:python3-yaml
 
        'regex',  # Debian:python3-regex
 
        'rt>=2.0',
...
 
@@ -31,6 +32,7 @@ setup(
 

	
 
    packages=[
 
        'conservancy_beancount',
 
        'conservancy_beancount.pdfforms',
 
        'conservancy_beancount.plugin',
 
        'conservancy_beancount.reports',
 
        'conservancy_beancount.tools',
...
 
@@ -46,6 +48,7 @@ setup(
 
            'fund-report = conservancy_beancount.reports.fund:entry_point',
 
            'ledger-report = conservancy_beancount.reports.ledger:entry_point',
 
            'opening-balances = conservancy_beancount.tools.opening_balances:entry_point',
 
            'pdfform-extract = conservancy_beancount.pdfforms.extract:entry_point',
 
            'split-ods-links = conservancy_beancount.tools.split_ods_links:entry_point',
 
        ],
 
    },
tests/pdfforms/form1.fdf
Show inline comments
 
new file 100644
 
%FDF-1.2
 
%âãÏÓ
 
1 0 obj
 
<<
 
/FDF
 
<<
 
/Fields [
 
<<
 
/T (topform)
 
/Kids [
 
    <<
 
        /T (text1_0)
 
        /FT /Tx
 
        /V ()
 
    >>
 
    <<
 
        /T (button1)
 
        /Kids [
 
            <<
 
                /FT /Btn
 
                /T (button1_0)
 
                /AP << /N << /1 1 0 R >> >>
 
            >>
 
            <<
 
                /FT /Btn
 
                /T (button1_1)
 
                /AP << /N << /2 1 0 R >> >>
 
            >>
 
        ]
 
    >>
 
    <<
 
        /T (text1_1)
 
        /FT /Tx
 
        /V ()
 
    >>
 
    <<
 
        /T (text2_0)
 
        /FT /Tx
 
        /V ()
 
    >>
 
    <<
 
        /T (button2)
 
        /Kids [
 
            <<
 
                /FT /Btn
 
                /T (button2_0)
 
                /AP << /N << /1 1 0 R >> >>
 
            >>
 
            <<
 
                /FT /Btn
 
                /T (button2_1)
 
                /AP << /N << /2 1 0 R >> >>
 
            >>
 
        ]
 
    >>
 
    <<
 
        % Readonly
 
        /T (text2_R)
 
        /FT /Tx
 
        /Ff 1
 
    >>
 
]
 
>>]
 
>>
 
>>
 
endobj
 
trailer
 

	
 
<<
 
/Root 1 0 R
 
>>
 
%%EOF
tests/pdfforms/form1.yml
Show inline comments
 
new file 100644
 
- fdf:
 
    type: Tx
 
    name: topform.text1_0
 
- fdf:
 
    type: Btn
 
    name: topform.button1.button1_0
 
    options: ['1', 'Off']
 
- fdf:
 
    type: Btn
 
    name: topform.button1.button1_1
 
    options: ['2', 'Off']
 
- fdf:
 
    type: Tx
 
    name: topform.text1_1
 
- fdf:
 
    type: Tx
 
    name: topform.text2_0
 
- fdf:
 
    type: Btn
 
    name: topform.button2.button2_0
 
    options: ['1', 'Off']
 
- fdf:
 
    type: Btn
 
    name: topform.button2.button2_1
 
    options: ['2', 'Off']
tests/test_pdfforms_extract.py
Show inline comments
 
new file 100644
 
"""test_pdfforms_extract.py - Unit tests for PDF form extractor"""
 
# Copyright © 2020  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
 
import io
 
import itertools
 

	
 
import pytest
 
import yaml
 

	
 
from . import testutil
 

	
 
from pathlib import Path
 

	
 
from conservancy_beancount.pdfforms import extract as extractmod
 

	
 
def compare_to_yaml(actual, yaml_path, from_file, form_key):
 
    if isinstance(yaml_path, str):
 
        yaml_path = testutil.test_path(f'pdfforms/{yaml_path}')
 
    with yaml_path.open() as yaml_file:
 
        expect_fields = yaml.safe_load(yaml_file)
 
    assert actual.get('from file') == from_file
 
    assert actual.get('form key') == form_key
 
    for act_f, exp_f in itertools.zip_longest(actual.get('fields', ()), expect_fields):
 
        for key, exp_value in exp_f.items():
 
            assert act_f[key] == exp_value
 

	
 
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
 
    ('form1.fdf', 'FDF', 'form1.yml'),
 
])
 
def test_extract_from_path(fdf_filename, form_key, fields_yaml):
 
    fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}')
 
    with extractmod.FormExtractor.from_path(fdf_path) as extractor:
 
        actual = extractor.extract()
 
    compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)
 

	
 
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
 
    ('form1.fdf', 'FDF', 'form1.yml'),
 
])
 
def test_extract_from_file(fdf_filename, form_key, fields_yaml):
 
    with testutil.test_path(f'pdfforms/{fdf_filename}').open('rb') as fdf_file:
 
        extractor = extractmod.FormExtractor.from_file(fdf_file)
 
        actual = extractor.extract()
 
    compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)
 

	
 
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
 
    ('form1.fdf', 'FDF', 'form1.yml'),
 
])
 
def test_main(fdf_filename, form_key, fields_yaml):
 
    fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}')
 
    arglist = [str(fdf_path)]
 
    stdout = io.StringIO()
 
    stderr = io.StringIO()
 
    returncode = extractmod.main(arglist, stdout, stderr)
 
    assert returncode == 0
 
    assert not stderr.getvalue()
 
    stdout.seek(0)
 
    actual = yaml.safe_load(stdout)
 
    compare_to_yaml(actual, fields_yaml, fdf_filename, form_key)
tests/test_pdfforms_fields.py
Show inline comments
 
new file 100644
 
"""test_pdfforms_fields.py - Unit tests for PDF forms manipulation"""
 
# Copyright © 2020  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
 
import codecs
 
import itertools
 

	
 
import pytest
 

	
 
from pdfminer.psparser import PSLiteral
 

	
 
from conservancy_beancount.pdfforms import fields as fieldsmod
 

	
 
def field_source(
 
        name=None,
 
        value=None,
 
        field_type=None,
 
        flags=None,
 
        parent=None,
 
        kids=None,
 
        *,
 
        literal=None,
 
):
 
    retval = {}
 
    if isinstance(name, str):
 
        retval['T'] = name.encode('ascii')
 
    elif name is not None:
 
        retval['T'] = name
 
    if value is not None:
 
        if literal is None:
 
            literal = field_type and field_type != 'Tx'
 
        if literal:
 
            value = PSLiteral(value)
 
        retval['V'] = value
 
    if field_type is not None:
 
        retval['FT'] = PSLiteral(field_type)
 
    if flags is not None:
 
        retval['Ff'] = flags
 
    if parent is not None:
 
        retval['Parent'] = parent
 
    if kids is not None:
 
        retval['Kids'] = list(kids)
 
    return retval
 

	
 
def appearance_states(*names):
 
    return {key: object() for key in names if key is not None}
 

	
 
def test_empty_field():
 
    source = field_source()
 
    field = fieldsmod.FormField(source)
 
    assert not field.name()
 
    assert field.value() is None
 
    assert field.parent() is None
 
    assert not list(field.kids())
 
    assert field.flags() == 0
 
    assert field.is_terminal()
 
    with pytest.raises(ValueError):
 
        field.field_type()
 

	
 
def test_text_field_base():
 
    source = field_source(b's', b'string of text', 'Tx')
 
    field = fieldsmod.FormField(source)
 
    assert field.field_type() is fieldsmod.FieldType.TEXT
 
    assert field.name() == 's'
 
    assert field.value() == b'string of text'
 

	
 
@pytest.mark.parametrize('value', ['Off', 'Yes', 'On'])
 
def test_checkbox_field_base(value):
 
    source = field_source(b'cb', value, 'Btn', literal=True)
 
    field = fieldsmod.FormField(source)
 
    assert field.field_type() is fieldsmod.FieldType.BUTTON
 
    assert field.name() == 'cb'
 
    assert field.value().name == value
 

	
 
@pytest.mark.parametrize('flags', range(4))
 
def test_readonly_flag(flags):
 
    source = field_source(flags=flags)
 
    field = fieldsmod.FormField(source)
 
    assert field.flags() == flags
 
    assert field.is_readonly() == flags % 2
 

	
 
@pytest.mark.parametrize('kid_count', range(3))
 
def test_kids(kid_count):
 
    kids = [field_source(f'kid{n}', field_type='Ch') for n in range(kid_count)]
 
    source = field_source(kids=iter(kids))
 
    field = fieldsmod.FormField(source)
 
    got_kids = list(field.kids())
 
    assert len(got_kids) == len(kids)
 
    assert field.is_terminal() == (not kids)
 
    for actual, expected in zip(got_kids, kids):
 
        assert actual.name() == expected['T'].decode('ascii')
 

	
 
def test_kids_by_type():
 
    kids = [field_source(field_type='Tx'), field_source(field_type='Btn')]
 
    source = field_source('topform', kids=iter(kids))
 
    actual = fieldsmod.FormField.by_type(source).kids()
 
    assert isinstance(next(actual), fieldsmod.TextField)
 
    assert isinstance(next(actual), fieldsmod.CheckboxField)
 
    assert next(actual, None) is None
 

	
 
def test_inheritance():
 
    parent_source = field_source(b'parent', 'parent value', 'Tx', 17)
 
    kid_source = field_source('kid', parent=parent_source)
 
    parent_source['Kids'] = [kid_source]
 
    field = fieldsmod.FormField(kid_source)
 
    parent = field.parent()
 
    assert parent is not None
 
    assert parent.name() == 'parent'
 
    assert not parent.is_terminal()
 
    assert field.is_terminal()
 
    assert field.name() == 'kid'
 
    assert field.field_type() is fieldsmod.FieldType.TEXT
 
    assert field.value() == 'parent value'
 
    assert field.flags() == 17
 
    assert not list(field.kids())
 

	
 
@pytest.mark.parametrize('field_type,value', [
 
    ('Tx', b'new value'),
 
    ('Btn', PSLiteral('Yes')),
 
])
 
def test_set_value(field_type, value):
 
    source = field_source(field_type=field_type)
 
    field = fieldsmod.FormField(source)
 
    assert field.value() is None
 
    field.set_value(value)
 
    assert field.value() == value
 

	
 
@pytest.mark.parametrize('field_type,expected', [
 
    ('Tx', fieldsmod.TextField),
 
    ('Btn', fieldsmod.CheckboxField),
 
])
 
def test_by_type(field_type, expected):
 
    source = field_source(field_type=field_type)
 
    field = fieldsmod.FormField.by_type(source)
 
    assert isinstance(field, expected)
 

	
 
def test_container_by_type():
 
    kids = [field_source(field_type='Tx'), field_source(field_type='Btn')]
 
    source = field_source('topform', kids=iter(kids))
 
    field = fieldsmod.FormField.by_type(source)
 
    assert isinstance(field, fieldsmod.FormField)
 

	
 
@pytest.mark.parametrize('flag', [
 
    # If you add dedicated classes for these types of buttons, you can remove
 
    # their test cases.
 
    fieldsmod.FieldFlags.Radio,
 
    fieldsmod.FieldFlags.Pushbutton,
 
])
 
def test_unsupported_button_by_type(flag):
 
    source = field_source(field_type='Btn', flags=flag)
 
    field = fieldsmod.FormField.by_type(source)
 
    assert type(field) is fieldsmod.FormField
 

	
 
@pytest.mark.parametrize('field_type', [
 
    # If you add dedicated classes for these types of fields, you can remove
 
    # their test cases.
 
    'Ch',
 
    'Sig',
 
])
 
def test_unsupported_field_by_type(field_type):
 
    source = field_source(field_type=field_type)
 
    field = fieldsmod.FormField.by_type(source)
 
    assert type(field) is fieldsmod.FormField
 

	
 
@pytest.mark.parametrize('value', [None, 'Off', 'Yes'])
 
def test_checkbox_value(value):
 
    source = field_source('cb', value, 'Btn', literal=True)
 
    field = fieldsmod.CheckboxField(source)
 
    assert field.value() == (value and value == 'Yes')
 

	
 
@pytest.mark.parametrize('value,expected', [
 
    (None, None),
 
    (False, 'Off'),
 
    (True, 'Yes'),
 
])
 
def test_checkbox_set_value(value, expected):
 
    source = field_source('cb', field_type='Btn')
 
    field = fieldsmod.CheckboxField(source)
 
    field.set_value(value)
 
    actual = fieldsmod.FormField.value(field)
 
    if expected is None:
 
        assert actual is None
 
    else:
 
        assert actual.name == expected
 

	
 
@pytest.mark.parametrize('on_key,off_key', itertools.product(
 
    ['1', '2', 'On', 'Yes'],
 
    ['Off', None],
 
))
 
def test_checkbox_options(on_key, off_key):
 
    source = field_source('cb', field_type='Btn')
 
    source['AP'] = {'N': appearance_states(on_key, off_key)}
 
    field = fieldsmod.CheckboxField(source)
 
    assert field.options() == [on_key, 'Off']
 

	
 
def test_checkbox_options_yes_no():
 
    # I'm not sure this is actually allowed under the spec, but…
 
    expected = ['Yes', 'No']
 
    source = field_source('cb', field_type='Btn')
 
    source['AP'] = {'N': appearance_states(*expected)}
 
    field = fieldsmod.CheckboxField(source)
 
    assert field.options() == expected
 

	
 
@pytest.mark.parametrize('on_key,off_key,set_value', itertools.product(
 
    ['1', '2', 'On', 'Yes'],
 
    ['Off', None],
 
    [True, False, None],
 
))
 
def test_checkbox_set_custom_value(on_key, off_key, set_value):
 
    source = field_source('cb', field_type='Btn')
 
    source['AP'] = {'N': appearance_states(on_key, off_key)}
 
    field = fieldsmod.CheckboxField(source)
 
    field.set_value(set_value)
 
    actual = fieldsmod.FormField.value(field)
 
    if set_value is None:
 
        assert actual is None
 
    elif set_value:
 
        assert actual.name == (on_key or 'Yes')
 
    else:
 
        assert actual.name == 'Off'
 

	
 
@pytest.mark.parametrize('encoding,prefix', [
 
    ('ascii', b''),
 
    ('utf-16be', codecs.BOM_UTF16_BE),
 
])
 
def test_text_value(encoding, prefix):
 
    expected = f'{encoding} encoding test'
 
    value = prefix + expected.encode(encoding)
 
    source = field_source('t', value, 'Tx')
 
    field = fieldsmod.TextField(source)
 
    assert field.value() == expected
 

	
 
def test_text_value_none():
 
    source = field_source(field_type='Tx')
 
    assert fieldsmod.TextField(source).value() is None
 

	
 
@pytest.mark.parametrize('text,bprefix', [
 
    ('ASCII test', b''),
 
    ('UTF—16 test', codecs.BOM_UTF16_BE),
 
])
 
def test_text_set_value(text, bprefix):
 
    source = field_source(field_type='Tx')
 
    field = fieldsmod.TextField(source)
 
    field.set_value(text)
 
    assert field.value() == text
 
    actual = fieldsmod.FormField.value(field)
 
    assert actual == bprefix + text.encode('utf-16be' if bprefix else 'ascii')
 

	
 
def test_text_set_value_none():
 
    source = field_source('t', b'set None test', 'Tx')
 
    field = fieldsmod.TextField(source)
 
    field.set_value(None)
 
    assert fieldsmod.FormField.value(field) is None
 

	
 
def test_empty_as_filled_fdf():
 
    source = field_source()
 
    field = fieldsmod.FormField(source)
 
    assert field.as_filled_fdf() == {}
 

	
 
@pytest.mark.parametrize('field_type,field_class,set_value', [
 
    ('Btn', fieldsmod.CheckboxField, True),
 
    ('Btn', fieldsmod.CheckboxField, False),
 
    ('Ch', fieldsmod.FormField, None),
 
    ('Tx', fieldsmod.TextField, 'export test'),
 
    ('Tx', fieldsmod.TextField, 'UTF—16 export'),
 
])
 
def test_as_filled_fdf_after_set_value(field_type, field_class, set_value):
 
    source = field_source(field_type, field_type=field_type)
 
    field = field_class(source)
 
    field.set_value(set_value)
 
    actual = field.as_filled_fdf()
 
    assert actual['T'] == field_type
 
    expect_len = 2
 
    if set_value is None:
 
        assert 'V' not in actual
 
        expect_len = 1
 
    elif field_class is fieldsmod.CheckboxField:
 
        assert actual['V'].name == ('Yes' if set_value else 'Off')
 
    else:
 
        assert actual['V'] == set_value
 
    assert len(actual) == expect_len
 

	
 
@pytest.mark.parametrize('field_type,expected', [
 
    ('Btn', None),
 
    ('Tx', ''),
 
])
 
def test_as_filled_fdf_default_value(field_type, expected):
 
    source = field_source(field_type=field_type)
 
    field = fieldsmod.FormField.by_type(source)
 
    actual = field.as_filled_fdf()
 
    assert actual.get('V') == expected
 

	
 
def test_as_filled_fdf_recursion():
 
    buttons = [field_source(f'bt{n}', field_type='Btn') for n in range(1, 3)]
 
    pair = field_source('Buttons', kids=iter(buttons))
 
    text = field_source('tx', field_type='Tx')
 
    source = field_source('topform', kids=[text, pair])
 
    field = fieldsmod.FormField(source)
 
    actual = field.as_filled_fdf()
 
    assert actual['T'] == 'topform'
 
    assert 'V' not in actual
 
    actual = iter(actual['Kids'])
 
    assert next(actual)['T'] == 'tx'
 
    actual = next(actual)
 
    assert actual['T'] == 'Buttons'
 
    assert 'V' not in actual
 
    actual = iter(actual['Kids'])
 
    assert next(actual)['T'] == 'bt1'
 
    assert next(actual)['T'] == 'bt2'
 
    assert next(actual, None) is None
 

	
 
@pytest.mark.parametrize('name,value,field_type', [
 
    (None, None, None),
 
    ('mt', 'mapping text', 'Tx'),
 
    ('mb', 'Yes', 'Btn'),
 
])
 
def test_simple_as_mapping(name, value, field_type):
 
    source = field_source(name, value, field_type)
 
    field = fieldsmod.FormField(source)
 
    actual = field.as_mapping()
 
    key, mapped = next(actual)
 
    assert key == (name or '')
 
    assert mapped is field
 
    assert next(actual, None) is None
 

	
 
def test_recursive_as_mapping():
 
    btn_kids = [field_source(f'btn{n}', field_type='Btn') for n in range(1, 3)]
 
    buttons = field_source('buttons', kids=iter(btn_kids))
 
    text_kids = [field_source(f'tx{n}', field_type='Tx') for n in range(1, 3)]
 
    texts = field_source('texts', kids=iter(text_kids))
 
    source = field_source('root', kids=[texts, buttons])
 
    root_field = fieldsmod.FormField(source)
 
    actual = root_field.as_mapping()
 
    for expected_key in [
 
            'root',
 
            'root.texts',
 
            'root.texts.tx1',
 
            'root.texts.tx2',
 
            'root.buttons',
 
            'root.buttons.btn1',
 
            'root.buttons.btn2',
 
    ]:
 
        key, field = next(actual)
 
        assert key == expected_key
 
        _, _, expected_name = expected_key.rpartition('.')
 
        assert field.name() == expected_name
 
    assert next(actual, None) is None
0 comments (0 inline, 0 general)