Changeset - 2cf4fdcfadf7
[Not reviewed]
0 4 3
Brett Smith - 4 years ago 2021-01-09 15:09:08
brettcsmith@brettcsmith.org
pdfforms.fill: New module+tool.

After you edit the YAML generated by pdfform-extract,
you can re-fill the original PDF with pdfform-fill.
7 files changed with 896 insertions and 26 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/pdfforms/fill.py
Show inline comments
 
new file 100644
 
"""fill.py - PDF writer class"""
 
# Copyright © 2021  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
 
import argparse
 
import contextlib
 
import inspect
 
import itertools
 
import logging
 
import os
 
import re
 
import subprocess
 
import sys
 

	
 
from codecs import BOM_UTF16_BE
 
from pathlib import Path
 

	
 
import yaml
 

	
 
from pdfminer import psparser  # type:ignore[import]
 
from pdfminer.pdfdocument import PDFDocument  # type:ignore[import]
 
from pdfminer.pdfparser import PDFParser  # type:ignore[import]
 
from pdfminer.pdftypes import resolve1  # type:ignore[import]
 

	
 
from . import fields as fieldmod
 
from . import utils as pdfutils
 
from .. import cliutil
 

	
 
from typing import (
 
    Any,
 
    BinaryIO,
 
    Dict,
 
    Iterator,
 
    List,
 
    Mapping,
 
    Match,
 
    NamedTuple,
 
    Optional,
 
    Sequence,
 
    TextIO,
 
    Tuple,
 
    Type,
 
    Union,
 
    cast,
 
)
 

	
 
EmitBytes = Iterator[bytes]
 
FieldSource = Mapping[str, Any]
 

	
 
PROGNAME = 'pdfform-fill'
 
logger = logging.getLogger('conservancy_beancount.pdfforms.extract')
 

	
 
SUPPORTED_VALUE_TYPES: Mapping[Type[fieldmod.FormField], Tuple[Type, ...]] = {
 
    ft: inspect.signature(ft.set_value).parameters['value'].annotation.__args__
 
    for ft in vars(fieldmod).values()
 
    if isinstance(ft, type)
 
    and issubclass(ft, fieldmod.FormField)
 
    and ft is not fieldmod.FormField
 
}
 

	
 
class PDFWriter:
 
    """Convert an arbitrary Python object out to PDF"""
 
    HEADER = b'''%FDF-1.2
 
%\xe2\xe3\xcf\xd3
 
1 0 obj
 
'''
 
    FOOTER = b'''
 
endobj
 
trailer
 
<</Root 1 0 R>>
 
%%EOF
 
'''
 
    # From the PDF spec section 7.3.5 "Name Objects"
 
    LITERAL_ESC_RE = re.compile(b'[^\x21\x22\x24-\x7e]+')
 
    STRING_ESC = {ord(c): f'\\{c}' for c in '()\\'}
 

	
 
    @staticmethod
 
    def escape_literal(match: Match[bytes]) -> bytes:
 
        return b''.join(
 
            hex(c).replace('0x', '#', 1).encode('ascii')
 
            for c in match.group(0)
 
        )
 

	
 
    def emit_array(self, obj: Sequence[Any]) -> EmitBytes:
 
        yield b'[\n'
 
        for item in obj:
 
            yield from self.emit(item)
 
            yield b'\n'
 
        yield b']'
 

	
 
    def emit_boolean(self, obj: bool) -> EmitBytes:
 
        yield b'true' if obj else b'false'
 

	
 
    def emit_dictionary(self, obj: Mapping[str, Any]) -> EmitBytes:
 
        yield b'<<\n'
 
        for key, value in obj.items():
 
            yield from self.emit_literal(key)
 
            yield b' '
 
            yield from self.emit(value)
 
            yield b'\n'
 
        yield b'>>'
 

	
 
    def emit_literal(self, obj: Union[str, psparser.PSLiteral]) -> EmitBytes:
 
        if isinstance(obj, psparser.PSLiteral):
 
            obj = cast(str, obj.name)
 
        yield b'/'
 
        yield self.LITERAL_ESC_RE.sub(self.escape_literal, obj.encode('ascii'))
 

	
 
    def emit_null(self, obj: None=None) -> EmitBytes:
 
        yield b'null'
 

	
 
    def emit_number(self, obj: Union[int, float]) -> EmitBytes:
 
        yield str(obj).encode('ascii')
 

	
 
    def emit_string(self, obj: str) -> EmitBytes:
 
        yield b'('
 
        yield pdfutils.encode_text(obj.translate(self.STRING_ESC))
 
        yield b')'
 

	
 
    def emit(self, obj: Any) -> EmitBytes:
 
        if obj is None:
 
            yield from self.emit_null(obj)
 
        elif isinstance(obj, bool):
 
            yield from self.emit_boolean(obj)
 
        elif isinstance(obj, psparser.PSLiteral):
 
            yield from self.emit_literal(obj)
 
        elif isinstance(obj, (int, float)):
 
            yield from self.emit_number(obj)
 
        elif isinstance(obj, str):
 
            yield from self.emit_string(obj)
 
        elif isinstance(obj, bytes):
 
            raise ValueError("can't emit raw bytes")
 
        elif isinstance(obj, Mapping):
 
            yield from self.emit_dictionary(obj)
 
        elif isinstance(obj, Sequence):
 
            yield from self.emit_array(obj)
 
        else:
 
            raise ValueError(f"don't know how to emit {type(obj).__name__}")
 

	
 
    def write_document(self, obj: Any, out_file: BinaryIO) -> None:
 
        out_file.write(self.HEADER)
 
        for out_bytes in self.emit(obj):
 
            out_file.write(out_bytes)
 
        out_file.write(self.FOOTER)
 

	
 

	
 
class FillProblem(NamedTuple):
 
    level: int
 
    yaml_index: int
 
    name: Optional[str]
 
    errdesc: str
 

	
 
    def log(self, logger: logging.Logger=logger) -> None:
 
        logger.log(
 
            self.level,
 
            "YAML form field #%d%s%s",
 
            self.yaml_index + 1,
 
            ' ' if self.name is None else f' ({self.name}) ',
 
            self.errdesc,
 
        )
 

	
 

	
 
def _ensure_field(
 
        field_map: Dict[str, fieldmod.FormField],
 
        key: str,
 
        field: Optional[FieldSource]=None,
 
        yaml_index: int=-2,
 
) -> Tuple[fieldmod.FormField, Optional[FillProblem]]:
 
    try:
 
        return (field_map[key], None)
 
    except KeyError:
 
        if field is None:
 
            field = {}
 
        problem: Optional[FillProblem] = None
 
        parent_key, _, kid_name = key.rpartition('.')
 
        kid_source: fieldmod.FieldSource = {'T': pdfutils.encode_text(kid_name)}
 
        try:
 
            field_type = field['type']
 
        except KeyError:
 
            pass
 
        else:
 
            try:
 
                field_type = fieldmod.FieldType[field_type.title()].value
 
            except KeyError:
 
                problem = FillProblem(
 
                    logging.ERROR, yaml_index, key,
 
                    f"has unknown FDF type {field_type!r}",
 
                )
 
            kid_source['FT'] = psparser.PSLiteralTable.intern(field_type)
 
        try:
 
            options = iter(field['options'])
 
        except KeyError:
 
            pass
 
        else:
 
            kid_source['AP'] = {'N': {opt: None for opt in options}}
 
        kid = fieldmod.FormField.by_type(kid_source)
 
        if parent_key:
 
            parent, _ = _ensure_field(field_map, parent_key)
 
            parent.add_kid(kid)
 
        field_map[key] = kid
 
        return (kid, problem)
 

	
 
def _set_field_value(
 
        field: fieldmod.FormField,
 
        value: Any,
 
        yaml_index: int=-2,
 
        yaml_key: Optional[str]=None,
 
) -> Iterator[FillProblem]:
 
    set_ok = True
 
    if value is not None:
 
        field_type = type(field)
 
        try:
 
            set_ok = isinstance(value, SUPPORTED_VALUE_TYPES[field_type])
 
        except KeyError:
 
            yield FillProblem(logging.ERROR, yaml_index, yaml_key,
 
                              "assigns a value to an unsupported field type")
 
        else:
 
            # bools shouldn't be allowed in as ints for this purpose.
 
            if set_ok and isinstance(value, bool):
 
                set_ok = any(issubclass(t, bool)
 
                             for t in SUPPORTED_VALUE_TYPES[field_type])
 
            if not set_ok:
 
                set_type = type(value).__name__
 
                yield FillProblem(logging.ERROR, yaml_index, yaml_key,
 
                                  f"assigns a {set_type} value to a {field_type.__name__}")
 
        if field.is_readonly():
 
            yield FillProblem(logging.WARNING, yaml_index, yaml_key,
 
                              "assigns a value to a readonly field")
 
    if set_ok:
 
        field.set_value(value)
 

	
 
def generate_form(
 
        form_source: Sequence[FieldSource],
 
) -> Tuple[Sequence[FieldSource], Sequence[FillProblem]]:
 
    problems: List[FillProblem] = []
 
    field_map: Dict[str, fieldmod.FormField] = {}
 
    for index, fill in enumerate(form_source):
 
        try:
 
            field_key = fill['fdf']['name']
 
        except KeyError:
 
            problems.append(FillProblem(logging.ERROR, index, None, "has no FDF name"))
 
            continue
 
        field, problem = _ensure_field(field_map, field_key, fill['fdf'], index)
 
        if problem is not None:
 
            problems.append(problem)
 
        try:
 
            set_value = fill['value']
 
        except KeyError:
 
            pass
 
        else:
 
            problems.extend(_set_field_value(field, set_value, index, field_key))
 
    fields = [
 
        field.as_filled_fdf()
 
        for key, field in field_map.items()
 
        if '.' not in key
 
    ]
 
    return (fields, problems)
 

	
 
def merge_form(
 
        form_fills: Sequence[FieldSource],
 
        form_source: Sequence[fieldmod.FieldSource],
 
) -> Tuple[Sequence[FieldSource], Sequence[FillProblem]]:
 
    problems: List[FillProblem] = []
 
    field_list = [fieldmod.FormField.by_type(resolve1(field)) for field in form_source]
 
    field_map = dict(
 
        kvpair
 
        for field in field_list
 
        for kvpair in field.as_mapping()
 
    )
 
    for index, fill in enumerate(form_fills):
 
        try:
 
            field_key = fill['fdf']['name']
 
        except KeyError:
 
            problems.append(FillProblem(logging.ERROR, index, None, "has no FDF name"))
 
            continue
 
        try:
 
            field = field_map[field_key]
 
        except KeyError:
 
            problems.append(FillProblem(
 
                logging.ERROR, index, field_key,
 
                "refers to a field that does not exist in the source form",
 
            ))
 
            continue
 
        try:
 
            expect_type = fieldmod.FieldType[fill['fdf']['type'].title()]
 
        except KeyError:
 
            pass
 
        else:
 
            try:
 
                actual_type = field.field_type()
 
            except ValueError:
 
                type_name: Optional[str] = None
 
            else:
 
                type_name = actual_type.value
 
            if expect_type.value != type_name:
 
                problems.append(FillProblem(
 
                    logging.WARNING, index, field_key,
 
                    f"has type {expect_type.name} but source has type {type_name}",
 
                ))
 
        try:
 
            set_value = fill['value']
 
        except KeyError:
 
            pass
 
        else:
 
            problems.extend(_set_field_value(field, set_value, index, field_key))
 
    return ([field.as_filled_fdf() for field in field_list], problems)
 

	
 
def parse_arguments(arglist: Optional[Sequence[str]]=None) -> argparse.Namespace:
 
    parser = argparse.ArgumentParser(prog=PROGNAME)
 
    cliutil.add_version_argument(parser)
 
    cliutil.add_loglevel_argument(parser)
 
    parser.add_argument(
 
        '--force',
 
        action='count',
 
        default=0,
 
        help="""Continue with filling the PDF even if there are problems in the
 
input YAML. Pass this option twice to continue even with major problems.
 
""")
 
    parser.add_argument(
 
        '--pdftk',
 
        type=Path,
 
        default=Path('pdftk'),
 
        help="""Path of the `pdftk` executable.
 
Default searched from your $PATH.
 
""")
 
    parser.add_argument(
 
        '--form-key', '-f',
 
        metavar='KEY',
 
        help="""Key in the document catalog with form data.
 
Default is guessed by examining the document.
 
""")
 
    parser.add_argument(
 
        '--output-file', '-O',
 
        metavar='PATH',
 
        type=Path,
 
        help="""Write output to this file, or stdout when PATH is `-`.
 
Default is generated from the input filename.
 
""")
 
    parser.add_argument(
 
        'yaml_file',
 
        type=Path,
 
        help="""YAML file with values generated from pdfform-extract
 
""")
 
    parser.add_argument(
 
        'pdf_file',
 
        nargs='?',
 
        type=Path,
 
        help="""PDF file with forms to fill. If omitted, pdfform-fill generates
 
FDF output that you can give to `pdftk fill_form` later.
 
""")
 
    return parser.parse_args(arglist)
 

	
 
def change_suffix(path: Path, suffix: str, backup: str='_filled') -> Path:
 
    if path.suffix == suffix:
 
        return path.with_name(f'{path.stem}{backup}{suffix}')
 
    else:
 
        return path.with_suffix(suffix)
 

	
 
def main(arglist: Optional[Sequence[str]]=None,
 
         stdout: TextIO=sys.stdout,
 
         stderr: TextIO=sys.stderr,
 
) -> int:
 
    args = parse_arguments(arglist)
 
    cliutil.set_loglevel(logger, args.loglevel)
 

	
 
    with args.yaml_file.open() as yaml_file:
 
        try:
 
            yaml_source = yaml.safe_load(yaml_file)
 
        except yaml.error.YAMLError as error:
 
            logger.critical("error parsing %s: %s", args.yaml_file, error)
 
            return os.EX_NOINPUT
 
    if not isinstance(yaml_source.get('fields'), list):
 
        logger.critical("YAML file does not include a list of fields")
 
        return os.EX_NOINPUT
 

	
 
    if args.pdf_file is None:
 
        fill_mode = False
 
        if args.form_key is None:
 
            args.form_key = yaml_source.get('form key', 'FDF')
 
        fields, problems = generate_form(yaml_source['fields'])
 
    else:
 
        with args.pdf_file.open('rb') as pdf_file:
 
            parser = PDFParser(pdf_file)
 
            pdf_doc = PDFDocument(parser)
 
            if args.form_key is None:
 
                try:
 
                    args.form_key = pdfutils.guess_form_key(pdf_doc)
 
                except ValueError as error:
 
                    logger.error("%s", error.args[0])
 
                    logger.info("you can specify a form key using --form-key")
 
                    return os.EX_NOINPUT
 
            fields, problems = merge_form(
 
                yaml_source['fields'],
 
                resolve1(pdf_doc.catalog[args.form_key])['Fields'],
 
            )
 
        fill_mode = cliutil.can_run(['pdftk', '--version'])
 
        if not fill_mode:
 
            logger.warning("cannot run pdftk to fill the PDF form; writing FDF instead")
 

	
 
    worst_problem = -1
 
    for problem in problems:
 
        problem.log()
 
        worst_problem = max(worst_problem, problem.level)
 
    if args.force > 1:
 
        problems_fatal = False
 
    elif args.force == 1:
 
        problems_fatal = worst_problem > logging.WARNING
 
    else:
 
        problems_fatal = worst_problem >= 0
 
    if problems_fatal:
 
        return os.EX_DATAERR
 

	
 
    if args.output_file is None:
 
        args.output_file = change_suffix(
 
            args.pdf_file or args.yaml_file,
 
            '.pdf' if fill_mode else '.fdf',
 
        )
 
        logger.info("writing output to %s", args.output_file)
 
    out_writer = PDFWriter()
 
    # pdftk always expects form fill data to be under the `FDF` key,
 
    # regardless of what the original PDF uses.
 
    out_doc = {'FDF': {'Fields': fields}}
 
    with contextlib.ExitStack() as exit_stack:
 
        if fill_mode:
 
            pdftk = exit_stack.enter_context(subprocess.Popen([
 
                args.pdftk, str(args.pdf_file),
 
                'fill_form', '-',
 
                'output', str(args.output_file),
 
            ], stdin=subprocess.PIPE))
 
            out_file = exit_stack.enter_context(cast(BinaryIO, pdftk.stdin))
 
        else:
 
            out_file = cliutil.bytes_output(args.output_file, stdout)
 
        out_writer.write_document(out_doc, out_file)
 
    try:
 
        return pdftk.returncode
 
    except NameError:
 
        return os.EX_OK
 

	
 
entry_point = cliutil.make_entry_point(__name__, PROGNAME)
 

	
 
if __name__ == '__main__':
 
    exit(entry_point())
setup.py
Show inline comments
...
 
@@ -40,16 +40,17 @@ setup(
 
    entry_points={
 
        'console_scripts': [
 
            'accrual-report = conservancy_beancount.reports.accrual:entry_point',
 
            'assemble-audit-reports = conservancy_beancount.tools.audit_report:entry_point',
 
            'balance-sheet-report = conservancy_beancount.reports.balance_sheet:entry_point',
 
            'budget-report = conservancy_beancount.reports.budget:entry_point',
 
            'bean-sort = conservancy_beancount.tools.sort_entries:entry_point',
 
            'extract-odf-links = conservancy_beancount.tools.extract_odf_links:entry_point',
 
            'fund-report = conservancy_beancount.reports.fund:entry_point',
 
            'ledger-report = conservancy_beancount.reports.ledger:entry_point',
 
            'opening-balances = conservancy_beancount.tools.opening_balances:entry_point',
 
            'pdfform-extract = conservancy_beancount.pdfforms.extract:entry_point',
 
            'pdfform-fill = conservancy_beancount.pdfforms.fill:entry_point',
 
            'split-ods-links = conservancy_beancount.tools.split_ods_links:entry_point',
 
        ],
 
    },
 
)
tests/pdfforms/form1.fdf
Show inline comments
...
 
@@ -50,23 +50,29 @@
 
                /FT /Btn
 
                /T (button2_1)
 
                /AP << /N << /2 1 0 R >> >>
 
            >>
 
        ]
 
    >>
 
    <<
 
        % Readonly
 
        /T (text2_R)
 
        /FT /Tx
 
        /Ff 1
 
    >>
 
    <<
 
        % Submit button
 
        /T (submit)
 
        /FT /Btn
 
        /Ff 65536
 
    >>
 
]
 
>>]
 
>>
 
>>
 
endobj
 
trailer
 

	
 
<<
 
/Root 1 0 R
 
>>
 
%%EOF
tests/pdfforms/form1.yml
Show inline comments
 
- fdf:
 
    type: Tx
 
    name: topform.text1_0
 
- fdf:
 
    type: Btn
 
    name: topform.button1.button1_0
 
    options: ['1', 'Off']
 
- fdf:
 
    type: Btn
 
    name: topform.button1.button1_1
 
    options: ['2', 'Off']
 
- fdf:
 
    type: Tx
 
    name: topform.text1_1
 
- fdf:
 
    type: Tx
 
    name: topform.text2_0
 
- fdf:
 
    type: Btn
 
    name: topform.button2.button2_0
 
    options: ['1', 'Off']
 
- fdf:
 
    type: Btn
 
    name: topform.button2.button2_1
 
    options: ['2', 'Off']
 
from file: form1.fdf
 
fields:
 
  - fdf:
 
      type: Tx
 
      name: topform.text1_0
 
  - fdf:
 
      type: Btn
 
      name: topform.button1.button1_0
 
      options: ['1', 'Off']
 
  - fdf:
 
      type: Btn
 
      name: topform.button1.button1_1
 
      options: ['2', 'Off']
 
  - fdf:
 
      type: Tx
 
      name: topform.text1_1
 
  - fdf:
 
      type: Tx
 
      name: topform.text2_0
 
  - fdf:
 
      type: Btn
 
      name: topform.button2.button2_0
 
      options: ['1', 'Off']
 
  - fdf:
 
      type: Btn
 
      name: topform.button2.button2_1
 
      options: ['2', 'Off']
 
  - fdf:
 
      type: Btn
 
      name: topform.submit
tests/pdfforms/form1_fill.yml
Show inline comments
 
new file 100644
 
from file: form1.fdf
 
fields:
 
  - fdf:
 
      type: Tx
 
      name: topform.text1_0
 
    value: text 1.0
 
  - fdf:
 
      type: Btn
 
      name: topform.button1.button1_0
 
      options: ['1', 'Off']
 
    value: on
 
  - fdf:
 
      type: Btn
 
      name: topform.button1.button1_1
 
      options: ['2', 'Off']
 
  - fdf:
 
      type: Tx
 
      name: topform.text1_1
 
    value: text 1.1
 
  - fdf:
 
      type: Tx
 
      name: topform.text2_0
 
    value: text 2.0
 
  - fdf:
 
      type: Btn
 
      name: topform.button2.button2_0
 
      options: ['1', 'Off']
 
  - fdf:
 
      type: Btn
 
      name: topform.button2.button2_1
 
      options: ['2', 'Off']
 
    value: on
tests/test_pdfforms_extract.py
Show inline comments
...
 
@@ -12,25 +12,25 @@ import pytest
 
import yaml
 

	
 
from . import testutil
 

	
 
from pathlib import Path
 

	
 
from conservancy_beancount.pdfforms import extract as extractmod
 

	
 
def compare_to_yaml(actual, yaml_path, from_file, form_key):
 
    if isinstance(yaml_path, str):
 
        yaml_path = testutil.test_path(f'pdfforms/{yaml_path}')
 
    with yaml_path.open() as yaml_file:
 
        expect_fields = yaml.safe_load(yaml_file)
 
        expect_fields = yaml.safe_load(yaml_file)['fields']
 
    assert actual.get('from file') == from_file
 
    assert actual.get('form key') == form_key
 
    for act_f, exp_f in itertools.zip_longest(actual.get('fields', ()), expect_fields):
 
        for key, exp_value in exp_f.items():
 
            assert act_f[key] == exp_value
 

	
 
@pytest.mark.parametrize('fdf_filename,form_key,fields_yaml', [
 
    ('form1.fdf', 'FDF', 'form1.yml'),
 
])
 
def test_extract_from_path(fdf_filename, form_key, fields_yaml):
 
    fdf_path = testutil.test_path(f'pdfforms/{fdf_filename}')
 
    with extractmod.FormExtractor.from_path(fdf_path) as extractor:
tests/test_pdfforms_fill.py
Show inline comments
 
new file 100644
 
"""test_pdfforms_writer.py - Unit tests for PDF writer"""
 
# Copyright © 2020  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
 
import codecs
 
import io
 
import logging
 
import re
 
import shutil
 

	
 
from pathlib import Path
 

	
 
import pytest
 
import yaml
 

	
 
from . import testutil
 
from pdfminer.pdfdocument import PDFDocument
 
from pdfminer.pdfparser import PDFParser
 
from pdfminer.pdftypes import resolve1
 
from pdfminer.psparser import PSLiteral
 

	
 
from conservancy_beancount.pdfforms import fill as fillmod
 

	
 
PDFTK = shutil.which('pdftk')
 
# Per the PDF spec, 7.2.2 "Character Set" Table 1
 
WHITESPACE = b'\x00\x09\x0A\x0C\x0D\x20'
 
WHITESPACE_RE = re.compile(b'[' + WHITESPACE + b']+')
 

	
 
@pytest.fixture(scope='module')
 
def writer():
 
    return fillmod.PDFWriter()
 

	
 
def expected_re(expected):
 
    pattern = re.escape(expected)
 
    # Unescape some things that don't strictly need to be escaped.
 
    pattern = re.sub(rb'\\(<|>| )', rb'\1', pattern)
 
    # Allow arbitrary whitespace around punctuation tokens.
 
    pattern = re.sub(rb'(<<|>>|\\\[|\\\])', rb'\\s*\1\\s*', pattern)
 
    # Allow any kind of whitespace where any is required.
 
    pattern = WHITESPACE_RE.sub(rb'\\s+', pattern)
 
    return pattern
 

	
 
def utf16_str(s):
 
    return b''.join([
 
        b'(',
 
        codecs.BOM_UTF16_BE,
 
        s.encode('utf-16be'),
 
        b')',
 
    ])
 

	
 
def open_pdf(source):
 
    if isinstance(source, Path):
 
        source = source.open('rb')
 
    else:
 
        source.seek(0)
 
    return PDFDocument(PDFParser(source))
 

	
 
def merge_form(yaml_fills, form_filename='form1.fdf', form_key='FDF'):
 
    with testutil.test_path(f'pdfforms/{form_filename}') as fdf_path:
 
        pdf = open_pdf(fdf_path)
 
        pdf_fields = resolve1(pdf.catalog[form_key])['Fields']
 
        return fillmod.merge_form(yaml_fills, pdf_fields)
 

	
 
@pytest.mark.parametrize('source,expected', [
 
    (None, b'null'),
 
    (True, b'true'),
 
    (False, b'false'),
 
    (0, b'0'),
 
    (1, b'1'),
 
    (345, b'345'),
 
    (34.56, b'34.56'),
 
    ('', b'()'),
 
    ('ascii', b'(ascii)'),
 
    (')parens(', br'(\)parens\()'),
 
    ('UTF—16', utf16_str('UTF—16')),
 
    (')¤(', utf16_str(r'\)¤\(')),
 
    (PSLiteral('lit'), b'/lit'),
 
    (PSLiteral('# header'), b'/#23#20header'),
 
])
 
def test_write_scalar(writer, source, expected):
 
    actual = b''.join(writer.emit(source)).strip(WHITESPACE)
 
    assert actual == expected
 

	
 
@pytest.mark.parametrize('source,expected', [
 
    ([], b'[]'),
 
    ([1, 2, 3], b'[1 2 3]'),
 
    ([[1, 3], [2, 4], []], b'[[1 3][2 4][]]'),
 
    ({}, b'<<>>'),
 
    ({'Yes': True, 'No': False}, b'<</Yes true /No false>>'),
 
    ({'Kids': [1, 2, 3]}, b'<</Kids [1 2 3]>>'),
 
])
 
def test_write_compound(writer, source, expected):
 
    pattern = expected_re(expected)
 
    actual = b''.join(writer.emit(source))
 
    assert re.fullmatch(pattern, actual)
 

	
 
def test_write_document(writer):
 
    pysrc = {'FDF': {'Fields': [
 
        {'FT': PSLiteral('Tx'), 'T': 'text'},
 
        {'FT': PSLiteral('Btn'), 'T': 'check'},
 
    ]}}
 
    doc = io.BytesIO()
 
    writer.write_document(pysrc, doc)
 
    pdf = open_pdf(doc)
 
    assert len(pdf.catalog) == 1
 
    actual = resolve1(pdf.catalog['FDF'])
 
    assert len(actual) == 1
 
    f1, f2 = actual['Fields']
 
    assert f1['FT'].name == 'Tx'
 
    assert f1['T'] == b'text'
 
    assert f2['FT'].name == 'Btn'
 
    assert f2['T'] == b'check'
 

	
 
def test_merge():
 
    with testutil.test_path('pdfforms/form1_fill.yml').open() as yaml_file:
 
        form_yaml = yaml.safe_load(yaml_file)['fields']
 
    actual, errors = merge_form(form_yaml)
 
    assert not errors
 
    expected = {
 
        'text1_0': 'text 1.0',
 
        'button1_0': PSLiteral('1'),
 
        'button1_1': None,
 
        'text1_1': 'text 1.1',
 
        'text2_0': 'text 2.0',
 
        'button2_0': None,
 
        'button2_1': PSLiteral('2'),
 
    }
 
    for field in actual:
 
        try:
 
            expect_value = expected.pop(field['T'])
 
        except KeyError:
 
            pass
 
        else:
 
            actual_value = field.get('V')
 
            if isinstance(expect_value, PSLiteral):
 
                assert actual_value.name == expect_value.name
 
            else:
 
                assert actual_value == expect_value
 
        actual.extend(field.get('Kids', ()))
 
    assert not expected, "not all expected fields found in filled form data"
 

	
 
@pytest.mark.parametrize('name', [None, 'nonesuchfield'])
 
def test_merge_bad_name(name):
 
    fill = {'fdf': {}}
 
    if name is not None:
 
        fill['fdf']['name'] = name
 
    _, errors = merge_form([fill])
 
    error, = errors
 
    assert error.level >= logging.ERROR
 
    assert error.yaml_index == 0
 
    assert error.name == name
 

	
 
@pytest.mark.parametrize('name,yaml_type', [
 
    ('topform.text1_0', 'Btn'),
 
    ('topform.button1.button1_0', 'Tx'),
 
])
 
def test_merge_yaml_wrong_type(name, yaml_type):
 
    fill = {'fdf': {'name': name, 'type': yaml_type}}
 
    _, errors = merge_form([fill])
 
    error, = errors
 
    assert error.level >= logging.WARNING
 
    assert error.yaml_index == 0
 
    assert error.name == name
 

	
 
@pytest.mark.parametrize('value', ['', ' ', 'readwrite'])
 
def test_merge_readonly_field(value):
 
    fill = [{
 
        'fdf': {'name': 'topform.text2_R'},
 
        'value': value,
 
    }]
 
    _, errors = merge_form(fill)
 
    error, = errors
 
    assert error.level >= logging.WARNING
 
    assert error.yaml_index == 0
 
    assert error.name == 'topform.text2_R'
 

	
 
@pytest.mark.parametrize('value', [None, True, 'Yes'])
 
def test_merge_nonterminal_field(value):
 
    yaml_fills = [{
 
        'fdf': {'name': 'topform.button1'},
 
        'value': value,
 
    }]
 
    _, errors = merge_form(yaml_fills)
 
    if value is None:
 
        assert not errors
 
    else:
 
        error, = errors
 
        assert error.level >= logging.WARNING
 
        assert error.yaml_index == 0
 
        assert error.name == 'topform.button1'
 

	
 
@pytest.mark.parametrize('value', [None, True, 'Yes'])
 
def test_merge_unsupported_field_type(value):
 
    yaml_fills = [{
 
        'fdf': {'name': 'topform.submit', 'type': 'Btn'},
 
        'value': value,
 
    }]
 
    _, errors = merge_form(yaml_fills)
 
    if value is None:
 
        assert not errors
 
    else:
 
        error, = errors
 
        assert error.level >= logging.WARNING
 
        assert error.yaml_index == 0
 
        assert error.name == 'topform.submit'
 

	
 
@pytest.mark.parametrize('value', [True, False, [], {}])
 
def test_merge_unsupported_text_value(value):
 
    yaml_fills = [{
 
        'fdf': {'name': 'topform.text1_0'},
 
        'value': value,
 
    }]
 
    _, errors = merge_form(yaml_fills)
 
    error, = errors
 
    assert error.level >= logging.ERROR
 
    assert error.yaml_index == 0
 
    assert error.name == 'topform.text1_0'
 

	
 
@pytest.mark.parametrize('value', ['', 'Off', 'Yes', [], {}])
 
def test_merge_unsupported_checkbox_value(value):
 
    yaml_fills = [{
 
        'fdf': {'name': 'topform.button1.button1_0'},
 
        'value': value,
 
    }]
 
    _, errors = merge_form(yaml_fills)
 
    error, = errors
 
    assert error.level >= logging.ERROR
 
    assert error.yaml_index == 0
 
    assert error.name == 'topform.button1.button1_0'
 

	
 
def test_generate():
 
    source = [
 
        {'fdf': {'name': 'form.text', 'type': 'Tx'}, 'value': 'generated'},
 
        {'fdf': {'name': 'form.button', 'type': 'Btn'}, 'value': True},
 
    ]
 
    actual, errors = fillmod.generate_form(source)
 
    assert not errors
 
    form_root, = actual
 
    assert form_root['T'] == 'form'
 
    assert 'V' not in form_root
 
    text, checkbox = form_root['Kids']
 
    assert text['T'] == 'text'
 
    assert text['V'] == 'generated'
 
    assert not text.get('Kids')
 
    assert checkbox['T'] == 'button'
 
    assert checkbox['V'].name == 'Yes'
 
    assert not checkbox.get('Kids')
 

	
 
@pytest.mark.parametrize('options,value', [
 
    (['1'], True),
 
    (['1'], False),
 
    (['On', 'Off'], True),
 
    (['On', 'Off'], False),
 
])
 
def test_generate_checkbox_with_options(options, value):
 
    source = [{
 
        'fdf': {'name': 'cbox', 'type': 'Btn', 'options': options},
 
        'value': value,
 
    }]
 
    actual, errors = fillmod.generate_form(source)
 
    assert not errors
 
    assert actual[0]['V'].name == (options[0] if value else 'Off')
 

	
 
@pytest.mark.parametrize('yaml_type', [None, 'Ch', 'Sig'])
 
def test_generate_unsupported_field_type(yaml_type):
 
    source = [{
 
        'fdf': {'name': 'badtype', 'type': yaml_type},
 
        'value': 'unsupported type value',
 
    }]
 
    if yaml_type is None:
 
        del source[0]['fdf']['type']
 
    _, errors = fillmod.generate_form(source)
 
    error, = errors
 
    assert error.level >= logging.ERROR
 
    assert error.yaml_index == 0
 
    assert error.name == 'badtype'
 

	
 
def test_generate_invalid_field_type():
 
    source = [{
 
        'fdf': {'name': 'badtype', 'type': '<unknown>'},
 
        'value': 'unsupported type value',
 
    }]
 
    _, errors = fillmod.generate_form(source)
 
    assert errors
 
    found_msg = False
 
    for error in errors:
 
        assert error.level >= logging.ERROR
 
        assert error.yaml_index == 0
 
        assert error.name == 'badtype'
 
        found_msg = found_msg or '<unknown>' in error.errdesc
 
    assert found_msg, "no errors mentioned unknown field type"
 

	
 
@pytest.mark.parametrize('value', [True, False, [], {}])
 
def test_generate_unsupported_text_value(value):
 
    source = [{
 
        'fdf': {'name': 'badtext', 'type': 'Tx'},
 
        'value': value,
 
    }]
 
    _, errors = fillmod.generate_form(source)
 
    error, = errors
 
    assert error.level >= logging.ERROR
 
    assert error.yaml_index == 0
 
    assert error.name == 'badtext'
 

	
 
@pytest.mark.parametrize('value', ['', 'Off', 'Yes', [], {}])
 
def test_generate_unsupported_checkbox_value(value):
 
    source = [{
 
        'fdf': {'name': 'badbutton', 'type': 'Btn'},
 
        'value': value,
 
    }]
 
    _, errors = fillmod.generate_form(source)
 
    error, = errors
 
    assert error.level >= logging.ERROR
 
    assert error.yaml_index == 0
 
    assert error.name == 'badbutton'
 

	
 
def test_main_generate_fdf():
 
    arglist = ['--output-file=-', str(testutil.test_path('pdfforms/form1_fill.yml'))]
 
    stdout = io.BytesIO()
 
    stderr = io.StringIO()
 
    retcode = fillmod.main(arglist, stdout, stderr)
 
    assert retcode == 0
 
    assert not stderr.getvalue()
 
    patterns = iter(expected_re(p) for p in [
 
            b'/T (text1_0)',
 
            b'/V (text 1.0)',
 
            b'/T (button1_0)',
 
            b'/V /1',
 
            b'/T (text2_0)',
 
            b'/V (text 2.0)',
 
            b'/T (button2_1)',
 
            b'/V /2',
 
    ])
 
    pattern = next(patterns)
 
    stdout.seek(0)
 
    for line in stdout:
 
        if re.search(pattern, line):
 
            try:
 
                pattern = next(patterns)
 
            except StopIteration:
 
                break
 
    else:
 
        pytest.fail(f"pattern {pattern!r} not found in FDF output")
 

	
 
@pytest.mark.skipUnless(PDFTK, "need pdftk installed")
 
@pytest.mark.xfail(reason="`pdftk fill_form` expects a full PDF")
 
def test_main_fill_pdf():
 
    arglist = [
 
        '--pdftk', PDFTK,
 
        '--output-file', '-',
 
        str(testutil.test_path('pdfforms/form1_fill.yml')),
 
        str(testutil.test_path('pdfforms/form1.fdf')),
 
    ]
 
    stdout = io.BytesIO()
 
    stderr = io.StringIO()
 
    retcode = fillmod.main(arglist, stdout, stderr)
 
    assert retcode == 0
 
    assert not stderr.getvalue()
 
    patterns = iter(expected_re(p) for p in [
 
            b'/T (text1_0)',
 
            b'/V (text 1.0)',
 
            b'/T (button1_0)',
 
            b'/V /1',
 
            b'/T (text2_0)',
 
            b'/V (text 2.0)',
 
            b'/T (button2_1)',
 
            b'/V /2',
 
    ])
 
    pattern = next(patterns)
 
    stdout.seek(0)
 
    for line in stdout:
 
        if re.search(pattern, line):
 
            try:
 
                pattern = next(patterns)
 
            except StopIteration:
 
                break
 
    else:
 
        pytest.fail(f"pattern {pattern!r} not found in FDF output")
0 comments (0 inline, 0 general)