Changeset - 1908358c309f
[Not reviewed]
1 1 1
Brett Smith - 3 years ago 2021-01-09 20:16:07
brettcsmith@brettcsmith.org
extract: Lay the groundwork for specialized PDF extractors.

* Start a whole extract submodule.
* Parametrize FormExtractor.
* Add a FormExtractor._transform_fields() hook.
2 files changed with 21 insertions and 9 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/pdfforms/extract/__init__.py
Show inline comments
 
file renamed from conservancy_beancount/pdfforms/extract.py to conservancy_beancount/pdfforms/extract/__init__.py
 
"""extract.py - Extract form data from PDF files"""
 
"""pdfforms/extract/__init__.py - Extract form data from PDF files"""
 
# Copyright © 2021  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
 
# Full copyright and licensing details can be found at toplevel file
 
# LICENSE.txt in the repository.
 

	
...
 
@@ -9,29 +9,31 @@ import argparse
 
import contextlib
 
import logging
 
import sys
 

	
 
import yaml
 

	
 
from . import fields as fieldmod
 
from . import utils as pdfutils
 
from .. import cliutil
 
from .. import fields as fieldmod
 
from .. import utils as pdfutils
 
from ... import cliutil
 

	
 
from pathlib import Path
 
from pdfminer.pdfdocument import PDFDocument  # type:ignore[import]
 
from pdfminer.pdfparser import PDFParser  # type:ignore[import]
 
from pdfminer.pdftypes import resolve1  # type:ignore[import]
 

	
 
from typing import (
 
    Any,
 
    BinaryIO,
 
    Iterable,
 
    Iterator,
 
    Mapping,
 
    Optional,
 
    Sequence,
 
    TextIO,
 
    Type,
 
)
 

	
 
PROGNAME = 'pdfform-extract'
 
logger = logging.getLogger('conservancy_beancount.pdfforms.extract')
 

	
 
class FormExtractor:
...
 
@@ -97,20 +99,28 @@ class FormExtractor:
 
                retval['fdf']['options'] = field.options()
 
            yield retval
 
        name += '.'
 
        for kid in field.kids():
 
            yield from self._extract_field(kid, name)
 

	
 
    def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
 
        pass
 

	
 
    def extract(self) -> Mapping[str, Any]:
 
        fields = [
 
            fieldmod.FormField.by_type(resolve1(field_source))
 
            for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
        ]
 
        self._transform_fields(iter(fields))
 
        return {
 
            'from file': self.source,
 
            'form key': self.form_key,
 
            'fields': [
 
                field
 
                for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
                for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
 
                yaml_field
 
                for field in fields
 
                for yaml_field in self._extract_field(field)
 
            ],
 
        }
 

	
 

	
 
class FormYAMLDumper(yaml.dumper.SafeDumper):
 
    def represent_mapping(self, tag: Any, value: Any, flow_style: Any=None) -> Any:
...
 
@@ -152,21 +162,22 @@ Use `-` to read from stdin.
 
""")
 
    return parser.parse_args(arglist)
 

	
 
def main(arglist: Optional[Sequence[str]]=None,
 
         stdout: TextIO=sys.stdout,
 
         stderr: TextIO=sys.stderr,
 
         extract_cls: Type[FormExtractor]=FormExtractor,
 
) -> int:
 
    args = parse_arguments(arglist)
 
    cliutil.set_loglevel(logger, args.loglevel)
 
    with contextlib.ExitStack() as exit_stack:
 
        if args.document == cliutil.STDSTREAM_PATH:
 
            extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
 
            extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key)
 
        else:
 
            extractor = exit_stack.enter_context(
 
                FormExtractor.from_path(args.document, args.form_key),
 
                extract_cls.from_path(args.document, args.form_key),
 
            )
 
        extracted_form = extractor.extract()
 
    with contextlib.ExitStack() as exit_stack:
 
        out_file = cliutil.text_output(args.output_file, stdout)
 
        if out_file is not stdout:
 
            exit_stack.enter_context(out_file)
setup.py
Show inline comments
...
 
@@ -30,12 +30,13 @@ setup(
 
        'pytest',  # Debian:python3-pytest
 
    ],
 

	
 
    packages=[
 
        'conservancy_beancount',
 
        'conservancy_beancount.pdfforms',
 
        'conservancy_beancount.pdfforms.extract',
 
        'conservancy_beancount.plugin',
 
        'conservancy_beancount.reports',
 
        'conservancy_beancount.tools',
 
    ],
 
    entry_points={
 
        'console_scripts': [
0 comments (0 inline, 0 general)