Changeset - 1908358c309f
[Not reviewed]
1 1 1
Brett Smith - 4 years ago 2021-01-09 20:16:07
brettcsmith@brettcsmith.org
extract: Lay the groundwork for specialized PDF extractors.

* Start a whole extract submodule.
* Parametrize FormExtractor.
* Add a FormExtractor._transform_fields() hook.
2 files changed with 21 insertions and 9 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/pdfforms/extract/__init__.py
Show inline comments
 
file renamed from conservancy_beancount/pdfforms/extract.py to conservancy_beancount/pdfforms/extract/__init__.py
 
"""extract.py - Extract form data from PDF files"""
 
"""pdfforms/extract/__init__.py - Extract form data from PDF files"""
 
# Copyright © 2021  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
...
 
@@ -13,7 +13,7 @@ import sys
 
import yaml
 

	
 
from . import fields as fieldmod
 
from . import utils as pdfutils
 
from .. import cliutil
 
from .. import fields as fieldmod
 
from .. import utils as pdfutils
 
from ... import cliutil
 

	
 
from pathlib import Path
...
 
@@ -25,4 +25,5 @@ from typing import (
 
    Any,
 
    BinaryIO,
 
    Iterable,
 
    Iterator,
 
    Mapping,
...
 
@@ -30,4 +31,5 @@ from typing import (
 
    Sequence,
 
    TextIO,
 
    Type,
 
)
 

	
...
 
@@ -101,12 +103,20 @@ class FormExtractor:
 
            yield from self._extract_field(kid, name)
 

	
 
    def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
 
        pass
 

	
 
    def extract(self) -> Mapping[str, Any]:
 
        fields = [
 
            fieldmod.FormField.by_type(resolve1(field_source))
 
            for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
        ]
 
        self._transform_fields(iter(fields))
 
        return {
 
            'from file': self.source,
 
            'form key': self.form_key,
 
            'fields': [
 
                field
 
                for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
                for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
 
                yaml_field
 
                for field in fields
 
                for yaml_field in self._extract_field(field)
 
            ],
 
        }
...
 
@@ -156,4 +166,5 @@ def main(arglist: Optional[Sequence[str]]=None,
 
         stdout: TextIO=sys.stdout,
 
         stderr: TextIO=sys.stderr,
 
         extract_cls: Type[FormExtractor]=FormExtractor,
 
) -> int:
 
    args = parse_arguments(arglist)
...
 
@@ -161,8 +172,8 @@ def main(arglist: Optional[Sequence[str]]=None,
 
    with contextlib.ExitStack() as exit_stack:
 
        if args.document == cliutil.STDSTREAM_PATH:
 
            extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
 
            extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key)
 
        else:
 
            extractor = exit_stack.enter_context(
 
                FormExtractor.from_path(args.document, args.form_key),
 
                extract_cls.from_path(args.document, args.form_key),
 
            )
 
        extracted_form = extractor.extract()
setup.py
Show inline comments
...
 
@@ -34,4 +34,5 @@ setup(
 
        'conservancy_beancount',
 
        'conservancy_beancount.pdfforms',
 
        'conservancy_beancount.pdfforms.extract',
 
        'conservancy_beancount.plugin',
 
        'conservancy_beancount.reports',
0 comments (0 inline, 0 general)