Changeset - 1908358c309f
[Not reviewed]
1 1 1
Brett Smith - 3 years ago 2021-01-09 20:16:07
brettcsmith@brettcsmith.org
extract: Lay the groundwork for specialized PDF extractors.

* Start a whole extract submodule.
* Parametrize FormExtractor.
* Add a FormExtractor._transform_fields() hook.
2 files changed with 21 insertions and 9 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/pdfforms/extract/__init__.py
Show inline comments
 
file renamed from conservancy_beancount/pdfforms/extract.py to conservancy_beancount/pdfforms/extract/__init__.py
 
"""extract.py - Extract form data from PDF files"""
 
"""pdfforms/extract/__init__.py - Extract form data from PDF files"""
 
# Copyright © 2021  Brett Smith
...
 
@@ -14,5 +14,5 @@ import yaml
 

	
 
from . import fields as fieldmod
 
from . import utils as pdfutils
 
from .. import cliutil
 
from .. import fields as fieldmod
 
from .. import utils as pdfutils
 
from ... import cliutil
 

	
...
 
@@ -26,2 +26,3 @@ from typing import (
 
    BinaryIO,
 
    Iterable,
 
    Iterator,
...
 
@@ -31,2 +32,3 @@ from typing import (
 
    TextIO,
 
    Type,
 
)
...
 
@@ -102,3 +104,11 @@ class FormExtractor:
 

	
 
    def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
 
        pass
 

	
 
    def extract(self) -> Mapping[str, Any]:
 
        fields = [
 
            fieldmod.FormField.by_type(resolve1(field_source))
 
            for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
        ]
 
        self._transform_fields(iter(fields))
 
        return {
...
 
@@ -107,5 +117,5 @@ class FormExtractor:
 
            'fields': [
 
                field
 
                for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
                for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
 
                yaml_field
 
                for field in fields
 
                for yaml_field in self._extract_field(field)
 
            ],
...
 
@@ -157,2 +167,3 @@ def main(arglist: Optional[Sequence[str]]=None,
 
         stderr: TextIO=sys.stderr,
 
         extract_cls: Type[FormExtractor]=FormExtractor,
 
) -> int:
...
 
@@ -162,6 +173,6 @@ def main(arglist: Optional[Sequence[str]]=None,
 
        if args.document == cliutil.STDSTREAM_PATH:
 
            extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
 
            extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key)
 
        else:
 
            extractor = exit_stack.enter_context(
 
                FormExtractor.from_path(args.document, args.form_key),
 
                extract_cls.from_path(args.document, args.form_key),
 
            )
setup.py
Show inline comments
...
 
@@ -35,2 +35,3 @@ setup(
 
        'conservancy_beancount.pdfforms',
 
        'conservancy_beancount.pdfforms.extract',
 
        'conservancy_beancount.plugin',
0 comments (0 inline, 0 general)