Changeset - 1908358c309f
[Not reviewed]
1 1 1
Brett Smith - 3 years ago 2021-01-09 20:16:07
brettcsmith@brettcsmith.org
extract: Lay the groundwork for specialized PDF extractors.

* Start a whole extract submodule.
* Parametrize FormExtractor.
* Add a FormExtractor._transform_fields() hook.
2 files changed with 21 insertions and 9 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/pdfforms/extract/__init__.py
Show inline comments
 
file renamed from conservancy_beancount/pdfforms/extract.py to conservancy_beancount/pdfforms/extract/__init__.py
 
"""extract.py - Extract form data from PDF files"""
 
"""pdfforms/extract/__init__.py - Extract form data from PDF files"""
 
# Copyright © 2021  Brett Smith
 
# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0
 
#
...
 
@@ -12,9 +12,9 @@ import sys
 

	
 
import yaml
 

	
 
from . import fields as fieldmod
 
from . import utils as pdfutils
 
from .. import cliutil
 
from .. import fields as fieldmod
 
from .. import utils as pdfutils
 
from ... import cliutil
 

	
 
from pathlib import Path
 
from pdfminer.pdfdocument import PDFDocument  # type:ignore[import]
...
 
@@ -24,11 +24,13 @@ from pdfminer.pdftypes import resolve1  # type:ignore[import]
 
from typing import (
 
    Any,
 
    BinaryIO,
 
    Iterable,
 
    Iterator,
 
    Mapping,
 
    Optional,
 
    Sequence,
 
    TextIO,
 
    Type,
 
)
 

	
 
PROGNAME = 'pdfform-extract'
...
 
@@ -100,14 +102,22 @@ class FormExtractor:
 
        for kid in field.kids():
 
            yield from self._extract_field(kid, name)
 

	
 
    def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None:
 
        pass
 

	
 
    def extract(self) -> Mapping[str, Any]:
 
        fields = [
 
            fieldmod.FormField.by_type(resolve1(field_source))
 
            for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
        ]
 
        self._transform_fields(iter(fields))
 
        return {
 
            'from file': self.source,
 
            'form key': self.form_key,
 
            'fields': [
 
                field
 
                for field_source in resolve1(self.document.catalog[self.form_key])['Fields']
 
                for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source)))
 
                yaml_field
 
                for field in fields
 
                for yaml_field in self._extract_field(field)
 
            ],
 
        }
 

	
...
 
@@ -155,15 +165,16 @@ Use `-` to read from stdin.
 
def main(arglist: Optional[Sequence[str]]=None,
 
         stdout: TextIO=sys.stdout,
 
         stderr: TextIO=sys.stderr,
 
         extract_cls: Type[FormExtractor]=FormExtractor,
 
) -> int:
 
    args = parse_arguments(arglist)
 
    cliutil.set_loglevel(logger, args.loglevel)
 
    with contextlib.ExitStack() as exit_stack:
 
        if args.document == cliutil.STDSTREAM_PATH:
 
            extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key)
 
            extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key)
 
        else:
 
            extractor = exit_stack.enter_context(
 
                FormExtractor.from_path(args.document, args.form_key),
 
                extract_cls.from_path(args.document, args.form_key),
 
            )
 
        extracted_form = extractor.extract()
 
    with contextlib.ExitStack() as exit_stack:
setup.py
Show inline comments
...
 
@@ -33,6 +33,7 @@ setup(
 
    packages=[
 
        'conservancy_beancount',
 
        'conservancy_beancount.pdfforms',
 
        'conservancy_beancount.pdfforms.extract',
 
        'conservancy_beancount.plugin',
 
        'conservancy_beancount.reports',
 
        'conservancy_beancount.tools',
0 comments (0 inline, 0 general)