diff --git a/conservancy_beancount/pdfforms/extract.py b/conservancy_beancount/pdfforms/extract/__init__.py similarity index 85% rename from conservancy_beancount/pdfforms/extract.py rename to conservancy_beancount/pdfforms/extract/__init__.py index 01c792139627e1bc163cf9c395a9e5a7b67be721..a2230dfbddcaee8dcadf48a6b462aa9fc0cf7351 100644 --- a/conservancy_beancount/pdfforms/extract.py +++ b/conservancy_beancount/pdfforms/extract/__init__.py @@ -1,4 +1,4 @@ -"""extract.py - Extract form data from PDF files""" +"""pdfforms/extract/__init__.py - Extract form data from PDF files""" # Copyright © 2021 Brett Smith # License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 # @@ -12,9 +12,9 @@ import sys import yaml -from . import fields as fieldmod -from . import utils as pdfutils -from .. import cliutil +from .. import fields as fieldmod +from .. import utils as pdfutils +from ... import cliutil from pathlib import Path from pdfminer.pdfdocument import PDFDocument # type:ignore[import] @@ -24,11 +24,13 @@ from pdfminer.pdftypes import resolve1 # type:ignore[import] from typing import ( Any, BinaryIO, + Iterable, Iterator, Mapping, Optional, Sequence, TextIO, + Type, ) PROGNAME = 'pdfform-extract' @@ -100,14 +102,22 @@ class FormExtractor: for kid in field.kids(): yield from self._extract_field(kid, name) + def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None: + pass + def extract(self) -> Mapping[str, Any]: + fields = [ + fieldmod.FormField.by_type(resolve1(field_source)) + for field_source in resolve1(self.document.catalog[self.form_key])['Fields'] + ] + self._transform_fields(iter(fields)) return { 'from file': self.source, 'form key': self.form_key, 'fields': [ - field - for field_source in resolve1(self.document.catalog[self.form_key])['Fields'] - for field in self._extract_field(fieldmod.FormField.by_type(resolve1(field_source))) + yaml_field + for field in fields + for yaml_field in self._extract_field(field) ], } @@ -155,15 +165,16 @@ Use `-` to read from stdin. def main(arglist: Optional[Sequence[str]]=None, stdout: TextIO=sys.stdout, stderr: TextIO=sys.stderr, + extract_cls: Type[FormExtractor]=FormExtractor, ) -> int: args = parse_arguments(arglist) cliutil.set_loglevel(logger, args.loglevel) with contextlib.ExitStack() as exit_stack: if args.document == cliutil.STDSTREAM_PATH: - extractor = FormExtractor.from_file(sys.stdin.buffer, args.form_key) + extractor = extract_cls.from_file(sys.stdin.buffer, args.form_key) else: extractor = exit_stack.enter_context( - FormExtractor.from_path(args.document, args.form_key), + extract_cls.from_path(args.document, args.form_key), ) extracted_form = extractor.extract() with contextlib.ExitStack() as exit_stack: diff --git a/setup.py b/setup.py index c2b78971fff288476a39cf37b0645b77b5ab48e4..d3f404f4d933424eb78382b5ae7d168bb869469d 100755 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ setup( packages=[ 'conservancy_beancount', 'conservancy_beancount.pdfforms', + 'conservancy_beancount.pdfforms.extract', 'conservancy_beancount.plugin', 'conservancy_beancount.reports', 'conservancy_beancount.tools',