From 1c95c1b1b1fb4cb0cebd4c358e8b9ac0a3459694 2021-01-09 20:17:55 From: Brett Smith Date: 2021-01-09 20:17:55 Subject: [PATCH] irs990scheduleA: New PDF extractor. --- diff --git a/conservancy_beancount/pdfforms/extract/irs990scheduleA.py b/conservancy_beancount/pdfforms/extract/irs990scheduleA.py new file mode 100644 index 0000000000000000000000000000000000000000..6a94ad7c48b918422654288537685232d17378a4 --- /dev/null +++ b/conservancy_beancount/pdfforms/extract/irs990scheduleA.py @@ -0,0 +1,83 @@ +"""irs990scheduleA.py - Extract IRS 990 Schedule A form data from the prior FY""" +# Copyright © 2021 Brett Smith +# License: AGPLv3-or-later WITH Beancount-Plugin-Additional-Permission-1.0 +# +# Full copyright and licensing details can be found at toplevel file +# LICENSE.txt in the repository. + +import collections +import functools +import itertools +import logging + +from . import FormExtractor, main +from .. import fields as fieldmod +from ... import cliutil + +from typing import ( + Iterable, + Iterator, + Optional, + Tuple, +) + +PROGNAME = 'pdfform-extract-irs990scheduleA' +logger = logging.getLogger('conservancy_beancount.pdfforms.extract.irs990scheduleA') + +def _make_shifts( + key_fmt: str, + start_count: int, + shift_count: int=4, + clear_count: int=2, +) -> Iterator[Tuple[str, Optional[str]]]: + for index in range(start_count, start_count + shift_count): + yield (key_fmt.format(index), key_fmt.format(index + 1)) + index += 1 + for index in range(index, index + clear_count): + yield (key_fmt.format(index), None) + +class IRS990ScheduleAExtractor(FormExtractor): + _BLANK_FIELDS = [ + 'topmostSubform[0].Page2[0].Table_SectionA[0].Line5[0].f2_25[0]', + 'topmostSubform[0].Page2[0].Table_SectionA[0].Line6[0].f2_26[0]', + 'topmostSubform[0].Page2[0].Table_SectionB[0].Line11[0].f2_51[0]', + 'topmostSubform[0].Page2[0].f2_52[0]', + 'topmostSubform[0].Page2[0].f2_53[0]', + 'topmostSubform[0].Page2[0].c2_2[0]', + 'topmostSubform[0].Page2[0].c2_4[0]', + ] + _FIELD_SOURCES = dict(itertools.chain( + _make_shifts('topmostSubform[0].Page2[0].Table_SectionA[0].Line1[0].f2_{}[0]', 1), + _make_shifts('topmostSubform[0].Page2[0].Table_SectionA[0].Line2[0].f2_{}[0]', 7), + _make_shifts('topmostSubform[0].Page2[0].Table_SectionA[0].Line3[0].f2_{}[0]', 13), + _make_shifts('topmostSubform[0].Page2[0].Table_SectionA[0].Line4[0].f2_{}[0]', 19), + _make_shifts('topmostSubform[0].Page2[0].Table_SectionB[0].Line7[0].f2_{}[0]', 27), + _make_shifts('topmostSubform[0].Page2[0].Table_SectionB[0].Line8[0].f2_{}[0]', 33), + _make_shifts('topmostSubform[0].Page2[0].Table_SectionB[0].Line9[0].f2_{}[0]', 39), + _make_shifts('topmostSubform[0].Page2[0].Table_SectionB[0].Line10[0].f2_{}[0]', 45), + iter((key, None) for key in _BLANK_FIELDS), + )) + # Part II Section C + _FIELD_SOURCES['topmostSubform[0].Page2[0].f2_54[0]'] = 'topmostSubform[0].Page2[0].f2_53[0]' + _FIELD_SOURCES['topmostSubform[0].Page2[0].c2_3[0]'] = 'topmostSubform[0].Page2[0].c2_2[0]' + _FIELD_SOURCES['topmostSubform[0].Page2[0].c2_5[0]'] = 'topmostSubform[0].Page2[0].c2_4[0]' + + def _transform_fields(self, fields: Iterable[fieldmod.FormField]) -> None: + fields_map = dict( + kvpair + for field in fields + for kvpair in field.as_mapping() + ) + new_values = { + key: None if src_key is None else fields_map[src_key].value() + for key, src_key in self._FIELD_SOURCES.items() + } + for key, value in new_values.items(): + fields_map[key].set_value(value) + + +main = functools.partial(main, extract_cls=IRS990ScheduleAExtractor) +entry_point = cliutil.make_entry_point(__name__, PROGNAME) + +if __name__ == '__main__': + exit(entry_point()) diff --git a/setup.py b/setup.py index d3f404f4d933424eb78382b5ae7d168bb869469d..36dc6ffbc80102f86aaf954ab0ba36d79c913cc7 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup setup( name='conservancy_beancount', description="Plugin, library, and reports for reading Conservancy's books", - version='1.15.2', + version='1.15.3', author='Software Freedom Conservancy', author_email='info@sfconservancy.org', license='GNU AGPLv3+', @@ -50,6 +50,7 @@ setup( 'ledger-report = conservancy_beancount.reports.ledger:entry_point', 'opening-balances = conservancy_beancount.tools.opening_balances:entry_point', 'pdfform-extract = conservancy_beancount.pdfforms.extract:entry_point', + 'pdfform-extract-irs990scheduleA = conservancy_beancount.pdfforms.extract.irs990scheduleA:entry_point', 'pdfform-fill = conservancy_beancount.pdfforms.fill:entry_point', 'split-ods-links = conservancy_beancount.tools.split_ods_links:entry_point', ],