Changeset - e764f3d0ef05
[Not reviewed]
0 2 0
Ben Sturmfels (bsturmfels) - 19 months ago 2023-02-11 04:23:15
ben@sturm.com.au
reconcile: Factor out the output formatting
2 files changed with 28 insertions and 13 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/reconcile/statement_reconciler.py
Show inline comments
...
 
@@ -500,220 +500,226 @@ def metadata_for_match(match: Tuple[List, List, List], statement_filename: str,
 
    statement_filename = get_repo_relative_path(statement_filename)
 
    csv_filename = get_repo_relative_path(csv_filename)
 
    metadata = []
 
    statement_entries, books_entries, _ = match
 
    for books_entry in books_entries:
 
        for statement_entry in statement_entries:
 
            if not books_entry['bank_statement']:
 
                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement: "{statement_filename}"'))
 
                metadata.append((books_entry['filename'], books_entry['line'], f'    bank-statement-csv: "{csv_filename}:{statement_entry["line"]}"'))
 
    return metadata
 

	
 

	
 
def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> None:
 
    """Insert reconciliation metadata in the books files.
 

	
 
    Takes a list of edits to make as tuples of form (filename, lineno, metadata):
 

	
 
    [
 
        ('2021/main.beancount', 4245, '    bank-statement: statement.pdf'),
 
        ('2021/main.beancount', 1057, '    bank-statement: statement.pdf'),
 
        ('2021/payroll.beancount', 257, '    bank-statement: statement.pdf'),
 
        ...,
 
    ]
 

	
 
    Beancount doesn't provide any infrastructure for programmatically
 
    updating the books, only appending in the case of importers. So
 
    we're on our own here.
 
    """
 
    file_contents: dict[str, list] = {}
 
    file_offsets: dict[str, int] = collections.defaultdict(int)
 
    # Load each books file into memory and insert the relevant metadata lines.
 
    # Line numbers change as we do this, so we keep track of the offset for each
 
    # file. Changes must be sorted by line number first or else the offsets will
 
    # break because we're jumping around making edits.
 
    for filename, line, metadata in sorted(metadata_to_apply):
 
        if filename not in file_contents:
 
            with open(filename, 'r') as f:
 
                file_contents[filename] = f.readlines()
 
        # Insert is inefficient, but fast enough for now in practise.
 
        file_contents[filename].insert(line + file_offsets[filename], metadata.rstrip() + '\n')
 
        file_offsets[filename] += 1
 
    # Writes each updated file back to disk.
 
    for filename, contents in file_contents.items():
 
        with open(filename, 'w') as f:
 
            f.writelines(contents)
 
            print(f'Wrote {filename}.')
 

	
 

	
 
def get_repo_relative_path(path: str) -> str:
 
    """Chop off the unique per-person CONSERVANCY_REPOSITORY.
 

	
 
    CSV and PDF statement metadata should be relative to
 
    CONSERVANCY_REPOSITORY ie. without regards to exactly where on
 
    your computer all the files live.
 

	
 
    """
 
    return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))
 

	
 

	
 
def parse_path(path: str) -> str:
 
    """Validate that a file exists for use in argparse."""
 
    if not os.path.exists(path):
 
        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
 
    return path
 

	
 

	
 
def parse_repo_relative_path(path: str) -> str:
 
    """Validate that a file exists and is within $CONSERVANCY_REPOSITORY.
 

	
 
    For use with argparse.
 

	
 
    """
 
    if not os.path.exists(path):
 
        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
 
    repo = os.getenv('CONSERVANCY_REPOSITORY')
 
    if not repo:
 
        raise argparse.ArgumentTypeError('$CONSERVANCY_REPOSITORY is not set.')
 
    if not path.startswith(repo):
 
        raise argparse.ArgumentTypeError(f'File {path} does not share a common prefix with $CONSERVANCY_REPOSITORY {repo}.')
 
    return path
 

	
 

	
 
def parse_decimal_with_separator(number_text: str) -> decimal.Decimal:
 
    """decimal.Decimal can't parse numbers with thousands separator."""
 
    number_text = number_text.replace(',', '')
 
    return decimal.Decimal(number_text)
 

	
 

	
 
def parse_arguments(argv: List[str]) -> argparse.Namespace:
 
    parser = argparse.ArgumentParser(prog=PROGNAME, description='Reconciliation helper')
 
    cliutil.add_version_argument(parser)
 
    cliutil.add_loglevel_argument(parser)
 
    parser.add_argument('--beancount-file', required=True, type=parse_path)
 
    parser.add_argument('--csv-statement', required=True, type=parse_repo_relative_path)
 
    parser.add_argument('--bank-statement', required=True, type=parse_repo_relative_path)
 
    parser.add_argument('--account', required=True, help='eg. Liabilities:CreditCard:AMEX')
 
    # parser.add_argument('--report-group-regex')
 
    parser.add_argument('--show-reconciled-matches', action='store_true')
 
    parser.add_argument('--non-interactive', action='store_true', help="Don't prompt to write to the books")    # parser.add_argument('--statement-balance', type=parse_decimal_with_separator, required=True, help="A.K.A \"cleared balance\" taken from the end of the period on the PDF statement. Required because CSV statements don't include final or running totals")
 
    args = parser.parse_args(args=argv)
 
    return args
 

	
 

	
 
def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:
 
    """Calculate the totals of transactions matched/not-matched."""
 
    total_matched = decimal.Decimal(0)
 
    total_missing_from_books = decimal.Decimal(0)
 
    total_missing_from_statement = decimal.Decimal(0)
 
    for statement_entries, books_entries, _ in matches:
 
        if statement_entries and books_entries:
 
            total_matched += sum(c['amount'] for c in statement_entries)
 
        elif statement_entries:
 
            total_missing_from_books += sum(c['amount'] for c in statement_entries)
 
        else:
 
            total_missing_from_statement += sum(c['amount'] for c in books_entries)
 
    return total_matched, total_missing_from_books, total_missing_from_statement
 

	
 

	
 
def process_unmatched(statement_trans: List[dict], books_trans: List[dict]) -> List[Tuple[List, List, List]]:
 
    """Format the remaining unmatched transactions to be added to one single list of matches."""
 
    matches: List[Tuple[List, List, List]] = []
 
    for r1 in statement_trans:
 
        matches.append(([r1], [], ['no match']))
 
    for r2 in books_trans:
 
        matches.append(([], [r2], ['no match']))
 
    return matches
 

	
 

	
 
def format_output(matches, begin_date, end_date, csv_statement, show_reconciled_matches) -> str:
 
    with io.StringIO() as out:
 
        match_output = format_matches(matches, csv_statement, show_reconciled_matches)
 
        _, total_missing_from_books, total_missing_from_statement = totals(matches)
 
        print('-' * 155, file=out)
 
        statement_heading = f'Statement transactions {begin_date} to {end_date}'
 
        print(f'{statement_heading:<52}            {"Books transactions":<58}   Notes', file=out)
 
        print('-' * 155, file=out)
 
        for _, output in sorted(match_output, key=lambda x: x[0]):
 
            print(output, file=out)
 
        print('-' * 155, file=out)
 
        print(f'Sub-total not on statement: {total_missing_from_statement:12,.2f}', file=out)
 
        print(f'Sub-total not in books:     {total_missing_from_books:12,.2f}', file=out)
 
        print(f'Total:                      {total_missing_from_statement + total_missing_from_books:12,.2f}', file=out)
 
        print('-' * 155, file=out)
 
        return out.getvalue()
 

	
 

	
 
def main(arglist: Optional[Sequence[str]] = None,
 
         stdout: TextIO = sys.stdout,
 
         stderr: TextIO = sys.stderr,
 
         config: Optional[configmod.Config] = None,
 
         ) -> int:
 
    args = parse_arguments(arglist)
 
    cliutil.set_loglevel(logger, args.loglevel)
 
    if config is None:
 
        config = configmod.Config()
 
        config.load_file()
 

	
 
    # Validate and normalise the statement into our standard
 
    # transaction data structure.
 
    if 'AMEX' in args.account:
 
        validate_csv = validate_amex_csv
 
        read_csv = read_amex_csv
 
    else:
 
        validate_csv = validate_fr_csv
 
        read_csv = read_fr_csv
 

	
 
    with open(args.csv_statement) as f:
 
        sample = f.read(200)
 
        # Validate should return true/false and a message.
 
        validate_csv(sample)
 
        f.seek(0)
 
        # TODO: Needs a custom read_transactions_from_csv for each of AMEX and
 
        # FR since AMEX has a header row and FR doesn't.
 
        statement_trans = read_csv(f)
 

	
 
    # Dates are taken from the beginning/end of the statement.
 
    begin_date = statement_trans[0]['date']
 
    end_date = statement_trans[-1]['date']
 

	
 
    # Query for the Beancount books data for this above period.
 
    #
 
    # There are pros and cons for using Beancount's in-memory entries
 
    # list directly and also for using Beancount Query Language (BQL)
 
    # to get a list of transactions? Using BQL because it's
 
    # convenient, but we don't have access to the full transaction
 
    # entry objects. Feels a bit strange that these approaches are so
 
    # disconnected.
 
    #
 
    # beancount.query.query_compile.compile() and
 
    # beancount.query.query_execute.filter_entries() look useful in this respect,
 
    # but I'm not clear on how to use compile(). An example would help.
 
    entries, _, options = loader.load_file(args.beancount_file)
 
    # String concatenation looks bad, but there's no SQL injection possible here
 
    # because BQL can't write back to the Beancount files. I hope!
 
    query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}'
 
    _, result_rows = run_query(entries, options, query)
 
    books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
 

	
 
    # Apply two passes of matching, one for standard matches and one
 
    # for subset matches.
 
    matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans)
 
    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(
 
        remaining_statement_trans, remaining_books_trans)
 
    matches.extend(subset_matches)
 

	
 
    # Add the remaining unmatched to make one big list of matches, successful or not.
 
    unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans)
 
    matches.extend(unmatched)
 

	
 
    # Print out results of our matching.
 
    match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches)
 
    _, total_missing_from_books, total_missing_from_statement = totals(matches)
 
    print('-' * 155)
 
    statement_heading = f'Statement transactions {begin_date} to {end_date}'
 
    print(f'{statement_heading:<52}            {"Books transactions":<58}   Notes')
 
    print('-' * 155)
 
    for _, output in sorted(match_output, key=lambda x: x[0]):
 
        print(output)
 
    print('-' * 155)
 
    print(f'Sub-total not on statement: {total_missing_from_statement:12,.2f}')
 
    print(f'Sub-total not in books:     {total_missing_from_books:12,.2f}')
 
    print(f'Total:                      {total_missing_from_statement + total_missing_from_books:12,.2f}')
 
    print('-' * 155)
 
    print(format_output(matches, begin_date, end_date, args.csv_statement, args.show_reconciled_matches))
 

	
 
    # Write statement metadata back to the books.
 
    metadata_to_apply = []
 
    for match in matches:
 
        metadata_to_apply.extend(metadata_for_match(match, args.bank_statement, args.csv_statement))
 
    if metadata_to_apply and not args.non_interactive:
 
        print('Mark matched transactions as reconciled in the books? (y/N) ', end='')
 
        if input().lower() == 'y':
 
            write_metadata_to_books(metadata_to_apply)
 

	
 

	
 
entry_point = cliutil.make_entry_point(__name__, PROGNAME)
 

	
 
if __name__ == '__main__':
 
    exit(entry_point())
tests/test_reconcile.py
Show inline comments
 
import datetime
 
import decimal
 
import io
 
import os
 
import tempfile
 
import textwrap
 

	
 
from conservancy_beancount.reconcile.statement_reconciler import (
 
    date_proximity,
 
    format_output,
 
    match_statement_and_books,
 
    metadata_for_match,
 
    payee_match,
 
    read_amex_csv,
 
    read_fr_csv,
 
    remove_duplicate_words,
 
    remove_payee_junk,
 
    subset_match,
 
    totals,
 
    write_metadata_to_books,
 
)
 

	
 
# These data structures represent individual transactions as taken from the
 
# statement ("S") or the books ("B").
 

	
 
# Statement transaction examples.
 
S1 = {
 
    'date': datetime.date(2022, 1, 1),
 
    'amount': decimal.Decimal('10.00'),
 
    'payee': 'Patreon         / Patreon   / 123456/ ST-A1B2C3D4G5H6       /',
 
    'check_id': '',
 
    'line': 222,
 
}
 
S2 = {
 
    'date': datetime.date(2022, 1, 2),
 
    'amount': decimal.Decimal('20.00'),
 
    'payee': 'BT*LINODE           PHILADELPHIA        P',
 
    'check_id': '',
 
    'line': 333,
 
}
 
S3 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS PO 4067540039 0PORTLAND            OR',
 
    'check_id': '',
 
    'line': 444,
 
}
 
S4 = {
 
    'date': datetime.date(2022, 8, 11),
 
    'amount': decimal.Decimal('-2260.00'),
 
    'payee': 'Trust 0000000362 210',
 
    'check_id': '',
 
    'line': 555,
 
}
 

	
 
# Books transaction examples.
 
B1 = {
 
    'date': datetime.date(2022, 1, 1),
 
    'amount': decimal.Decimal('10.00'),
 
    'payee': 'Patreon',
 
    'check_id': '',
 
    'filename': '2022/imports.beancount',
 
    'line': 777,
 
    'bank_statement': '',
 
}
 
B2 = {
 
    'date': datetime.date(2022, 1, 2),
 
    'amount': decimal.Decimal('20.00'),
 
    'payee': 'Linode',
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
 
    'line': 888,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_next_day = {
 
    'date': datetime.date(2022, 1, 4),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS',
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_next_week = {
 
    'date': datetime.date(2022, 1, 10),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS',
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_mismatch_amount = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('31.00'),
 
    'payee': 'USPS',
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_payee_mismatch_1 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'Credit X',
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_payee_mismatch_2 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'Credit Y',
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_unmatched_check_id = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS',
 
    'check_id': '1234',
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B4A = {
 
    'date': datetime.date(2022, 8, 11),
 
    'amount': decimal.Decimal('-250.00'),
 
    'payee': 'TRUST 0000000362 ACH Retirement Plan',
 
    'check_id': '',
 
    'line': 1000,
 
}
 
B4B = {
 
    'date': datetime.date(2022, 8, 11),
 
    'amount': decimal.Decimal('-250.00'),
...
 
@@ -254,128 +255,136 @@ def test_date_proximity():
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=30)) == 0.5
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=60)) == 0.0
 

	
 

	
 
def test_remove_duplicate_words():
 
    assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow'
 

	
 

	
 
def test_payee_matches_when_first_word_matches():
 
    assert payee_match('Gandi San Francisco', 'Gandi example.com renewal 1234567') == 1.0
 
    assert payee_match('USPS 123456789 Portland', 'USPS John Brown') == 0.8
 

	
 

	
 
def test_metadata_for_match(monkeypatch):
 
    monkeypatch.setenv('CONSERVANCY_REPOSITORY', '.')
 
    assert metadata_for_match(([S1], [B1], []), 'statement.pdf', 'statement.csv') == [
 
        ('2022/imports.beancount', 777, '    bank-statement: "statement.pdf"'),
 
        ('2022/imports.beancount', 777, '    bank-statement-csv: "statement.csv:222"'),
 
    ]
 

	
 

	
 
def test_no_metadata_if_no_matches():
 
    assert metadata_for_match(([S1], [], ['no match']), 'statement.pdf', 'statement.csv') == []
 
    assert metadata_for_match(([], [B1], ['no match']), 'statement.pdf', 'statement.csv') == []
 
    assert metadata_for_match(([S1], [B2], ['no match']), 'statement.pdf', 'statement.csv') == []
 

	
 

	
 
def test_write_to_books():
 
    books = textwrap.dedent("""\
 
        2021-08-16 txn "Gandi" "transfer seleniumconf.us"
 
          Liabilities:CreditCard:AMEX            -15.50 USD
 
          Expenses:Hosting                        15.50 USD""")
 
    f = tempfile.NamedTemporaryFile('w', delete=False)
 
    f.write(books)
 
    f.close()
 
    metadata = [(f.name, 2, '    bank-statement: statement.pdf')]
 
    write_metadata_to_books(metadata)
 
    with open(f.name) as f:
 
        output = f.read()
 
    assert output == textwrap.dedent("""\
 
        2021-08-16 txn "Gandi" "transfer seleniumconf.us"
 
          Liabilities:CreditCard:AMEX            -15.50 USD
 
            bank-statement: statement.pdf
 
          Expenses:Hosting                        15.50 USD""")
 
    os.remove(f.name)
 

	
 

	
 
def test_totals():
 
    assert totals([
 
        ([S1], [B1], []),
 
        ([S2], [], []),
 
        ([], [B3_next_day], []),
 
    ]) == (decimal.Decimal('10'), decimal.Decimal('20'), decimal.Decimal('30'))
 

	
 

	
 
def test_payee_not_considered_if_check_id_present():
 
    # These records match aside from check-id.
 
    statement = [S3]
 
    books = [B3_unmatched_check_id]
 
    assert match_statement_and_books(statement, books) == (
 
        [],
 
        [S3],
 
        [B3_unmatched_check_id],
 
    )
 

	
 

	
 
def test_subset_sum_match():
 
    statement = [S4]
 
    books = [B4A, B4B, B4C]
 
    assert subset_match(statement, books) == (
 
        [([S4], [B4A, B4B, B4C], [])],
 
        [],  # No remaining statement trans.
 
        [],  # No remaining books trans.
 
    )
 

	
 

	
 
def test_subset_passes_through_all_non_matches():
 
    """This was used to locate a bug where some of the non-matches had
 
    gone missing due to mutation of books_trans."""
 
    statement_trans = [
 
        S1,  # No match
 
        S4,  # Match
 
    ]
 
    books_trans = [
 
        B2,  # No match
 
        B4A, B4B, B4C,  # Match
 
        B3_next_day, B3_next_week,  # No match
 
    ]
 
    assert subset_match(statement_trans, books_trans) == (
 
        [([S4], [B4A, B4B, B4C], [])],  # Matched
 
        [S1],  # No match: preserved intact
 
        [B2, B3_next_day, B3_next_week]  # No match: preserved intact
 
    )
 

	
 

	
 
def test_handles_amex_csv():
 
    CSV = """Date,Receipt,Description,Card Member,Account #,Amount,Extended Details,Appears On Your Statement As,Address,City/State,Zip Code,Country,Reference,Category\n08/19/2021,,Gandi.net           San Francisco,RODNEY R BROWN,-99999,28.15,"00000009999 00000009999999999999\nGandi.net\nSan Francisco\n00000009999999999999",Gandi.net           San Francisco,"NEPTUNUSSTRAAT 41-63\nHOOFDDORP",,2132 JA,NETHERLANDS (THE),'999999999999999999',Merchandise & Supplies-Internet Purchase\n"""
 
    expected = [
 
        {
 
            'date': datetime.date(2021, 8, 19),
 
            'amount': decimal.Decimal('-28.15'),
 
            'payee': 'Gandi San Francisco',
 
            'check_id': '',
 
            'line': 2,
 
        },
 
    ]
 
    assert read_amex_csv(io.StringIO(CSV)) == expected
 

	
 

	
 
def test_handles_fr_csv():
 
    CSV = """"DD99999999999","03/31/2022","LAST STATEMENT","","","$1,000.00"\n"9999999999999","04/01/2022","INCOMING WIRE","GONDOR S.S. A111111111BCDE0F","$6.50","$1,006.50"\n"DD99999999999","04/18/2022","CHECK  3741","","$-4.50","$1,002.00"\n"DD99999999999","04/30/2022","THIS STATEMENT","","","$102.00"\n"""
 
    expected = [
 
        {
 
            'date': datetime.date(2022, 4, 1),
 
            'amount': decimal.Decimal('6.50'),
 
            'payee': 'GONDOR S.S. A1111111',
 
            'check_id': '',
 
            'line': 2,
 
        },
 
        {
 
            'date': datetime.date(2022, 4, 18),
 
            'amount': decimal.Decimal('-4.50'),
 
            'payee': '',
 
            'check_id': '3741',
 
            'line': 3,
 
        },
 
    ]
 
    assert read_fr_csv(io.StringIO(CSV)) == expected
 

	
 

	
 
def test_format_output():
 
    statement = [S1]
 
    books = [B1]
 
    matches, _, _ = match_statement_and_books(statement, books)
 
    output = format_output(matches, datetime.date(2022, 1, 1), datetime.date(2022, 2, 1), 'test.csv', True)
 
    assert '2022-01-01:       10.00 Patreon         / Patreon   / 12345  →  2022-01-01:       10.00 Patreon                              ✓ Matched' in output
0 comments (0 inline, 0 general)