Changeset - d8f4eac53bb7
[Not reviewed]
0 1 1
Ben Sturmfels (bsturmfels) - 2 years ago 2022-02-18 09:36:11
ben@sturm.com.au
reconcile: Improve reconciler matching and add test cases.
2 files changed with 336 insertions and 57 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/reconcile/prototype_amex_reconciler.py
Show inline comments
...
 
@@ -44,11 +44,14 @@ TODO/ISSUES:
 
"""
 
import argparse
 
import collections
 
import copy
 
import csv
 
import datetime
 
import decimal
 
import io
 
import logging
 
import os
 
import re
 
import sys
 
from typing import Callable, Dict, List, Tuple, TextIO
 

	
...
 
@@ -62,6 +65,49 @@ if not sys.warnoptions:
 
    warnings.filterwarnings('ignore', category=UserWarning, module='thefuzz.fuzz')
 
from thefuzz import fuzz  # type: ignore
 

	
 
logger = logging.getLogger()
 
logger.setLevel(logging.DEBUG)
 

	
 
# Console logging.
 
logger.addHandler(logging.StreamHandler())
 

	
 

	
 
JUNK_WORDS = [
 
    'software',
 
    'freedom',
 
    'conservancy',
 
    'conse',
 
    'payment',
 
    'echeck',
 
    'bill',
 
    'debit',
 
    'wire',
 
    'credit',
 
    "int'l",
 
    "in.l",
 
    'llc',
 
    'online',
 
    'donation',
 
]
 
JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
 
ZERO_RE = re.compile('^0+')
 

	
 
def remove_payee_junk(payee):
 
    for r in JUNK_WORDS_RES:
 
        payee = r.sub('', payee)
 
    payee = ZERO_RE.sub('', payee)
 
    payee = payee.replace(' - ', ' ')
 
    payee = re.sub(r'\.0\.\d+', ' ', payee)
 
    payee = payee.replace('.0', ' ')
 
    payee = payee.replace('/', ' ')
 
    payee = re.sub(re.escape('.com'), ' ', payee, flags=re.IGNORECASE)
 
    payee = re.sub(re.escape('.net'), ' ', payee, flags=re.IGNORECASE)
 
    payee = payee.replace('*', ' ')
 
    payee = ' '.join([i for i in payee.split(' ') if len(i) > 3])
 
    payee = payee.replace('-', ' ')
 
    payee.strip()
 
    return payee
 

	
 
# NOTE: Statement doesn't seem to give us a running balance or a final total.
 

	
 
def read_transactions_from_csv(f: TextIO, standardize_statement_record: Callable) -> list:
...
 
@@ -74,8 +120,10 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
 
    return {
 
        'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
 
        'amount': -1 * decimal.Decimal(row['Amount']),
 
        # Descriptions have too much noise, so taking just the start
 
        # significantly assists the fuzzy matching.
 
        'payee': remove_payee_junk(row['Description'] or '')[:25],
 
        'line': line,
 
        'payee': row['Description'] or '',
 
    }
 

	
 

	
...
 
@@ -83,39 +131,46 @@ def standardize_fr_record(row: Dict, line: int) -> Dict:
 
    return {
 
        'date': datetime.datetime.strptime(row['Date'], '%m/%d/%Y').date(),
 
        'amount': decimal.Decimal(row['Amount']),
 
        'payee': row['Detail'] or row['Description'],
 
        'payee': remove_payee_junk(row['Detail'] or row['Serial Num'])[:20],
 
        'line': line,
 
    }
 

	
 

	
 
def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 
    """Turn a Beancount query result row into a standard dict representing a transaction."""
 
    # if '0000000362' in row.narration:
 
    #     import pdb; pdb.set_trace()
 
    return {
 
        'date': row.date,
 
        'amount': row.number_cost_position,
 
        'payee': row.payee if row.payee else row.narration,
 
        'payee': remove_payee_junk(row.payee or row.narration)[:30],
 
        'entity': row.entity,
 
        'check_id': row.check_id,
 
        'filename': row.filename,
 
        'line': row.posting_line,
 
        'statement': row.posting_statement,
 
        'line': row.line,
 
        'bank_statement': row.bank_statement,
 
    }
 

	
 

	
 
def format_record(record: Dict) -> str:
 
    return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:20]:<20}"
 
def format_record(records: list[dict]) -> str:
 
    if len(records) == 1:
 
        record = records[0]
 
        return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:25]:<25}"
 
    else:
 
        raise NotImplementedError
 

	
 

	
 
def sort_records(records: List) -> List:
 
    return sorted(records, key=lambda x: (x['date'], x['amount']))
 

	
 

	
 
def match_statement_and_books(statement_trans: list, books_trans: list, show_reconciled_matches: bool, csv_statement: str) -> tuple[list, list, decimal.Decimal]:
 
def match_statement_and_books2(statement_trans: list, books_trans: list):
 
    matches = []
 
    metadata_to_apply = []
 
    total_matched = decimal.Decimal(0)
 
    total_missing_from_books = decimal.Decimal(0)
 
    total_missing_from_statement = decimal.Decimal(0)
 
    # We need a realised list and should be a copy so we can safely delete
 
    # items.
 
    books_trans = list(books_trans)
 

	
 
    # Run through all the statement transactions to find a matching transaction in
 
    # run through all the statement transactions to find a matching transaction in
 
    # the books. If found, the books transaction is marked off so that it can only
 
    # be matched once. Some transactions will be matched, some will be on the
 
    # statement but not the books and some on the books but not the statement.
...
 
@@ -125,49 +180,96 @@ def match_statement_and_books(statement_trans: list, books_trans: list, show_rec
 
    # the books transactions to find an closely matching payee, then do another pass
 
    # disregarding payee.
 

	
 
    # TODO: What if th
 
    for r1 in statement_trans:
 
        for r2 in books_trans:
 
            match, note = records_match(r1, r2)
 
            if match:
 
                if not r2['statement'] or show_reconciled_matches:
 
                        matches.append([r2['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ {note}'])
 
                        total_matched += r2['amount']
 
                if not r2['statement']:
 
                    metadata_to_apply.append((r2['filename'], r2['line'], f'    bank-statement: "{os.path.basename(csv_statement)}:{r2["line"]}"\n'))
 
                books_trans.remove(r2)
 
                break
 
        best_match_score = 0
 
        best_match_index = None
 
        best_match_note = ''
 
        matches_found = 0
 
        for i, r2 in enumerate(books_trans):
 
            score, note = records_match2(r1, r2)
 
            if score >= 0.5 and score >= best_match_score:
 
                matches_found += 1
 
                best_match_score = score
 
                best_match_index = i
 
                best_match_note = note
 
                # if note == 'payee mismatch':
 
                #     payee_only_mismatches += 1
 
        if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8:
 
            if best_match_score <= 0.8:
 
                best_match_note.append('only one decent match')
 
            matches.append(([r1], [books_trans[best_match_index]], best_match_note))
 
            del books_trans[best_match_index]
 
        else:
 
            matches.append([r1['date'], f'{format_record(r1)}  →  {" ":^45}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1["line"]})'])
 
            total_missing_from_books += r1['amount']
 
            matches.append(([r1], [], ['no match']))
 
    for r2 in books_trans:
 
        matches.append([r2['date'], f'{" ":^45}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2["filename"])}:{r2["line"]})'])
 
        total_missing_from_statement += r2['amount']
 
    return matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
 
        matches.append(([], [r2], ['no match']))
 
    return matches
 

	
 
def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str):
 
    match_output = []
 
    metadata_to_apply = []
 
    total_matched = decimal.Decimal(0)
 
    total_missing_from_books = decimal.Decimal(0)
 
    total_missing_from_statement = decimal.Decimal(0)
 

	
 
    matches = match_statement_and_books2(statement_trans, books_trans)
 
    for r1, r2, note in matches:
 
        note = ', '.join(note)
 
        note = ': ' + note if note else note
 
        if r1 and r2:
 
            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ Matched{note}'])
 
        elif r1:
 
            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {" ":^50}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1[0]["line"]})'])
 
        else:
 
            match_output.append([r2[0]['date'], f'{" ":^50}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})'])
 
    return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
 

	
 
# TODO: Time for some test cases I think.
 

	
 
# TODO: Could potentially return a score so that we can find the best match from
 
# a pool of candidates. How would be then remove that candidate from the global
 
# pool?
 

	
 
def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
 
def date_proximity(d1, d2):
 
    diff = abs((d1 - d2).days)
 
    if diff > 60:
 
        return 0
 
    else:
 
        return 1.0 - (diff / 60.0)
 

	
 
def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]:
 
    """Do these records represent the same transaction?"""
 
    date_matches_exactly = r1['date'] == r2['date']
 
    date_matches_loosly = r1['date'] >= r2['date'] - datetime.timedelta(days=3) and r1['date'] <= r2['date'] + datetime.timedelta(days=3)
 
    amount_matches = r1['amount'] == r2['amount']
 
    payee_match_quality = fuzz.token_set_ratio(r1['payee'], r2['payee'])
 
    payee_matches = payee_match_quality > 50
 
    if date_matches_exactly and amount_matches and payee_matches:
 
        return True, 'Matched'
 
    elif date_matches_loosly and amount_matches and payee_matches:
 
        return True, 'Matched +/- 3 days'
 
    elif date_matches_exactly and amount_matches:
 
        return True, f'Matched ignoring payee'
 
    elif date_matches_loosly and amount_matches:
 
        return True, f'Matched +/- 3 days, ignoring payee'
 

	
 
    date_score = date_proximity(r1['date'], r2['date'])
 
    if r1['date'] == r2['date']:
 
        date_message = ''
 
    elif date_score > 0.0:
 
        diff = abs((r1['date'] - r2['date']).days)
 
        date_message = f'+/- {diff} days'
 
    else:
 
        date_message = 'date mismatch'
 

	
 
    if r1['amount'] == r2['amount']:
 
        amount_score, amount_message = 2.0, ''
 
    else:
 
        amount_score, amount_message = 0.0, 'amount mismatch'
 

	
 
    if r2['check_id']:
 
        payee_score = 1.0 if r1['payee'] == str(r2['check_id']) else 0
 
        payee_message = 'check # matched'
 
    else:
 
        return False, ''
 
        payee_match_quality_payee = fuzz.token_set_ratio(r1['payee'], r2['payee'])
 
        payee_match_quality_entity = fuzz.token_set_ratio(r1['payee'], r2['entity'])
 
        payee_score = max(payee_match_quality_payee, payee_match_quality_entity) / 100.0
 
        if payee_score > 0.8:
 
            payee_message = ''
 
        elif payee_score > 0.5:
 
            payee_message = 'partial payee match'
 
        else:
 
            payee_message = 'payee mismatch'
 

	
 
    overall_score = (date_score + amount_score + payee_score) / 4
 
    overall_message = [m for m in [date_message, amount_message, payee_message] if m]
 
    return overall_score, overall_message
 

	
 

	
 
# TODO: Is there a way to pull the side-effecting code out of this function?
...
 
@@ -221,6 +323,9 @@ def parse_args(argv):
 
def main(args):
 
    # TODO: Should put in a sanity check to make sure the statement you're feeding
 
    # in matches the account you've provided.
 

	
 
    # TODO: Can we open the files first, then pass the streams on to the rest of the program?
 

	
 
    if 'AMEX' in args.account:
 
        standardize_statement_record = standardize_amex_record
 
    else:
...
 
@@ -253,14 +358,16 @@ def main(args):
 

	
 
    # String concatenation looks bad, but there's no SQL injection possible here
 
    # because BQL can't write back to the Beancount files. I hope!
 
    query = f"SELECT filename, META('lineno') AS posting_line, META('bank-statement') AS posting_statement, date, number(cost(position)), payee, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
 
    query = f"SELECT id, filename, META('lineno') AS line, META('bank-statement') AS bank_statement, date, number(cost(position)), payee, ANY_META('entity') as entity, ANY_META('check-id') as check_id, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
 
    result_types, result_rows = run_query(entries, options, query)
 

	
 
    books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
 

	
 
    matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = match_statement_and_books(
 
    matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = format_matches(
 
        statement_trans, books_trans, args.show_reconciled_matches, args.csv_statement)
 

	
 
    # assert books_balance == books_balance_reconciled + total_matched + total_missing_from_statement
 

	
 
    out = io.StringIO()
 
    print('-' * 155)
 
    print(f'{"Statement transaction":<38}            {"Books transaction":<44}   Notes')
...
 
@@ -268,17 +375,14 @@ def main(args):
 
    for _, output in sorted(matches):
 
        print(output)
 
    print('-' * 155)
 
    print(f'Period: {begin_date} to {end_date}')
 
    print(f'Statement/cleared balance:  {args.statement_balance:12,.2f}    (as provided by you)')
 
    print(f'Books balance (all):        {books_balance:12,.2f}    (all transactions, includes unreconciled)')
 
    print(f'Books balance (reconciled): {books_balance_reconciled:12,.2f}    (transactions with "bank-statement" tag only)')
 
    print(f'Matched above:              {total_matched:12,.2f}    ("bank-statement" tag yet to be applied)')
 
    print(f'On statement only:          {total_missing_from_books:12,.2f}    (no match in books)')
 
    print(f'On books only:              {total_missing_from_statement:12,.2f}    (no match on statement)')
 
    print(f'Statement period: {begin_date} to {end_date}')
 
    print(f'Statement/cleared balance:    {args.statement_balance:12,.2f}    (as provided by you)')
 
    print(f'Books balance (all):          {books_balance:12,.2f}    (all transactions, includes unreconciled)')
 
    print(f'Books balance (reconciled):   {books_balance_reconciled:12,.2f}    (transactions with "bank-statement" tag only)')
 
    print(f'Matched above:                {total_matched:12,.2f}    ("bank-statement" tag yet to be applied)')
 
    print(f'Unmatched on books:           {total_missing_from_statement:12,.2f}')
 
    print(f'Unmatched statement:          {total_missing_from_books:12,.2f}')
 
    print('-' * 155)
 
    # print(f'Remaning to reconcile:          {books_balance - books_balance_reconciled - total_matched:12,.2f}')
 
    # print(f'Total reconciled inc. above:    {books_balance_reconciled + total_matched:12,.2f}')
 
    # print('-' * 155)
 

	
 
    # Write statement metadata back to books
 
    if metadata_to_apply and not args.non_interactive:
tests/test_reconcile.py
Show inline comments
 
new file 100644
 
import datetime
 
import decimal
 

	
 
from conservancy_beancount.reconcile.prototype_amex_reconciler import match_statement_and_books2 as match_statement_and_books, remove_payee_junk, date_proximity
 

	
 
S1 = {
 
    'date': datetime.date(2022, 1, 1),
 
    'amount': decimal.Decimal('10.00'),
 
    'payee': 'Patreon         / Patreon   / 123456/ ST-A1B2C3D4G5H6       /',
 
    'line': 222,
 
}
 
S2 = {
 
    'date': datetime.date(2022, 1, 2),
 
    'amount': decimal.Decimal('20.00'),
 
    'payee': 'BT*LINODE           PHILADELPHIA        P',
 
    'line': 333,
 
}
 
S3 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS PO 4067540039 0PORTLAND            OR',
 
    'line': 444,
 
}
 

	
 
B1 = {
 
    'date': datetime.date(2022, 1, 1),
 
    'amount': decimal.Decimal('10.00'),
 
    'payee': 'Patreon',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/imports.beancount',
 
    'line': 777,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B2 = {
 
    'date': datetime.date(2022, 1, 2),
 
    'amount': decimal.Decimal('20.00'),
 
    'payee': 'Linode',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 888,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_next_day = {
 
    'date': datetime.date(2022, 1, 4),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_next_week = {
 
    'date': datetime.date(2022, 1, 10),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_mismatch_amount = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('31.00'),
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_payee_mismatch_1 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'Credit X',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_payee_mismatch_2 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'Credit Y',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 

	
 

	
 
def test_one_exact_match():
 
    statement = [S1]
 
    books = [B1]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S1], [B1], []),
 
    ]
 

	
 
def test_multiple_exact_matches():
 
    statement = [S1, S2]
 
    books = [B1, B2]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S1], [B1], []),
 
        ([S2], [B2], []),
 
    ]
 

	
 
def test_one_mismatch():
 
    statement = [S1]
 
    books = []
 
    assert match_statement_and_books(statement, books) == [
 
        ([S1], [], ['no match']),
 
    ]
 

	
 
def test_multiple_mismatches():
 
    statement = [S1]
 
    books = [B2]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S1], [], ['no match']),
 
        ([], [B2], ['no match']),
 
    ]
 

	
 
def test_next_day_matches():
 
    statement = [S3]
 
    books = [B3_next_day]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S3], [B3_next_day], ['+/- 1 days']),
 
    ]
 

	
 
def test_next_week_matches():
 
    statement = [S3]
 
    books = [B3_next_week]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S3], [B3_next_week], ['+/- 7 days']),
 
    ]
 

	
 
def test_incorrect_amount_does_not_match():
 
    statement = [S3]
 
    books = [B3_mismatch_amount]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S3], [], ['no match']),
 
        ([], [B3_mismatch_amount], ['no match']),
 
    ]
 

	
 
def test_payee_mismatch_ok_when_only_one_that_amount_and_date():
 
    statement = [S3]
 
    books = [B3_payee_mismatch_1]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match']),
 
    ]
 

	
 
def test_payee_mismatch_not_ok_when_multiple_that_amount_and_date():
 
    statement = [S3]
 
    books = [B3_payee_mismatch_1, B3_payee_mismatch_2]
 
    match = match_statement_and_books(statement, books)
 
    assert match == [
 
        ([S3], [], ['no match']),
 
        ([], [B3_payee_mismatch_1], ['no match']),
 
        ([], [B3_payee_mismatch_2], ['no match']),
 
    ]
 

	
 
# def test_subset_sum_with_same_date_and_payee():
 

	
 
def test_remove_payee_junk():
 
    assert remove_payee_junk('WIDGETSRUS INC PAYMENT 1') == 'WIDGETSRUS'
 
    assert remove_payee_junk('0000010017') == '10017'
 

	
 
def test_date_proximity():
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23)) == 1.0
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=30)) == 0.5
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=60)) == 0.0
0 comments (0 inline, 0 general)