Changeset - d8f4eac53bb7
[Not reviewed]
0 1 1
Ben Sturmfels (bsturmfels) - 2 years ago 2022-02-18 09:36:11
ben@sturm.com.au
reconcile: Improve reconciler matching and add test cases.
2 files changed with 336 insertions and 57 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/reconcile/prototype_amex_reconciler.py
Show inline comments
...
 
@@ -46,2 +46,3 @@ import argparse
 
import collections
 
import copy
 
import csv
...
 
@@ -50,3 +51,5 @@ import decimal
 
import io
 
import logging
 
import os
 
import re
 
import sys
...
 
@@ -64,2 +67,45 @@ from thefuzz import fuzz  # type: ignore
 

	
 
logger = logging.getLogger()
 
logger.setLevel(logging.DEBUG)
 

	
 
# Console logging.
 
logger.addHandler(logging.StreamHandler())
 

	
 

	
 
JUNK_WORDS = [
 
    'software',
 
    'freedom',
 
    'conservancy',
 
    'conse',
 
    'payment',
 
    'echeck',
 
    'bill',
 
    'debit',
 
    'wire',
 
    'credit',
 
    "int'l",
 
    "in.l",
 
    'llc',
 
    'online',
 
    'donation',
 
]
 
JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
 
ZERO_RE = re.compile('^0+')
 

	
 
def remove_payee_junk(payee):
 
    for r in JUNK_WORDS_RES:
 
        payee = r.sub('', payee)
 
    payee = ZERO_RE.sub('', payee)
 
    payee = payee.replace(' - ', ' ')
 
    payee = re.sub(r'\.0\.\d+', ' ', payee)
 
    payee = payee.replace('.0', ' ')
 
    payee = payee.replace('/', ' ')
 
    payee = re.sub(re.escape('.com'), ' ', payee, flags=re.IGNORECASE)
 
    payee = re.sub(re.escape('.net'), ' ', payee, flags=re.IGNORECASE)
 
    payee = payee.replace('*', ' ')
 
    payee = ' '.join([i for i in payee.split(' ') if len(i) > 3])
 
    payee = payee.replace('-', ' ')
 
    payee.strip()
 
    return payee
 

	
 
# NOTE: Statement doesn't seem to give us a running balance or a final total.
...
 
@@ -76,4 +122,6 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
 
        'amount': -1 * decimal.Decimal(row['Amount']),
 
        # Descriptions have too much noise, so taking just the start
 
        # significantly assists the fuzzy matching.
 
        'payee': remove_payee_junk(row['Description'] or '')[:25],
 
        'line': line,
 
        'payee': row['Description'] or '',
 
    }
...
 
@@ -85,3 +133,3 @@ def standardize_fr_record(row: Dict, line: int) -> Dict:
 
        'amount': decimal.Decimal(row['Amount']),
 
        'payee': row['Detail'] or row['Description'],
 
        'payee': remove_payee_junk(row['Detail'] or row['Serial Num'])[:20],
 
        'line': line,
...
 
@@ -92,2 +140,4 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 
    """Turn a Beancount query result row into a standard dict representing a transaction."""
 
    # if '0000000362' in row.narration:
 
    #     import pdb; pdb.set_trace()
 
    return {
...
 
@@ -95,6 +145,8 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 
        'amount': row.number_cost_position,
 
        'payee': row.payee if row.payee else row.narration,
 
        'payee': remove_payee_junk(row.payee or row.narration)[:30],
 
        'entity': row.entity,
 
        'check_id': row.check_id,
 
        'filename': row.filename,
 
        'line': row.posting_line,
 
        'statement': row.posting_statement,
 
        'line': row.line,
 
        'bank_statement': row.bank_statement,
 
    }
...
 
@@ -102,4 +154,8 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 

	
 
def format_record(record: Dict) -> str:
 
    return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:20]:<20}"
 
def format_record(records: list[dict]) -> str:
 
    if len(records) == 1:
 
        record = records[0]
 
        return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:25]:<25}"
 
    else:
 
        raise NotImplementedError
 

	
...
 
@@ -110,10 +166,9 @@ def sort_records(records: List) -> List:
 

	
 
def match_statement_and_books(statement_trans: list, books_trans: list, show_reconciled_matches: bool, csv_statement: str) -> tuple[list, list, decimal.Decimal]:
 
def match_statement_and_books2(statement_trans: list, books_trans: list):
 
    matches = []
 
    metadata_to_apply = []
 
    total_matched = decimal.Decimal(0)
 
    total_missing_from_books = decimal.Decimal(0)
 
    total_missing_from_statement = decimal.Decimal(0)
 
    # We need a realised list and should be a copy so we can safely delete
 
    # items.
 
    books_trans = list(books_trans)
 

	
 
    # Run through all the statement transactions to find a matching transaction in
 
    # run through all the statement transactions to find a matching transaction in
 
    # the books. If found, the books transaction is marked off so that it can only
...
 
@@ -127,23 +182,46 @@ def match_statement_and_books(statement_trans: list, books_trans: list, show_rec
 

	
 
    # TODO: What if th
 
    for r1 in statement_trans:
 
        for r2 in books_trans:
 
            match, note = records_match(r1, r2)
 
            if match:
 
                if not r2['statement'] or show_reconciled_matches:
 
                        matches.append([r2['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ {note}'])
 
                        total_matched += r2['amount']
 
                if not r2['statement']:
 
                    metadata_to_apply.append((r2['filename'], r2['line'], f'    bank-statement: "{os.path.basename(csv_statement)}:{r2["line"]}"\n'))
 
                books_trans.remove(r2)
 
                break
 
        best_match_score = 0
 
        best_match_index = None
 
        best_match_note = ''
 
        matches_found = 0
 
        for i, r2 in enumerate(books_trans):
 
            score, note = records_match2(r1, r2)
 
            if score >= 0.5 and score >= best_match_score:
 
                matches_found += 1
 
                best_match_score = score
 
                best_match_index = i
 
                best_match_note = note
 
                # if note == 'payee mismatch':
 
                #     payee_only_mismatches += 1
 
        if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8:
 
            if best_match_score <= 0.8:
 
                best_match_note.append('only one decent match')
 
            matches.append(([r1], [books_trans[best_match_index]], best_match_note))
 
            del books_trans[best_match_index]
 
        else:
 
            matches.append([r1['date'], f'{format_record(r1)}  →  {" ":^45}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1["line"]})'])
 
            total_missing_from_books += r1['amount']
 
            matches.append(([r1], [], ['no match']))
 
    for r2 in books_trans:
 
        matches.append([r2['date'], f'{" ":^45}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2["filename"])}:{r2["line"]})'])
 
        total_missing_from_statement += r2['amount']
 
    return matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
 
        matches.append(([], [r2], ['no match']))
 
    return matches
 

	
 
def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str):
 
    match_output = []
 
    metadata_to_apply = []
 
    total_matched = decimal.Decimal(0)
 
    total_missing_from_books = decimal.Decimal(0)
 
    total_missing_from_statement = decimal.Decimal(0)
 

	
 
    matches = match_statement_and_books2(statement_trans, books_trans)
 
    for r1, r2, note in matches:
 
        note = ', '.join(note)
 
        note = ': ' + note if note else note
 
        if r1 and r2:
 
            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {format_record(r2)}  ✓ Matched{note}'])
 
        elif r1:
 
            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {" ":^50}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1[0]["line"]})'])
 
        else:
 
            match_output.append([r2[0]['date'], f'{" ":^50}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})'])
 
    return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
 

	
 
# TODO: Time for some test cases I think.
 

	
...
 
@@ -153,19 +231,43 @@ def match_statement_and_books(statement_trans: list, books_trans: list, show_rec
 

	
 
def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
 
def date_proximity(d1, d2):
 
    diff = abs((d1 - d2).days)
 
    if diff > 60:
 
        return 0
 
    else:
 
        return 1.0 - (diff / 60.0)
 

	
 
def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]:
 
    """Do these records represent the same transaction?"""
 
    date_matches_exactly = r1['date'] == r2['date']
 
    date_matches_loosly = r1['date'] >= r2['date'] - datetime.timedelta(days=3) and r1['date'] <= r2['date'] + datetime.timedelta(days=3)
 
    amount_matches = r1['amount'] == r2['amount']
 
    payee_match_quality = fuzz.token_set_ratio(r1['payee'], r2['payee'])
 
    payee_matches = payee_match_quality > 50
 
    if date_matches_exactly and amount_matches and payee_matches:
 
        return True, 'Matched'
 
    elif date_matches_loosly and amount_matches and payee_matches:
 
        return True, 'Matched +/- 3 days'
 
    elif date_matches_exactly and amount_matches:
 
        return True, f'Matched ignoring payee'
 
    elif date_matches_loosly and amount_matches:
 
        return True, f'Matched +/- 3 days, ignoring payee'
 

	
 
    date_score = date_proximity(r1['date'], r2['date'])
 
    if r1['date'] == r2['date']:
 
        date_message = ''
 
    elif date_score > 0.0:
 
        diff = abs((r1['date'] - r2['date']).days)
 
        date_message = f'+/- {diff} days'
 
    else:
 
        date_message = 'date mismatch'
 

	
 
    if r1['amount'] == r2['amount']:
 
        amount_score, amount_message = 2.0, ''
 
    else:
 
        amount_score, amount_message = 0.0, 'amount mismatch'
 

	
 
    if r2['check_id']:
 
        payee_score = 1.0 if r1['payee'] == str(r2['check_id']) else 0
 
        payee_message = 'check # matched'
 
    else:
 
        return False, ''
 
        payee_match_quality_payee = fuzz.token_set_ratio(r1['payee'], r2['payee'])
 
        payee_match_quality_entity = fuzz.token_set_ratio(r1['payee'], r2['entity'])
 
        payee_score = max(payee_match_quality_payee, payee_match_quality_entity) / 100.0
 
        if payee_score > 0.8:
 
            payee_message = ''
 
        elif payee_score > 0.5:
 
            payee_message = 'partial payee match'
 
        else:
 
            payee_message = 'payee mismatch'
 

	
 
    overall_score = (date_score + amount_score + payee_score) / 4
 
    overall_message = [m for m in [date_message, amount_message, payee_message] if m]
 
    return overall_score, overall_message
 

	
...
 
@@ -223,2 +325,5 @@ def main(args):
 
    # in matches the account you've provided.
 

	
 
    # TODO: Can we open the files first, then pass the streams on to the rest of the program?
 

	
 
    if 'AMEX' in args.account:
...
 
@@ -255,3 +360,3 @@ def main(args):
 
    # because BQL can't write back to the Beancount files. I hope!
 
    query = f"SELECT filename, META('lineno') AS posting_line, META('bank-statement') AS posting_statement, date, number(cost(position)), payee, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
 
    query = f"SELECT id, filename, META('lineno') AS line, META('bank-statement') AS bank_statement, date, number(cost(position)), payee, ANY_META('entity') as entity, ANY_META('check-id') as check_id, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
 
    result_types, result_rows = run_query(entries, options, query)
...
 
@@ -260,5 +365,7 @@ def main(args):
 

	
 
    matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = match_statement_and_books(
 
    matches, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement = format_matches(
 
        statement_trans, books_trans, args.show_reconciled_matches, args.csv_statement)
 

	
 
    # assert books_balance == books_balance_reconciled + total_matched + total_missing_from_statement
 

	
 
    out = io.StringIO()
...
 
@@ -270,13 +377,10 @@ def main(args):
 
    print('-' * 155)
 
    print(f'Period: {begin_date} to {end_date}')
 
    print(f'Statement/cleared balance:  {args.statement_balance:12,.2f}    (as provided by you)')
 
    print(f'Books balance (all):        {books_balance:12,.2f}    (all transactions, includes unreconciled)')
 
    print(f'Books balance (reconciled): {books_balance_reconciled:12,.2f}    (transactions with "bank-statement" tag only)')
 
    print(f'Matched above:              {total_matched:12,.2f}    ("bank-statement" tag yet to be applied)')
 
    print(f'On statement only:          {total_missing_from_books:12,.2f}    (no match in books)')
 
    print(f'On books only:              {total_missing_from_statement:12,.2f}    (no match on statement)')
 
    print(f'Statement period: {begin_date} to {end_date}')
 
    print(f'Statement/cleared balance:    {args.statement_balance:12,.2f}    (as provided by you)')
 
    print(f'Books balance (all):          {books_balance:12,.2f}    (all transactions, includes unreconciled)')
 
    print(f'Books balance (reconciled):   {books_balance_reconciled:12,.2f}    (transactions with "bank-statement" tag only)')
 
    print(f'Matched above:                {total_matched:12,.2f}    ("bank-statement" tag yet to be applied)')
 
    print(f'Unmatched on books:           {total_missing_from_statement:12,.2f}')
 
    print(f'Unmatched statement:          {total_missing_from_books:12,.2f}')
 
    print('-' * 155)
 
    # print(f'Remaning to reconcile:          {books_balance - books_balance_reconciled - total_matched:12,.2f}')
 
    # print(f'Total reconciled inc. above:    {books_balance_reconciled + total_matched:12,.2f}')
 
    # print('-' * 155)
 

	
tests/test_reconcile.py
Show inline comments
 
new file 100644
 
import datetime
 
import decimal
 

	
 
from conservancy_beancount.reconcile.prototype_amex_reconciler import match_statement_and_books2 as match_statement_and_books, remove_payee_junk, date_proximity
 

	
 
S1 = {
 
    'date': datetime.date(2022, 1, 1),
 
    'amount': decimal.Decimal('10.00'),
 
    'payee': 'Patreon         / Patreon   / 123456/ ST-A1B2C3D4G5H6       /',
 
    'line': 222,
 
}
 
S2 = {
 
    'date': datetime.date(2022, 1, 2),
 
    'amount': decimal.Decimal('20.00'),
 
    'payee': 'BT*LINODE           PHILADELPHIA        P',
 
    'line': 333,
 
}
 
S3 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS PO 4067540039 0PORTLAND            OR',
 
    'line': 444,
 
}
 

	
 
B1 = {
 
    'date': datetime.date(2022, 1, 1),
 
    'amount': decimal.Decimal('10.00'),
 
    'payee': 'Patreon',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/imports.beancount',
 
    'line': 777,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B2 = {
 
    'date': datetime.date(2022, 1, 2),
 
    'amount': decimal.Decimal('20.00'),
 
    'payee': 'Linode',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 888,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_next_day = {
 
    'date': datetime.date(2022, 1, 4),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_next_week = {
 
    'date': datetime.date(2022, 1, 10),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_mismatch_amount = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('31.00'),
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_payee_mismatch_1 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'Credit X',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 
B3_payee_mismatch_2 = {
 
    'date': datetime.date(2022, 1, 3),
 
    'amount': decimal.Decimal('30.00'),
 
    'payee': 'Credit Y',
 
    'entity': '',
 
    'check_id': None,
 
    'filename': '2022/main.beancount',
 
    'line': 999,
 
    'bank_statement': "Financial/Bank-Statements/AMEX/2022-01-12_AMEX_statement.pdf"
 
}
 

	
 

	
 
def test_one_exact_match():
 
    statement = [S1]
 
    books = [B1]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S1], [B1], []),
 
    ]
 

	
 
def test_multiple_exact_matches():
 
    statement = [S1, S2]
 
    books = [B1, B2]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S1], [B1], []),
 
        ([S2], [B2], []),
 
    ]
 

	
 
def test_one_mismatch():
 
    statement = [S1]
 
    books = []
 
    assert match_statement_and_books(statement, books) == [
 
        ([S1], [], ['no match']),
 
    ]
 

	
 
def test_multiple_mismatches():
 
    statement = [S1]
 
    books = [B2]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S1], [], ['no match']),
 
        ([], [B2], ['no match']),
 
    ]
 

	
 
def test_next_day_matches():
 
    statement = [S3]
 
    books = [B3_next_day]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S3], [B3_next_day], ['+/- 1 days']),
 
    ]
 

	
 
def test_next_week_matches():
 
    statement = [S3]
 
    books = [B3_next_week]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S3], [B3_next_week], ['+/- 7 days']),
 
    ]
 

	
 
def test_incorrect_amount_does_not_match():
 
    statement = [S3]
 
    books = [B3_mismatch_amount]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S3], [], ['no match']),
 
        ([], [B3_mismatch_amount], ['no match']),
 
    ]
 

	
 
def test_payee_mismatch_ok_when_only_one_that_amount_and_date():
 
    statement = [S3]
 
    books = [B3_payee_mismatch_1]
 
    assert match_statement_and_books(statement, books) == [
 
        ([S3], [B3_payee_mismatch_1], ['payee mismatch', 'only one decent match']),
 
    ]
 

	
 
def test_payee_mismatch_not_ok_when_multiple_that_amount_and_date():
 
    statement = [S3]
 
    books = [B3_payee_mismatch_1, B3_payee_mismatch_2]
 
    match = match_statement_and_books(statement, books)
 
    assert match == [
 
        ([S3], [], ['no match']),
 
        ([], [B3_payee_mismatch_1], ['no match']),
 
        ([], [B3_payee_mismatch_2], ['no match']),
 
    ]
 

	
 
# def test_subset_sum_with_same_date_and_payee():
 

	
 
def test_remove_payee_junk():
 
    assert remove_payee_junk('WIDGETSRUS INC PAYMENT 1') == 'WIDGETSRUS'
 
    assert remove_payee_junk('0000010017') == '10017'
 

	
 
def test_date_proximity():
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23)) == 1.0
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=30)) == 0.5
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=60)) == 0.0
0 comments (0 inline, 0 general)