Changeset - 32fc4517a054
[Not reviewed]
0 2 0
Ben Sturmfels (bsturmfels) - 2 years ago 2022-02-18 13:27:48
ben@sturm.com.au
reconcile: show check number, remove duplicate words in payee.
2 files changed with 99 insertions and 77 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/reconcile/prototype_amex_reconciler.py
Show inline comments
...
 
@@ -90,2 +90,3 @@ JUNK_WORDS = [
 
    'donation',
 
    'usd',
 
]
...
 
@@ -94,3 +95,13 @@ ZERO_RE = re.compile('^0+')
 

	
 
def remove_payee_junk(payee):
 
def remove_duplicate_words(text):
 
    unique_words = []
 
    known_words = set()
 
    for word in text.split():
 
        if word.lower() not in known_words:
 
            unique_words.append(word)
 
            known_words.add(word.lower())
 
    return ' '.join(unique_words)
 

	
 

	
 
def remove_payee_junk(payee: str) -> str:
 
    for r in JUNK_WORDS_RES:
...
 
@@ -105,4 +116,5 @@ def remove_payee_junk(payee):
 
    payee = payee.replace('*', ' ')
 
    payee = ' '.join([i for i in payee.split(' ') if len(i) > 3])
 
    payee = ' '.join([i for i in payee.split(' ') if len(i) > 2])
 
    payee = payee.replace('-', ' ')
 
    payee = remove_duplicate_words(payee)
 
    payee.strip()
...
 
@@ -124,3 +136,4 @@ def standardize_amex_record(row: Dict, line: int) -> Dict:
 
        # significantly assists the fuzzy matching.
 
        'payee': remove_payee_junk(row['Description'] or '')[:25],
 
        'payee': remove_payee_junk(row['Description'] or '')[:20],
 
        'check_id': '',
 
        'line': line,
...
 
@@ -133,3 +146,4 @@ def standardize_fr_record(row: Dict, line: int) -> Dict:
 
        'amount': decimal.Decimal(row['Amount']),
 
        'payee': remove_payee_junk(row['Detail'] or row['Serial Num'])[:20],
 
        'payee': remove_payee_junk(row['Detail'] or '')[:20],
 
        'check_id': row['Serial Num'].lstrip('0'),
 
        'line': line,
...
 
@@ -140,4 +154,2 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 
    """Turn a Beancount query result row into a standard dict representing a transaction."""
 
    # if '0000000362' in row.narration:
 
    #     import pdb; pdb.set_trace()
 
    return {
...
 
@@ -145,5 +157,4 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 
        'amount': row.number_cost_position,
 
        'payee': remove_payee_junk(row.payee or row.narration)[:30],
 
        'entity': row.entity,
 
        'check_id': row.check_id,
 
        'payee': remove_payee_junk(f'{row.payee or ""} {row.entity or ""} {row.narration or ""}'),
 
        'check_id': str(row.check_id or ''),
 
        'filename': row.filename,
...
 
@@ -157,3 +168,10 @@ def format_record(records: list[dict]) -> str:
 
        record = records[0]
 
        return f"{record['date'].isoformat()}: {record['amount']:12,.2f} {record['payee'][:25]:<25}"
 

	
 
        if record['payee'] and record['check_id']:
 
            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)
 
        elif record['payee']:
 
            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:35]}".ljust(59)
 
        else:
 
            output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} #{record['check_id']}".ljust(59)
 
        return output
 
    else:
...
 
@@ -166,3 +184,47 @@ def sort_records(records: List) -> List:
 

	
 
def match_statement_and_books2(statement_trans: list, books_trans: list):
 
def records_match(r1: Dict, r2: Dict) -> Tuple[bool, str]:
 
    """Do these records represent the same transaction?"""
 

	
 
    date_score = date_proximity(r1['date'], r2['date'])
 
    if r1['date'] == r2['date']:
 
        date_message = ''
 
    elif date_score > 0.0:
 
        diff = abs((r1['date'] - r2['date']).days)
 
        date_message = f'+/- {diff} days'
 
    else:
 
        date_message = 'date mismatch'
 

	
 
    if r1['amount'] == r2['amount']:
 
        amount_score, amount_message = 2.0, ''
 
    else:
 
        amount_score, amount_message = 0.0, 'amount mismatch'
 

	
 
    if r1['check_id'] and r2['check_id'] and r1['check_id'] == r2['check_id']:
 
        check_score = 1.0
 
    else:
 
        check_score = 0.0
 

	
 
    payee_score = fuzz.token_set_ratio(r1['payee'], r2['payee']) / 100.00
 
    if check_score == 1.0 or payee_score > 0.8:
 
        payee_message = ''
 
    elif payee_score > 0.4:
 
        payee_message = 'partial payee match'
 
    else:
 
        payee_message = 'payee mismatch'
 

	
 
    overall_score = (date_score + amount_score + check_score + payee_score) / 4
 
    overall_message = [m for m in [date_message, amount_message, payee_message] if m]
 
    return overall_score, overall_message
 

	
 

	
 
def match_statement_and_books(statement_trans: list, books_trans: list):
 
    """
 

	
 

	
 
    Runs through all the statement transactions to find a matching transaction
 
    in the books. If found, the books transaction is marked off so that it can
 
    only be matched once. Some transactions will be matched, some will be on the
 
    statement but not the books and some on the books but not the statement.
 

	
 
    """
 
    matches = []
...
 
@@ -172,12 +234,2 @@ def match_statement_and_books2(statement_trans: list, books_trans: list):
 

	
 
    # run through all the statement transactions to find a matching transaction in
 
    # the books. If found, the books transaction is marked off so that it can only
 
    # be matched once. Some transactions will be matched, some will be on the
 
    # statement but not the books and some on the books but not the statement.
 
    #
 
    # Doesn't currently find exact matches when there are a bunch of transactions
 
    # for the same amount on the same day. Probably ought to do a first pass through
 
    # the books transactions to find an closely matching payee, then do another pass
 
    # disregarding payee.
 

	
 
    for r1 in statement_trans:
...
 
@@ -188,3 +240,3 @@ def match_statement_and_books2(statement_trans: list, books_trans: list):
 
        for i, r2 in enumerate(books_trans):
 
            score, note = records_match2(r1, r2)
 
            score, note = records_match(r1, r2)
 
            if score >= 0.5 and score >= best_match_score:
...
 
@@ -194,4 +246,2 @@ def match_statement_and_books2(statement_trans: list, books_trans: list):
 
                best_match_note = note
 
                # if note == 'payee mismatch':
 
                #     payee_only_mismatches += 1
 
        if best_match_score > 0.5 and matches_found == 1 and 'payee_mismatch' not in best_match_note or best_match_score > 0.8:
...
 
@@ -207,2 +257,3 @@ def match_statement_and_books2(statement_trans: list, books_trans: list):
 

	
 

	
 
def format_matches(statement_trans, books_trans, show_reconciled_matches: bool, csv_statement: str):
...
 
@@ -214,3 +265,3 @@ def format_matches(statement_trans, books_trans, show_reconciled_matches: bool,
 

	
 
    matches = match_statement_and_books2(statement_trans, books_trans)
 
    matches = match_statement_and_books(statement_trans, books_trans)
 
    for r1, r2, note in matches:
...
 
@@ -221,5 +272,5 @@ def format_matches(statement_trans, books_trans, show_reconciled_matches: bool,
 
        elif r1:
 
            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {" ":^50}  ✗ Not in books ({os.path.basename(csv_statement)}:{r1[0]["line"]})'])
 
            match_output.append([r1[0]['date'], f'{format_record(r1)}  →  {" ":^59}  ✗ NOT IN BOOKS ({os.path.basename(csv_statement)}:{r1[0]["line"]})'])
 
        else:
 
            match_output.append([r2[0]['date'], f'{" ":^50}  →  {format_record(r2)}  ✗ Not on statement ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})'])
 
            match_output.append([r2[0]['date'], f'{" ":^59}  →  {format_record(r2)}  ✗ NOT ON STATEMENT ({os.path.basename(r2[0]["filename"])}:{r2[0]["line"]})'])
 
    return match_output, metadata_to_apply, total_matched, total_missing_from_books, total_missing_from_statement
...
 
@@ -238,36 +289,2 @@ def date_proximity(d1, d2):
 

	
 
def records_match2(r1: Dict, r2: Dict) -> Tuple[bool, str]:
 
    """Do these records represent the same transaction?"""
 

	
 
    date_score = date_proximity(r1['date'], r2['date'])
 
    if r1['date'] == r2['date']:
 
        date_message = ''
 
    elif date_score > 0.0:
 
        diff = abs((r1['date'] - r2['date']).days)
 
        date_message = f'+/- {diff} days'
 
    else:
 
        date_message = 'date mismatch'
 

	
 
    if r1['amount'] == r2['amount']:
 
        amount_score, amount_message = 2.0, ''
 
    else:
 
        amount_score, amount_message = 0.0, 'amount mismatch'
 

	
 
    if r2['check_id']:
 
        payee_score = 1.0 if r1['payee'] == str(r2['check_id']) else 0
 
        payee_message = 'check # matched'
 
    else:
 
        payee_match_quality_payee = fuzz.token_set_ratio(r1['payee'], r2['payee'])
 
        payee_match_quality_entity = fuzz.token_set_ratio(r1['payee'], r2['entity'])
 
        payee_score = max(payee_match_quality_payee, payee_match_quality_entity) / 100.0
 
        if payee_score > 0.8:
 
            payee_message = ''
 
        elif payee_score > 0.5:
 
            payee_message = 'partial payee match'
 
        else:
 
            payee_message = 'payee mismatch'
 

	
 
    overall_score = (date_score + amount_score + payee_score) / 4
 
    overall_message = [m for m in [date_message, amount_message, payee_message] if m]
 
    return overall_score, overall_message
 

	
...
 
@@ -360,3 +377,3 @@ def main(args):
 
    # because BQL can't write back to the Beancount files. I hope!
 
    query = f"SELECT id, filename, META('lineno') AS line, META('bank-statement') AS bank_statement, date, number(cost(position)), payee, ANY_META('entity') as entity, ANY_META('check-id') as check_id, narration where account = '{args.account}' and date >= {begin_date} and date <= {end_date}"
 
    query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}'
 
    result_types, result_rows = run_query(entries, options, query)
tests/test_reconcile.py
Show inline comments
...
 
@@ -3,3 +3,8 @@ import decimal
 

	
 
from conservancy_beancount.reconcile.prototype_amex_reconciler import match_statement_and_books2 as match_statement_and_books, remove_payee_junk, date_proximity
 
from conservancy_beancount.reconcile.prototype_amex_reconciler import (
 
    match_statement_and_books,
 
    remove_payee_junk,
 
    date_proximity,
 
    remove_duplicate_words,
 
)
 

	
...
 
@@ -9,2 +14,3 @@ S1 = {
 
    'payee': 'Patreon         / Patreon   / 123456/ ST-A1B2C3D4G5H6       /',
 
    'check_id': '',
 
    'line': 222,
...
 
@@ -15,2 +21,3 @@ S2 = {
 
    'payee': 'BT*LINODE           PHILADELPHIA        P',
 
    'check_id': '',
 
    'line': 333,
...
 
@@ -21,2 +28,3 @@ S3 = {
 
    'payee': 'USPS PO 4067540039 0PORTLAND            OR',
 
    'check_id': '',
 
    'line': 444,
...
 
@@ -28,4 +36,3 @@ B1 = {
 
    'payee': 'Patreon',
 
    'entity': '',
 
    'check_id': None,
 
    'check_id': '',
 
    'filename': '2022/imports.beancount',
...
 
@@ -38,4 +45,3 @@ B2 = {
 
    'payee': 'Linode',
 
    'entity': '',
 
    'check_id': None,
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
...
 
@@ -48,4 +54,3 @@ B3_next_day = {
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
...
 
@@ -58,4 +63,3 @@ B3_next_week = {
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
...
 
@@ -68,4 +72,3 @@ B3_mismatch_amount = {
 
    'payee': 'USPS',
 
    'entity': '',
 
    'check_id': None,
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
...
 
@@ -78,4 +81,3 @@ B3_payee_mismatch_1 = {
 
    'payee': 'Credit X',
 
    'entity': '',
 
    'check_id': None,
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
...
 
@@ -88,4 +90,3 @@ B3_payee_mismatch_2 = {
 
    'payee': 'Credit Y',
 
    'entity': '',
 
    'check_id': None,
 
    'check_id': '',
 
    'filename': '2022/main.beancount',
...
 
@@ -175 +176,5 @@ def test_date_proximity():
 
    assert date_proximity(datetime.date(2021, 8, 23), datetime.date(2021, 8, 23) - datetime.timedelta(days=60)) == 0.0
 

	
 

	
 
def test_remove_duplicate_words():
 
    assert remove_duplicate_words('Hi Foo Kow FOO') == 'Hi Foo Kow'
0 comments (0 inline, 0 general)