Changeset - 581ef0fa2378
[Not reviewed]
0 1 0
Ben Sturmfels (bsturmfels) - 2 years ago 2022-03-02 23:55:09
ben@sturm.com.au
reconcile: Update docs, move a few things around.
1 file changed with 127 insertions and 70 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/reconcile/statement_reconciler.py
Show inline comments
...
 
@@ -44,23 +44,25 @@ similar directives are already present. This is a bit like diff-ing a
 
statement with the books (though we're only interested in the presence
 
of lines, not so much their order).
 

	
 
Paper checks are entered in the books when written (a.k.a. "posted"),
 
but may not be cashed until months later sometimes causing
 
reconciliation differences that live beyond a month. It's worth noting
 
that there are really two dates here - the posting date and the
 
cleared date. Beancount only allows us to model one, which is why
 
carrying these reconciliation differences between months feels a bit
 
awkward.
 

	
 
Problems in scope:
 

	
 
 - errors in the books take hours to find during reconciliation,
 
   requiring manually comparing statemnts and the books and are
 
   requiring manually comparing statements and the books and are
 
   succeptible to mistakes, such as not noticing when there are two
 
   payments for the same amount on the statement, but not in the books
 
   ("you're entering a world of pain")
 
   (as Bradley likes to quote, "you're entering a world of pain")
 

	
 
 - adding statement/reconciliation metadata to books is/was manual and
 
   prone to mistakes
 

	
 
 - Beancount doesn't provide any infrastructure for programmatically
 
   updating the books, only appending in the case of importers
 

	
 
 - paper checks are entered in the books when written, but may not be
 
   cashed until months later (reconcile errors)
 

	
 
 - jumping to an individual transaction in a large ledger isn't
 
   trivial - Emacs grep mode is the current best option
 

	
...
 
@@ -244,6 +246,7 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]
 

	
 

	
 
def format_record(record: dict) -> str:
 
    """Generate output lines for a standard 1:1 match."""
 
    if record['payee'] and record['check_id']:
 
        output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)
 
    elif record['payee']:
...
 
@@ -254,6 +257,7 @@ def format_record(record: dict) -> str:
 

	
 

	
 
def format_multirecord(r1s: list[dict], r2s: list[dict], note: str) -> list[list]:
 
    """Generates output lines for one statement:multiple books transaction match."""
 
    assert len(r1s) == 1
 
    assert len(r2s) > 1
 
    match_output = []
...
 
@@ -268,6 +272,13 @@ def sort_records(records: List) -> List:
 

	
 

	
 
def first_word_exact_match(a: str, b: str) -> float:
 
    """Score a payee match based first word.
 

	
 
    We get a whole lot of good matches this way. Helps in the
 
    situation where the first word or two of a transaction description
 
    is useful and the rest is garbage.
 

	
 
    """
 
    if len(a) == 0 or len(b) == 0:
 
        return 0.0
 
    first_a = a.split()[0].strip()
...
 
@@ -279,6 +290,7 @@ def first_word_exact_match(a: str, b: str) -> float:
 

	
 

	
 
def payee_match(a: str, b: str) -> float:
 
    """Score a match between two payees."""
 
    fuzzy_match = float(fuzz.token_set_ratio(a, b) / 100.00)
 
    first_word_match = first_word_exact_match(a, b)
 
    return max(fuzzy_match, first_word_match)
...
 
@@ -286,7 +298,6 @@ def payee_match(a: str, b: str) -> float:
 

	
 
def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
 
    """Do these records represent the same transaction?"""
 

	
 
    date_score = date_proximity(r1['date'], r2['date'])
 
    if r1['date'] == r2['date']:
 
        date_message = ''
...
 
@@ -329,11 +340,20 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
 

	
 

	
 
def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
 
    """
 
    Runs through all the statement transactions to find a matching transaction
 
    in the books. If found, the books transaction is marked off so that it can
 
    only be matched once. Some transactions will be matched, some will be on the
 
    statement but not the books and some on the books but not the statement.
 
    """Match transactions between the statement and books.
 

	
 
    If matched, the books transaction is marked off so that it can
 
    only be matched once. Some transactions will be matched, some will
 
    be on the statement but not the books and some on the books but
 
    not the statement.
 

	
 
    Passes through any unmatched transactions.
 

	
 
    Currently we use the same matching logic for all types of
 
    statements. It's conceivable that you could have special cases to
 
    accurately match some types of statements, but that would be more
 
    work to maintain and test.
 

	
 
    """
 
    matches = []
 
    remaining_books_trans = []
...
 
@@ -363,9 +383,58 @@ def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dic
 
    return matches, remaining_statement_trans, remaining_books_trans
 

	
 

	
 
def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
 
    """Match single statement transactions with multiple books transactions.
 

	
 
    Works similarly to match_statement_and_books in that it returns a
 
    list of matches and lists of remaining statement and books
 
    transactions.
 

	
 
    """
 
    matches = []
 
    remaining_books_trans = []
 
    remaining_statement_trans = []
 

	
 
    groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee']))
 
    for _, group in groups:
 
        best_match_score = 0.0
 
        best_match_index = None
 
        best_match_note = []
 
        matches_found = 0
 

	
 
        group_items = list(group)
 
        total = sum(x['amount'] for x in group_items)
 
        r2 = copy.copy(group_items[0])
 
        r2['amount'] = total
 
        for i, r1 in enumerate(statement_trans):
 
            score, note = records_match(r1, r2)
 
            if score >= 0.5 and score >= best_match_score:
 
                matches_found += 1
 
                best_match_score = score
 
                best_match_index = i
 
                best_match_note = note
 
        if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:
 
            matches.append(([statement_trans[best_match_index]], group_items, best_match_note))
 
            if best_match_index is not None:
 
                del statement_trans[best_match_index]
 
        else:
 
            remaining_books_trans.append(r2)
 
    for r1 in statement_trans:
 
        remaining_statement_trans.append(r1)
 
    return matches, remaining_statement_trans, remaining_books_trans
 

	
 

	
 
# TODO: Return list of tuples (instead of list of lists).
 

	
 
def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]:
 

	
 
    """Produce a list of body output lines from the given matches.
 

	
 
    The first column is a date so we can re-sort the list to put the
 
    missing entries in the right place. The second column is the text
 
    output.
 

	
 
    """
 
    match_output = []
 
    for r1s, r2s, note in matches:
 
        note = ', '.join(note)
...
 
@@ -387,15 +456,19 @@ def format_matches(matches: List, csv_statement: str, show_reconciled_matches: b
 

	
 

	
 
def date_proximity(d1: datetime.date, d2: datetime.date) -> float:
 
    """Scores two days based on how close they are together."""
 
    ZERO_CUTOFF = 60  # Score will be zero for this many days apart.
 
    diff = abs(int((d1 - d2).days))
 
    if diff > 60:
 
    if diff >= ZERO_CUTOFF:
 
        return 0.0
 
    else:
 
        return 1.0 - (diff / 60.0)
 
        return 1.0 - (diff / ZERO_CUTOFF)
 

	
 

	
 
def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]:
 
    # Can we really ever have multiple statement entries? Probably not.
 
    """Returns the bank-statement metadata that should be applied for a match."""
 
    # TODO: Our data structure would allow multiple statement entries
 
    # for a match, but would this ever make sense? Probably not.
 
    statement_filename = get_repo_relative_path(statement_filename)
 
    csv_filename = get_repo_relative_path(csv_filename)
 
    metadata = []
...
 
@@ -420,6 +493,9 @@ def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> No
 
        ...,
 
    ]
 

	
 
    Beancount doesn't provide any infrastructure for programmatically
 
    updating the books, only appending in the case of importers. So
 
    we're on our own here.
 
    """
 
    file_contents: dict[str, list] = {}
 
    file_offsets: dict[str, int] = collections.defaultdict(int)
...
 
@@ -442,16 +518,29 @@ def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> No
 

	
 

	
 
def get_repo_relative_path(path: str) -> str:
 
    """Chop off the unique per-person CONSERVANCY_REPOSITORY.
 

	
 
    CSV and PDF statement metadata should be relative to
 
    CONSERVANCY_REPOSITORY ie. without regards to exactly where on
 
    your computer all the files live.
 

	
 
    """
 
    return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))
 

	
 

	
 
def parse_path(path: str) -> str:
 
    """Validate that a file exists for use in argparse."""
 
    if not os.path.exists(path):
 
        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
 
    return path
 

	
 

	
 
def parse_repo_relative_path(path: str) -> str:
 
    """Validate that a file exists and is within $CONSERVANCY_REPOSITORY.
 

	
 
    For use with argparse.
 

	
 
    """
 
    if not os.path.exists(path):
 
        raise argparse.ArgumentTypeError(f'File {path} does not exist.')
 
    repo = os.getenv('CONSERVANCY_REPOSITORY')
...
 
@@ -484,6 +573,7 @@ def parse_arguments(argv: List[str]) -> argparse.Namespace:
 

	
 

	
 
def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:
 
    """Calculate the totals of transactions matched/not-matched."""
 
    total_matched = decimal.Decimal(0)
 
    total_missing_from_books = decimal.Decimal(0)
 
    total_missing_from_statement = decimal.Decimal(0)
...
 
@@ -497,41 +587,8 @@ def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, dec
 
    return total_matched, total_missing_from_books, total_missing_from_statement
 

	
 

	
 
def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:
 
    matches = []
 
    remaining_books_trans = []
 
    remaining_statement_trans = []
 

	
 
    groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee']))
 
    for _, group in groups:
 
        best_match_score = 0.0
 
        best_match_index = None
 
        best_match_note = []
 
        matches_found = 0
 

	
 
        group_items = list(group)
 
        total = sum(x['amount'] for x in group_items)
 
        r2 = copy.copy(group_items[0])
 
        r2['amount'] = total
 
        for i, r1 in enumerate(statement_trans):
 
            score, note = records_match(r1, r2)
 
            if score >= 0.5 and score >= best_match_score:
 
                matches_found += 1
 
                best_match_score = score
 
                best_match_index = i
 
                best_match_note = note
 
        if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:
 
            matches.append(([statement_trans[best_match_index]], group_items, best_match_note))
 
            if best_match_index is not None:
 
                del statement_trans[best_match_index]
 
        else:
 
            remaining_books_trans.append(r2)
 
    for r1 in statement_trans:
 
        remaining_statement_trans.append(r1)
 
    return matches, remaining_statement_trans, remaining_books_trans
 

	
 

	
 
def process_unmatched(statement_trans: List[dict], books_trans: List[dict]) -> List[Tuple[List, List, List]]:
 
    """Format the remaining unmatched transactions to be added to one single list of matches."""
 
    matches: List[Tuple[List, List, List]] = []
 
    for r1 in statement_trans:
 
        matches.append(([r1], [], ['no match']))
...
 
@@ -551,11 +608,8 @@ def main(arglist: Optional[Sequence[str]] = None,
 
        config = configmod.Config()
 
        config.load_file()
 

	
 
    # TODO: Should put in a sanity check to make sure the statement you're feeding
 
    # in matches the account you've provided.
 

	
 
    # TODO: Can we open the files first, then pass the streams on to the rest of the program?
 

	
 
    # Validate and normalise the statement into our standard
 
    # transaction data structure.
 
    if 'AMEX' in args.account:
 
        validate_csv = validate_amex_csv
 
        standardize_statement_record = standardize_amex_record
...
 
@@ -569,40 +623,43 @@ def main(arglist: Optional[Sequence[str]] = None,
 
        f.seek(0)
 
        statement_trans = read_transactions_from_csv(f, standardize_statement_record)
 

	
 
    # Dates are taken from the beginning/end of the statement.
 
    begin_date = statement_trans[0]['date']
 
    end_date = statement_trans[-1]['date']
 

	
 
    # Do we traverse and filter the in-memory entries list and filter that, or do we
 
    # use Beancount Query Language (BQL) to get a list of transactions? Currently
 
    # using BQL.
 
    # Query for the Beancount books data for this above period.
 
    #
 
    # There are pros and cons for using Beancount's in-memory entries
 
    # list directly and also for using Beancount Query Language (BQL)
 
    # to get a list of transactions? Using BQL because it's
 
    # convenient, but we don't have access to the full transaction
 
    # entry objects. Feels a bit strange that these approaches are so
 
    # disconnected.
 
    #
 
    # beancount.query.query_compile.compile() and
 
    # beancount.query.query_execute.filter_entries() look useful in this respect,
 
    # but I'm not clear on how to use compile(). An example would help.
 
    entries, _, options = loader.load_file(args.beancount_file)
 

	
 
    # books_balance_query = f"""SELECT sum(COST(position)) AS aa WHERE account = "{args.account}"
 
    #     AND date <= {end_date.isoformat()}"""
 
    # _, result_rows = run_query(entries, options, books_balance_query, numberify=True)
 
    # books_balance = result_rows[0][0] if result_rows else 0
 

	
 
    # String concatenation looks bad, but there's no SQL injection possible here
 
    # because BQL can't write back to the Beancount files. I hope!
 
    query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}'
 
    _, result_rows = run_query(entries, options, query)
 

	
 
    books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])
 

	
 
    # Apply two passes of matching, one for standard matches and one
 
    # for subset matches.
 
    matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans)
 
    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(remaining_statement_trans, remaining_books_trans)
 
    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(
 
        remaining_statement_trans, remaining_books_trans)
 
    matches.extend(subset_matches)
 

	
 
    # Add the remaining unmatched to make one big list of matches, successful or not.
 
    unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans)
 
    matches.extend(unmatched)
 

	
 
    # Print out results of our matching.
 
    match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches)
 

	
 
    _, total_missing_from_books, total_missing_from_statement = totals(matches)
 

	
 
    print('-' * 155)
 
    statement_heading = f'Statement transactions {begin_date} to {end_date}'
 
    print(f'{statement_heading:<52}            {"Books transactions":<58}   Notes')
...
 
@@ -615,7 +672,7 @@ def main(arglist: Optional[Sequence[str]] = None,
 
    print(f'Total:                      {total_missing_from_statement + total_missing_from_books:12,.2f}')
 
    print('-' * 155)
 

	
 
    # Write statement metadata back to books
 
    # Write statement metadata back to the books.
 
    metadata_to_apply = []
 
    for match in matches:
 
        metadata_to_apply.extend(metadata_for_match(match, args.bank_statement, args.csv_statement))
0 comments (0 inline, 0 general)