diff --git a/conservancy_beancount/reconcile/statement_reconciler.py b/conservancy_beancount/reconcile/statement_reconciler.py index 8285d310237969b2a7e5ac9abfe6128e9420a964..a151011d1d93ca3d735bce11359447f0ac6eb74c 100644 --- a/conservancy_beancount/reconcile/statement_reconciler.py +++ b/conservancy_beancount/reconcile/statement_reconciler.py @@ -44,23 +44,25 @@ similar directives are already present. This is a bit like diff-ing a statement with the books (though we're only interested in the presence of lines, not so much their order). +Paper checks are entered in the books when written (a.k.a. "posted"), +but may not be cashed until months later sometimes causing +reconciliation differences that live beyond a month. It's worth noting +that there are really two dates here - the posting date and the +cleared date. Beancount only allows us to model one, which is why +carrying these reconciliation differences between months feels a bit +awkward. + Problems in scope: - errors in the books take hours to find during reconciliation, - requiring manually comparing statemnts and the books and are + requiring manually comparing statements and the books and are succeptible to mistakes, such as not noticing when there are two payments for the same amount on the statement, but not in the books - ("you're entering a world of pain") + (as Bradley likes to quote, "you're entering a world of pain") - adding statement/reconciliation metadata to books is/was manual and prone to mistakes - - Beancount doesn't provide any infrastructure for programmatically - updating the books, only appending in the case of importers - - - paper checks are entered in the books when written, but may not be - cashed until months later (reconcile errors) - - jumping to an individual transaction in a large ledger isn't trivial - Emacs grep mode is the current best option @@ -244,6 +246,7 @@ def standardize_beancount_record(row) -> Dict: # type: ignore[no-untyped-def] def format_record(record: dict) -> str: + """Generate output lines for a standard 1:1 match.""" if record['payee'] and record['check_id']: output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59) elif record['payee']: @@ -254,6 +257,7 @@ def format_record(record: dict) -> str: def format_multirecord(r1s: list[dict], r2s: list[dict], note: str) -> list[list]: + """Generates output lines for one statement:multiple books transaction match.""" assert len(r1s) == 1 assert len(r2s) > 1 match_output = [] @@ -268,6 +272,13 @@ def sort_records(records: List) -> List: def first_word_exact_match(a: str, b: str) -> float: + """Score a payee match based first word. + + We get a whole lot of good matches this way. Helps in the + situation where the first word or two of a transaction description + is useful and the rest is garbage. + + """ if len(a) == 0 or len(b) == 0: return 0.0 first_a = a.split()[0].strip() @@ -279,6 +290,7 @@ def first_word_exact_match(a: str, b: str) -> float: def payee_match(a: str, b: str) -> float: + """Score a match between two payees.""" fuzzy_match = float(fuzz.token_set_ratio(a, b) / 100.00) first_word_match = first_word_exact_match(a, b) return max(fuzzy_match, first_word_match) @@ -286,7 +298,6 @@ def payee_match(a: str, b: str) -> float: def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]: """Do these records represent the same transaction?""" - date_score = date_proximity(r1['date'], r2['date']) if r1['date'] == r2['date']: date_message = '' @@ -329,11 +340,20 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]: def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]: - """ - Runs through all the statement transactions to find a matching transaction - in the books. If found, the books transaction is marked off so that it can - only be matched once. Some transactions will be matched, some will be on the - statement but not the books and some on the books but not the statement. + """Match transactions between the statement and books. + + If matched, the books transaction is marked off so that it can + only be matched once. Some transactions will be matched, some will + be on the statement but not the books and some on the books but + not the statement. + + Passes through any unmatched transactions. + + Currently we use the same matching logic for all types of + statements. It's conceivable that you could have special cases to + accurately match some types of statements, but that would be more + work to maintain and test. + """ matches = [] remaining_books_trans = [] @@ -363,9 +383,58 @@ def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dic return matches, remaining_statement_trans, remaining_books_trans +def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]: + """Match single statement transactions with multiple books transactions. + + Works similarly to match_statement_and_books in that it returns a + list of matches and lists of remaining statement and books + transactions. + + """ + matches = [] + remaining_books_trans = [] + remaining_statement_trans = [] + + groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee'])) + for _, group in groups: + best_match_score = 0.0 + best_match_index = None + best_match_note = [] + matches_found = 0 + + group_items = list(group) + total = sum(x['amount'] for x in group_items) + r2 = copy.copy(group_items[0]) + r2['amount'] = total + for i, r1 in enumerate(statement_trans): + score, note = records_match(r1, r2) + if score >= 0.5 and score >= best_match_score: + matches_found += 1 + best_match_score = score + best_match_index = i + best_match_note = note + if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8: + matches.append(([statement_trans[best_match_index]], group_items, best_match_note)) + if best_match_index is not None: + del statement_trans[best_match_index] + else: + remaining_books_trans.append(r2) + for r1 in statement_trans: + remaining_statement_trans.append(r1) + return matches, remaining_statement_trans, remaining_books_trans + + # TODO: Return list of tuples (instead of list of lists). def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]: + + """Produce a list of body output lines from the given matches. + + The first column is a date so we can re-sort the list to put the + missing entries in the right place. The second column is the text + output. + + """ match_output = [] for r1s, r2s, note in matches: note = ', '.join(note) @@ -387,15 +456,19 @@ def format_matches(matches: List, csv_statement: str, show_reconciled_matches: b def date_proximity(d1: datetime.date, d2: datetime.date) -> float: + """Scores two days based on how close they are together.""" + ZERO_CUTOFF = 60 # Score will be zero for this many days apart. diff = abs(int((d1 - d2).days)) - if diff > 60: + if diff >= ZERO_CUTOFF: return 0.0 else: - return 1.0 - (diff / 60.0) + return 1.0 - (diff / ZERO_CUTOFF) def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]: - # Can we really ever have multiple statement entries? Probably not. + """Returns the bank-statement metadata that should be applied for a match.""" + # TODO: Our data structure would allow multiple statement entries + # for a match, but would this ever make sense? Probably not. statement_filename = get_repo_relative_path(statement_filename) csv_filename = get_repo_relative_path(csv_filename) metadata = [] @@ -420,6 +493,9 @@ def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> No ..., ] + Beancount doesn't provide any infrastructure for programmatically + updating the books, only appending in the case of importers. So + we're on our own here. """ file_contents: dict[str, list] = {} file_offsets: dict[str, int] = collections.defaultdict(int) @@ -442,16 +518,29 @@ def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> No def get_repo_relative_path(path: str) -> str: + """Chop off the unique per-person CONSERVANCY_REPOSITORY. + + CSV and PDF statement metadata should be relative to + CONSERVANCY_REPOSITORY ie. without regards to exactly where on + your computer all the files live. + + """ return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY')) def parse_path(path: str) -> str: + """Validate that a file exists for use in argparse.""" if not os.path.exists(path): raise argparse.ArgumentTypeError(f'File {path} does not exist.') return path def parse_repo_relative_path(path: str) -> str: + """Validate that a file exists and is within $CONSERVANCY_REPOSITORY. + + For use with argparse. + + """ if not os.path.exists(path): raise argparse.ArgumentTypeError(f'File {path} does not exist.') repo = os.getenv('CONSERVANCY_REPOSITORY') @@ -484,6 +573,7 @@ def parse_arguments(argv: List[str]) -> argparse.Namespace: def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]: + """Calculate the totals of transactions matched/not-matched.""" total_matched = decimal.Decimal(0) total_missing_from_books = decimal.Decimal(0) total_missing_from_statement = decimal.Decimal(0) @@ -497,41 +587,8 @@ def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, dec return total_matched, total_missing_from_books, total_missing_from_statement -def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]: - matches = [] - remaining_books_trans = [] - remaining_statement_trans = [] - - groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee'])) - for _, group in groups: - best_match_score = 0.0 - best_match_index = None - best_match_note = [] - matches_found = 0 - - group_items = list(group) - total = sum(x['amount'] for x in group_items) - r2 = copy.copy(group_items[0]) - r2['amount'] = total - for i, r1 in enumerate(statement_trans): - score, note = records_match(r1, r2) - if score >= 0.5 and score >= best_match_score: - matches_found += 1 - best_match_score = score - best_match_index = i - best_match_note = note - if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8: - matches.append(([statement_trans[best_match_index]], group_items, best_match_note)) - if best_match_index is not None: - del statement_trans[best_match_index] - else: - remaining_books_trans.append(r2) - for r1 in statement_trans: - remaining_statement_trans.append(r1) - return matches, remaining_statement_trans, remaining_books_trans - - def process_unmatched(statement_trans: List[dict], books_trans: List[dict]) -> List[Tuple[List, List, List]]: + """Format the remaining unmatched transactions to be added to one single list of matches.""" matches: List[Tuple[List, List, List]] = [] for r1 in statement_trans: matches.append(([r1], [], ['no match'])) @@ -551,11 +608,8 @@ def main(arglist: Optional[Sequence[str]] = None, config = configmod.Config() config.load_file() - # TODO: Should put in a sanity check to make sure the statement you're feeding - # in matches the account you've provided. - - # TODO: Can we open the files first, then pass the streams on to the rest of the program? - + # Validate and normalise the statement into our standard + # transaction data structure. if 'AMEX' in args.account: validate_csv = validate_amex_csv standardize_statement_record = standardize_amex_record @@ -569,40 +623,43 @@ def main(arglist: Optional[Sequence[str]] = None, f.seek(0) statement_trans = read_transactions_from_csv(f, standardize_statement_record) + # Dates are taken from the beginning/end of the statement. begin_date = statement_trans[0]['date'] end_date = statement_trans[-1]['date'] - # Do we traverse and filter the in-memory entries list and filter that, or do we - # use Beancount Query Language (BQL) to get a list of transactions? Currently - # using BQL. + # Query for the Beancount books data for this above period. + # + # There are pros and cons for using Beancount's in-memory entries + # list directly and also for using Beancount Query Language (BQL) + # to get a list of transactions? Using BQL because it's + # convenient, but we don't have access to the full transaction + # entry objects. Feels a bit strange that these approaches are so + # disconnected. # # beancount.query.query_compile.compile() and # beancount.query.query_execute.filter_entries() look useful in this respect, # but I'm not clear on how to use compile(). An example would help. entries, _, options = loader.load_file(args.beancount_file) - - # books_balance_query = f"""SELECT sum(COST(position)) AS aa WHERE account = "{args.account}" - # AND date <= {end_date.isoformat()}""" - # _, result_rows = run_query(entries, options, books_balance_query, numberify=True) - # books_balance = result_rows[0][0] if result_rows else 0 - # String concatenation looks bad, but there's no SQL injection possible here # because BQL can't write back to the Beancount files. I hope! query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}' _, result_rows = run_query(entries, options, query) - books_trans = sort_records([standardize_beancount_record(row) for row in result_rows]) + # Apply two passes of matching, one for standard matches and one + # for subset matches. matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans) - subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(remaining_statement_trans, remaining_books_trans) + subset_matches, remaining_statement_trans, remaining_books_trans = subset_match( + remaining_statement_trans, remaining_books_trans) matches.extend(subset_matches) + + # Add the remaining unmatched to make one big list of matches, successful or not. unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans) matches.extend(unmatched) + # Print out results of our matching. match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches) - _, total_missing_from_books, total_missing_from_statement = totals(matches) - print('-' * 155) statement_heading = f'Statement transactions {begin_date} to {end_date}' print(f'{statement_heading:<52} {"Books transactions":<58} Notes') @@ -615,7 +672,7 @@ def main(arglist: Optional[Sequence[str]] = None, print(f'Total: {total_missing_from_statement + total_missing_from_books:12,.2f}') print('-' * 155) - # Write statement metadata back to books + # Write statement metadata back to the books. metadata_to_apply = [] for match in matches: metadata_to_apply.extend(metadata_for_match(match, args.bank_statement, args.csv_statement))