NPO-Accounting/conservancy_beancount Changeset - 581ef0fa2378

...

@@ -44,23 +44,25 @@ similar directives are already present. This is a bit like diff-ing a

statement with the books (though we're only interested in the presence

of lines, not so much their order).

Paper checks are entered in the books when written (a.k.a. "posted"),

but may not be cashed until months later sometimes causing

reconciliation differences that live beyond a month. It's worth noting

that there are really two dates here - the posting date and the

cleared date. Beancount only allows us to model one, which is why

carrying these reconciliation differences between months feels a bit

awkward.

Problems in scope:

 - errors in the books take hours to find during reconciliation,

   requiring manually comparing statemnts and the books and are

   requiring manually comparing statements and the books and are

   succeptible to mistakes, such as not noticing when there are two

   payments for the same amount on the statement, but not in the books

   ("you're entering a world of pain")

   (as Bradley likes to quote, "you're entering a world of pain")

 - adding statement/reconciliation metadata to books is/was manual and

   prone to mistakes

 - Beancount doesn't provide any infrastructure for programmatically

   updating the books, only appending in the case of importers

 - paper checks are entered in the books when written, but may not be

   cashed until months later (reconcile errors)

 - jumping to an individual transaction in a large ledger isn't

   trivial - Emacs grep mode is the current best option

...

@@ -244,6 +246,7 @@ def standardize_beancount_record(row) -> Dict:  # type: ignore[no-untyped-def]

def format_record(record: dict) -> str:

    """Generate output lines for a standard 1:1 match."""

    if record['payee'] and record['check_id']:

        output = f"{record['date'].isoformat()}: {record['amount']:11,.2f} {record['payee'][:25]} #{record['check_id']}".ljust(59)

    elif record['payee']:

...

@@ -254,6 +257,7 @@ def format_record(record: dict) -> str:

def format_multirecord(r1s: list[dict], r2s: list[dict], note: str) -> list[list]:

    """Generates output lines for one statement:multiple books transaction match."""

    assert len(r1s) == 1

    assert len(r2s) > 1

    match_output = []

...

@@ -268,6 +272,13 @@ def sort_records(records: List) -> List:

def first_word_exact_match(a: str, b: str) -> float:

    """Score a payee match based first word.

    We get a whole lot of good matches this way. Helps in the

    situation where the first word or two of a transaction description

    is useful and the rest is garbage.

"""

    if len(a) == 0 or len(b) == 0:

        return 0.0

    first_a = a.split()[0].strip()

...

@@ -279,6 +290,7 @@ def first_word_exact_match(a: str, b: str) -> float:

def payee_match(a: str, b: str) -> float:

    """Score a match between two payees."""

    fuzzy_match = float(fuzz.token_set_ratio(a, b) / 100.00)

    first_word_match = first_word_exact_match(a, b)

    return max(fuzzy_match, first_word_match)

...

@@ -286,7 +298,6 @@ def payee_match(a: str, b: str) -> float:

def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:

    """Do these records represent the same transaction?"""

    date_score = date_proximity(r1['date'], r2['date'])

    if r1['date'] == r2['date']:

        date_message = ''

...

@@ -329,11 +340,20 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:

def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:

"""

    Runs through all the statement transactions to find a matching transaction

    in the books. If found, the books transaction is marked off so that it can

    only be matched once. Some transactions will be matched, some will be on the

    statement but not the books and some on the books but not the statement.

    """Match transactions between the statement and books.

    If matched, the books transaction is marked off so that it can

    only be matched once. Some transactions will be matched, some will

    be on the statement but not the books and some on the books but

    not the statement.

    Passes through any unmatched transactions.

    Currently we use the same matching logic for all types of

    statements. It's conceivable that you could have special cases to

    accurately match some types of statements, but that would be more

    work to maintain and test.

"""

    matches = []

    remaining_books_trans = []

...

@@ -363,9 +383,58 @@ def match_statement_and_books(statement_trans: List[Dict], books_trans: List[Dic

    return matches, remaining_statement_trans, remaining_books_trans

def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:

    """Match single statement transactions with multiple books transactions.

    Works similarly to match_statement_and_books in that it returns a

    list of matches and lists of remaining statement and books

    transactions.

"""

    matches = []

    remaining_books_trans = []

    remaining_statement_trans = []

    groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee']))

    for _, group in groups:

        best_match_score = 0.0

        best_match_index = None

        best_match_note = []

        matches_found = 0

        group_items = list(group)

        total = sum(x['amount'] for x in group_items)

        r2 = copy.copy(group_items[0])

        r2['amount'] = total

        for i, r1 in enumerate(statement_trans):

            score, note = records_match(r1, r2)

            if score >= 0.5 and score >= best_match_score:

                matches_found += 1

                best_match_score = score

                best_match_index = i

                best_match_note = note

        if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:

            matches.append(([statement_trans[best_match_index]], group_items, best_match_note))

            if best_match_index is not None:

                del statement_trans[best_match_index]

        else:

            remaining_books_trans.append(r2)

    for r1 in statement_trans:

        remaining_statement_trans.append(r1)

    return matches, remaining_statement_trans, remaining_books_trans

# TODO: Return list of tuples (instead of list of lists).

def format_matches(matches: List, csv_statement: str, show_reconciled_matches: bool) -> List[List]:

    """Produce a list of body output lines from the given matches.

    The first column is a date so we can re-sort the list to put the

    missing entries in the right place. The second column is the text

    output.

"""

    match_output = []

    for r1s, r2s, note in matches:

        note = ', '.join(note)

...

@@ -387,15 +456,19 @@ def format_matches(matches: List, csv_statement: str, show_reconciled_matches: b

def date_proximity(d1: datetime.date, d2: datetime.date) -> float:

    """Scores two days based on how close they are together."""

    ZERO_CUTOFF = 60  # Score will be zero for this many days apart.

    diff = abs(int((d1 - d2).days))

    if diff > 60:

    if diff >= ZERO_CUTOFF:

        return 0.0

    else:

        return 1.0 - (diff / 60.0)

        return 1.0 - (diff / ZERO_CUTOFF)

def metadata_for_match(match: Tuple[List, List, List], statement_filename: str, csv_filename: str) -> List[Tuple[str, int, str]]:

    # Can we really ever have multiple statement entries? Probably not.

    """Returns the bank-statement metadata that should be applied for a match."""

    # TODO: Our data structure would allow multiple statement entries

    # for a match, but would this ever make sense? Probably not.

    statement_filename = get_repo_relative_path(statement_filename)

    csv_filename = get_repo_relative_path(csv_filename)

    metadata = []

...

@@ -420,6 +493,9 @@ def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> No

        ...,

    Beancount doesn't provide any infrastructure for programmatically

    updating the books, only appending in the case of importers. So

    we're on our own here.

"""

    file_contents: dict[str, list] = {}

    file_offsets: dict[str, int] = collections.defaultdict(int)

...

@@ -442,16 +518,29 @@ def write_metadata_to_books(metadata_to_apply: List[Tuple[str, int, str]]) -> No

def get_repo_relative_path(path: str) -> str:

    """Chop off the unique per-person CONSERVANCY_REPOSITORY.

    CSV and PDF statement metadata should be relative to

    CONSERVANCY_REPOSITORY ie. without regards to exactly where on

    your computer all the files live.

"""

    return os.path.relpath(path, start=os.getenv('CONSERVANCY_REPOSITORY'))

def parse_path(path: str) -> str:

    """Validate that a file exists for use in argparse."""

    if not os.path.exists(path):

        raise argparse.ArgumentTypeError(f'File {path} does not exist.')

    return path

def parse_repo_relative_path(path: str) -> str:

    """Validate that a file exists and is within $CONSERVANCY_REPOSITORY.

    For use with argparse.

"""

    if not os.path.exists(path):

        raise argparse.ArgumentTypeError(f'File {path} does not exist.')

    repo = os.getenv('CONSERVANCY_REPOSITORY')

...

@@ -484,6 +573,7 @@ def parse_arguments(argv: List[str]) -> argparse.Namespace:

def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, decimal.Decimal, decimal.Decimal]:

    """Calculate the totals of transactions matched/not-matched."""

    total_matched = decimal.Decimal(0)

    total_missing_from_books = decimal.Decimal(0)

    total_missing_from_statement = decimal.Decimal(0)

...

@@ -497,41 +587,8 @@ def totals(matches: List[Tuple[List, List, List]]) -> Tuple[decimal.Decimal, dec

    return total_matched, total_missing_from_books, total_missing_from_statement

def subset_match(statement_trans: List[dict], books_trans: List[dict]) -> Tuple[List[Tuple[List, List, List]], List[Dict], List[Dict]]:

    matches = []

    remaining_books_trans = []

    remaining_statement_trans = []

    groups = itertools.groupby(books_trans, key=lambda x: (x['date'], x['payee']))

    for _, group in groups:

        best_match_score = 0.0

        best_match_index = None

        best_match_note = []

        matches_found = 0

        group_items = list(group)

        total = sum(x['amount'] for x in group_items)

        r2 = copy.copy(group_items[0])

        r2['amount'] = total

        for i, r1 in enumerate(statement_trans):

            score, note = records_match(r1, r2)

            if score >= 0.5 and score >= best_match_score:

                matches_found += 1

                best_match_score = score

                best_match_index = i

                best_match_note = note

        if best_match_score > 0.5 and matches_found == 1 and 'check-id mismatch' not in best_match_note or best_match_score > 0.8:

            matches.append(([statement_trans[best_match_index]], group_items, best_match_note))

            if best_match_index is not None:

                del statement_trans[best_match_index]

        else:

            remaining_books_trans.append(r2)

    for r1 in statement_trans:

        remaining_statement_trans.append(r1)

    return matches, remaining_statement_trans, remaining_books_trans

def process_unmatched(statement_trans: List[dict], books_trans: List[dict]) -> List[Tuple[List, List, List]]:

    """Format the remaining unmatched transactions to be added to one single list of matches."""

    matches: List[Tuple[List, List, List]] = []

    for r1 in statement_trans:

        matches.append(([r1], [], ['no match']))

...

@@ -551,11 +608,8 @@ def main(arglist: Optional[Sequence[str]] = None,

        config = configmod.Config()

        config.load_file()

    # TODO: Should put in a sanity check to make sure the statement you're feeding

    # in matches the account you've provided.

    # TODO: Can we open the files first, then pass the streams on to the rest of the program?

    # Validate and normalise the statement into our standard

    # transaction data structure.

    if 'AMEX' in args.account:

        validate_csv = validate_amex_csv

        standardize_statement_record = standardize_amex_record

...

@@ -569,40 +623,43 @@ def main(arglist: Optional[Sequence[str]] = None,

        f.seek(0)

        statement_trans = read_transactions_from_csv(f, standardize_statement_record)

    # Dates are taken from the beginning/end of the statement.

    begin_date = statement_trans[0]['date']

    end_date = statement_trans[-1]['date']

    # Do we traverse and filter the in-memory entries list and filter that, or do we

    # use Beancount Query Language (BQL) to get a list of transactions? Currently

    # using BQL.

    # Query for the Beancount books data for this above period.

    # There are pros and cons for using Beancount's in-memory entries

    # list directly and also for using Beancount Query Language (BQL)

    # to get a list of transactions? Using BQL because it's

    # convenient, but we don't have access to the full transaction

    # entry objects. Feels a bit strange that these approaches are so

    # disconnected.

    # beancount.query.query_compile.compile() and

    # beancount.query.query_execute.filter_entries() look useful in this respect,

    # but I'm not clear on how to use compile(). An example would help.

    entries, _, options = loader.load_file(args.beancount_file)

    # books_balance_query = f"""SELECT sum(COST(position)) AS aa WHERE account = "{args.account}"

    #     AND date <= {end_date.isoformat()}"""

    # _, result_rows = run_query(entries, options, books_balance_query, numberify=True)

    # books_balance = result_rows[0][0] if result_rows else 0

    # String concatenation looks bad, but there's no SQL injection possible here

    # because BQL can't write back to the Beancount files. I hope!

    query = f'SELECT filename, META("lineno") AS line, META("bank-statement") AS bank_statement, date, number(cost(position)), payee, ENTRY_META("entity") as entity, ANY_META("check-id") as check_id, narration where account = "{args.account}" and date >= {begin_date} and date <= {end_date}'

    _, result_rows = run_query(entries, options, query)

    books_trans = sort_records([standardize_beancount_record(row) for row in result_rows])

    # Apply two passes of matching, one for standard matches and one

    # for subset matches.

    matches, remaining_statement_trans, remaining_books_trans = match_statement_and_books(statement_trans, books_trans)

    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(remaining_statement_trans, remaining_books_trans)

    subset_matches, remaining_statement_trans, remaining_books_trans = subset_match(

        remaining_statement_trans, remaining_books_trans)

    matches.extend(subset_matches)

    # Add the remaining unmatched to make one big list of matches, successful or not.

    unmatched = process_unmatched(remaining_statement_trans, remaining_books_trans)

    matches.extend(unmatched)

    # Print out results of our matching.

    match_output = format_matches(matches, args.csv_statement, args.show_reconciled_matches)

    _, total_missing_from_books, total_missing_from_statement = totals(matches)

    print('-' * 155)

    statement_heading = f'Statement transactions {begin_date} to {end_date}'

    print(f'{statement_heading:<52}            {"Books transactions":<58}   Notes')

...

@@ -615,7 +672,7 @@ def main(arglist: Optional[Sequence[str]] = None,

    print(f'Total:                      {total_missing_from_statement + total_missing_from_books:12,.2f}')

    print('-' * 155)

    # Write statement metadata back to books

    # Write statement metadata back to the books.

    metadata_to_apply = []

    for match in matches:

        metadata_to_apply.extend(metadata_for_match(match, args.bank_statement, args.csv_statement))