From 86f4232df19e779d2a49bb38afd410b319d5fc87 2023-02-11 07:30:22
From: Ben Sturmfels <ben@sturm.com.au>
Date: 2023-02-11 07:30:22
Subject: [PATCH] reconciler: Move other score thresholds to constants

---

diff --git a/conservancy_beancount/reconcile/statement_reconciler.py b/conservancy_beancount/reconcile/statement_reconciler.py
index c874d4f12ebbaa42b6acd8c086d0ccc00ace46ed..bf83e56b68407a1f1921d4f94d77da327dc14968 100644
--- a/conservancy_beancount/reconcile/statement_reconciler.py
+++ b/conservancy_beancount/reconcile/statement_reconciler.py
@@ -152,8 +152,10 @@ JUNK_WORDS = [
 ]
 JUNK_WORDS_RES = [re.compile(word, re.IGNORECASE) for word in JUNK_WORDS]
 ZERO_RE = re.compile('^0+')
-FULL_MATCH_THRESHOLD = 0.8
-PARTIAL_MATCH_THRESHOLD = 0.4
+PAYEE_FULL_MATCH_THRESHOLD = 0.8
+PAYEE_PARTIAL_MATCH_THRESHOLD = 0.4
+OVERALL_EXCELLENT_MATCH_THRESHOLD = 0.8  # Clear winner
+OVERALL_ACCEPTABLE_MATCH_THRESHOLD = 0.5  # Acceptable if only one match found
 
 
 def remove_duplicate_words(text: str) -> str:
@@ -392,9 +394,9 @@ def records_match(r1: Dict, r2: Dict) -> Tuple[float, List[str]]:
     else:
         check_score = 0.0
         payee_score = payee_match(r1['payee'], r2['payee'])
-        if payee_score > FULL_MATCH_THRESHOLD:
+        if payee_score > PAYEE_FULL_MATCH_THRESHOLD:
             payee_message = ''
-        elif payee_score > PARTIAL_MATCH_THRESHOLD:
+        elif payee_score > PAYEE_PARTIAL_MATCH_THRESHOLD:
             payee_message = 'partial payee match'
         else:
             payee_message = 'payee mismatch'
@@ -435,16 +437,16 @@ def match_statement_and_books(
         matches_found = 0
         for i, r2 in enumerate(books_trans):
             score, note = records_match(r1, r2)
-            if score >= 0.5 and score >= best_match_score:
+            if score >= OVERALL_ACCEPTABLE_MATCH_THRESHOLD and score >= best_match_score:
                 matches_found += 1
                 best_match_score = score
                 best_match_index = i
                 best_match_note = note
         if (
-            best_match_score > 0.5
+            best_match_score > OVERALL_ACCEPTABLE_MATCH_THRESHOLD
             and matches_found == 1
             and 'check-id mismatch' not in best_match_note
-            or best_match_score > 0.8
+            or best_match_score > OVERALL_EXCELLENT_MATCH_THRESHOLD
         ):
             matches.append(([r1], [books_trans[best_match_index]], best_match_note))
             # Don't try to make a second match against this books entry.
@@ -484,16 +486,16 @@ def subset_match(
         r2['amount'] = total
         for i, r1 in enumerate(statement_trans):
             score, note = records_match(r1, r2)
-            if score >= 0.5 and score >= best_match_score:
+            if score >= OVERALL_ACCEPTABLE_MATCH_THRESHOLD and score >= best_match_score:
                 matches_found += 1
                 best_match_score = score
                 best_match_index = i
                 best_match_note = note
         if (
-            best_match_score > 0.5
+            best_match_score > OVERALL_ACCEPTABLE_MATCH_THRESHOLD
             and matches_found == 1
             and 'check-id mismatch' not in best_match_note
-            or best_match_score > 0.8
+            or best_match_score > OVERALL_EXCELLENT_MATCH_THRESHOLD
         ):
             matches.append(
                 ([statement_trans[best_match_index]], group_items, best_match_note)
@@ -795,6 +797,8 @@ def main(
         statement_trans = read_csv(f)
 
     # Dates are taken from the beginning/end of the statement.
+    # TODO: FR statements include the last day of previous statement and the
+    # last day of this statement in the first/last rows.
     begin_date = statement_trans[0]['date']
     end_date = statement_trans[-1]['date']