From 87f32091019ebfae9ce2ad4d03c74b4733139ad8 2017-12-19 17:43:59
From: Brett Smith <brettcsmith@brettcsmith.org>
Date: 2017-12-19 17:43:59
Subject: [PATCH] strparse: Improve extra text parsing in currency_decimal.

This allows a symbol and a currency code to be in different parts of the
string, as long as there's at most one of each.

---

diff --git a/import2ledger/strparse.py b/import2ledger/strparse.py
index e5ef5daca23dcf9d6945ddbcccc1c24070eb095d..d72e5786af7154994aab9b905f3812a4ed1dff00 100644
--- a/import2ledger/strparse.py
+++ b/import2ledger/strparse.py
@@ -6,8 +6,14 @@ import unicodedata
 
 import babel.numbers
 
+CURRENCY_SPEC_PATTERN = r'^{space}(?:|{symbol}{space}{code}|{code}{space}{symbol}){space}$'.format(
+    code=r'[A-Za-z]{,3}',
+    space=r'\s*',
+    symbol=r'(\W?)',
+)
+
 @functools.lru_cache()
-def _currency_pattern(locale):
+def _currency_amount_pattern(locale):
     minus = babel.numbers.get_minus_sign_symbol(locale)
     plus = babel.numbers.get_plus_sign_symbol(locale)
     dec_sym = babel.numbers.get_decimal_symbol(locale)
@@ -21,27 +27,22 @@ def _currency_pattern(locale):
 
 def currency_decimal(s, locale='en_US_POSIX'):
     try:
-        match = re.search(_currency_pattern(locale), s)
+        match = re.search(_currency_amount_pattern(locale), s)
     except TypeError:
         return decimal.Decimal(s)
     if not match:
         raise ValueError("no decimal found in {!r}".format(s))
-    # There may be extra symbols/text before the number, after the number,
-    # or between the number and its sign—but only in one of those places.
-    extra = None
-    for extra_s in [s[:match.start()], match.group(2), s[match.end():]]:
-        extra_s = extra_s.strip()
-        if extra and extra_s:
-            raise ValueError("too much extraneous text in {!r}".format(s))
-        extra = extra_s
-    # The only extra text allowed is currency specifiers like plain symbols,
-    # 'A$', 'US$', 'CAD', 'USD $', etc.
-    # Trim any currency symbol.
-    if extra and unicodedata.category(extra[-1]) == 'Sc':
-        extra = extra[:-1].strip()
-    # Anything remaining should look like currency specifier text.
-    if extra and ((len(extra) > 3) or (not extra.isalpha())):
-        raise ValueError("non-currency text in {!r}: {!r}".format(s, extra))
+    extra_s = ''.join([s[:match.start()], match.group(2), s[match.end():]])
+    # The only extra text allowed is currency specifiers:
+    # '€', 'A$', 'US$', 'CAD', '$USD', etc.
+    extra_match = re.match(CURRENCY_SPEC_PATTERN, extra_s)
+    if not extra_match:
+        extra_ok = False
+    else:
+        symbol = extra_match.group(1) or extra_match.group(2)
+        extra_ok = (not symbol) or (unicodedata.category(symbol) == 'Sc')
+    if not extra_ok:
+        raise ValueError("non-currency text in {!r}: {!r}".format(s, extra_s))
     return babel.numbers.parse_decimal(match.group(1) + match.group(3), locale)
 
 def date(date_s, date_fmt):