From 87f32091019ebfae9ce2ad4d03c74b4733139ad8 2017-12-19 17:43:59 From: Brett Smith Date: 2017-12-19 17:43:59 Subject: [PATCH] strparse: Improve extra text parsing in currency_decimal. This allows a symbol and a currency code to be in different parts of the string, as long as there's at most one of each. --- diff --git a/import2ledger/strparse.py b/import2ledger/strparse.py index e5ef5daca23dcf9d6945ddbcccc1c24070eb095d..d72e5786af7154994aab9b905f3812a4ed1dff00 100644 --- a/import2ledger/strparse.py +++ b/import2ledger/strparse.py @@ -6,8 +6,14 @@ import unicodedata import babel.numbers +CURRENCY_SPEC_PATTERN = r'^{space}(?:|{symbol}{space}{code}|{code}{space}{symbol}){space}$'.format( + code=r'[A-Za-z]{,3}', + space=r'\s*', + symbol=r'(\W?)', +) + @functools.lru_cache() -def _currency_pattern(locale): +def _currency_amount_pattern(locale): minus = babel.numbers.get_minus_sign_symbol(locale) plus = babel.numbers.get_plus_sign_symbol(locale) dec_sym = babel.numbers.get_decimal_symbol(locale) @@ -21,27 +27,22 @@ def _currency_pattern(locale): def currency_decimal(s, locale='en_US_POSIX'): try: - match = re.search(_currency_pattern(locale), s) + match = re.search(_currency_amount_pattern(locale), s) except TypeError: return decimal.Decimal(s) if not match: raise ValueError("no decimal found in {!r}".format(s)) - # There may be extra symbols/text before the number, after the number, - # or between the number and its sign—but only in one of those places. - extra = None - for extra_s in [s[:match.start()], match.group(2), s[match.end():]]: - extra_s = extra_s.strip() - if extra and extra_s: - raise ValueError("too much extraneous text in {!r}".format(s)) - extra = extra_s - # The only extra text allowed is currency specifiers like plain symbols, - # 'A$', 'US$', 'CAD', 'USD $', etc. - # Trim any currency symbol. - if extra and unicodedata.category(extra[-1]) == 'Sc': - extra = extra[:-1].strip() - # Anything remaining should look like currency specifier text. - if extra and ((len(extra) > 3) or (not extra.isalpha())): - raise ValueError("non-currency text in {!r}: {!r}".format(s, extra)) + extra_s = ''.join([s[:match.start()], match.group(2), s[match.end():]]) + # The only extra text allowed is currency specifiers: + # '€', 'A$', 'US$', 'CAD', '$USD', etc. + extra_match = re.match(CURRENCY_SPEC_PATTERN, extra_s) + if not extra_match: + extra_ok = False + else: + symbol = extra_match.group(1) or extra_match.group(2) + extra_ok = (not symbol) or (unicodedata.category(symbol) == 'Sc') + if not extra_ok: + raise ValueError("non-currency text in {!r}: {!r}".format(s, extra_s)) return babel.numbers.parse_decimal(match.group(1) + match.group(3), locale) def date(date_s, date_fmt):