From 499f18ff623c954558d59648653a8e0708baa460 2020-04-01 17:38:37 From: Brett Smith Date: 2020-04-01 17:38:37 Subject: [PATCH] meta_entity: Adjust what entities are allowed based on today's books. See the comments throughout for more discussion about what cases are or aren't allowed, and why. --- diff --git a/conservancy_beancount/plugin/meta_entity.py b/conservancy_beancount/plugin/meta_entity.py index 4096a65d8e170c432584e8feed157ec0677b8cca..ee655f41111fc60745f0a2241291bc752f47a26f 100644 --- a/conservancy_beancount/plugin/meta_entity.py +++ b/conservancy_beancount/plugin/meta_entity.py @@ -14,7 +14,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import re +# Type stubs aren't available for regex. +# Fortunately, we're using it in a way that's API-compatible with the re +# module. We mitigate the lack of type stubs by providing type declarations +# for returned objects. This way, the only thing that isn't type checked are +# the calls to regex functions. +import regex # type:ignore[import] from . import core from .. import data @@ -23,10 +28,25 @@ from ..beancount_types import ( Transaction, ) +from typing import ( + Pattern, +) + class MetaEntity(core.TransactionHook): METADATA_KEY = 'entity' HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY]) - ENTITY_RE = re.compile(r'^[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*$') + + # alnum is the set of characters we always accept in entity metadata: + # letters and digits, minus the Latin 1 supplement (i.e., Roman letters + # with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.) + # See the tests for specific cases. + alnum = r'\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}' + # A regexp that would be reasonably stricter would be: + # f'^[{alnum}][.{alnum}]*(?:-[.{alnum}])*$' + # However, current producers fail that regexp in a few different ways. + # See the tests for specific cases. + ENTITY_RE: Pattern[str] = regex.compile(f'^[{alnum}][-.{alnum}]*$', regex.VERSION1) + del alnum def run(self, txn: Transaction) -> errormod.Iter: txn_entity = txn.meta.get(self.METADATA_KEY) diff --git a/setup.py b/setup.py index c473325b952b3c274357359d02854c4d6cb8ffd2..565125c5fc5d61b790434ded4aa68df32a6486a7 100755 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ setup( install_requires=[ 'beancount>=2.2', + 'regex', 'rt>=2.0', ], setup_requires=[ diff --git a/tests/test_meta_entity.py b/tests/test_meta_entity.py index 0b755955fc824ffee81fd2e15096d27e90f6afac..f20e0abf5a09d4a35c81e05d49efa87d938612f8 100644 --- a/tests/test_meta_entity.py +++ b/tests/test_meta_entity.py @@ -21,19 +21,47 @@ from . import testutil from conservancy_beancount.plugin import meta_entity VALID_VALUES = { + # Classic entity: LastName-FirstName 'Smith-Alex', + # Various people and companies have one-word names + # Digits are allowed, as part of a name or standalone 'Company19', + 'Company-19', + # No case requirements 'boyd-danah', + # No limit on the number of parts of the name 'B-van-der-A', + # Names that have no ASCII are allowed, with or without dash separators + '田中流星', + '田中-流星', + 'スミスダコタ', + 'スミス-ダコタ', + 'Яшин-Данила', + # The PayPal importer produces . in entity metadata + 'Du-Bois-W.-E.-B.', + # import2ledger produces entities that end with - + # That's probably a bug, but allow it for now. + 'foo-', } INVALID_VALUES = { + # Starting with a - is not allowed '-foo', - 'foo-', '-', + # Names that can be reduced to ASCII should be + # Producers should change this to Uberentity or Ueberentity + # I am not wild about this rule and would like to relax it—it's mostly + # based on an expectation that entities are typed in by a human. That's true + # less and less and it seems like we should reduce the amount of mangling + # producers are expected to do. But it's the rule for today. 'Überentity', + # Whitespace is never allowed 'Alex Smith', + '田中 流星', + 'スミス ダコタ', + 'Яшин Данила', ' ', + # An empty string is not valid '', }