Changeset - 56b644f1db9b
[Not reviewed]
0 2 0
Brett Smith - 4 years ago 2020-05-06 14:26:25
brettcsmith@brettcsmith.org
meta_entity: More battle testing.

See the test cases for examples of real entities in the books
that we should accept for now.
2 files changed with 81 insertions and 28 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/plugin/meta_entity.py
Show inline comments
...
 
@@ -25,39 +25,51 @@ from . import core
 
from .. import data
 
from .. import errors as errormod
 
from ..beancount_types import (
 
    MetaKey,
 
    MetaValue,
 
    Transaction,
 
)
 

	
 
from typing import (
 
    MutableMapping,
 
    Optional,
 
    Pattern,
 
    Tuple,
 
)
 

	
 
class MetaEntity(core.TransactionHook):
 
    METADATA_KEY = 'entity'
 
    HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY])
 

	
 
    # alnum is the set of characters we always accept in entity metadata:
 
    # letters and digits, minus the Latin 1 supplement (i.e., Roman letters
 
    # with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.)
 
    # chars is the set of characters we always accept in entity metadata:
 
    # letters, digits, and ASCII punctuation, except `-` and the Latin 1 supplement
 
    # (i.e., Roman letters with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.)
 
    # See the tests for specific cases.
 
    alnum = r'\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}'
 
    # A regexp that would be reasonably stricter would be:
 
    #   f'^[{alnum}][.{alnum}]*(?:-[.{alnum}])*$'
 
    # However, current producers fail that regexp in a few different ways.
 
    # See the tests for specific cases.
 
    ENTITY_RE: Pattern[str] = regex.compile(f'^[{alnum}][-.{alnum}]*$', regex.VERSION1)
 
    del alnum
 
    chars = r'\u0021-\u002c\u002e-\u007e\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}'
 
    ENTITY_RE: Pattern[str] = regex.compile(f'^[{chars}][-{chars}]*$', regex.VERSION1)
 
    ANONYMOUS_RE: Pattern[str] = regex.compile(r'^[-_.?!\s]*$', regex.VERSION1)
 
    del chars
 

	
 
    def _check_entity(self,
 
                      meta: MutableMapping[MetaKey, MetaValue],
 
                      default: Optional[str]=None,
 
    ) -> Tuple[Optional[str], Optional[bool]]:
 
        entity = meta.get(self.METADATA_KEY, default)
 
        if entity is None:
 
            return None, None
 
        elif not isinstance(entity, str):
 
            return None, False
 
        elif self.ANONYMOUS_RE.match(entity):
 
            entity = 'Anonymous'
 
            meta[self.METADATA_KEY] = entity
 
            return entity, True
 
        else:
 
            return entity, self.ENTITY_RE.match(entity) is not None
 

	
 
    def run(self, txn: Transaction) -> errormod.Iter:
 
        if data.is_opening_balance_txn(txn):
 
            return
 
        txn_entity = txn.meta.get(self.METADATA_KEY, txn.payee)
 
        if txn_entity is None:
 
            txn_entity_ok = None
 
        elif isinstance(txn_entity, str):
 
            txn_entity_ok = bool(self.ENTITY_RE.match(txn_entity))
 
        else:
 
            txn_entity_ok = False
 
        txn_entity, txn_entity_ok = self._check_entity(txn.meta, txn.payee)
 
        if txn_entity_ok is False:
 
            yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, txn_entity)
 
        for post in data.Posting.from_txn(txn):
...
 
@@ -68,10 +80,8 @@ class MetaEntity(core.TransactionHook):
 
                    'Liabilities:Payable',
 
            ):
 
                continue
 
            entity = post.meta.get(self.METADATA_KEY)
 
            if entity is None:
 
                yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post)
 
            elif entity is txn_entity:
 
            entity, entity_ok = self._check_entity(post.meta, txn_entity)
 
            if entity is txn_entity and entity is not None:
 
                pass
 
            elif not self.ENTITY_RE.match(entity):
 
            elif not entity_ok:
 
                yield errormod.InvalidMetadataError(txn, self.METADATA_KEY, entity, post)
tests/test_meta_entity.py
Show inline comments
...
 
@@ -37,8 +37,14 @@ VALID_VALUES = {
 
    'スミスダコタ',
 
    'スミス-ダコタ',
 
    'Яшин-Данила',
 
    # The PayPal importer produces . in entity metadata
 
    # Governments, using : as a hierarchy separator
 
    'BE',
 
    'US:KY',
 
    'CA:ON',
 
    # The PayPal importer allows ASCII punctuation in entity metadata
 
    'Du-Bois-W.-E.-B.',
 
    "O'Malley-Thomas",
 
    'O`Malley-Thomas',
 
    # import2ledger produces entities that end with -
 
    # That's probably a bug, but allow it for now.
 
    'foo-',
...
 
@@ -47,7 +53,6 @@ VALID_VALUES = {
 
INVALID_VALUES = {
 
    # Starting with a - is not allowed
 
    '-foo',
 
    '-',
 
    # Names that can be reduced to ASCII should be
 
    # Producers should change this to Uberentity or Ueberentity
 
    # I am not wild about this rule and would like to relax it—it's mostly
...
 
@@ -56,14 +61,25 @@ INVALID_VALUES = {
 
    # mangling producers are expected to do. But it's the rule for today.
 
    'Überentity',
 
    # Whitespace is never allowed
 
    ' ',
 
    'Alex Smith',
 
    '田中\u00A0流星',  # Non-breaking space
 
    # The only punctuation allowed is - and .
 
    'スミス_ダコタ',
 
    # Non-ASCII punctuation is not allowed
 
    'Яшин—Данила',  # em dash
 
    # An empty string is not valid
 
    'O’Malley-Thomas',  # Right-angled apostrophe
 
    'Du-Bois-W。-E。-B。',  # Japanese period
 
}
 

	
 
ANONYMOUS_VALUES = {
 
    # Values produced by various importers that should be translated to
 
    # Anonymous.
 
    '',
 
    ' ',
 
    '-',
 
    '--',
 
    '-----',
 
    '_',
 
    ' _ ',
 
    '.',
 
}
 

	
 
TEST_KEY = 'entity'
...
 
@@ -81,6 +97,15 @@ def test_valid_values_on_postings(hook, src_value):
 
    ])
 
    assert not any(hook.run(txn))
 

	
 
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
 
def test_anonymous_values_on_postings(hook, src_value):
 
    txn = testutil.Transaction(postings=[
 
        ('Assets:Cash', -25),
 
        ('Expenses:General', 25, {TEST_KEY: src_value}),
 
    ])
 
    assert not any(hook.run(txn))
 
    assert txn.postings[-1].meta[TEST_KEY] == 'Anonymous'
 

	
 
@pytest.mark.parametrize('src_value', INVALID_VALUES)
 
def test_invalid_values_on_postings(hook, src_value):
 
    txn = testutil.Transaction(postings=[
...
 
@@ -99,6 +124,15 @@ def test_valid_values_on_transactions(hook, src_value):
 
    ])
 
    assert not any(hook.run(txn))
 

	
 
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
 
def test_anonymous_values_on_transactions(hook, src_value):
 
    txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[
 
        ('Assets:Cash', -25),
 
        ('Expenses:General', 25),
 
    ])
 
    assert not any(hook.run(txn))
 
    assert txn.meta[TEST_KEY] == 'Anonymous'
 

	
 
@pytest.mark.parametrize('src_value', INVALID_VALUES)
 
def test_invalid_values_on_transactions(hook, src_value):
 
    txn = testutil.Transaction(**{TEST_KEY: src_value}, postings=[
...
 
@@ -118,6 +152,15 @@ def test_valid_values_on_payee(hook, src_value):
 
    ])
 
    assert not any(hook.run(txn))
 

	
 
@pytest.mark.parametrize('src_value', ANONYMOUS_VALUES)
 
def test_anonymous_values_on_payee(hook, src_value):
 
    txn = testutil.Transaction(payee=src_value, postings=[
 
        ('Assets:Cash', -25),
 
        ('Expenses:General', 25),
 
    ])
 
    assert not any(hook.run(txn))
 
    assert txn.meta[TEST_KEY] == 'Anonymous'
 

	
 
@pytest.mark.parametrize('src_value', INVALID_VALUES)
 
def test_invalid_values_on_payee(hook, src_value):
 
    txn = testutil.Transaction(payee=src_value, postings=[
0 comments (0 inline, 0 general)