Changeset - 499f18ff623c
[Not reviewed]
0 3 0
Brett Smith - 4 years ago 2020-04-01 17:38:37
brettcsmith@brettcsmith.org
meta_entity: Adjust what entities are allowed based on today's books.

See the comments throughout for more discussion about what cases are
or aren't allowed, and why.
3 files changed with 52 insertions and 3 deletions:
0 comments (0 inline, 0 general)
conservancy_beancount/plugin/meta_entity.py
Show inline comments
...
 
@@ -11,25 +11,45 @@
 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
# GNU Affero General Public License for more details.
 
#
 
# You should have received a copy of the GNU Affero General Public License
 
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
 

	
 
import re
 
# Type stubs aren't available for regex.
 
# Fortunately, we're using it in a way that's API-compatible with the re
 
# module. We mitigate the lack of type stubs by providing type declarations
 
# for returned objects. This way, the only thing that isn't type checked are
 
# the calls to regex functions.
 
import regex  # type:ignore[import]
 

	
 
from . import core
 
from .. import data
 
from .. import errors as errormod
 
from ..beancount_types import (
 
    Transaction,
 
)
 

	
 
from typing import (
 
    Pattern,
 
)
 

	
 
class MetaEntity(core.TransactionHook):
 
    METADATA_KEY = 'entity'
 
    HOOK_GROUPS = frozenset(['posting', 'metadata', METADATA_KEY])
 
    ENTITY_RE = re.compile(r'^[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*$')
 

	
 
    # alnum is the set of characters we always accept in entity metadata:
 
    # letters and digits, minus the Latin 1 supplement (i.e., Roman letters
 
    # with diacritics: áÁàÀâÂåÅäÄãà çÇ ðÐ ñÑ øØ ß etc.)
 
    # See the tests for specific cases.
 
    alnum = r'\p{Letter}\p{Digit}--\p{Block=Latin_1_Supplement}'
 
    # A regexp that would be reasonably stricter would be:
 
    #   f'^[{alnum}][.{alnum}]*(?:-[.{alnum}])*$'
 
    # However, current producers fail that regexp in a few different ways.
 
    # See the tests for specific cases.
 
    ENTITY_RE: Pattern[str] = regex.compile(f'^[{alnum}][-.{alnum}]*$', regex.VERSION1)
 
    del alnum
 

	
 
    def run(self, txn: Transaction) -> errormod.Iter:
 
        txn_entity = txn.meta.get(self.METADATA_KEY)
 
        if txn_entity is None:
 
            txn_entity_ok = None
 
        elif isinstance(txn_entity, str):
setup.py
Show inline comments
...
 
@@ -9,12 +9,13 @@ setup(
 
    author='Software Freedom Conservancy',
 
    author_email='info@sfconservancy.org',
 
    license='GNU AGPLv3+',
 

	
 
    install_requires=[
 
        'beancount>=2.2',
 
        'regex',
 
        'rt>=2.0',
 
    ],
 
    setup_requires=[
 
        'pytest-mypy',
 
        'pytest-runner',
 
    ],
tests/test_meta_entity.py
Show inline comments
...
 
@@ -18,25 +18,53 @@ import pytest
 

	
 
from . import testutil
 

	
 
from conservancy_beancount.plugin import meta_entity
 

	
 
VALID_VALUES = {
 
    # Classic entity: LastName-FirstName
 
    'Smith-Alex',
 
    # Various people and companies have one-word names
 
    # Digits are allowed, as part of a name or standalone
 
    'Company19',
 
    'Company-19',
 
    # No case requirements
 
    'boyd-danah',
 
    # No limit on the number of parts of the name
 
    'B-van-der-A',
 
    # Names that have no ASCII are allowed, with or without dash separators
 
    '田中流星',
 
    '田中-流星',
 
    'スミスダコタ',
 
    'スミス-ダコタ',
 
    'Яшин-Данила',
 
    # The PayPal importer produces . in entity metadata
 
    'Du-Bois-W.-E.-B.',
 
    # import2ledger produces entities that end with -
 
    # That's probably a bug, but allow it for now.
 
    'foo-',
 
}
 

	
 
INVALID_VALUES = {
 
    # Starting with a - is not allowed
 
    '-foo',
 
    'foo-',
 
    '-',
 
    # Names that can be reduced to ASCII should be
 
    # Producers should change this to Uberentity or Ueberentity
 
    # I am not wild about this rule and would like to relax it—it's mostly
 
    # based on an expectation that entities are typed in by a human. That's true
 
    # less and less and it seems like we should reduce the amount of mangling
 
    # producers are expected to do. But it's the rule for today.
 
    'Überentity',
 
    # Whitespace is never allowed
 
    'Alex Smith',
 
    '田中 流星',
 
    'スミス ダコタ',
 
    'Яшин Данила',
 
    ' ',
 
    # An empty string is not valid
 
    '',
 
}
 

	
 
TEST_KEY = 'entity'
 

	
 
@pytest.fixture(scope='module')
0 comments (0 inline, 0 general)