Changeset - b33c83af0a0f
[Not reviewed]
0 2 0
Brett Smith - 6 years ago 2018-06-20 19:59:56
brettcsmith@brettcsmith.org
importers: Catch and handle CSV parse errors in can_import.

import2ledger bailed on one of these exceptions when trying to import
a recent XLS file that had a null byte on a line.
2 files changed with 7 insertions and 3 deletions:
0 comments (0 inline, 0 general)
import2ledger/importers/_csv.py
Show inline comments
...
 
@@ -18,65 +18,69 @@ class CSVImporterBase:
 
      will be copied directly to the entry data dict before _read_row is called.
 
      Fields named here must exist in the CSV for it to be imported.
 
    * _read_header(cls, input_file): Some CSVs include "headers" with smaller
 
      rows before they get to the "real" data.  This classmethod is expected to
 
      read those rows and return two values: a dict of entry data read from
 
      the headers, and a list of column names for the real data.  The method
 
      is expected to leave input_data at the position where the real data
 
      starts, so callers can run ``csv.DictReader(input_file, column_names)``
 
      after.
 
      The default implementation reads rows until it finds one long enough to
 
      include all of the columns required by NEEDED_FIELDS and COPIED_FIELDS,
 
      then returns ({}, that_row).
 
    * _read_header_row(cls, row): A classmethod that returns either a dict,
 
      or None.  The default implementation of _read_header calls this method
 
      on each row.  If it returns a dict, those keys and values will be
 
      included in the entry data returned by _read_header.  If it returns
 
      None, _read_header expects this is the row with column names for the
 
      real data, and uses it in its return value.
 
    * Reader: A class that accepts the input source and iterates over rows of
 
      formatted data.  Default csv.reader.
 
    * DictReader: A class that accepts the input source and iterates over rows
 
      of data organized into dictionaries.  Default csv.DictReader.
 
    """
 
    ENTRY_SEED = {}
 
    COPIED_FIELDS = {}
 
    Reader = csv.reader
 
    DictReader = csv.DictReader
 

	
 
    @classmethod
 
    def _read_header_row(cls, row):
 
        return {} if len(row) < cls._HEADER_MAX_LEN else None
 

	
 
    @classmethod
 
    def _read_header(cls, input_file):
 
        cls._NEEDED_KEYS = cls.NEEDED_FIELDS.union(cls.COPIED_FIELDS)
 
        cls._HEADER_MAX_LEN = len(cls._NEEDED_KEYS)
 
        header = {}
 
        row = None
 
        for row in cls.Reader(input_file):
 
            row_data = cls._read_header_row(row)
 
            if row_data is None:
 
                break
 
            else:
 
                header.update(row_data)
 
        return header, row
 

	
 
    @classmethod
 
    def can_import(cls, input_file):
 
        _, fields = cls._read_header(input_file)
 
        return cls._NEEDED_KEYS.issubset(fields or ())
 
        try:
 
            _, fields = cls._read_header(input_file)
 
        except csv.Error:
 
            return False
 
        else:
 
            return cls._NEEDED_KEYS.issubset(fields or ())
 

	
 
    def __init__(self, input_file):
 
        self.entry_seed, fields = self._read_header(input_file)
 
        self.in_csv = self.DictReader(input_file, fields)
 

	
 
    def __iter__(self):
 
        for row in self.in_csv:
 
            row_data = self._read_row(row)
 
            if row_data is not None:
 
                copied_fields = {
 
                    entry_key: row[row_key]
 
                    for row_key, entry_key in self.COPIED_FIELDS.items()
 
                }
 
                yield collections.ChainMap(
 
                    row_data, copied_fields, self.entry_seed, self.ENTRY_SEED)
setup.py
Show inline comments
 
#!/usr/bin/env python3
 

	
 
import sys
 

	
 
from setuptools import setup, find_packages
 

	
 
REQUIREMENTS = {
 
    'install_requires': [
 
        'babel',
 
        'enum34;python_version<"3.4"',
 
    ],
 
    'setup_requires': ['pytest-runner'],
 
    'extras_require': {
 
        'brightfunds': ['xlrd'],
 
        'nbpy2017': ['beautifulsoup4', 'html5lib'],
 
    },
 
}
 

	
 
all_extras_require = [
 
    req for reqlist in REQUIREMENTS['extras_require'].values() for req in reqlist
 
]
 

	
 
REQUIREMENTS['extras_require']['all_importers'] = all_extras_require
 
REQUIREMENTS['tests_require'] = [
 
    'pytest',
 
    'PyYAML',
 
    *all_extras_require,
 
]
 

	
 
setup(
 
    name='import2ledger',
 
    description="Import different sources of financial data to Ledger",
 
    version='0.3',
 
    version='0.4',
 
    author='Brett Smith',
 
    author_email='brettcsmith@brettcsmith.org',
 
    license='GNU AGPLv3+',
 

	
 
    packages=find_packages(include=['import2ledger', 'import2ledger.*']),
 
    entry_points={
 
        'console_scripts': ['import2ledger = import2ledger.__main__:main'],
 
    },
 

	
 
    **REQUIREMENTS,
 
)
0 comments (0 inline, 0 general)