add csv streaming parser

2025-08-06 22:32:54 +02:00 · 2021-10-24 12:24:44 -07:00
parent 0313e3f8ca
commit 2b887debc6
4 changed files with 10172 additions and 1 deletions
--- a/3
+++ b/3
@ -1,10 +1,11 @@
 jc changelog
-20211019 v1.17.1  *** in progress ***
+20211024 v1.17.1  *** in progress ***
 - Fix file parser for gzip files
 - Fix uname parser for cases where the 'processor' and/or 'hardware_platform' fields are missing on linux
 - Fix uname parser on FreeBSD
 - Add lsusb parser tested on linux
 - Add CSV file streaming parser
 20210923 v1.17.0
 - Note to Package Maintainers: please see note at 20210720 v1.16.0
--- a/jc/cli.py
+++ b/jc/cli.py
@ -53,6 +53,7 @@ parsers = [
    'crontab',
    'crontab-u',
    'csv',
    'csv-s',
    'date',
    'df',
    'dig',
--- a/jc/parsers/csv_s.py
+++ b/jc/parsers/csv_s.py
@ -0,0 +1,168 @@
 """jc - JSON CLI output utility `csv` file streaming parser
 The `csv` parser will attempt to automatically detect the delimiter character. If the delimiter cannot be detected it will default to comma. The first row of the file must be a header row.
 Note: The first 100 rows are read into memory to enable delimiter detection. Then the rest of the rows are loaded lazily.
 Usage (cli):
    $ cat file.csv | jc --csv
 Usage (module):
    import jc.parsers.csv_s
    result = jc.parsers.csv_s.parse(csv_output)
 Schema:
    csv file converted to a Dictionary: https://docs.python.org/3/library/csv.html
  {
    "column_name1":     string,
    "column_name2":     string
  }
 Examples:
    $ cat homes.csv
    "Sell", "List", "Living", "Rooms", "Beds", "Baths", "Age", "Acres", "Taxes"
    142, 160, 28, 10, 5, 3,  60, 0.28,  3167
    175, 180, 18,  8, 4, 1,  12, 0.43,  4033
    129, 132, 13,  6, 3, 1,  41, 0.33,  1471
    ...
    $ cat homes.csv | jc --csv_s
    {"Sell":"142","List":"160","Living":"28","Rooms":"10","Beds":"5","Baths":"3","Age":"60","Acres":"0.28","Taxes":"3167"}
    {"Sell":"175","List":"180","Living":"18","Rooms":"8","Beds":"4","Baths":"1","Age":"12","Acres":"0.43","Taxes":"4033"}
    {"Sell":"129","List":"132","Living":"13","Rooms":"6","Beds":"3","Baths":"1","Age":"41","Acres":"0.33","Taxes":"1471"}
    ...
 """
 import itertools
 import csv
 import jc.utils
 from jc.utils import stream_success, stream_error
 from jc.exceptions import ParseError
 class info():
    """Provides parser metadata (version, author, etc.)"""
    version = '1.0'
    description = 'CSV file streaming parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
    details = 'Using the python standard csv library'
    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
    streaming = True
 __version__ = info.version
 def _process(proc_data):
    """
    Final processing to conform to the schema.
    Parameters:
        proc_data:   (List of Dictionaries) raw structured data to process
    Returns:
        List of Dictionaries. Each Dictionary represents a row in the csv file.
    """
    # No further processing
    return proc_data
 def old_parse(data, raw=False, quiet=False):
    """
    Main text parsing function
    Parameters:
        data:        (string)  text data to parse
        raw:         (boolean) output preprocessed JSON if True
        quiet:       (boolean) suppress warning messages if True
    Returns:
        List of Dictionaries. Raw or processed structured data.
    """
    if not quiet:
        jc.utils.compatibility(__name__, info.compatible)
    raw_output = []
    cleandata = data.splitlines()
    # Clear any blank lines
    cleandata = list(filter(None, cleandata))
    if jc.utils.has_data(data):
        dialect = None
        try:
            dialect = csv.Sniffer().sniff(data[:1024])
        except Exception:
            pass
        reader = csv.DictReader(cleandata, dialect=dialect)
        for row in reader:
            raw_output.append(row)
    if raw:
        return raw_output
    else:
        return _process(raw_output)
 def parse(data, raw=False, quiet=False, ignore_exceptions=False):
    """
    Main text parsing generator function. Returns an iterator object.
    Parameters:
        data:              (iterable)  line-based text data to parse (e.g. sys.stdin or str.splitlines())
        raw:               (boolean)   output preprocessed JSON if True
        quiet:             (boolean)   suppress warning messages if True
        ignore_exceptions: (boolean)   ignore parsing exceptions if True
    Yields:
        Dictionary. Raw or processed structured data.
    Returns:
        Iterator object
    """
    if not quiet:
        jc.utils.compatibility(__name__, info.compatible)
    temp_list = []
    # first, load the first 100 lines into a list to detect the CSV dialect
    for line in itertools.islice(data, 100):
        temp_list.append(line)
    sniffdata = '\n'.join(temp_list)
    dialect = None
    try:
        dialect = csv.Sniffer().sniff(sniffdata[:1024])
    except Exception:
        pass
    # chain `temp_list` and `data` together to lazy load all of the CSV data
    new_data = itertools.chain(temp_list, data)
    reader = csv.DictReader(new_data, dialect=dialect)
    for row in reader:
        try:
            if row:
                yield stream_success(row, ignore_exceptions) if raw else stream_success(_process(row), ignore_exceptions)
            else:
                raise ParseError('Not CSV data')
        except Exception as e:
            yield stream_error(e, ignore_exceptions, row)
--- a/tests/fixtures/generic/csv-10k-sales-records.csv
+++ b/tests/fixtures/generic/csv-10k-sales-records.csv