add csv streaming parser

2025-08-06 22:32:54 +02:00 · 2021-10-24 12:24:44 -07:00
parent 0313e3f8ca
commit 2b887debc6
4 changed files with 10172 additions and 1 deletions
--- a/3
+++ b/3
@ -1,10 +1,11 @@
 jc changelog

-20211019 v1.17.1  *** in progress ***
+20211024 v1.17.1  *** in progress ***
 - Fix file parser for gzip files
 - Fix uname parser for cases where the 'processor' and/or 'hardware_platform' fields are missing on linux
 - Fix uname parser on FreeBSD
 - Add lsusb parser tested on linux
+- Add CSV file streaming parser

 20210923 v1.17.0
 - Note to Package Maintainers: please see note at 20210720 v1.16.0
--- a/jc/cli.py
+++ b/jc/cli.py
@ -53,6 +53,7 @@ parsers = [
    'crontab',
    'crontab-u',
    'csv',
+    'csv-s',
    'date',
    'df',
    'dig',
--- a/jc/parsers/csv_s.py
+++ b/jc/parsers/csv_s.py
@ -0,0 +1,168 @@
+"""jc - JSON CLI output utility `csv` file streaming parser
+
+The `csv` parser will attempt to automatically detect the delimiter character. If the delimiter cannot be detected it will default to comma. The first row of the file must be a header row.
+
+Note: The first 100 rows are read into memory to enable delimiter detection. Then the rest of the rows are loaded lazily.
+
+Usage (cli):
+
+    $ cat file.csv | jc --csv
+
+Usage (module):
+
+    import jc.parsers.csv_s
+    result = jc.parsers.csv_s.parse(csv_output)
+
+Schema:
+
+    csv file converted to a Dictionary: https://docs.python.org/3/library/csv.html
+
+  {
+    "column_name1":     string,
+    "column_name2":     string
+  }
+
+Examples:
+
+    $ cat homes.csv
+    "Sell", "List", "Living", "Rooms", "Beds", "Baths", "Age", "Acres", "Taxes"
+    142, 160, 28, 10, 5, 3,  60, 0.28,  3167
+    175, 180, 18,  8, 4, 1,  12, 0.43,  4033
+    129, 132, 13,  6, 3, 1,  41, 0.33,  1471
+    ...
+
+    $ cat homes.csv | jc --csv_s
+    {"Sell":"142","List":"160","Living":"28","Rooms":"10","Beds":"5","Baths":"3","Age":"60","Acres":"0.28","Taxes":"3167"}
+    {"Sell":"175","List":"180","Living":"18","Rooms":"8","Beds":"4","Baths":"1","Age":"12","Acres":"0.43","Taxes":"4033"}
+    {"Sell":"129","List":"132","Living":"13","Rooms":"6","Beds":"3","Baths":"1","Age":"41","Acres":"0.33","Taxes":"1471"}
+    ...
+"""
+import itertools
+import csv
+import jc.utils
+from jc.utils import stream_success, stream_error
+from jc.exceptions import ParseError
+
+
+class info():
+    """Provides parser metadata (version, author, etc.)"""
+    version = '1.0'
+    description = 'CSV file streaming parser'
+    author = 'Kelly Brazil'
+    author_email = 'kellyjonbrazil@gmail.com'
+    details = 'Using the python standard csv library'
+    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
+    streaming = True
+
+
+__version__ = info.version
+
+
+def _process(proc_data):
+    """
+    Final processing to conform to the schema.
+
+    Parameters:
+
+        proc_data:   (List of Dictionaries) raw structured data to process
+
+    Returns:
+
+        List of Dictionaries. Each Dictionary represents a row in the csv file.
+    """
+
+    # No further processing
+    return proc_data
+
+
+def old_parse(data, raw=False, quiet=False):
+    """
+    Main text parsing function
+
+    Parameters:
+
+        data:        (string)  text data to parse
+        raw:         (boolean) output preprocessed JSON if True
+        quiet:       (boolean) suppress warning messages if True
+
+    Returns:
+
+        List of Dictionaries. Raw or processed structured data.
+    """
+    if not quiet:
+        jc.utils.compatibility(__name__, info.compatible)
+
+    raw_output = []
+    cleandata = data.splitlines()
+
+    # Clear any blank lines
+    cleandata = list(filter(None, cleandata))
+
+    if jc.utils.has_data(data):
+
+        dialect = None
+        try:
+            dialect = csv.Sniffer().sniff(data[:1024])
+        except Exception:
+            pass
+
+        reader = csv.DictReader(cleandata, dialect=dialect)
+
+        for row in reader:
+            raw_output.append(row)
+
+    if raw:
+        return raw_output
+    else:
+        return _process(raw_output)
+
+
+def parse(data, raw=False, quiet=False, ignore_exceptions=False):
+    """
+    Main text parsing generator function. Returns an iterator object.
+
+    Parameters:
+
+        data:              (iterable)  line-based text data to parse (e.g. sys.stdin or str.splitlines())
+        raw:               (boolean)   output preprocessed JSON if True
+        quiet:             (boolean)   suppress warning messages if True
+        ignore_exceptions: (boolean)   ignore parsing exceptions if True
+
+    Yields:
+
+        Dictionary. Raw or processed structured data.
+
+    Returns:
+
+        Iterator object
+    """
+    if not quiet:
+        jc.utils.compatibility(__name__, info.compatible)
+
+    temp_list = []
+
+    # first, load the first 100 lines into a list to detect the CSV dialect
+    for line in itertools.islice(data, 100):
+        temp_list.append(line)
+
+    sniffdata = '\n'.join(temp_list)
+
+    dialect = None
+    try:
+        dialect = csv.Sniffer().sniff(sniffdata[:1024])
+    except Exception:
+        pass
+
+    # chain `temp_list` and `data` together to lazy load all of the CSV data
+    new_data = itertools.chain(temp_list, data)
+    reader = csv.DictReader(new_data, dialect=dialect)
+
+    for row in reader:
+        try:
+            if row:
+                yield stream_success(row, ignore_exceptions) if raw else stream_success(_process(row), ignore_exceptions)
+            else:
+                raise ParseError('Not CSV data')
+
+        except Exception as e:
+            yield stream_error(e, ignore_exceptions, row)
--- a/tests/fixtures/generic/csv-10k-sales-records.csv
+++ b/tests/fixtures/generic/csv-10k-sales-records.csv