jc/jc/parsers/csv_s.py

"""jc - JSON CLI output utility `csv` file streaming parser

> This streaming parser outputs JSON Lines

The `csv` streaming parser will attempt to automatically detect the
delimiter character. If the delimiter cannot be detected it will default
to comma. The first row of the file must be a header row.

Note: The first 100 rows are read into memory to enable delimiter detection,
then the rest of the rows are loaded lazily.

Usage (cli):

    $ cat file.csv | jc --csv-s

Usage (module):

    import jc
    # result is an iterable object (generator)
    result = jc.parse('csv_s', csv_output.splitlines())
    for item in result:
        # do something

    or

    import jc.parsers.csv_s
    # result is an iterable object (generator)
    result = jc.parsers.csv_s.parse(csv_output.splitlines())
    for item in result:
        # do something

Schema:

    csv file converted to a Dictionary:
    https://docs.python.org/3/library/csv.html

    {
      "column_name1":     string,
      "column_name2":     string,

      # below object only exists if using -qq or ignore_exceptions=True

      "_jc_meta":
        {
          "success":      boolean,     # false if error parsing
          "error":        string,      # exists if "success" is false
          "line":         string       # exists if "success" is false
        }
    }

Examples:

    $ cat homes.csv
    "Sell", "List", "Living", "Rooms", "Beds", "Baths", "Age", "Acres"...
    142, 160, 28, 10, 5, 3,  60, 0.28,  3167
    175, 180, 18,  8, 4, 1,  12, 0.43,  4033
    129, 132, 13,  6, 3, 1,  41, 0.33,  1471
    ...

    $ cat homes.csv | jc --csv-s
    {"Sell":"142","List":"160","Living":"28","Rooms":"10","Beds":"5"...}
    {"Sell":"175","List":"180","Living":"18","Rooms":"8","Beds":"4"...}
    {"Sell":"129","List":"132","Living":"13","Rooms":"6","Beds":"3"...}
    ...
"""
import itertools
import csv
import jc.utils
from jc.utils import ignore_exceptions_msg, add_jc_meta
from jc.exceptions import ParseError


class info():
    """Provides parser metadata (version, author, etc.)"""
    version = '1.3'
    description = 'CSV file streaming parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
    details = 'Using the python standard csv library'
    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
    streaming = True


__version__ = info.version


def _process(proc_data):
    """
    Final processing to conform to the schema.

    Parameters:

        proc_data:   (List of Dictionaries) raw structured data to process

    Returns:

        List of Dictionaries. Each Dictionary represents a row in the csv
        file.
    """
    # No further processing
    return proc_data


@add_jc_meta
def parse(data, raw=False, quiet=False, ignore_exceptions=False):
    """
    Main text parsing generator function. Returns an iterator object.

    Parameters:

        data:              (iterable)  line-based text data to parse
                                       (e.g. sys.stdin or str.splitlines())

        raw:               (boolean)   unprocessed output if True
        quiet:             (boolean)   suppress warning messages if True
        ignore_exceptions: (boolean)   ignore parsing exceptions if True

    Yields:

        Dictionary. Raw or processed structured data.

    Returns:

        Iterator object
    """
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.streaming_input_type_check(data)

    # convert data to an iterable in case a sequence like a list is used as input.
    # this allows the exhaustion of the input so we don't double-process later.
    data = iter(data)
    temp_list = []

    # first, load the first 100 lines into a list to detect the CSV dialect
    for line in itertools.islice(data, 100):
        temp_list.append(line.rstrip())

    # check for Python bug that does not split on `\r` newlines from sys.stdin correctly
    # https://bugs.python.org/issue45617
    if len(temp_list) == 1:
        raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')

    sniffdata = '\n'.join(temp_list)[:1024]
    dialect = 'excel'  # default in csv module

    try:
        dialect = csv.Sniffer().sniff(sniffdata)
        if '""' in sniffdata:
            dialect.doublequote = True
    except Exception:
        pass

    # chain `temp_list` and `data` together to lazy load the rest of the CSV data
    new_data = itertools.chain(temp_list, data)
    reader = csv.DictReader(new_data, dialect=dialect)

    for row in reader:
        try:
            yield row if raw else _process(row)
        except Exception as e:
            if not ignore_exceptions:
                e.args = (str(e) + ignore_exceptions_msg,)
                raise e

            yield e, str(row)