1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-19 00:17:51 +02:00
Files
jc/jc/parsers/csv_s.py
2022-02-04 12:14:16 -08:00

166 lines
4.7 KiB
Python

"""jc - JSON CLI output utility `csv` file streaming parser
> This streaming parser outputs JSON Lines
The `csv` streaming parser will attempt to automatically detect the
delimiter character. If the delimiter cannot be detected it will default
to comma. The first row of the file must be a header row.
Note: The first 100 rows are read into memory to enable delimiter detection,
then the rest of the rows are loaded lazily.
Usage (cli):
$ cat file.csv | jc --csv-s
Usage (module):
import jc
# result is an iterable object (generator)
result = jc.parse('csv_s', csv_output.splitlines())
for item in result:
# do something
or
import jc.parsers.csv_s
# result is an iterable object (generator)
result = jc.parsers.csv_s.parse(csv_output.splitlines())
for item in result:
# do something
Schema:
csv file converted to a Dictionary:
https://docs.python.org/3/library/csv.html
{
"column_name1": string,
"column_name2": string,
# below object only exists if using -qq or ignore_exceptions=True
"_jc_meta":
{
"success": boolean, # false if error parsing
"error": string, # exists if "success" is false
"line": string # exists if "success" is false
}
}
Examples:
$ cat homes.csv
"Sell", "List", "Living", "Rooms", "Beds", "Baths", "Age", "Acres"...
142, 160, 28, 10, 5, 3, 60, 0.28, 3167
175, 180, 18, 8, 4, 1, 12, 0.43, 4033
129, 132, 13, 6, 3, 1, 41, 0.33, 1471
...
$ cat homes.csv | jc --csv-s
{"Sell":"142","List":"160","Living":"28","Rooms":"10","Beds":"5"...}
{"Sell":"175","List":"180","Living":"18","Rooms":"8","Beds":"4"...}
{"Sell":"129","List":"132","Living":"13","Rooms":"6","Beds":"3"...}
...
"""
import itertools
import csv
import jc.utils
from jc.utils import ignore_exceptions_msg, add_jc_meta
from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.3'
description = 'CSV file streaming parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
details = 'Using the python standard csv library'
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
streaming = True
__version__ = info.version
def _process(proc_data):
"""
Final processing to conform to the schema.
Parameters:
proc_data: (List of Dictionaries) raw structured data to process
Returns:
List of Dictionaries. Each Dictionary represents a row in the csv
file.
"""
# No further processing
return proc_data
@add_jc_meta
def parse(data, raw=False, quiet=False, ignore_exceptions=False):
"""
Main text parsing generator function. Returns an iterator object.
Parameters:
data: (iterable) line-based text data to parse
(e.g. sys.stdin or str.splitlines())
raw: (boolean) unprocessed output if True
quiet: (boolean) suppress warning messages if True
ignore_exceptions: (boolean) ignore parsing exceptions if True
Yields:
Dictionary. Raw or processed structured data.
Returns:
Iterator object
"""
jc.utils.compatibility(__name__, info.compatible, quiet)
jc.utils.streaming_input_type_check(data)
# convert data to an iterable in case a sequence like a list is used as input.
# this allows the exhaustion of the input so we don't double-process later.
data = iter(data)
temp_list = []
# first, load the first 100 lines into a list to detect the CSV dialect
for line in itertools.islice(data, 100):
temp_list.append(line.rstrip())
# check for Python bug that does not split on `\r` newlines from sys.stdin correctly
# https://bugs.python.org/issue45617
if len(temp_list) == 1:
raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
sniffdata = '\n'.join(temp_list)[:1024]
dialect = 'excel' # default in csv module
try:
dialect = csv.Sniffer().sniff(sniffdata)
if '""' in sniffdata:
dialect.doublequote = True
except Exception:
pass
# chain `temp_list` and `data` together to lazy load the rest of the CSV data
new_data = itertools.chain(temp_list, data)
reader = csv.DictReader(new_data, dialect=dialect)
for row in reader:
try:
yield row if raw else _process(row)
except Exception as e:
if not ignore_exceptions:
e.args = (str(e) + ignore_exceptions_msg,)
raise e
yield e, str(row)