1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-19 00:17:51 +02:00
Files
jc/jc/parsers/csv_s.py

124 lines
3.7 KiB
Python

"""jc - JSON CLI output utility `csv` file streaming parser
The `csv` streaming parser will attempt to automatically detect the delimiter character. If the delimiter cannot be detected it will default to comma. The first row of the file must be a header row.
Note: The first 100 rows are read into memory to enable delimiter detection, then the rest of the rows are loaded lazily.
Usage (cli):
$ cat file.csv | jc --csv-s
Usage (module):
import jc.parsers.csv_s
result = jc.parsers.csv_s.parse(csv_output)
Schema:
csv file converted to a Dictionary: https://docs.python.org/3/library/csv.html
{
"column_name1": string,
"column_name2": string
}
Examples:
$ cat homes.csv
"Sell", "List", "Living", "Rooms", "Beds", "Baths", "Age", "Acres", "Taxes"
142, 160, 28, 10, 5, 3, 60, 0.28, 3167
175, 180, 18, 8, 4, 1, 12, 0.43, 4033
129, 132, 13, 6, 3, 1, 41, 0.33, 1471
...
$ cat homes.csv | jc --csv-s
{"Sell":"142","List":"160","Living":"28","Rooms":"10","Beds":"5","Baths":"3","Age":"60","Acres":"0.28","Taxes":"3167"}
{"Sell":"175","List":"180","Living":"18","Rooms":"8","Beds":"4","Baths":"1","Age":"12","Acres":"0.43","Taxes":"4033"}
{"Sell":"129","List":"132","Living":"13","Rooms":"6","Beds":"3","Baths":"1","Age":"41","Acres":"0.33","Taxes":"1471"}
...
"""
import itertools
import csv
import jc.utils
from jc.utils import stream_success, stream_error
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = 'CSV file streaming parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
details = 'Using the python standard csv library'
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
streaming = True
__version__ = info.version
def _process(proc_data):
"""
Final processing to conform to the schema.
Parameters:
proc_data: (List of Dictionaries) raw structured data to process
Returns:
List of Dictionaries. Each Dictionary represents a row in the csv file.
"""
# No further processing
return proc_data
def parse(data, raw=False, quiet=False, ignore_exceptions=False):
"""
Main text parsing generator function. Returns an iterator object.
Parameters:
data: (iterable) line-based text data to parse (e.g. sys.stdin or str.splitlines())
raw: (boolean) output preprocessed JSON if True
quiet: (boolean) suppress warning messages if True
ignore_exceptions: (boolean) ignore parsing exceptions if True
Yields:
Dictionary. Raw or processed structured data.
Returns:
Iterator object
"""
if not quiet:
jc.utils.compatibility(__name__, info.compatible)
# convert data to an iterable in case a sequence like a list is used as input.
# this allows the exhaustion of the input so we don't double-process later.
data = iter(data)
temp_list = []
# first, load the first 100 lines into a list to detect the CSV dialect
for line in itertools.islice(data, 100):
temp_list.append(line)
sniffdata = '\n'.join(temp_list)
dialect = None
try:
dialect = csv.Sniffer().sniff(sniffdata)
except Exception:
pass
# chain `temp_list` and `data` together to lazy load the rest of the CSV data
new_data = itertools.chain(temp_list, data)
reader = csv.DictReader(new_data, dialect=dialect)
for row in reader:
try:
yield stream_success(row, ignore_exceptions) if raw else stream_success(_process(row), ignore_exceptions)
except Exception as e:
yield stream_error(e, ignore_exceptions, row)