1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-21 00:19:42 +02:00

add csv streaming parser

This commit is contained in:
Kelly Brazil
2021-10-24 12:24:44 -07:00
parent 0313e3f8ca
commit 2b887debc6
4 changed files with 10172 additions and 1 deletions

View File

@ -1,10 +1,11 @@
jc changelog
20211019 v1.17.1 *** in progress ***
20211024 v1.17.1 *** in progress ***
- Fix file parser for gzip files
- Fix uname parser for cases where the 'processor' and/or 'hardware_platform' fields are missing on linux
- Fix uname parser on FreeBSD
- Add lsusb parser tested on linux
- Add CSV file streaming parser
20210923 v1.17.0
- Note to Package Maintainers: please see note at 20210720 v1.16.0

View File

@ -53,6 +53,7 @@ parsers = [
'crontab',
'crontab-u',
'csv',
'csv-s',
'date',
'df',
'dig',

168
jc/parsers/csv_s.py Normal file
View File

@ -0,0 +1,168 @@
"""jc - JSON CLI output utility `csv` file streaming parser
The `csv` parser will attempt to automatically detect the delimiter character. If the delimiter cannot be detected it will default to comma. The first row of the file must be a header row.
Note: The first 100 rows are read into memory to enable delimiter detection. Then the rest of the rows are loaded lazily.
Usage (cli):
$ cat file.csv | jc --csv
Usage (module):
import jc.parsers.csv_s
result = jc.parsers.csv_s.parse(csv_output)
Schema:
csv file converted to a Dictionary: https://docs.python.org/3/library/csv.html
{
"column_name1": string,
"column_name2": string
}
Examples:
$ cat homes.csv
"Sell", "List", "Living", "Rooms", "Beds", "Baths", "Age", "Acres", "Taxes"
142, 160, 28, 10, 5, 3, 60, 0.28, 3167
175, 180, 18, 8, 4, 1, 12, 0.43, 4033
129, 132, 13, 6, 3, 1, 41, 0.33, 1471
...
$ cat homes.csv | jc --csv_s
{"Sell":"142","List":"160","Living":"28","Rooms":"10","Beds":"5","Baths":"3","Age":"60","Acres":"0.28","Taxes":"3167"}
{"Sell":"175","List":"180","Living":"18","Rooms":"8","Beds":"4","Baths":"1","Age":"12","Acres":"0.43","Taxes":"4033"}
{"Sell":"129","List":"132","Living":"13","Rooms":"6","Beds":"3","Baths":"1","Age":"41","Acres":"0.33","Taxes":"1471"}
...
"""
import itertools
import csv
import jc.utils
from jc.utils import stream_success, stream_error
from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = 'CSV file streaming parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
details = 'Using the python standard csv library'
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
streaming = True
__version__ = info.version
def _process(proc_data):
"""
Final processing to conform to the schema.
Parameters:
proc_data: (List of Dictionaries) raw structured data to process
Returns:
List of Dictionaries. Each Dictionary represents a row in the csv file.
"""
# No further processing
return proc_data
def old_parse(data, raw=False, quiet=False):
"""
Main text parsing function
Parameters:
data: (string) text data to parse
raw: (boolean) output preprocessed JSON if True
quiet: (boolean) suppress warning messages if True
Returns:
List of Dictionaries. Raw or processed structured data.
"""
if not quiet:
jc.utils.compatibility(__name__, info.compatible)
raw_output = []
cleandata = data.splitlines()
# Clear any blank lines
cleandata = list(filter(None, cleandata))
if jc.utils.has_data(data):
dialect = None
try:
dialect = csv.Sniffer().sniff(data[:1024])
except Exception:
pass
reader = csv.DictReader(cleandata, dialect=dialect)
for row in reader:
raw_output.append(row)
if raw:
return raw_output
else:
return _process(raw_output)
def parse(data, raw=False, quiet=False, ignore_exceptions=False):
"""
Main text parsing generator function. Returns an iterator object.
Parameters:
data: (iterable) line-based text data to parse (e.g. sys.stdin or str.splitlines())
raw: (boolean) output preprocessed JSON if True
quiet: (boolean) suppress warning messages if True
ignore_exceptions: (boolean) ignore parsing exceptions if True
Yields:
Dictionary. Raw or processed structured data.
Returns:
Iterator object
"""
if not quiet:
jc.utils.compatibility(__name__, info.compatible)
temp_list = []
# first, load the first 100 lines into a list to detect the CSV dialect
for line in itertools.islice(data, 100):
temp_list.append(line)
sniffdata = '\n'.join(temp_list)
dialect = None
try:
dialect = csv.Sniffer().sniff(sniffdata[:1024])
except Exception:
pass
# chain `temp_list` and `data` together to lazy load all of the CSV data
new_data = itertools.chain(temp_list, data)
reader = csv.DictReader(new_data, dialect=dialect)
for row in reader:
try:
if row:
yield stream_success(row, ignore_exceptions) if raw else stream_success(_process(row), ignore_exceptions)
else:
raise ParseError('Not CSV data')
except Exception as e:
yield stream_error(e, ignore_exceptions, row)

10001
tests/fixtures/generic/csv-10k-sales-records.csv vendored Executable file

File diff suppressed because it is too large Load Diff