mirror of
https://github.com/kellyjonbrazil/jc.git
synced 2025-06-21 00:19:42 +02:00
add csv streaming parser
This commit is contained in:
@ -1,10 +1,11 @@
|
|||||||
jc changelog
|
jc changelog
|
||||||
|
|
||||||
20211019 v1.17.1 *** in progress ***
|
20211024 v1.17.1 *** in progress ***
|
||||||
- Fix file parser for gzip files
|
- Fix file parser for gzip files
|
||||||
- Fix uname parser for cases where the 'processor' and/or 'hardware_platform' fields are missing on linux
|
- Fix uname parser for cases where the 'processor' and/or 'hardware_platform' fields are missing on linux
|
||||||
- Fix uname parser on FreeBSD
|
- Fix uname parser on FreeBSD
|
||||||
- Add lsusb parser tested on linux
|
- Add lsusb parser tested on linux
|
||||||
|
- Add CSV file streaming parser
|
||||||
|
|
||||||
20210923 v1.17.0
|
20210923 v1.17.0
|
||||||
- Note to Package Maintainers: please see note at 20210720 v1.16.0
|
- Note to Package Maintainers: please see note at 20210720 v1.16.0
|
||||||
|
@ -53,6 +53,7 @@ parsers = [
|
|||||||
'crontab',
|
'crontab',
|
||||||
'crontab-u',
|
'crontab-u',
|
||||||
'csv',
|
'csv',
|
||||||
|
'csv-s',
|
||||||
'date',
|
'date',
|
||||||
'df',
|
'df',
|
||||||
'dig',
|
'dig',
|
||||||
|
168
jc/parsers/csv_s.py
Normal file
168
jc/parsers/csv_s.py
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
"""jc - JSON CLI output utility `csv` file streaming parser
|
||||||
|
|
||||||
|
The `csv` parser will attempt to automatically detect the delimiter character. If the delimiter cannot be detected it will default to comma. The first row of the file must be a header row.
|
||||||
|
|
||||||
|
Note: The first 100 rows are read into memory to enable delimiter detection. Then the rest of the rows are loaded lazily.
|
||||||
|
|
||||||
|
Usage (cli):
|
||||||
|
|
||||||
|
$ cat file.csv | jc --csv
|
||||||
|
|
||||||
|
Usage (module):
|
||||||
|
|
||||||
|
import jc.parsers.csv_s
|
||||||
|
result = jc.parsers.csv_s.parse(csv_output)
|
||||||
|
|
||||||
|
Schema:
|
||||||
|
|
||||||
|
csv file converted to a Dictionary: https://docs.python.org/3/library/csv.html
|
||||||
|
|
||||||
|
{
|
||||||
|
"column_name1": string,
|
||||||
|
"column_name2": string
|
||||||
|
}
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
$ cat homes.csv
|
||||||
|
"Sell", "List", "Living", "Rooms", "Beds", "Baths", "Age", "Acres", "Taxes"
|
||||||
|
142, 160, 28, 10, 5, 3, 60, 0.28, 3167
|
||||||
|
175, 180, 18, 8, 4, 1, 12, 0.43, 4033
|
||||||
|
129, 132, 13, 6, 3, 1, 41, 0.33, 1471
|
||||||
|
...
|
||||||
|
|
||||||
|
$ cat homes.csv | jc --csv_s
|
||||||
|
{"Sell":"142","List":"160","Living":"28","Rooms":"10","Beds":"5","Baths":"3","Age":"60","Acres":"0.28","Taxes":"3167"}
|
||||||
|
{"Sell":"175","List":"180","Living":"18","Rooms":"8","Beds":"4","Baths":"1","Age":"12","Acres":"0.43","Taxes":"4033"}
|
||||||
|
{"Sell":"129","List":"132","Living":"13","Rooms":"6","Beds":"3","Baths":"1","Age":"41","Acres":"0.33","Taxes":"1471"}
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
import itertools
|
||||||
|
import csv
|
||||||
|
import jc.utils
|
||||||
|
from jc.utils import stream_success, stream_error
|
||||||
|
from jc.exceptions import ParseError
|
||||||
|
|
||||||
|
|
||||||
|
class info():
|
||||||
|
"""Provides parser metadata (version, author, etc.)"""
|
||||||
|
version = '1.0'
|
||||||
|
description = 'CSV file streaming parser'
|
||||||
|
author = 'Kelly Brazil'
|
||||||
|
author_email = 'kellyjonbrazil@gmail.com'
|
||||||
|
details = 'Using the python standard csv library'
|
||||||
|
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
|
||||||
|
streaming = True
|
||||||
|
|
||||||
|
|
||||||
|
__version__ = info.version
|
||||||
|
|
||||||
|
|
||||||
|
def _process(proc_data):
|
||||||
|
"""
|
||||||
|
Final processing to conform to the schema.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
|
||||||
|
proc_data: (List of Dictionaries) raw structured data to process
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
List of Dictionaries. Each Dictionary represents a row in the csv file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# No further processing
|
||||||
|
return proc_data
|
||||||
|
|
||||||
|
|
||||||
|
def old_parse(data, raw=False, quiet=False):
|
||||||
|
"""
|
||||||
|
Main text parsing function
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
|
||||||
|
data: (string) text data to parse
|
||||||
|
raw: (boolean) output preprocessed JSON if True
|
||||||
|
quiet: (boolean) suppress warning messages if True
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
List of Dictionaries. Raw or processed structured data.
|
||||||
|
"""
|
||||||
|
if not quiet:
|
||||||
|
jc.utils.compatibility(__name__, info.compatible)
|
||||||
|
|
||||||
|
raw_output = []
|
||||||
|
cleandata = data.splitlines()
|
||||||
|
|
||||||
|
# Clear any blank lines
|
||||||
|
cleandata = list(filter(None, cleandata))
|
||||||
|
|
||||||
|
if jc.utils.has_data(data):
|
||||||
|
|
||||||
|
dialect = None
|
||||||
|
try:
|
||||||
|
dialect = csv.Sniffer().sniff(data[:1024])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
reader = csv.DictReader(cleandata, dialect=dialect)
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
raw_output.append(row)
|
||||||
|
|
||||||
|
if raw:
|
||||||
|
return raw_output
|
||||||
|
else:
|
||||||
|
return _process(raw_output)
|
||||||
|
|
||||||
|
|
||||||
|
def parse(data, raw=False, quiet=False, ignore_exceptions=False):
|
||||||
|
"""
|
||||||
|
Main text parsing generator function. Returns an iterator object.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
|
||||||
|
data: (iterable) line-based text data to parse (e.g. sys.stdin or str.splitlines())
|
||||||
|
raw: (boolean) output preprocessed JSON if True
|
||||||
|
quiet: (boolean) suppress warning messages if True
|
||||||
|
ignore_exceptions: (boolean) ignore parsing exceptions if True
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
|
||||||
|
Dictionary. Raw or processed structured data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
Iterator object
|
||||||
|
"""
|
||||||
|
if not quiet:
|
||||||
|
jc.utils.compatibility(__name__, info.compatible)
|
||||||
|
|
||||||
|
temp_list = []
|
||||||
|
|
||||||
|
# first, load the first 100 lines into a list to detect the CSV dialect
|
||||||
|
for line in itertools.islice(data, 100):
|
||||||
|
temp_list.append(line)
|
||||||
|
|
||||||
|
sniffdata = '\n'.join(temp_list)
|
||||||
|
|
||||||
|
dialect = None
|
||||||
|
try:
|
||||||
|
dialect = csv.Sniffer().sniff(sniffdata[:1024])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# chain `temp_list` and `data` together to lazy load all of the CSV data
|
||||||
|
new_data = itertools.chain(temp_list, data)
|
||||||
|
reader = csv.DictReader(new_data, dialect=dialect)
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
try:
|
||||||
|
if row:
|
||||||
|
yield stream_success(row, ignore_exceptions) if raw else stream_success(_process(row), ignore_exceptions)
|
||||||
|
else:
|
||||||
|
raise ParseError('Not CSV data')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
yield stream_error(e, ignore_exceptions, row)
|
10001
tests/fixtures/generic/csv-10k-sales-records.csv
vendored
Executable file
10001
tests/fixtures/generic/csv-10k-sales-records.csv
vendored
Executable file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user