1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-23 00:29:59 +02:00

Merge pull request #193 from kellyjonbrazil/dev

Dev add csv doublequote fix
This commit is contained in:
Kelly Brazil
2022-01-03 09:12:50 -08:00
committed by GitHub
7 changed files with 42 additions and 6 deletions

View File

@ -75,7 +75,7 @@ import csv
class info(): class info():
"""Provides parser metadata (version, author, etc.)""" """Provides parser metadata (version, author, etc.)"""
version = '1.3' version = '1.4'
description = 'CSV file parser' description = 'CSV file parser'
author = 'Kelly Brazil' author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com' author_email = 'kellyjonbrazil@gmail.com'
@ -130,9 +130,11 @@ def parse(data, raw=False, quiet=False):
if jc.utils.has_data(data): if jc.utils.has_data(data):
dialect = None dialect = 'excel' # default in csv module
try: try:
dialect = csv.Sniffer().sniff(data[:1024]) dialect = csv.Sniffer().sniff(data[:1024])
if '""' in data:
dialect.doublequote = True
except Exception: except Exception:
pass pass

View File

@ -56,7 +56,7 @@ from jc.exceptions import ParseError
class info(): class info():
"""Provides parser metadata (version, author, etc.)""" """Provides parser metadata (version, author, etc.)"""
version = '1.1' version = '1.2'
description = 'CSV file streaming parser' description = 'CSV file streaming parser'
author = 'Kelly Brazil' author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com' author_email = 'kellyjonbrazil@gmail.com'
@ -113,18 +113,20 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
# first, load the first 100 lines into a list to detect the CSV dialect # first, load the first 100 lines into a list to detect the CSV dialect
for line in itertools.islice(data, 100): for line in itertools.islice(data, 100):
temp_list.append(line) temp_list.append(line.rstrip())
# check for Python bug that does not split on `\r` newlines from sys.stdin correctly # check for Python bug that does not split on `\r` newlines from sys.stdin correctly
# https://bugs.python.org/issue45617 # https://bugs.python.org/issue45617
if len(temp_list) == 1: if len(temp_list) == 1:
raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.') raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
sniffdata = '\n'.join(temp_list) sniffdata = '\n'.join(temp_list)[:1024]
dialect = 'excel' # default in csv module
dialect = None
try: try:
dialect = csv.Sniffer().sniff(sniffdata) dialect = csv.Sniffer().sniff(sniffdata)
if '""' in sniffdata:
dialect.doublequote = True
except Exception: except Exception:
pass pass

View File

@ -0,0 +1 @@
[{"A":"1","B":"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},{"A":"2","B":"this is a field with \" in it\""}]

View File

@ -0,0 +1,3 @@
A,B
1,"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"
2,"this is a field with "" in it"
1 A B
2 1 this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this
3 2 this is a field with " in it

View File

@ -0,0 +1,4 @@
[
{"A": "1", "B": "this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},
{"A": "2", "B": "this is a field with \" in it"}
]

View File

@ -37,6 +37,9 @@ class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f: with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f:
self.generic_csv_insurance = f.read() self.generic_csv_insurance = f.read()
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
self.generic_csv_doubleqouted = f.read()
# output # output
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f: with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f:
self.generic_csv_biostats_json = json.loads(f.read()) self.generic_csv_biostats_json = json.loads(f.read())
@ -65,6 +68,9 @@ class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f: with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f:
self.generic_csv_insurance_json = json.loads(f.read()) self.generic_csv_insurance_json = json.loads(f.read())
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.json'), 'r', encoding='utf-8') as f:
self.generic_csv_doubleqouted_json = json.loads(f.read())
def test_csv_nodata(self): def test_csv_nodata(self):
""" """
Test with no data Test with no data
@ -125,6 +131,12 @@ class MyTests(unittest.TestCase):
""" """
self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json) self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json)
def test_doubleqouted(self):
"""
Test 'csv-doubleqouted.csv' file
"""
self.assertEqual(jc.parsers.csv.parse(self.generic_csv_doubleqouted, quiet=True), self.generic_csv_doubleqouted_json)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -42,6 +42,9 @@ class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f: with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f:
self.generic_csv_10k_sales_records = f.read() self.generic_csv_10k_sales_records = f.read()
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
self.generic_csv_doubleqouted = f.read()
# output # output
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f: with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f:
self.generic_csv_biostats_streaming_json = json.loads(f.read()) self.generic_csv_biostats_streaming_json = json.loads(f.read())
@ -70,6 +73,9 @@ class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f: with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f:
self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read()) self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read())
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted-streaming.json'), 'r', encoding='utf-8') as f:
self.generic_csv_doublequoted_streaming_json = json.loads(f.read())
def test_csv_s_nodata(self): def test_csv_s_nodata(self):
""" """
Test CSV parser with no data Test CSV parser with no data
@ -141,6 +147,12 @@ class MyTests(unittest.TestCase):
""" """
self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json) self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json)
def test_csv_s_doublequoted(self):
"""
Test 'doublequoted.csv' file
"""
self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_doubleqouted.splitlines(), quiet=True)), self.generic_csv_doublequoted_streaming_json)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()