1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-23 00:29:59 +02:00

Merge pull request #193 from kellyjonbrazil/dev

Dev add csv doublequote fix
This commit is contained in:
Kelly Brazil
2022-01-03 09:12:50 -08:00
committed by GitHub
7 changed files with 42 additions and 6 deletions

View File

@ -75,7 +75,7 @@ import csv
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.3'
version = '1.4'
description = 'CSV file parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
@ -130,9 +130,11 @@ def parse(data, raw=False, quiet=False):
if jc.utils.has_data(data):
dialect = None
dialect = 'excel' # default in csv module
try:
dialect = csv.Sniffer().sniff(data[:1024])
if '""' in data:
dialect.doublequote = True
except Exception:
pass

View File

@ -56,7 +56,7 @@ from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.1'
version = '1.2'
description = 'CSV file streaming parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
@ -113,18 +113,20 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
# first, load the first 100 lines into a list to detect the CSV dialect
for line in itertools.islice(data, 100):
temp_list.append(line)
temp_list.append(line.rstrip())
# check for Python bug that does not split on `\r` newlines from sys.stdin correctly
# https://bugs.python.org/issue45617
if len(temp_list) == 1:
raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
sniffdata = '\n'.join(temp_list)
sniffdata = '\n'.join(temp_list)[:1024]
dialect = 'excel' # default in csv module
dialect = None
try:
dialect = csv.Sniffer().sniff(sniffdata)
if '""' in sniffdata:
dialect.doublequote = True
except Exception:
pass

View File

@ -0,0 +1 @@
[{"A":"1","B":"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},{"A":"2","B":"this is a field with \" in it\""}]

View File

@ -0,0 +1,3 @@
A,B
1,"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"
2,"this is a field with "" in it"
1 A B
2 1 this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this
3 2 this is a field with " in it

View File

@ -0,0 +1,4 @@
[
{"A": "1", "B": "this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},
{"A": "2", "B": "this is a field with \" in it"}
]

View File

@ -37,6 +37,9 @@ class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f:
self.generic_csv_insurance = f.read()
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
self.generic_csv_doubleqouted = f.read()
# output
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f:
self.generic_csv_biostats_json = json.loads(f.read())
@ -65,6 +68,9 @@ class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f:
self.generic_csv_insurance_json = json.loads(f.read())
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.json'), 'r', encoding='utf-8') as f:
self.generic_csv_doubleqouted_json = json.loads(f.read())
def test_csv_nodata(self):
"""
Test with no data
@ -125,6 +131,12 @@ class MyTests(unittest.TestCase):
"""
self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json)
def test_doubleqouted(self):
"""
Test 'csv-doubleqouted.csv' file
"""
self.assertEqual(jc.parsers.csv.parse(self.generic_csv_doubleqouted, quiet=True), self.generic_csv_doubleqouted_json)
if __name__ == '__main__':
unittest.main()

View File

@ -42,6 +42,9 @@ class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f:
self.generic_csv_10k_sales_records = f.read()
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
self.generic_csv_doubleqouted = f.read()
# output
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f:
self.generic_csv_biostats_streaming_json = json.loads(f.read())
@ -70,6 +73,9 @@ class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f:
self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read())
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted-streaming.json'), 'r', encoding='utf-8') as f:
self.generic_csv_doublequoted_streaming_json = json.loads(f.read())
def test_csv_s_nodata(self):
"""
Test CSV parser with no data
@ -141,6 +147,12 @@ class MyTests(unittest.TestCase):
"""
self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json)
def test_csv_s_doublequoted(self):
"""
Test 'doublequoted.csv' file
"""
self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_doubleqouted.splitlines(), quiet=True)), self.generic_csv_doublequoted_streaming_json)
if __name__ == '__main__':
unittest.main()