mirror of
https://github.com/kellyjonbrazil/jc.git
synced 2025-06-23 00:29:59 +02:00
Merge pull request #193 from kellyjonbrazil/dev
Dev add csv doublequote fix
This commit is contained in:
@ -75,7 +75,7 @@ import csv
|
|||||||
|
|
||||||
class info():
|
class info():
|
||||||
"""Provides parser metadata (version, author, etc.)"""
|
"""Provides parser metadata (version, author, etc.)"""
|
||||||
version = '1.3'
|
version = '1.4'
|
||||||
description = 'CSV file parser'
|
description = 'CSV file parser'
|
||||||
author = 'Kelly Brazil'
|
author = 'Kelly Brazil'
|
||||||
author_email = 'kellyjonbrazil@gmail.com'
|
author_email = 'kellyjonbrazil@gmail.com'
|
||||||
@ -130,9 +130,11 @@ def parse(data, raw=False, quiet=False):
|
|||||||
|
|
||||||
if jc.utils.has_data(data):
|
if jc.utils.has_data(data):
|
||||||
|
|
||||||
dialect = None
|
dialect = 'excel' # default in csv module
|
||||||
try:
|
try:
|
||||||
dialect = csv.Sniffer().sniff(data[:1024])
|
dialect = csv.Sniffer().sniff(data[:1024])
|
||||||
|
if '""' in data:
|
||||||
|
dialect.doublequote = True
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ from jc.exceptions import ParseError
|
|||||||
|
|
||||||
class info():
|
class info():
|
||||||
"""Provides parser metadata (version, author, etc.)"""
|
"""Provides parser metadata (version, author, etc.)"""
|
||||||
version = '1.1'
|
version = '1.2'
|
||||||
description = 'CSV file streaming parser'
|
description = 'CSV file streaming parser'
|
||||||
author = 'Kelly Brazil'
|
author = 'Kelly Brazil'
|
||||||
author_email = 'kellyjonbrazil@gmail.com'
|
author_email = 'kellyjonbrazil@gmail.com'
|
||||||
@ -113,18 +113,20 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
|
|||||||
|
|
||||||
# first, load the first 100 lines into a list to detect the CSV dialect
|
# first, load the first 100 lines into a list to detect the CSV dialect
|
||||||
for line in itertools.islice(data, 100):
|
for line in itertools.islice(data, 100):
|
||||||
temp_list.append(line)
|
temp_list.append(line.rstrip())
|
||||||
|
|
||||||
# check for Python bug that does not split on `\r` newlines from sys.stdin correctly
|
# check for Python bug that does not split on `\r` newlines from sys.stdin correctly
|
||||||
# https://bugs.python.org/issue45617
|
# https://bugs.python.org/issue45617
|
||||||
if len(temp_list) == 1:
|
if len(temp_list) == 1:
|
||||||
raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
|
raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
|
||||||
|
|
||||||
sniffdata = '\n'.join(temp_list)
|
sniffdata = '\n'.join(temp_list)[:1024]
|
||||||
|
dialect = 'excel' # default in csv module
|
||||||
|
|
||||||
dialect = None
|
|
||||||
try:
|
try:
|
||||||
dialect = csv.Sniffer().sniff(sniffdata)
|
dialect = csv.Sniffer().sniff(sniffdata)
|
||||||
|
if '""' in sniffdata:
|
||||||
|
dialect.doublequote = True
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
1
tests/fixtures/generic/csv-doubleqouted-streaming.json
vendored
Normal file
1
tests/fixtures/generic/csv-doubleqouted-streaming.json
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
[{"A":"1","B":"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},{"A":"2","B":"this is a field with \" in it\""}]
|
3
tests/fixtures/generic/csv-doubleqouted.csv
vendored
Normal file
3
tests/fixtures/generic/csv-doubleqouted.csv
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
A,B
|
||||||
|
1,"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"
|
||||||
|
2,"this is a field with "" in it"
|
|
4
tests/fixtures/generic/csv-doubleqouted.json
vendored
Normal file
4
tests/fixtures/generic/csv-doubleqouted.json
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
[
|
||||||
|
{"A": "1", "B": "this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},
|
||||||
|
{"A": "2", "B": "this is a field with \" in it"}
|
||||||
|
]
|
@ -37,6 +37,9 @@ class MyTests(unittest.TestCase):
|
|||||||
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f:
|
||||||
self.generic_csv_insurance = f.read()
|
self.generic_csv_insurance = f.read()
|
||||||
|
|
||||||
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
|
||||||
|
self.generic_csv_doubleqouted = f.read()
|
||||||
|
|
||||||
# output
|
# output
|
||||||
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f:
|
||||||
self.generic_csv_biostats_json = json.loads(f.read())
|
self.generic_csv_biostats_json = json.loads(f.read())
|
||||||
@ -65,6 +68,9 @@ class MyTests(unittest.TestCase):
|
|||||||
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f:
|
||||||
self.generic_csv_insurance_json = json.loads(f.read())
|
self.generic_csv_insurance_json = json.loads(f.read())
|
||||||
|
|
||||||
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.json'), 'r', encoding='utf-8') as f:
|
||||||
|
self.generic_csv_doubleqouted_json = json.loads(f.read())
|
||||||
|
|
||||||
def test_csv_nodata(self):
|
def test_csv_nodata(self):
|
||||||
"""
|
"""
|
||||||
Test with no data
|
Test with no data
|
||||||
@ -125,6 +131,12 @@ class MyTests(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json)
|
self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json)
|
||||||
|
|
||||||
|
def test_doubleqouted(self):
|
||||||
|
"""
|
||||||
|
Test 'csv-doubleqouted.csv' file
|
||||||
|
"""
|
||||||
|
self.assertEqual(jc.parsers.csv.parse(self.generic_csv_doubleqouted, quiet=True), self.generic_csv_doubleqouted_json)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -42,6 +42,9 @@ class MyTests(unittest.TestCase):
|
|||||||
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f:
|
||||||
self.generic_csv_10k_sales_records = f.read()
|
self.generic_csv_10k_sales_records = f.read()
|
||||||
|
|
||||||
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
|
||||||
|
self.generic_csv_doubleqouted = f.read()
|
||||||
|
|
||||||
# output
|
# output
|
||||||
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f:
|
||||||
self.generic_csv_biostats_streaming_json = json.loads(f.read())
|
self.generic_csv_biostats_streaming_json = json.loads(f.read())
|
||||||
@ -70,6 +73,9 @@ class MyTests(unittest.TestCase):
|
|||||||
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f:
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f:
|
||||||
self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read())
|
self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read())
|
||||||
|
|
||||||
|
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted-streaming.json'), 'r', encoding='utf-8') as f:
|
||||||
|
self.generic_csv_doublequoted_streaming_json = json.loads(f.read())
|
||||||
|
|
||||||
def test_csv_s_nodata(self):
|
def test_csv_s_nodata(self):
|
||||||
"""
|
"""
|
||||||
Test CSV parser with no data
|
Test CSV parser with no data
|
||||||
@ -141,6 +147,12 @@ class MyTests(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json)
|
self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json)
|
||||||
|
|
||||||
|
def test_csv_s_doublequoted(self):
|
||||||
|
"""
|
||||||
|
Test 'doublequoted.csv' file
|
||||||
|
"""
|
||||||
|
self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_doubleqouted.splitlines(), quiet=True)), self.generic_csv_doublequoted_streaming_json)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
Reference in New Issue
Block a user