From 2a40f842743a8745d651b4b6cae645045e6c6aab Mon Sep 17 00:00:00 2001 From: shaik Date: Sun, 2 Jan 2022 17:11:20 +0200 Subject: [PATCH 1/5] fix doubleqoute in csv --- jc/parsers/csv.py | 4 +++- tests/fixtures/generic/csv-doubleqouted.csv | 3 +++ tests/fixtures/generic/csv-doubleqouted.json | 4 ++++ tests/test_csv.py | 12 ++++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/generic/csv-doubleqouted.csv create mode 100644 tests/fixtures/generic/csv-doubleqouted.json diff --git a/jc/parsers/csv.py b/jc/parsers/csv.py index 924d731c..731e0ddb 100644 --- a/jc/parsers/csv.py +++ b/jc/parsers/csv.py @@ -130,9 +130,11 @@ def parse(data, raw=False, quiet=False): if jc.utils.has_data(data): - dialect = None + dialect = "excel" # default in csv module try: dialect = csv.Sniffer().sniff(data[:1024]) + if '""' in data: + dialect.doublequote = True except Exception: pass diff --git a/tests/fixtures/generic/csv-doubleqouted.csv b/tests/fixtures/generic/csv-doubleqouted.csv new file mode 100644 index 00000000..9b2ef10f --- /dev/null +++ b/tests/fixtures/generic/csv-doubleqouted.csv @@ -0,0 +1,3 @@ +A,B +1,"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this" +2,"this is a field with "" in it" diff --git a/tests/fixtures/generic/csv-doubleqouted.json b/tests/fixtures/generic/csv-doubleqouted.json new file mode 100644 index 00000000..a1dab8e1 --- /dev/null +++ b/tests/fixtures/generic/csv-doubleqouted.json @@ -0,0 +1,4 @@ +[ + {"A": "1", "B": "this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"}, + {"A": "2", "B": "this is a field with \" in it"} +] diff --git a/tests/test_csv.py b/tests/test_csv.py index 6d3b2d91..a62cf0f2 100644 --- a/tests/test_csv.py +++ b/tests/test_csv.py @@ -37,6 +37,9 @@ class MyTests(unittest.TestCase): with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f: self.generic_csv_insurance = f.read() + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f: + self.generic_csv_doubleqouted = f.read() + # output with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f: self.generic_csv_biostats_json = json.loads(f.read()) @@ -65,6 +68,9 @@ class MyTests(unittest.TestCase): with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f: self.generic_csv_insurance_json = json.loads(f.read()) + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.json'), 'r', encoding='utf-8') as f: + self.generic_csv_doubleqouted_json = json.loads(f.read()) + def test_csv_nodata(self): """ Test with no data @@ -125,6 +131,12 @@ class MyTests(unittest.TestCase): """ self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json) + def test_doubleqouted(self): + """ + Test 'csv-doubleqouted.csv' file + """ + self.assertEqual(jc.parsers.csv.parse(self.generic_csv_doubleqouted, quiet=True), self.generic_csv_doubleqouted_json) + if __name__ == '__main__': unittest.main() From 9c887a36a804d817c3f669a55b9b1566d09d645d Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Sun, 2 Jan 2022 11:44:18 -0800 Subject: [PATCH 2/5] update csv_s parser with csv changes --- jc/parsers/csv_s.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/jc/parsers/csv_s.py b/jc/parsers/csv_s.py index e237b28a..7b894f80 100644 --- a/jc/parsers/csv_s.py +++ b/jc/parsers/csv_s.py @@ -56,7 +56,7 @@ from jc.exceptions import ParseError class info(): """Provides parser metadata (version, author, etc.)""" - version = '1.1' + version = '1.2' description = 'CSV file streaming parser' author = 'Kelly Brazil' author_email = 'kellyjonbrazil@gmail.com' @@ -122,9 +122,11 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False): sniffdata = '\n'.join(temp_list) - dialect = None + dialect = 'excel' # default in csv module try: dialect = csv.Sniffer().sniff(sniffdata) + if '""' in sniffdata: + dialect.doublequote = True except Exception: pass From 3a4a27e1f94ee07352c7616c57ec655c1aea04f6 Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Sun, 2 Jan 2022 11:44:25 -0800 Subject: [PATCH 3/5] version bump --- jc/parsers/csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jc/parsers/csv.py b/jc/parsers/csv.py index 731e0ddb..539d961b 100644 --- a/jc/parsers/csv.py +++ b/jc/parsers/csv.py @@ -75,7 +75,7 @@ import csv class info(): """Provides parser metadata (version, author, etc.)""" - version = '1.3' + version = '1.4' description = 'CSV file parser' author = 'Kelly Brazil' author_email = 'kellyjonbrazil@gmail.com' @@ -130,7 +130,7 @@ def parse(data, raw=False, quiet=False): if jc.utils.has_data(data): - dialect = "excel" # default in csv module + dialect = 'excel' # default in csv module try: dialect = csv.Sniffer().sniff(data[:1024]) if '""' in data: From 5563829df2849a899df2e9211d6c92bddc695f9b Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Mon, 3 Jan 2022 08:48:23 -0800 Subject: [PATCH 4/5] make dialect sniff behavior match non-streaming parser --- jc/parsers/csv_s.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jc/parsers/csv_s.py b/jc/parsers/csv_s.py index 7b894f80..f152bdc7 100644 --- a/jc/parsers/csv_s.py +++ b/jc/parsers/csv_s.py @@ -113,20 +113,20 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False): # first, load the first 100 lines into a list to detect the CSV dialect for line in itertools.islice(data, 100): - temp_list.append(line) + temp_list.append(line.rstrip()) # check for Python bug that does not split on `\r` newlines from sys.stdin correctly # https://bugs.python.org/issue45617 if len(temp_list) == 1: raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.') - sniffdata = '\n'.join(temp_list) - + sniffdata = '\n'.join(temp_list)[:1024] dialect = 'excel' # default in csv module + try: dialect = csv.Sniffer().sniff(sniffdata) if '""' in sniffdata: - dialect.doublequote = True + dialect.doublequote = True except Exception: pass From 65d96e26b59e231c77c1dbba1dc91708c33de30c Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Mon, 3 Jan 2022 09:06:00 -0800 Subject: [PATCH 5/5] add streaming tests --- .../fixtures/generic/csv-doubleqouted-streaming.json | 1 + tests/test_csv_s.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 tests/fixtures/generic/csv-doubleqouted-streaming.json diff --git a/tests/fixtures/generic/csv-doubleqouted-streaming.json b/tests/fixtures/generic/csv-doubleqouted-streaming.json new file mode 100644 index 00000000..9f0e1ded --- /dev/null +++ b/tests/fixtures/generic/csv-doubleqouted-streaming.json @@ -0,0 +1 @@ +[{"A":"1","B":"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},{"A":"2","B":"this is a field with \" in it\""}] diff --git a/tests/test_csv_s.py b/tests/test_csv_s.py index 5a9707ce..757d7cfc 100644 --- a/tests/test_csv_s.py +++ b/tests/test_csv_s.py @@ -42,6 +42,9 @@ class MyTests(unittest.TestCase): with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f: self.generic_csv_10k_sales_records = f.read() + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f: + self.generic_csv_doubleqouted = f.read() + # output with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f: self.generic_csv_biostats_streaming_json = json.loads(f.read()) @@ -70,6 +73,9 @@ class MyTests(unittest.TestCase): with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f: self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read()) + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted-streaming.json'), 'r', encoding='utf-8') as f: + self.generic_csv_doublequoted_streaming_json = json.loads(f.read()) + def test_csv_s_nodata(self): """ Test CSV parser with no data @@ -141,6 +147,12 @@ class MyTests(unittest.TestCase): """ self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json) + def test_csv_s_doublequoted(self): + """ + Test 'doublequoted.csv' file + """ + self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_doubleqouted.splitlines(), quiet=True)), self.generic_csv_doublequoted_streaming_json) + if __name__ == '__main__': unittest.main()