diff --git a/jc/parsers/csv.py b/jc/parsers/csv.py index 924d731c..539d961b 100644 --- a/jc/parsers/csv.py +++ b/jc/parsers/csv.py @@ -75,7 +75,7 @@ import csv class info(): """Provides parser metadata (version, author, etc.)""" - version = '1.3' + version = '1.4' description = 'CSV file parser' author = 'Kelly Brazil' author_email = 'kellyjonbrazil@gmail.com' @@ -130,9 +130,11 @@ def parse(data, raw=False, quiet=False): if jc.utils.has_data(data): - dialect = None + dialect = 'excel' # default in csv module try: dialect = csv.Sniffer().sniff(data[:1024]) + if '""' in data: + dialect.doublequote = True except Exception: pass diff --git a/jc/parsers/csv_s.py b/jc/parsers/csv_s.py index e237b28a..f152bdc7 100644 --- a/jc/parsers/csv_s.py +++ b/jc/parsers/csv_s.py @@ -56,7 +56,7 @@ from jc.exceptions import ParseError class info(): """Provides parser metadata (version, author, etc.)""" - version = '1.1' + version = '1.2' description = 'CSV file streaming parser' author = 'Kelly Brazil' author_email = 'kellyjonbrazil@gmail.com' @@ -113,18 +113,20 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False): # first, load the first 100 lines into a list to detect the CSV dialect for line in itertools.islice(data, 100): - temp_list.append(line) + temp_list.append(line.rstrip()) # check for Python bug that does not split on `\r` newlines from sys.stdin correctly # https://bugs.python.org/issue45617 if len(temp_list) == 1: raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.') - sniffdata = '\n'.join(temp_list) + sniffdata = '\n'.join(temp_list)[:1024] + dialect = 'excel' # default in csv module - dialect = None try: dialect = csv.Sniffer().sniff(sniffdata) + if '""' in sniffdata: + dialect.doublequote = True except Exception: pass diff --git a/tests/fixtures/generic/csv-doubleqouted-streaming.json b/tests/fixtures/generic/csv-doubleqouted-streaming.json new file mode 100644 index 00000000..9f0e1ded --- /dev/null +++ b/tests/fixtures/generic/csv-doubleqouted-streaming.json @@ -0,0 +1 @@ +[{"A":"1","B":"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},{"A":"2","B":"this is a field with \" in it\""}] diff --git a/tests/fixtures/generic/csv-doubleqouted.csv b/tests/fixtures/generic/csv-doubleqouted.csv new file mode 100644 index 00000000..9b2ef10f --- /dev/null +++ b/tests/fixtures/generic/csv-doubleqouted.csv @@ -0,0 +1,3 @@ +A,B +1,"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this" +2,"this is a field with "" in it" diff --git a/tests/fixtures/generic/csv-doubleqouted.json b/tests/fixtures/generic/csv-doubleqouted.json new file mode 100644 index 00000000..a1dab8e1 --- /dev/null +++ b/tests/fixtures/generic/csv-doubleqouted.json @@ -0,0 +1,4 @@ +[ + {"A": "1", "B": "this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"}, + {"A": "2", "B": "this is a field with \" in it"} +] diff --git a/tests/test_csv.py b/tests/test_csv.py index 6d3b2d91..a62cf0f2 100644 --- a/tests/test_csv.py +++ b/tests/test_csv.py @@ -37,6 +37,9 @@ class MyTests(unittest.TestCase): with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f: self.generic_csv_insurance = f.read() + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f: + self.generic_csv_doubleqouted = f.read() + # output with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f: self.generic_csv_biostats_json = json.loads(f.read()) @@ -65,6 +68,9 @@ class MyTests(unittest.TestCase): with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f: self.generic_csv_insurance_json = json.loads(f.read()) + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.json'), 'r', encoding='utf-8') as f: + self.generic_csv_doubleqouted_json = json.loads(f.read()) + def test_csv_nodata(self): """ Test with no data @@ -125,6 +131,12 @@ class MyTests(unittest.TestCase): """ self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json) + def test_doubleqouted(self): + """ + Test 'csv-doubleqouted.csv' file + """ + self.assertEqual(jc.parsers.csv.parse(self.generic_csv_doubleqouted, quiet=True), self.generic_csv_doubleqouted_json) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_csv_s.py b/tests/test_csv_s.py index 5a9707ce..757d7cfc 100644 --- a/tests/test_csv_s.py +++ b/tests/test_csv_s.py @@ -42,6 +42,9 @@ class MyTests(unittest.TestCase): with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f: self.generic_csv_10k_sales_records = f.read() + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f: + self.generic_csv_doubleqouted = f.read() + # output with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f: self.generic_csv_biostats_streaming_json = json.loads(f.read()) @@ -70,6 +73,9 @@ class MyTests(unittest.TestCase): with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f: self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read()) + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted-streaming.json'), 'r', encoding='utf-8') as f: + self.generic_csv_doublequoted_streaming_json = json.loads(f.read()) + def test_csv_s_nodata(self): """ Test CSV parser with no data @@ -141,6 +147,12 @@ class MyTests(unittest.TestCase): """ self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json) + def test_csv_s_doublequoted(self): + """ + Test 'doublequoted.csv' file + """ + self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_doubleqouted.splitlines(), quiet=True)), self.generic_csv_doublequoted_streaming_json) + if __name__ == '__main__': unittest.main()