Merge pull request #193 from kellyjonbrazil/dev

Dev add csv doublequote fix
2025-08-06 22:32:54 +02:00 · 2022-01-03 09:12:50 -08:00
parent 241d53af9a 78672bd7ad
commit 360154559c
7 changed files with 42 additions and 6 deletions
--- a/jc/parsers/csv.py
+++ b/jc/parsers/csv.py
@ -75,7 +75,7 @@ import csv
 class info():
    """Provides parser metadata (version, author, etc.)"""
-    version = '1.3'
+    version = '1.4'
    description = 'CSV file parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
@ -130,9 +130,11 @@ def parse(data, raw=False, quiet=False):
    if jc.utils.has_data(data):
-        dialect = None
+        dialect = 'excel'  # default in csv module
        try:
            dialect = csv.Sniffer().sniff(data[:1024])
            if '""' in data:
                dialect.doublequote = True
        except Exception:
            pass
--- a/jc/parsers/csv_s.py
+++ b/jc/parsers/csv_s.py
@ -56,7 +56,7 @@ from jc.exceptions import ParseError
 class info():
    """Provides parser metadata (version, author, etc.)"""
-    version = '1.1'
+    version = '1.2'
    description = 'CSV file streaming parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
@ -113,18 +113,20 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
    # first, load the first 100 lines into a list to detect the CSV dialect
    for line in itertools.islice(data, 100):
-        temp_list.append(line)
+        temp_list.append(line.rstrip())
    # check for Python bug that does not split on `\r` newlines from sys.stdin correctly
    # https://bugs.python.org/issue45617
    if len(temp_list) == 1:
        raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
-    sniffdata = '\n'.join(temp_list)
+    sniffdata = '\n'.join(temp_list)[:1024]
    dialect = 'excel'  # default in csv module
    dialect = None
    try:
        dialect = csv.Sniffer().sniff(sniffdata)
        if '""' in sniffdata:
            dialect.doublequote = True
    except Exception:
        pass
--- a/tests/fixtures/generic/csv-doubleqouted-streaming.json
+++ b/tests/fixtures/generic/csv-doubleqouted-streaming.json
@ -0,0 +1 @@
 [{"A":"1","B":"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},{"A":"2","B":"this is a field with \" in it\""}]
--- a/tests/fixtures/generic/csv-doubleqouted.csv
+++ b/tests/fixtures/generic/csv-doubleqouted.csv
@ -0,0 +1,3 @@
 A,B
 1,"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"
 2,"this is a field with "" in it"
--- a/tests/fixtures/generic/csv-doubleqouted.json
+++ b/tests/fixtures/generic/csv-doubleqouted.json
@ -0,0 +1,4 @@
 [
    {"A": "1", "B": "this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},
    {"A": "2", "B": "this is a field with \" in it"}
 ]
--- a/tests/test_csv.py
+++ b/tests/test_csv.py
@ -37,6 +37,9 @@ class MyTests(unittest.TestCase):
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f:
            self.generic_csv_insurance = f.read()
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
            self.generic_csv_doubleqouted = f.read()
        # output
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f:
            self.generic_csv_biostats_json = json.loads(f.read())
@ -65,6 +68,9 @@ class MyTests(unittest.TestCase):
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f:
            self.generic_csv_insurance_json = json.loads(f.read())
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.json'), 'r', encoding='utf-8') as f:
            self.generic_csv_doubleqouted_json = json.loads(f.read())
    def test_csv_nodata(self):
        """
        Test with no data
@ -125,6 +131,12 @@ class MyTests(unittest.TestCase):
        """
        self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json)
    def test_doubleqouted(self):
        """
        Test 'csv-doubleqouted.csv' file
        """
        self.assertEqual(jc.parsers.csv.parse(self.generic_csv_doubleqouted, quiet=True), self.generic_csv_doubleqouted_json)
 if __name__ == '__main__':
    unittest.main()
--- a/tests/test_csv_s.py
+++ b/tests/test_csv_s.py
@ -42,6 +42,9 @@ class MyTests(unittest.TestCase):
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f:
            self.generic_csv_10k_sales_records = f.read()
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
            self.generic_csv_doubleqouted = f.read()
        # output
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f:
            self.generic_csv_biostats_streaming_json = json.loads(f.read())
@ -70,6 +73,9 @@ class MyTests(unittest.TestCase):
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f:
            self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read())
        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted-streaming.json'), 'r', encoding='utf-8') as f:
            self.generic_csv_doublequoted_streaming_json = json.loads(f.read())
    def test_csv_s_nodata(self):
        """
        Test CSV parser with no data
@ -141,6 +147,12 @@ class MyTests(unittest.TestCase):
        """
        self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json)
    def test_csv_s_doublequoted(self):
        """
        Test 'doublequoted.csv' file
        """
        self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_doubleqouted.splitlines(), quiet=True)), self.generic_csv_doublequoted_streaming_json)
 if __name__ == '__main__':
    unittest.main()
		`@ -0,0 +1 @@`
							[{"A":"1","B":"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},{"A":"2","B":"this is a field with \" in it\""}]