From 2a40f842743a8745d651b4b6cae645045e6c6aab Mon Sep 17 00:00:00 2001
From: shaik <shai.kustin@torq.io>
Date: Sun, 2 Jan 2022 17:11:20 +0200
Subject: [PATCH 1/5] fix doubleqoute in csv

---
 jc/parsers/csv.py                            |  4 +++-
 tests/fixtures/generic/csv-doubleqouted.csv  |  3 +++
 tests/fixtures/generic/csv-doubleqouted.json |  4 ++++
 tests/test_csv.py                            | 12 ++++++++++++
 4 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 tests/fixtures/generic/csv-doubleqouted.csv
 create mode 100644 tests/fixtures/generic/csv-doubleqouted.json

diff --git a/jc/parsers/csv.py b/jc/parsers/csv.py
index 924d731c..731e0ddb 100644
--- a/jc/parsers/csv.py
+++ b/jc/parsers/csv.py
@@ -130,9 +130,11 @@ def parse(data, raw=False, quiet=False):
 
     if jc.utils.has_data(data):
 
-        dialect = None
+        dialect = "excel" # default in csv module
         try:
             dialect = csv.Sniffer().sniff(data[:1024])
+            if '""' in data:
+                dialect.doublequote = True
         except Exception:
             pass
 
diff --git a/tests/fixtures/generic/csv-doubleqouted.csv b/tests/fixtures/generic/csv-doubleqouted.csv
new file mode 100644
index 00000000..9b2ef10f
--- /dev/null
+++ b/tests/fixtures/generic/csv-doubleqouted.csv
@@ -0,0 +1,3 @@
+A,B
+1,"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"
+2,"this is a field with "" in it"
diff --git a/tests/fixtures/generic/csv-doubleqouted.json b/tests/fixtures/generic/csv-doubleqouted.json
new file mode 100644
index 00000000..a1dab8e1
--- /dev/null
+++ b/tests/fixtures/generic/csv-doubleqouted.json
@@ -0,0 +1,4 @@
+[
+    {"A": "1", "B": "this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},
+    {"A": "2", "B": "this is a field with \" in it"}
+]
diff --git a/tests/test_csv.py b/tests/test_csv.py
index 6d3b2d91..a62cf0f2 100644
--- a/tests/test_csv.py
+++ b/tests/test_csv.py
@@ -37,6 +37,9 @@ class MyTests(unittest.TestCase):
         with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.csv'), 'r', encoding='utf-8') as f:
             self.generic_csv_insurance = f.read()
 
+        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
+            self.generic_csv_doubleqouted = f.read()
+
         # output
         with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats.json'), 'r', encoding='utf-8') as f:
             self.generic_csv_biostats_json = json.loads(f.read())
@@ -65,6 +68,9 @@ class MyTests(unittest.TestCase):
         with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-insurance.json'), 'r', encoding='utf-8') as f:
             self.generic_csv_insurance_json = json.loads(f.read())
 
+        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.json'), 'r', encoding='utf-8') as f:
+            self.generic_csv_doubleqouted_json = json.loads(f.read())
+
     def test_csv_nodata(self):
         """
         Test with no data
@@ -125,6 +131,12 @@ class MyTests(unittest.TestCase):
         """
         self.assertEqual(jc.parsers.csv.parse(self.generic_csv_insurance, quiet=True), self.generic_csv_insurance_json)
 
+    def test_doubleqouted(self):
+        """
+        Test 'csv-doubleqouted.csv' file
+        """
+        self.assertEqual(jc.parsers.csv.parse(self.generic_csv_doubleqouted, quiet=True), self.generic_csv_doubleqouted_json)
+
 
 if __name__ == '__main__':
     unittest.main()

From 9c887a36a804d817c3f669a55b9b1566d09d645d Mon Sep 17 00:00:00 2001
From: Kelly Brazil <kellyjonbrazil@gmail.com>
Date: Sun, 2 Jan 2022 11:44:18 -0800
Subject: [PATCH 2/5] update csv_s parser with csv changes

---
 jc/parsers/csv_s.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/jc/parsers/csv_s.py b/jc/parsers/csv_s.py
index e237b28a..7b894f80 100644
--- a/jc/parsers/csv_s.py
+++ b/jc/parsers/csv_s.py
@@ -56,7 +56,7 @@ from jc.exceptions import ParseError
 
 class info():
     """Provides parser metadata (version, author, etc.)"""
-    version = '1.1'
+    version = '1.2'
     description = 'CSV file streaming parser'
     author = 'Kelly Brazil'
     author_email = 'kellyjonbrazil@gmail.com'
@@ -122,9 +122,11 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
 
     sniffdata = '\n'.join(temp_list)
 
-    dialect = None
+    dialect = 'excel'  # default in csv module
     try:
         dialect = csv.Sniffer().sniff(sniffdata)
+        if '""' in sniffdata:
+                dialect.doublequote = True
     except Exception:
         pass
 

From 3a4a27e1f94ee07352c7616c57ec655c1aea04f6 Mon Sep 17 00:00:00 2001
From: Kelly Brazil <kellyjonbrazil@gmail.com>
Date: Sun, 2 Jan 2022 11:44:25 -0800
Subject: [PATCH 3/5] version bump

---
 jc/parsers/csv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jc/parsers/csv.py b/jc/parsers/csv.py
index 731e0ddb..539d961b 100644
--- a/jc/parsers/csv.py
+++ b/jc/parsers/csv.py
@@ -75,7 +75,7 @@ import csv
 
 class info():
     """Provides parser metadata (version, author, etc.)"""
-    version = '1.3'
+    version = '1.4'
     description = 'CSV file parser'
     author = 'Kelly Brazil'
     author_email = 'kellyjonbrazil@gmail.com'
@@ -130,7 +130,7 @@ def parse(data, raw=False, quiet=False):
 
     if jc.utils.has_data(data):
 
-        dialect = "excel" # default in csv module
+        dialect = 'excel'  # default in csv module
         try:
             dialect = csv.Sniffer().sniff(data[:1024])
             if '""' in data:

From 5563829df2849a899df2e9211d6c92bddc695f9b Mon Sep 17 00:00:00 2001
From: Kelly Brazil <kellyjonbrazil@gmail.com>
Date: Mon, 3 Jan 2022 08:48:23 -0800
Subject: [PATCH 4/5] make dialect sniff behavior match non-streaming parser

---
 jc/parsers/csv_s.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/jc/parsers/csv_s.py b/jc/parsers/csv_s.py
index 7b894f80..f152bdc7 100644
--- a/jc/parsers/csv_s.py
+++ b/jc/parsers/csv_s.py
@@ -113,20 +113,20 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
 
     # first, load the first 100 lines into a list to detect the CSV dialect
     for line in itertools.islice(data, 100):
-        temp_list.append(line)
+        temp_list.append(line.rstrip())
 
     # check for Python bug that does not split on `\r` newlines from sys.stdin correctly
     # https://bugs.python.org/issue45617
     if len(temp_list) == 1:
         raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
 
-    sniffdata = '\n'.join(temp_list)
-
+    sniffdata = '\n'.join(temp_list)[:1024]
     dialect = 'excel'  # default in csv module
+
     try:
         dialect = csv.Sniffer().sniff(sniffdata)
         if '""' in sniffdata:
-                dialect.doublequote = True
+            dialect.doublequote = True
     except Exception:
         pass
 

From 65d96e26b59e231c77c1dbba1dc91708c33de30c Mon Sep 17 00:00:00 2001
From: Kelly Brazil <kellyjonbrazil@gmail.com>
Date: Mon, 3 Jan 2022 09:06:00 -0800
Subject: [PATCH 5/5] add streaming tests

---
 .../fixtures/generic/csv-doubleqouted-streaming.json |  1 +
 tests/test_csv_s.py                                  | 12 ++++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 tests/fixtures/generic/csv-doubleqouted-streaming.json

diff --git a/tests/fixtures/generic/csv-doubleqouted-streaming.json b/tests/fixtures/generic/csv-doubleqouted-streaming.json
new file mode 100644
index 00000000..9f0e1ded
--- /dev/null
+++ b/tests/fixtures/generic/csv-doubleqouted-streaming.json
@@ -0,0 +1 @@
+[{"A":"1","B":"this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this is 1024 bytes long field this"},{"A":"2","B":"this is a field with \" in it\""}]
diff --git a/tests/test_csv_s.py b/tests/test_csv_s.py
index 5a9707ce..757d7cfc 100644
--- a/tests/test_csv_s.py
+++ b/tests/test_csv_s.py
@@ -42,6 +42,9 @@ class MyTests(unittest.TestCase):
         with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records.csv'), 'r', encoding='utf-8') as f:
             self.generic_csv_10k_sales_records = f.read()
 
+        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted.csv'), 'r', encoding='utf-8') as f:
+            self.generic_csv_doubleqouted = f.read()
+
         # output
         with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-biostats-streaming.json'), 'r', encoding='utf-8') as f:
             self.generic_csv_biostats_streaming_json = json.loads(f.read())
@@ -70,6 +73,9 @@ class MyTests(unittest.TestCase):
         with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-10k-sales-records-streaming.json'), 'r', encoding='utf-8') as f:
             self.generic_csv_10k_sales_records_streaming_json = json.loads(f.read())
 
+        with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/csv-doubleqouted-streaming.json'), 'r', encoding='utf-8') as f:
+            self.generic_csv_doublequoted_streaming_json = json.loads(f.read())
+
     def test_csv_s_nodata(self):
         """
         Test CSV parser with no data
@@ -141,6 +147,12 @@ class MyTests(unittest.TestCase):
         """
         self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_10k_sales_records.splitlines(), quiet=True)), self.generic_csv_10k_sales_records_streaming_json)
 
+    def test_csv_s_doublequoted(self):
+        """
+        Test 'doublequoted.csv' file
+        """
+        self.assertEqual(list(jc.parsers.csv_s.parse(self.generic_csv_doubleqouted.splitlines(), quiet=True)), self.generic_csv_doublequoted_streaming_json)
+
 
 if __name__ == '__main__':
     unittest.main()