fix for UTF-8 csv files with leading BOM bytes

2026-04-24 20:56:11 +02:00 · 2022-10-25 11:18:22 -07:00
parent 45859b01e5
commit 888b6bd6d5
2 changed files with 16 additions and 3 deletions
@@ -78,7 +78,7 @@ import csv

 class info():
    """Provides parser metadata (version, author, etc.)"""
-    version = '1.4'
+    version = '1.5'
    description = 'CSV file parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
@@ -124,6 +124,12 @@ def parse(data, raw=False, quiet=False):
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.input_type_check(data)

+    # remove BOM bytes, if present
+    if isinstance(data, str):
+        data = data.encode('utf-8')
+
+    data = data.decode('utf-8-sig')
+
    raw_output = []
    cleandata = data.splitlines()

@@ -63,7 +63,7 @@ from jc.exceptions import ParseError

 class info():
    """Provides parser metadata (version, author, etc.)"""
-    version = '1.3'
+    version = '1.4'
    description = 'CSV file streaming parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
@@ -127,7 +127,14 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
    if len(temp_list) == 1:
        raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')

-    sniffdata = '\n'.join(temp_list)[:1024]
+    # remove BOM bytes from first row, if present
+    if temp_list:
+        if isinstance(temp_list[0], str):
+            temp_list[0] = temp_list[0].encode('utf-8')
+
+        temp_list[0] = temp_list[0].decode('utf-8-sig')
+
+    sniffdata = '\r\n'.join(temp_list)[:1024]
    dialect = 'excel'  # default in csv module

    try: