fix for UTF-8 csv files with leading BOM bytes

2025-07-13 01:20:24 +02:00 · 2022-10-25 11:18:22 -07:00
parent 45859b01e5
commit 888b6bd6d5
2 changed files with 16 additions and 3 deletions
--- a/jc/parsers/csv.py
+++ b/jc/parsers/csv.py
@ -78,7 +78,7 @@ import csv
 class info():
    """Provides parser metadata (version, author, etc.)"""
-    version = '1.4'
+    version = '1.5'
    description = 'CSV file parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
@ -124,6 +124,12 @@ def parse(data, raw=False, quiet=False):
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.input_type_check(data)
    # remove BOM bytes, if present
    if isinstance(data, str):
        data = data.encode('utf-8')
    data = data.decode('utf-8-sig')
    raw_output = []
    cleandata = data.splitlines()
--- a/jc/parsers/csv_s.py
+++ b/jc/parsers/csv_s.py
@ -63,7 +63,7 @@ from jc.exceptions import ParseError
 class info():
    """Provides parser metadata (version, author, etc.)"""
-    version = '1.3'
+    version = '1.4'
    description = 'CSV file streaming parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
@ -127,7 +127,14 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
    if len(temp_list) == 1:
        raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
-    sniffdata = '\n'.join(temp_list)[:1024]
+    # remove BOM bytes from first row, if present
    if temp_list:
        if isinstance(temp_list[0], str):
            temp_list[0] = temp_list[0].encode('utf-8')
        temp_list[0] = temp_list[0].decode('utf-8-sig')
    sniffdata = '\r\n'.join(temp_list)[:1024]
    dialect = 'excel'  # default in csv module
    try: