1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-07-13 01:20:24 +02:00

fix for UTF-8 csv files with leading BOM bytes

This commit is contained in:
Kelly Brazil
2022-10-25 11:18:22 -07:00
parent 45859b01e5
commit 888b6bd6d5
2 changed files with 16 additions and 3 deletions

View File

@ -78,7 +78,7 @@ import csv
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.4'
version = '1.5'
description = 'CSV file parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
@ -124,6 +124,12 @@ def parse(data, raw=False, quiet=False):
jc.utils.compatibility(__name__, info.compatible, quiet)
jc.utils.input_type_check(data)
# remove BOM bytes, if present
if isinstance(data, str):
data = data.encode('utf-8')
data = data.decode('utf-8-sig')
raw_output = []
cleandata = data.splitlines()

View File

@ -63,7 +63,7 @@ from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.3'
version = '1.4'
description = 'CSV file streaming parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
@ -127,7 +127,14 @@ def parse(data, raw=False, quiet=False, ignore_exceptions=False):
if len(temp_list) == 1:
raise ParseError('Unable to detect line endings. Please try the non-streaming CSV parser instead.')
sniffdata = '\n'.join(temp_list)[:1024]
# remove BOM bytes from first row, if present
if temp_list:
if isinstance(temp_list[0], str):
temp_list[0] = temp_list[0].encode('utf-8')
temp_list[0] = temp_list[0].decode('utf-8-sig')
sniffdata = '\r\n'.join(temp_list)[:1024]
dialect = 'excel' # default in csv module
try: