1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-21 00:19:42 +02:00
Files
jc/jc/parsers/universal.py
2022-03-24 11:57:01 -07:00

159 lines
5.9 KiB
Python

"""jc - JSON Convert universal parsers"""
from typing import Iterable, List, Dict
def simple_table_parse(data: Iterable[str]) -> List[Dict]:
"""
Parse simple tables. There should be no blank cells. The last column
may contain data with spaces.
Example Table:
col_1 col_2 col_3 col_4 col_5
apple orange pear banana my favorite fruits
carrot squash celery spinach my favorite veggies
chicken beef pork eggs my favorite proteins
[{'col_1': 'apple', 'col_2': 'orange', 'col_3': 'pear', 'col_4':
'banana', 'col_5': 'my favorite fruits'}, {'col_1': 'carrot',
'col_2': 'squash', 'col_3': 'celery', 'col_4': 'spinach', 'col_5':
'my favorite veggies'}, {'col_1': 'chicken', 'col_2': 'beef',
'col_3': 'pork', 'col_4': 'eggs', 'col_5': 'my favorite proteins'}]
Parameters:
data: (iter) Text data to parse that has been split into lines
via .splitlines(). Item 0 must be the header row.
Any spaces in header names should be changed to
underscore '_'. You should also ensure headers are
lowercase by using .lower().
Also, ensure there are no blank lines (list items)
in the data.
Returns:
List of Dictionaries
"""
# code adapted from Conor Heine at:
# https://gist.github.com/cahna/43a1a3ff4d075bcd71f9d7120037a501
# cast iterable to a list. Also keeps from mutating the caller's list
data = list(data)
headers = [h for h in ' '.join(data[0].strip().split()).split() if h]
raw_data = map(lambda s: s.strip().split(None, len(headers) - 1), data[1:])
raw_output = [dict(zip(headers, r)) for r in raw_data]
return raw_output
def sparse_table_parse(data: Iterable[str], delim: str = '\u2063') -> List[Dict]:
"""
Parse tables with missing column data or with spaces in column data.
Blank cells are converted to None in the resulting dictionary. Data
elements must line up within column boundaries.
Example Table:
col_1 col_2 col_3 col_4 col_5
apple orange fuzzy peach my favorite fruits
green beans celery spinach my favorite veggies
chicken beef brown eggs my favorite proteins
[{'col_1': 'apple', 'col_2': 'orange', 'col_3': None, 'col_4':
'fuzzy peach', 'col_5': 'my favorite fruits'}, {'col_1':
'green beans', 'col_2': None, 'col_3': 'celery', 'col_4': 'spinach',
'col_5': 'my favorite veggies'}, {'col_1': 'chicken', 'col_2':
'beef', 'col_3': None, 'col_4': 'brown eggs', 'col_5':
'my favorite proteins'}]
Parameters:
data: (iter) An iterable of string lines (e.g. str.splitlines())
Item 0 must be the header row. Any spaces in header
names should be changed to underscore '_'. You
should also ensure headers are lowercase by using
.lower(). Do not change the position of header
names as the positions are used to find the data.
Also, ensure there are no blank line items.
delim: (string) Delimiter to use. By default `u\\2063`
(invisible separator) is used since it is unlikely
to ever be seen in terminal output. You can change
this for troubleshooting purposes or if there is a
delimiter conflict with your data.
Returns:
List of Dictionaries
"""
# cast iterable to a list. Also keeps from mutating the caller's list
data = list(data)
# find the longest line and pad all lines with spaces to match
max_len = max([len(x) for x in data])
new_data = []
for line in data:
new_data.append(line + ' ' * (max_len - len(line)))
data = new_data
# find header
output: List = []
header_text: str = data.pop(0)
header_text = header_text + ' '
header_list: List = header_text.split()
# find each column index and end position
header_search = [header_list[0]]
for h in header_list[1:]:
header_search.append(' ' + h + ' ')
header_spec_list = []
for i, column in enumerate(header_list[0:len(header_list) - 1]):
header_spec = {
'name': column,
'end': header_text.find(header_search[i + 1])
}
header_spec_list.append(header_spec)
# parse lines
if data:
for entry in data:
output_line = {}
# insert new separator since data can contain spaces
for col in reversed(header_list):
# find the right header_spec
for h_spec in header_spec_list:
if h_spec['name'] == col:
h_end = h_spec['end']
# check if the location contains whitespace. if not
# then move to the left until a space is found
while h_end > 0 and not entry[h_end].isspace():
h_end -= 1
# insert custom delimiter
entry = entry[:h_end] + delim + entry[h_end + 1:]
# create the entry list from the new custom delimiter
entry_list = entry.split(delim, maxsplit=len(header_list) - 1)
# clean up leading and trailing spaces in entry
clean_entry_list = []
for col in entry_list:
clean_entry = col.strip()
if clean_entry == '':
clean_entry = None
clean_entry_list.append(clean_entry)
output_line = dict(zip(header_list, clean_entry_list))
output.append(output_line)
return output