mirror of
https://github.com/kellyjonbrazil/jc.git
synced 2025-06-21 00:19:42 +02:00
159 lines
5.9 KiB
Python
159 lines
5.9 KiB
Python
"""jc - JSON Convert universal parsers"""
|
|
from typing import Iterable, List, Dict
|
|
|
|
|
|
def simple_table_parse(data: Iterable[str]) -> List[Dict]:
|
|
"""
|
|
Parse simple tables. There should be no blank cells. The last column
|
|
may contain data with spaces.
|
|
|
|
Example Table:
|
|
|
|
col_1 col_2 col_3 col_4 col_5
|
|
apple orange pear banana my favorite fruits
|
|
carrot squash celery spinach my favorite veggies
|
|
chicken beef pork eggs my favorite proteins
|
|
|
|
[{'col_1': 'apple', 'col_2': 'orange', 'col_3': 'pear', 'col_4':
|
|
'banana', 'col_5': 'my favorite fruits'}, {'col_1': 'carrot',
|
|
'col_2': 'squash', 'col_3': 'celery', 'col_4': 'spinach', 'col_5':
|
|
'my favorite veggies'}, {'col_1': 'chicken', 'col_2': 'beef',
|
|
'col_3': 'pork', 'col_4': 'eggs', 'col_5': 'my favorite proteins'}]
|
|
|
|
Parameters:
|
|
|
|
data: (iter) Text data to parse that has been split into lines
|
|
via .splitlines(). Item 0 must be the header row.
|
|
Any spaces in header names should be changed to
|
|
underscore '_'. You should also ensure headers are
|
|
lowercase by using .lower().
|
|
|
|
Also, ensure there are no blank lines (list items)
|
|
in the data.
|
|
|
|
Returns:
|
|
|
|
List of Dictionaries
|
|
"""
|
|
# code adapted from Conor Heine at:
|
|
# https://gist.github.com/cahna/43a1a3ff4d075bcd71f9d7120037a501
|
|
|
|
# cast iterable to a list. Also keeps from mutating the caller's list
|
|
data = list(data)
|
|
|
|
headers = [h for h in ' '.join(data[0].strip().split()).split() if h]
|
|
raw_data = map(lambda s: s.strip().split(None, len(headers) - 1), data[1:])
|
|
raw_output = [dict(zip(headers, r)) for r in raw_data]
|
|
|
|
return raw_output
|
|
|
|
|
|
def sparse_table_parse(data: Iterable[str], delim: str = '\u2063') -> List[Dict]:
|
|
"""
|
|
Parse tables with missing column data or with spaces in column data.
|
|
Blank cells are converted to None in the resulting dictionary. Data
|
|
elements must line up within column boundaries.
|
|
|
|
Example Table:
|
|
|
|
col_1 col_2 col_3 col_4 col_5
|
|
apple orange fuzzy peach my favorite fruits
|
|
green beans celery spinach my favorite veggies
|
|
chicken beef brown eggs my favorite proteins
|
|
|
|
[{'col_1': 'apple', 'col_2': 'orange', 'col_3': None, 'col_4':
|
|
'fuzzy peach', 'col_5': 'my favorite fruits'}, {'col_1':
|
|
'green beans', 'col_2': None, 'col_3': 'celery', 'col_4': 'spinach',
|
|
'col_5': 'my favorite veggies'}, {'col_1': 'chicken', 'col_2':
|
|
'beef', 'col_3': None, 'col_4': 'brown eggs', 'col_5':
|
|
'my favorite proteins'}]
|
|
|
|
Parameters:
|
|
|
|
data: (iter) An iterable of string lines (e.g. str.splitlines())
|
|
Item 0 must be the header row. Any spaces in header
|
|
names should be changed to underscore '_'. You
|
|
should also ensure headers are lowercase by using
|
|
.lower(). Do not change the position of header
|
|
names as the positions are used to find the data.
|
|
|
|
Also, ensure there are no blank line items.
|
|
|
|
delim: (string) Delimiter to use. By default `u\\2063`
|
|
(invisible separator) is used since it is unlikely
|
|
to ever be seen in terminal output. You can change
|
|
this for troubleshooting purposes or if there is a
|
|
delimiter conflict with your data.
|
|
|
|
Returns:
|
|
|
|
List of Dictionaries
|
|
"""
|
|
# cast iterable to a list. Also keeps from mutating the caller's list
|
|
data = list(data)
|
|
|
|
# find the longest line and pad all lines with spaces to match
|
|
max_len = max([len(x) for x in data])
|
|
|
|
new_data = []
|
|
for line in data:
|
|
new_data.append(line + ' ' * (max_len - len(line)))
|
|
|
|
data = new_data
|
|
|
|
# find header
|
|
output: List = []
|
|
header_text: str = data.pop(0)
|
|
header_text = header_text + ' '
|
|
header_list: List = header_text.split()
|
|
|
|
# find each column index and end position
|
|
header_search = [header_list[0]]
|
|
for h in header_list[1:]:
|
|
header_search.append(' ' + h + ' ')
|
|
|
|
header_spec_list = []
|
|
for i, column in enumerate(header_list[0:len(header_list) - 1]):
|
|
header_spec = {
|
|
'name': column,
|
|
'end': header_text.find(header_search[i + 1])
|
|
}
|
|
|
|
header_spec_list.append(header_spec)
|
|
|
|
# parse lines
|
|
if data:
|
|
for entry in data:
|
|
output_line = {}
|
|
|
|
# insert new separator since data can contain spaces
|
|
for col in reversed(header_list):
|
|
# find the right header_spec
|
|
for h_spec in header_spec_list:
|
|
if h_spec['name'] == col:
|
|
h_end = h_spec['end']
|
|
# check if the location contains whitespace. if not
|
|
# then move to the left until a space is found
|
|
while h_end > 0 and not entry[h_end].isspace():
|
|
h_end -= 1
|
|
|
|
# insert custom delimiter
|
|
entry = entry[:h_end] + delim + entry[h_end + 1:]
|
|
|
|
# create the entry list from the new custom delimiter
|
|
entry_list = entry.split(delim, maxsplit=len(header_list) - 1)
|
|
|
|
# clean up leading and trailing spaces in entry
|
|
clean_entry_list = []
|
|
for col in entry_list:
|
|
clean_entry = col.strip()
|
|
if clean_entry == '':
|
|
clean_entry = None
|
|
|
|
clean_entry_list.append(clean_entry)
|
|
|
|
output_line = dict(zip(header_list, clean_entry_list))
|
|
output.append(output_line)
|
|
|
|
return output
|