1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-07-13 01:20:24 +02:00

new streamlined parser

This commit is contained in:
Kelly Brazil
2022-03-21 13:06:34 -07:00
parent 9ecbdb0916
commit 51ae5ebcac

View File

@ -44,7 +44,7 @@ Examples:
[] []
""" """
import re import re
from typing import Iterable, List, Dict, Optional, Generator from typing import Iterable, Tuple, List, Dict
import jc.utils import jc.utils
from jc.exceptions import ParseError from jc.exceptions import ParseError
@ -73,11 +73,6 @@ def _process(proc_data: List[Dict]) -> List[Dict]:
List of Dictionaries. Structured to conform to the schema. List of Dictionaries. Structured to conform to the schema.
""" """
# remove newlines from values
# for item in proc_data:
# for k, v in item.items():
# item[k] = v.replace('\n', '')
return proc_data return proc_data
@ -115,213 +110,191 @@ def _table_sniff(string: str) -> str:
return 'simple' return 'simple'
def _pretty_set_separators(table_lines: Iterable, separator: str) -> Generator[str, None, None]: def _is_separator(line: str) -> bool:
"""Return a generator that yields rows standardized separators""" """Returns true if a table separator line is found"""
for line in table_lines: strip_line = line.strip()
strip_line = line.strip() if strip_line.startswith('╒═') and strip_line.endswith('═╕')\
or strip_line.startswith('╞═') and strip_line.endswith('═╡')\
# skip any blanks or strip_line.startswith('╘═') and strip_line.endswith('═╛')\
if not strip_line: or strip_line.startswith('┌─') and strip_line.endswith('─┐')\
continue or strip_line.startswith('├─') and strip_line.endswith('─┤')\
or strip_line.startswith('└─') and strip_line.endswith('─┘')\
# yield row separators as a sentinel string or strip_line.startswith('+=') and strip_line.endswith('=+')\
if strip_line.startswith('╒═') and strip_line.endswith('═╕')\ or strip_line.startswith('+-') and strip_line.endswith('-+'):
or strip_line.startswith('╞═') and strip_line.endswith('═╡')\ return True
or strip_line.startswith('╘═') and strip_line.endswith('═╛')\ return False
or strip_line.startswith('┌─') and strip_line.endswith('─┐')\
or strip_line.startswith('├─') and strip_line.endswith('─┤')\
or strip_line.startswith('└─') and strip_line.endswith('─┘')\
or strip_line.startswith('+=') and strip_line.endswith('=+')\
or strip_line.startswith('+-') and strip_line.endswith('-+'):
yield separator
continue
# remove the table column separator characters and yield the line
# line = line.replace('|', ' ').replace('│', ' ')
yield line
def _pretty_normalize_rows(table_lines: Iterable, def _snake_case(line: str) -> str:
separator: str, """replace spaces between words with an underscores and set to lowercase"""
data_separator: str) -> Generator[str, None, None]: return re.sub(r'\b \b', '_', line).lower()
def _fixup_separators(line: str) -> str:
"""Normalize separators, and remove first and last separators"""
# normalize separator
line = line.replace('', '|')
# remove first separator if it is the first char in the line
if line[0] == '|':
line = line.replace('|', ' ', 1)
# remove last separator if it is the last char in the line
if line[-1] == '|':
line = line[::-1].replace('|', ' ', 1)[::-1]
return line
def _normalize_rows(table_lines: Iterable[str]) -> List[Tuple[int, List[str]]]:
""" """
Return a generator that yields header and data rows with different separators. Return a List of tuples of a row counters and data lines.
Also removes spaces from headers.
""" """
result = []
header_found = False header_found = False
data_found = False data_found = False
row_counter = 0
# Removes initial table lines, finds the header row(s) and separates for line in table_lines:
# the header from the data rows with different separator characters. # skip blank lines
for i in table_lines: if not line.strip():
if separator in i and not header_found and not data_found:
# top table frame
continue continue
if not separator in i and not header_found and not data_found:
# skip top table frame
if _is_separator(line) and not header_found and not data_found:
continue
# first header row found
if not _is_separator(line) and not header_found and not data_found:
header_found = True header_found = True
# first header data found line = _snake_case(line)
# remove spaces from header line = _fixup_separators(line)
i = re.sub(r'\b \b', '_', i) line_list = line.split('|')
yield i line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
continue continue
if not separator in i and header_found and not data_found:
# subsequent header data found # subsequent header row found
# remove spaces from header if not _is_separator(line) and header_found and not data_found:
i = re.sub(r'\b \b', '_', i) line = _snake_case(line)
yield i line = _fixup_separators(line)
line_list = line.split('|')
line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
continue continue
if separator in i and header_found and not data_found:
# table separator found - this is a header separator
if _is_separator(line) and header_found and not data_found:
data_found = True data_found = True
# table separator found - this is a header separator row_counter += 1
yield separator
continue
if not separator in i and header_found and data_found:
# subsequent data row found
yield i
continue
if separator in i and header_found and data_found:
# table separator found - this is a data separator
yield data_separator
continue continue
# subsequent data row found
if not _is_separator(line) and header_found and data_found:
line = _fixup_separators(line)
line_list = line.split('|')
line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
continue
def _pretty_table_parse(table: Iterable) -> List[Dict]: # table separator found - this is a data separator
temp_table = [] if _is_separator(line) and header_found and data_found:
for line in table: row_counter += 1
# normalize separator continue
line = line.replace('', '|')
# remove first separator if it is the first char in the line
if line[0] == '|':
line = line.replace('|', ' ', 1)
# remove last separator if it is the last char in the line
if line[-1] == '|':
line = line[::-1].replace('|', ' ', 1)[::-1]
temp_table.append([x.strip() for x in line.split('|')])
headers = temp_table[0]
raw_data = temp_table[1:]
result = [dict.fromkeys(headers, None)]
result.extend([dict(zip(headers, r)) for r in raw_data])
return result return result
def _pretty_remove_header_rows(table: List[Dict], sep: str, data_sep: str) -> List[Optional[Dict]]: def _get_headers(table: Iterable[Tuple[int, List]]) -> List[List[str]]:
"""return a table with only data rows.""" """
# create a new list of row objects with new key names return a list of all of the header rows (which are lists of strings.
data_obj_list: List[Optional[Dict]] = [] [ # headers
sep_found = False ['str', 'str', 'str'], # header rows
data_sep_found = False ['str', 'str', 'str']
for obj in table: ]
#skip to data """
for v in obj.values(): result = []
if not sep_found and not str(v).strip() == sep: for row_num, line in table:
continue if row_num == 0:
if not sep_found and str(v).strip() == sep: result.append(line)
sep_found = True return result
continue
# append data row objects or None for separators
if sep_found: def _get_data(table: Iterable[Tuple[int, List]]) -> List[List[List[str]]]:
for k, v in obj.items(): """
if str(v).strip() == data_sep: return a list of rows, which are lists made up of lists of strings:
data_sep_found = True [ # data
break [ # rows
else: ['str', 'str', 'str'] # lines
data_sep_found = False ]
if data_sep_found: ]
data_obj_list.append(None) """
result: List[List[List[str]]] = []
current_row = 1
this_line: List[List[str]] = []
for row_num, line in table:
if row_num != 0:
if row_num != current_row:
result.append(this_line)
current_row = row_num
this_line = []
this_line.append(line)
if this_line:
result.append(this_line)
return result
def _collapse_headers(table: List[List[str]]) -> List[str]:
"""append each column string to return the full header list"""
result = table[0]
for line in table[1:]:
new_line: List[str] = []
for i, header in enumerate(line):
if header:
new_header = result[i] + '_' + header
new_header = re.sub(r'__+', '_', new_header)
new_line.append(new_header)
else: else:
data_obj_list.append(obj) new_line.append(result[i])
result = new_line
# remove first item, which is a separator return result
return data_obj_list[1:]
def _pretty_map_new_keynames(table: List[Dict], sep: str) -> Dict: def _collapse_data(table: List[List[List[str]]]) -> List[List[str]]:
""" """combine data rows to return a simple list of lists"""
returns a dict of old keyname to new keyname mappings by consolidating result: List[List[str]] = []
multiline keynames from the input list of dictionaries.
"""
# first get all header objects to find full keynames. Stop when data rows are found.
header_obj_list = []
sep_found = False
for obj in table:
for v in obj.values():
if str(v).strip() == sep:
sep_found = True
break
if sep_found:
break
header_obj_list.append(obj)
if not header_obj_list: for row in table:
header_obj_list = [{key: None for key in table[0]}] new_row: List[str] = []
for line in row:
if new_row:
for i, item in enumerate(line):
new_row[i] = (new_row[i] + '\n' + item).strip()
else:
new_row = line
# create an old-key to new-key name mapping dict result.append(new_row)
new_keynames_dict = dict.fromkeys([key for key in header_obj_list[0]], '')
for item in new_keynames_dict:
new_keynames_dict[item] = item
for obj in header_obj_list:
for k, v in obj.items():
if v:
new_keynames_dict[k] = new_keynames_dict[k] + '_' + v
# normalize keynames so they are lowercase, no spaces, and no redundat '_'s return result
for k, v in new_keynames_dict.items():
new_keynames_dict[k] = v.replace(' ', '_').lower()
new_keynames_dict[k] = re.sub(r'__+', '_', v)
return new_keynames_dict
def _pretty_rename_keys(table: List, new_keynames: Dict) -> List[Optional[Dict]]: def _create_table_dict(header: List[str], data: List[List[str]]) -> List[Dict[str, str]]:
"""rename all of the keys in the table based on the new_keynames mapping""" return [dict(zip(header, r)) for r in data]
renamed_key_table: List[Optional[Dict]] = []
for item in table:
if item:
renamed_key_table.append({new_keynames[k]:v for k, v in item.items()})
else:
renamed_key_table.append(None)
return renamed_key_table
def _pretty_consolidate_rows(table: List) -> List[Dict]:
"""go through all data objects and combine values between data separators"""
consolidated_rows = []
current_obj = dict.fromkeys([key for key in table[0]], '')
for item in table:
if not item:
consolidated_rows.append(current_obj)
current_obj = dict.fromkeys([key for key in table[0]], '')
continue
else:
for k, v in item.items():
if v:
if not current_obj[k]:
current_obj[k] = v
else:
current_obj[k] = current_obj[k] + '\n' + v
return consolidated_rows
def _parse_pretty(string: str) -> List: def _parse_pretty(string: str) -> List:
string_lines = string.splitlines() string_lines: List[str] = string.splitlines()
sep = '~~~' clean: List[Tuple[int, List[str]]] = _normalize_rows(string_lines)
data_sep = '===' raw_headers: List[List[str]] = _get_headers(clean)
separator = ' ' + sep + ' ' raw_data: List[List[List[str]]] = _get_data(clean)
data_separator = ' ' + data_sep + ' '
clean: Generator = _pretty_set_separators(string_lines, separator) new_headers: List[str] = _collapse_headers(raw_headers)
normalized: Generator = _pretty_normalize_rows(clean, separator, data_separator) new_data: List[List[str]] = _collapse_data(raw_data)
raw_table: List[Dict] = _pretty_table_parse(normalized) # was sparse_table_parse() final_table: List[Dict[str, str]] = _create_table_dict(new_headers, new_data)
new_keynames: Dict = _pretty_map_new_keynames(raw_table, sep)
data_table: List[Optional[Dict]] = _pretty_remove_header_rows(raw_table, sep, data_sep)
table_with_renamed_keys: List[Optional[Dict]] = _pretty_rename_keys(data_table, new_keynames)
final_table: List[Dict] = _pretty_consolidate_rows(table_with_renamed_keys)
return final_table return final_table