mirror of
https://github.com/kellyjonbrazil/jc.git
synced 2025-07-13 01:20:24 +02:00
new streamlined parser
This commit is contained in:
@ -44,7 +44,7 @@ Examples:
|
|||||||
[]
|
[]
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import Iterable, List, Dict, Optional, Generator
|
from typing import Iterable, Tuple, List, Dict
|
||||||
import jc.utils
|
import jc.utils
|
||||||
from jc.exceptions import ParseError
|
from jc.exceptions import ParseError
|
||||||
|
|
||||||
@ -73,11 +73,6 @@ def _process(proc_data: List[Dict]) -> List[Dict]:
|
|||||||
|
|
||||||
List of Dictionaries. Structured to conform to the schema.
|
List of Dictionaries. Structured to conform to the schema.
|
||||||
"""
|
"""
|
||||||
# remove newlines from values
|
|
||||||
# for item in proc_data:
|
|
||||||
# for k, v in item.items():
|
|
||||||
# item[k] = v.replace('\n', '')
|
|
||||||
|
|
||||||
return proc_data
|
return proc_data
|
||||||
|
|
||||||
|
|
||||||
@ -115,213 +110,191 @@ def _table_sniff(string: str) -> str:
|
|||||||
return 'simple'
|
return 'simple'
|
||||||
|
|
||||||
|
|
||||||
def _pretty_set_separators(table_lines: Iterable, separator: str) -> Generator[str, None, None]:
|
def _is_separator(line: str) -> bool:
|
||||||
"""Return a generator that yields rows standardized separators"""
|
"""Returns true if a table separator line is found"""
|
||||||
for line in table_lines:
|
strip_line = line.strip()
|
||||||
strip_line = line.strip()
|
if strip_line.startswith('╒═') and strip_line.endswith('═╕')\
|
||||||
|
or strip_line.startswith('╞═') and strip_line.endswith('═╡')\
|
||||||
# skip any blanks
|
or strip_line.startswith('╘═') and strip_line.endswith('═╛')\
|
||||||
if not strip_line:
|
or strip_line.startswith('┌─') and strip_line.endswith('─┐')\
|
||||||
continue
|
or strip_line.startswith('├─') and strip_line.endswith('─┤')\
|
||||||
|
or strip_line.startswith('└─') and strip_line.endswith('─┘')\
|
||||||
# yield row separators as a sentinel string
|
or strip_line.startswith('+=') and strip_line.endswith('=+')\
|
||||||
if strip_line.startswith('╒═') and strip_line.endswith('═╕')\
|
or strip_line.startswith('+-') and strip_line.endswith('-+'):
|
||||||
or strip_line.startswith('╞═') and strip_line.endswith('═╡')\
|
return True
|
||||||
or strip_line.startswith('╘═') and strip_line.endswith('═╛')\
|
return False
|
||||||
or strip_line.startswith('┌─') and strip_line.endswith('─┐')\
|
|
||||||
or strip_line.startswith('├─') and strip_line.endswith('─┤')\
|
|
||||||
or strip_line.startswith('└─') and strip_line.endswith('─┘')\
|
|
||||||
or strip_line.startswith('+=') and strip_line.endswith('=+')\
|
|
||||||
or strip_line.startswith('+-') and strip_line.endswith('-+'):
|
|
||||||
yield separator
|
|
||||||
continue
|
|
||||||
|
|
||||||
# remove the table column separator characters and yield the line
|
|
||||||
# line = line.replace('|', ' ').replace('│', ' ')
|
|
||||||
yield line
|
|
||||||
|
|
||||||
|
|
||||||
def _pretty_normalize_rows(table_lines: Iterable,
|
def _snake_case(line: str) -> str:
|
||||||
separator: str,
|
"""replace spaces between words with an underscores and set to lowercase"""
|
||||||
data_separator: str) -> Generator[str, None, None]:
|
return re.sub(r'\b \b', '_', line).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def _fixup_separators(line: str) -> str:
|
||||||
|
"""Normalize separators, and remove first and last separators"""
|
||||||
|
# normalize separator
|
||||||
|
line = line.replace('│', '|')
|
||||||
|
|
||||||
|
# remove first separator if it is the first char in the line
|
||||||
|
if line[0] == '|':
|
||||||
|
line = line.replace('|', ' ', 1)
|
||||||
|
|
||||||
|
# remove last separator if it is the last char in the line
|
||||||
|
if line[-1] == '|':
|
||||||
|
line = line[::-1].replace('|', ' ', 1)[::-1]
|
||||||
|
|
||||||
|
return line
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_rows(table_lines: Iterable[str]) -> List[Tuple[int, List[str]]]:
|
||||||
"""
|
"""
|
||||||
Return a generator that yields header and data rows with different separators.
|
Return a List of tuples of a row counters and data lines.
|
||||||
Also removes spaces from headers.
|
|
||||||
"""
|
"""
|
||||||
|
result = []
|
||||||
header_found = False
|
header_found = False
|
||||||
data_found = False
|
data_found = False
|
||||||
|
row_counter = 0
|
||||||
|
|
||||||
# Removes initial table lines, finds the header row(s) and separates
|
for line in table_lines:
|
||||||
# the header from the data rows with different separator characters.
|
# skip blank lines
|
||||||
for i in table_lines:
|
if not line.strip():
|
||||||
if separator in i and not header_found and not data_found:
|
|
||||||
# top table frame
|
|
||||||
continue
|
continue
|
||||||
if not separator in i and not header_found and not data_found:
|
|
||||||
|
# skip top table frame
|
||||||
|
if _is_separator(line) and not header_found and not data_found:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# first header row found
|
||||||
|
if not _is_separator(line) and not header_found and not data_found:
|
||||||
header_found = True
|
header_found = True
|
||||||
# first header data found
|
line = _snake_case(line)
|
||||||
# remove spaces from header
|
line = _fixup_separators(line)
|
||||||
i = re.sub(r'\b \b', '_', i)
|
line_list = line.split('|')
|
||||||
yield i
|
line_list = [x.strip() for x in line_list]
|
||||||
|
result.append((row_counter, line_list))
|
||||||
continue
|
continue
|
||||||
if not separator in i and header_found and not data_found:
|
|
||||||
# subsequent header data found
|
# subsequent header row found
|
||||||
# remove spaces from header
|
if not _is_separator(line) and header_found and not data_found:
|
||||||
i = re.sub(r'\b \b', '_', i)
|
line = _snake_case(line)
|
||||||
yield i
|
line = _fixup_separators(line)
|
||||||
|
line_list = line.split('|')
|
||||||
|
line_list = [x.strip() for x in line_list]
|
||||||
|
result.append((row_counter, line_list))
|
||||||
continue
|
continue
|
||||||
if separator in i and header_found and not data_found:
|
|
||||||
|
# table separator found - this is a header separator
|
||||||
|
if _is_separator(line) and header_found and not data_found:
|
||||||
data_found = True
|
data_found = True
|
||||||
# table separator found - this is a header separator
|
row_counter += 1
|
||||||
yield separator
|
|
||||||
continue
|
|
||||||
if not separator in i and header_found and data_found:
|
|
||||||
# subsequent data row found
|
|
||||||
yield i
|
|
||||||
continue
|
|
||||||
if separator in i and header_found and data_found:
|
|
||||||
# table separator found - this is a data separator
|
|
||||||
yield data_separator
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# subsequent data row found
|
||||||
|
if not _is_separator(line) and header_found and data_found:
|
||||||
|
line = _fixup_separators(line)
|
||||||
|
line_list = line.split('|')
|
||||||
|
line_list = [x.strip() for x in line_list]
|
||||||
|
result.append((row_counter, line_list))
|
||||||
|
continue
|
||||||
|
|
||||||
def _pretty_table_parse(table: Iterable) -> List[Dict]:
|
# table separator found - this is a data separator
|
||||||
temp_table = []
|
if _is_separator(line) and header_found and data_found:
|
||||||
for line in table:
|
row_counter += 1
|
||||||
# normalize separator
|
continue
|
||||||
line = line.replace('│', '|')
|
|
||||||
|
|
||||||
# remove first separator if it is the first char in the line
|
|
||||||
if line[0] == '|':
|
|
||||||
line = line.replace('|', ' ', 1)
|
|
||||||
|
|
||||||
# remove last separator if it is the last char in the line
|
|
||||||
if line[-1] == '|':
|
|
||||||
line = line[::-1].replace('|', ' ', 1)[::-1]
|
|
||||||
|
|
||||||
temp_table.append([x.strip() for x in line.split('|')])
|
|
||||||
|
|
||||||
headers = temp_table[0]
|
|
||||||
raw_data = temp_table[1:]
|
|
||||||
result = [dict.fromkeys(headers, None)]
|
|
||||||
result.extend([dict(zip(headers, r)) for r in raw_data])
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _pretty_remove_header_rows(table: List[Dict], sep: str, data_sep: str) -> List[Optional[Dict]]:
|
def _get_headers(table: Iterable[Tuple[int, List]]) -> List[List[str]]:
|
||||||
"""return a table with only data rows."""
|
"""
|
||||||
# create a new list of row objects with new key names
|
return a list of all of the header rows (which are lists of strings.
|
||||||
data_obj_list: List[Optional[Dict]] = []
|
[ # headers
|
||||||
sep_found = False
|
['str', 'str', 'str'], # header rows
|
||||||
data_sep_found = False
|
['str', 'str', 'str']
|
||||||
for obj in table:
|
]
|
||||||
#skip to data
|
"""
|
||||||
for v in obj.values():
|
result = []
|
||||||
if not sep_found and not str(v).strip() == sep:
|
for row_num, line in table:
|
||||||
continue
|
if row_num == 0:
|
||||||
if not sep_found and str(v).strip() == sep:
|
result.append(line)
|
||||||
sep_found = True
|
return result
|
||||||
continue
|
|
||||||
# append data row objects or None for separators
|
|
||||||
if sep_found:
|
def _get_data(table: Iterable[Tuple[int, List]]) -> List[List[List[str]]]:
|
||||||
for k, v in obj.items():
|
"""
|
||||||
if str(v).strip() == data_sep:
|
return a list of rows, which are lists made up of lists of strings:
|
||||||
data_sep_found = True
|
[ # data
|
||||||
break
|
[ # rows
|
||||||
else:
|
['str', 'str', 'str'] # lines
|
||||||
data_sep_found = False
|
]
|
||||||
if data_sep_found:
|
]
|
||||||
data_obj_list.append(None)
|
"""
|
||||||
|
result: List[List[List[str]]] = []
|
||||||
|
current_row = 1
|
||||||
|
this_line: List[List[str]] = []
|
||||||
|
for row_num, line in table:
|
||||||
|
if row_num != 0:
|
||||||
|
if row_num != current_row:
|
||||||
|
result.append(this_line)
|
||||||
|
current_row = row_num
|
||||||
|
this_line = []
|
||||||
|
|
||||||
|
this_line.append(line)
|
||||||
|
|
||||||
|
if this_line:
|
||||||
|
result.append(this_line)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _collapse_headers(table: List[List[str]]) -> List[str]:
|
||||||
|
"""append each column string to return the full header list"""
|
||||||
|
result = table[0]
|
||||||
|
for line in table[1:]:
|
||||||
|
new_line: List[str] = []
|
||||||
|
for i, header in enumerate(line):
|
||||||
|
if header:
|
||||||
|
new_header = result[i] + '_' + header
|
||||||
|
new_header = re.sub(r'__+', '_', new_header)
|
||||||
|
new_line.append(new_header)
|
||||||
else:
|
else:
|
||||||
data_obj_list.append(obj)
|
new_line.append(result[i])
|
||||||
|
result = new_line
|
||||||
|
|
||||||
# remove first item, which is a separator
|
return result
|
||||||
return data_obj_list[1:]
|
|
||||||
|
|
||||||
|
|
||||||
def _pretty_map_new_keynames(table: List[Dict], sep: str) -> Dict:
|
def _collapse_data(table: List[List[List[str]]]) -> List[List[str]]:
|
||||||
"""
|
"""combine data rows to return a simple list of lists"""
|
||||||
returns a dict of old keyname to new keyname mappings by consolidating
|
result: List[List[str]] = []
|
||||||
multiline keynames from the input list of dictionaries.
|
|
||||||
"""
|
|
||||||
# first get all header objects to find full keynames. Stop when data rows are found.
|
|
||||||
header_obj_list = []
|
|
||||||
sep_found = False
|
|
||||||
for obj in table:
|
|
||||||
for v in obj.values():
|
|
||||||
if str(v).strip() == sep:
|
|
||||||
sep_found = True
|
|
||||||
break
|
|
||||||
if sep_found:
|
|
||||||
break
|
|
||||||
header_obj_list.append(obj)
|
|
||||||
|
|
||||||
if not header_obj_list:
|
for row in table:
|
||||||
header_obj_list = [{key: None for key in table[0]}]
|
new_row: List[str] = []
|
||||||
|
for line in row:
|
||||||
|
if new_row:
|
||||||
|
for i, item in enumerate(line):
|
||||||
|
new_row[i] = (new_row[i] + '\n' + item).strip()
|
||||||
|
else:
|
||||||
|
new_row = line
|
||||||
|
|
||||||
# create an old-key to new-key name mapping dict
|
result.append(new_row)
|
||||||
new_keynames_dict = dict.fromkeys([key for key in header_obj_list[0]], '')
|
|
||||||
for item in new_keynames_dict:
|
|
||||||
new_keynames_dict[item] = item
|
|
||||||
for obj in header_obj_list:
|
|
||||||
for k, v in obj.items():
|
|
||||||
if v:
|
|
||||||
new_keynames_dict[k] = new_keynames_dict[k] + '_' + v
|
|
||||||
|
|
||||||
# normalize keynames so they are lowercase, no spaces, and no redundat '_'s
|
return result
|
||||||
for k, v in new_keynames_dict.items():
|
|
||||||
new_keynames_dict[k] = v.replace(' ', '_').lower()
|
|
||||||
new_keynames_dict[k] = re.sub(r'__+', '_', v)
|
|
||||||
|
|
||||||
return new_keynames_dict
|
|
||||||
|
|
||||||
|
|
||||||
def _pretty_rename_keys(table: List, new_keynames: Dict) -> List[Optional[Dict]]:
|
def _create_table_dict(header: List[str], data: List[List[str]]) -> List[Dict[str, str]]:
|
||||||
"""rename all of the keys in the table based on the new_keynames mapping"""
|
return [dict(zip(header, r)) for r in data]
|
||||||
renamed_key_table: List[Optional[Dict]] = []
|
|
||||||
for item in table:
|
|
||||||
if item:
|
|
||||||
renamed_key_table.append({new_keynames[k]:v for k, v in item.items()})
|
|
||||||
else:
|
|
||||||
renamed_key_table.append(None)
|
|
||||||
|
|
||||||
return renamed_key_table
|
|
||||||
|
|
||||||
|
|
||||||
def _pretty_consolidate_rows(table: List) -> List[Dict]:
|
|
||||||
"""go through all data objects and combine values between data separators"""
|
|
||||||
consolidated_rows = []
|
|
||||||
current_obj = dict.fromkeys([key for key in table[0]], '')
|
|
||||||
for item in table:
|
|
||||||
if not item:
|
|
||||||
consolidated_rows.append(current_obj)
|
|
||||||
current_obj = dict.fromkeys([key for key in table[0]], '')
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
for k, v in item.items():
|
|
||||||
if v:
|
|
||||||
if not current_obj[k]:
|
|
||||||
current_obj[k] = v
|
|
||||||
else:
|
|
||||||
current_obj[k] = current_obj[k] + '\n' + v
|
|
||||||
|
|
||||||
return consolidated_rows
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_pretty(string: str) -> List:
|
def _parse_pretty(string: str) -> List:
|
||||||
string_lines = string.splitlines()
|
string_lines: List[str] = string.splitlines()
|
||||||
sep = '~~~'
|
clean: List[Tuple[int, List[str]]] = _normalize_rows(string_lines)
|
||||||
data_sep = '==='
|
raw_headers: List[List[str]] = _get_headers(clean)
|
||||||
separator = ' ' + sep + ' '
|
raw_data: List[List[List[str]]] = _get_data(clean)
|
||||||
data_separator = ' ' + data_sep + ' '
|
|
||||||
|
|
||||||
clean: Generator = _pretty_set_separators(string_lines, separator)
|
new_headers: List[str] = _collapse_headers(raw_headers)
|
||||||
normalized: Generator = _pretty_normalize_rows(clean, separator, data_separator)
|
new_data: List[List[str]] = _collapse_data(raw_data)
|
||||||
raw_table: List[Dict] = _pretty_table_parse(normalized) # was sparse_table_parse()
|
final_table: List[Dict[str, str]] = _create_table_dict(new_headers, new_data)
|
||||||
new_keynames: Dict = _pretty_map_new_keynames(raw_table, sep)
|
|
||||||
data_table: List[Optional[Dict]] = _pretty_remove_header_rows(raw_table, sep, data_sep)
|
|
||||||
table_with_renamed_keys: List[Optional[Dict]] = _pretty_rename_keys(data_table, new_keynames)
|
|
||||||
final_table: List[Dict] = _pretty_consolidate_rows(table_with_renamed_keys)
|
|
||||||
|
|
||||||
return final_table
|
return final_table
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user