1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-25 00:37:31 +02:00
Files
jc/jc/parsers/asciitable_m.py

366 lines
12 KiB
Python
Raw Normal View History

2022-03-18 13:05:57 -07:00
"""jc - JSON Convert `asciitable-m` parser
This parser converts ASCII and Unicode text tables with multi-line rows.
Tables must have some sort of separator line between rows.
For example:
foo bar baz fiz
buz
good day 12345
mate
hi there abc def 3.14
Usage (cli):
$ cat table.txt | jc --asciitable-m
Usage (module):
import jc
result = jc.parse('asciitable_m', asciitable-string)
Schema:
[
{
"asciitable-m": string,
"bar": boolean,
"baz": integer
}
]
Examples:
$ asciitable-m | jc --asciitable-m -p
[]
$ asciitable-m | jc --asciitable-m -p -r
[]
"""
import re
2022-03-18 16:53:23 -07:00
from typing import Iterable, List, Dict, Optional, Generator
2022-03-18 13:05:57 -07:00
import jc.utils
from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = 'multi-line ASCII and Unicode table parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
__version__ = info.version
def _process(proc_data: List[Dict]) -> List[Dict]:
"""
Final processing to conform to the schema.
Parameters:
proc_data: (List of Dictionaries) raw structured data to process
Returns:
List of Dictionaries. Structured to conform to the schema.
"""
# remove newlines from values
# for item in proc_data:
# for k, v in item.items():
# item[k] = v.replace('\n', '')
return proc_data
def _remove_ansi(string: str) -> str:
ansi_escape =re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
return ansi_escape.sub('', string)
def _lstrip(string: str) -> str:
"""find the leftmost non-whitespace character and lstrip to that index"""
lstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
start_points = (len(x) - len(x.lstrip()) for x in lstrip_list)
min_point = min(start_points)
new_lstrip_list = (x[min_point:] for x in lstrip_list)
return '\n'.join(new_lstrip_list)
def _table_sniff(string: str) -> str:
"""Find the table-type via heuristics"""
# pretty tables
for line in string.splitlines():
line = line.strip()
if line.startswith('╞═') and line.endswith('═╡')\
or line.startswith('├─') and line.endswith('─┤')\
or line.startswith('+=') and line.endswith('=+')\
or line.startswith('+-') and line.endswith('-+'):
return 'pretty'
# markdown tables
second_line = string.splitlines()[1]
if second_line.startswith('|-') and second_line.endswith('-|'):
return 'markdown'
# simple tables
return 'simple'
def _pretty_set_separators(table_lines: Iterable, separator: str) -> Generator[str, None, None]:
"""Return a generator that yields rows standardized separators"""
for line in table_lines:
strip_line = line.strip()
# skip any blanks
if not strip_line:
continue
# yield row separators as a sentinel string
if strip_line.startswith('╒═') and strip_line.endswith('═╕')\
or strip_line.startswith('╞═') and strip_line.endswith('═╡')\
or strip_line.startswith('╘═') and strip_line.endswith('═╛')\
or strip_line.startswith('┌─') and strip_line.endswith('─┐')\
or strip_line.startswith('├─') and strip_line.endswith('─┤')\
or strip_line.startswith('└─') and strip_line.endswith('─┘')\
or strip_line.startswith('+=') and strip_line.endswith('=+')\
or strip_line.startswith('+-') and strip_line.endswith('-+'):
yield separator
continue
# remove the table column separator characters and yield the line
2022-03-18 16:53:23 -07:00
# line = line.replace('|', ' ').replace('│', ' ')
2022-03-18 13:05:57 -07:00
yield line
def _pretty_normalize_rows(table_lines: Iterable,
separator: str,
data_separator: str) -> Generator[str, None, None]:
"""
Return a generator that yields header and data rows with different separators.
Also removes spaces from headers.
"""
header_found = False
data_found = False
# Removes initial table lines, finds the header row(s) and separates
# the header from the data rows with different separator characters.
for i in table_lines:
if separator in i and not header_found and not data_found:
# top table frame
continue
if not separator in i and not header_found and not data_found:
header_found = True
# first header data found
# remove spaces from header
i = re.sub(r'\b \b', '_', i)
yield i
continue
if not separator in i and header_found and not data_found:
# subsequent header data found
# remove spaces from header
i = re.sub(r'\b \b', '_', i)
yield i
continue
if separator in i and header_found and not data_found:
data_found = True
# table separator found - this is a header separator
yield separator
continue
if not separator in i and header_found and data_found:
# subsequent data row found
yield i
continue
if separator in i and header_found and data_found:
# table separator found - this is a data separator
yield data_separator
continue
2022-03-18 16:53:23 -07:00
def _pretty_table_parse(table: Iterable) -> List[Dict]:
temp_table = []
for line in table:
# normalize separator
line = line.replace('', '|')
# remove first separator if it is the first char in the line
if line[0] == '|':
line = line.replace('|', ' ', 1)
# remove last separator if it is the last char in the line
if line[-1] == '|':
line = line[::-1].replace('|', ' ', 1)[::-1]
temp_table.append([x.strip() for x in line.split('|')])
headers = temp_table[0]
raw_data = temp_table[1:]
result = [dict.fromkeys(headers, None)]
result.extend([dict(zip(headers, r)) for r in raw_data])
return result
2022-03-18 13:05:57 -07:00
def _pretty_remove_header_rows(table: List[Dict], sep: str, data_sep: str) -> List[Optional[Dict]]:
"""return a table with only data rows."""
# create a new list of row objects with new key names
data_obj_list: List[Optional[Dict]] = []
sep_found = False
data_sep_found = False
for obj in table:
#skip to data
for v in obj.values():
if not sep_found and not str(v).strip() == sep:
continue
if not sep_found and str(v).strip() == sep:
sep_found = True
continue
# append data row objects or None for separators
if sep_found:
for k, v in obj.items():
if str(v).strip() == data_sep:
data_sep_found = True
break
else:
data_sep_found = False
if data_sep_found:
data_obj_list.append(None)
else:
data_obj_list.append(obj)
# remove first item, which is a separator
return data_obj_list[1:]
def _pretty_map_new_keynames(table: List[Dict], sep: str) -> Dict:
"""
returns a dict of old keyname to new keyname mappings by consolidating
multiline keynames from the input list of dictionaries.
"""
# first get all header objects to find full keynames. Stop when data rows are found.
header_obj_list = []
sep_found = False
for obj in table:
for v in obj.values():
if str(v).strip() == sep:
sep_found = True
break
if sep_found:
break
header_obj_list.append(obj)
if not header_obj_list:
header_obj_list = [{key: None for key in table[0]}]
# create an old-key to new-key name mapping dict
new_keynames_dict = dict.fromkeys([key for key in header_obj_list[0]], '')
for item in new_keynames_dict:
new_keynames_dict[item] = item
for obj in header_obj_list:
for k, v in obj.items():
if v:
new_keynames_dict[k] = new_keynames_dict[k] + '_' + v
# normalize keynames so they are lowercase, no spaces, and no redundat '_'s
for k, v in new_keynames_dict.items():
new_keynames_dict[k] = v.replace(' ', '_').lower()
new_keynames_dict[k] = re.sub(r'__+', '_', v)
return new_keynames_dict
def _pretty_rename_keys(table: List, new_keynames: Dict) -> List[Optional[Dict]]:
"""rename all of the keys in the table based on the new_keynames mapping"""
renamed_key_table: List[Optional[Dict]] = []
for item in table:
if item:
renamed_key_table.append({new_keynames[k]:v for k, v in item.items()})
else:
renamed_key_table.append(None)
return renamed_key_table
def _pretty_consolidate_rows(table: List) -> List[Dict]:
"""go through all data objects and combine values between data separators"""
consolidated_rows = []
current_obj = dict.fromkeys([key for key in table[0]], '')
for item in table:
if not item:
consolidated_rows.append(current_obj)
current_obj = dict.fromkeys([key for key in table[0]], '')
continue
else:
for k, v in item.items():
if v:
if not current_obj[k]:
current_obj[k] = v
else:
current_obj[k] = current_obj[k] + '\n' + v
return consolidated_rows
def _parse_pretty(string: str) -> List:
string_lines = string.splitlines()
sep = '~~~'
data_sep = '==='
separator = ' ' + sep + ' '
data_separator = ' ' + data_sep + ' '
2022-03-18 16:53:23 -07:00
clean: Generator = _pretty_set_separators(string_lines, separator)
normalized: Generator = _pretty_normalize_rows(clean, separator, data_separator)
raw_table: List[Dict] = _pretty_table_parse(normalized) # was sparse_table_parse()
new_keynames: Dict = _pretty_map_new_keynames(raw_table, sep)
data_table: List[Optional[Dict]] = _pretty_remove_header_rows(raw_table, sep, data_sep)
table_with_renamed_keys: List[Optional[Dict]] = _pretty_rename_keys(data_table, new_keynames)
final_table: List[Dict] = _pretty_consolidate_rows(table_with_renamed_keys)
2022-03-18 13:05:57 -07:00
return final_table
def parse(
data: str,
raw: bool = False,
quiet: bool = False
) -> List[Dict]:
"""
Main text parsing function
Parameters:
data: (string) text data to parse
raw: (boolean) unprocessed output if True
quiet: (boolean) suppress warning messages if True
Returns:
List of Dictionaries. Raw or processed structured data.
"""
jc.utils.compatibility(__name__, info.compatible, quiet)
jc.utils.input_type_check(data)
raw_output: List = []
table_type = 'unknown'
if jc.utils.has_data(data):
data = _remove_ansi(data)
data = _lstrip(data)
table_type = _table_sniff(data)
if table_type == 'pretty':
raw_output = _parse_pretty(data)
elif table_type == 'markdown':
raise ParseError('Only "pretty" tables supported with multiline. "markdown" table detected. Please try the "asciitable" parser.')
else:
raise ParseError('Only "pretty" tables supported with multiline. "simple" table detected. Please try the "asciitable" parser.')
return raw_output if raw else _process(raw_output)