1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-19 00:17:51 +02:00
Files
jc/jc/parsers/asciitable_m.py
2022-03-30 14:39:56 -07:00

462 lines
16 KiB
Python

"""jc - JSON Convert `asciitable-m` parser
This parser converts various styles of ASCII and Unicode text tables with
multi-line rows. Tables must have a header row and separator line between
rows.
For example:
╒══════════╤═════════╤════════╕
│ foo │ bar baz │ fiz │
│ │ │ buz │
╞══════════╪═════════╪════════╡
│ good day │ 12345 │ │
│ mate │ │ │
├──────────┼─────────┼────────┤
│ hi there │ abc def │ 3.14 │
│ │ │ │
╘══════════╧═════════╧════════╛
Cells with multiple lines within rows will be joined with a newline
character ('\\n').
Headers (keys) are converted to snake-case and newlines between multi-line
headers are joined with an underscore. All values are returned as strings,
except empty strings, which are converted to None/null.
Usage (cli):
$ cat table.txt | jc --asciitable-m
Usage (module):
import jc
result = jc.parse('asciitable_m', asciitable-string)
Schema:
[
{
"column_name1": string, # empty string is null
"column_name2": string # empty string is null
}
]
Examples:
$ echo '
> +----------+---------+--------+
> | foo | bar | baz |
> | | | buz |
> +==========+=========+========+
> | good day | 12345 | |
> | mate | | |
> +----------+---------+--------+
> | hi there | abc def | 3.14 |
> | | | |
> +==========+=========+========+' | jc --asciitable-m -p
[
{
"foo": "good day\\nmate",
"bar": "12345",
"baz_buz": null
},
{
"foo": "hi there",
"bar": "abc def",
"baz_buz": "3.14"
}
]
$ echo '
> ╒══════════╤═════════╤════════╕
> │ foo │ bar │ baz │
> │ │ │ buz │
> ╞══════════╪═════════╪════════╡
> │ good day │ 12345 │ │
> │ mate │ │ │
> ├──────────┼─────────┼────────┤
> │ hi there │ abc def │ 3.14 │
> │ │ │ │
> ╘══════════╧═════════╧════════╛' | jc --asciitable-m -p
[
{
"foo": "good day\\nmate",
"bar": "12345",
"baz_buz": null
},
{
"foo": "hi there",
"bar": "abc def",
"baz_buz": "3.14"
}
]
"""
import re
from functools import lru_cache
from typing import Iterable, Tuple, List, Dict, Optional
import jc.utils
from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = 'multi-line ASCII and Unicode table parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
__version__ = info.version
def _process(proc_data: List[Dict]) -> List[Dict]:
"""
Final processing to conform to the schema.
Parameters:
proc_data: (List of Dictionaries) raw structured data to process
Returns:
List of Dictionaries. Structured to conform to the schema.
"""
return proc_data
def _remove_ansi(string: str) -> str:
ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
return ansi_escape.sub('', string)
def _lstrip(string: str) -> str:
"""find the leftmost non-whitespace character and lstrip to that index"""
lstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
start_points = (len(x) - len(x.lstrip()) for x in lstrip_list)
min_point = min(start_points)
new_lstrip_list = (x[min_point:] for x in lstrip_list)
return '\n'.join(new_lstrip_list)
def _rstrip(string: str) -> str:
"""find the rightmost non-whitespace character and rstrip and pad to that index"""
rstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
end_points = (len(x.rstrip()) for x in rstrip_list)
max_point = max(end_points)
new_rstrip_list = ((x + ' ' * max_point)[:max_point] for x in rstrip_list)
return '\n'.join(new_rstrip_list)
def _strip(string: str) -> str:
string = _lstrip(string)
string = _rstrip(string)
return string
def _table_sniff(string: str) -> str:
"""find the table-type via heuristics"""
# pretty tables
for line in string.splitlines():
line = line.strip()
if any((
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('') and line.endswith(''),
line.startswith('+=') and line.endswith('=+'),
line.startswith('+-') and line.endswith('-+')
)):
return 'pretty'
# markdown tables
second_line = string.splitlines()[1]
if second_line.startswith('|-') and second_line.endswith('-|'):
return 'markdown'
# simple tables
return 'simple'
@lru_cache(maxsize=32)
def _is_separator(line: str) -> bool:
"""returns true if a table separator line is found"""
# This function is cacheable since tables have identical separators
strip_line = line.strip()
if any((
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('') and strip_line.endswith(''),
strip_line.startswith('+=') and strip_line.endswith('=+'),
strip_line.startswith('+-') and strip_line.endswith('-+')
)):
return True
return False
def _snake_case(line: str) -> str:
"""
replace spaces between words and special characters with an underscore
and set to lowercase
"""
# must include all column separator characters in regex
line = re.sub(r'[^a-zA-Z0-9 |│┃┆┇┊┋╎╏║]', '_', line)
return re.sub(r'\b \b', '_', line).lower()
def _fixup_separators(line: str) -> str:
"""normalize separators, and remove first and last separators"""
# normalize separator
line = line.replace('', '|')\
.replace('', '|')\
.replace('', '|')\
.replace('', '|')\
.replace('', '|')\
.replace('', '|')\
.replace('', '|')\
.replace('', '|')\
.replace('', '|')
# remove first separator if it is the first char in the line
if line[0] == '|':
line = line.replace('|', ' ', 1)
# remove last separator if it is the last char in the line
if line[-1] == '|':
line = line[::-1].replace('|', ' ', 1)[::-1]
return line
def _normalize_rows(table_lines: Iterable[str]) -> List[Tuple[int, List[str]]]:
"""return a List of tuples of row-counters and data lines."""
result = []
header_found = False
data_found = False
row_counter = 0
for line in table_lines:
# skip blank lines
if not line.strip():
continue
# skip top table frame
if not header_found and not data_found and _is_separator(line):
continue
# first header row found
if not header_found and not data_found and not _is_separator(line):
header_found = True
line = _snake_case(line)
line = _fixup_separators(line)
line_list = line.split('|')
line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
continue
# subsequent header row found
if header_found and not data_found and not _is_separator(line):
line = _snake_case(line)
line = _fixup_separators(line)
line_list = line.split('|')
line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
continue
# table separator found - this is a header separator
if header_found and not data_found and _is_separator(line):
data_found = True
row_counter += 1
continue
# data row found
if header_found and data_found and not _is_separator(line):
line = _fixup_separators(line)
line_list = line.split('|')
line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
continue
# table separator found - this is a data separator
if header_found and data_found and _is_separator(line):
row_counter += 1
continue
return result
def _get_headers(table: Iterable[Tuple[int, List]]) -> List[List[str]]:
"""
return a list of all of the header rows (which are lists of strings.
[ # headers
['str', 'str', 'str'], # header rows
['str', 'str', 'str']
]
"""
result = []
for row_num, line in table:
if row_num == 0:
result.append(line)
return result
def _get_data(table: Iterable[Tuple[int, List]]) -> List[List[List[str]]]:
"""
return a list of rows, which are lists made up of lists of strings:
[ # data
[ # data rows
['str', 'str', 'str'], # data lines
['str', 'str', 'str']
]
]
"""
result: List[List[List[str]]] = []
current_row = 1
this_line: List[List[str]] = []
for row_num, line in table:
if row_num != 0:
if row_num != current_row:
result.append(this_line)
current_row = row_num
this_line = []
this_line.append(line)
if this_line:
result.append(this_line)
return result
def _collapse_headers(table: List[List[str]]) -> List[str]:
"""append each column string to return the full header list"""
result = table[0]
for line in table[1:]:
new_line: List[str] = []
for i, header in enumerate(line):
if header:
new_header = result[i] + '_' + header
# remove consecutive underscores
new_header = re.sub(r'__+', '_', new_header)
new_line.append(new_header)
else:
new_line.append(result[i])
result = new_line
return result
def _collapse_data(table: List[List[List[str]]]) -> List[List[str]]:
"""combine data rows to return a simple list of lists"""
result: List[List[str]] = []
for row in table:
new_row: List[str] = []
for line in row:
if new_row:
for i, item in enumerate(line):
new_row[i] = (new_row[i] + '\n' + item).strip()
else:
new_row = line
result.append(new_row)
return result
def _create_table_dict(header: List[str], data: List[List[str]]) -> List[Dict[str, Optional[str]]]:
"""
zip the headers and data to create a list of dictionaries. Also convert
empty strings to None.
"""
table_list_dict: List[Dict[str, Optional[str]]] = [dict(zip(header, r)) for r in data]
for row in table_list_dict:
for k, v in row.items():
if v == '':
row[k] = None
return table_list_dict
def _parse_pretty(string: str) -> List[Dict[str, Optional[str]]]:
string_lines: List[str] = string.splitlines()
clean: List[Tuple[int, List[str]]] = _normalize_rows(string_lines)
raw_headers: List[List[str]] = _get_headers(clean)
raw_data: List[List[List[str]]] = _get_data(clean)
new_headers: List[str] = _collapse_headers(raw_headers)
new_data: List[List[str]] = _collapse_data(raw_data)
final_table: List[Dict[str, Optional[str]]] = _create_table_dict(new_headers, new_data)
return final_table
def parse(
data: str,
raw: bool = False,
quiet: bool = False
) -> List[Dict]:
"""
Main text parsing function
Parameters:
data: (string) text data to parse
raw: (boolean) unprocessed output if True
quiet: (boolean) suppress warning messages if True
Returns:
List of Dictionaries. Raw or processed structured data.
"""
jc.utils.compatibility(__name__, info.compatible, quiet)
jc.utils.input_type_check(data)
raw_output: List = []
table_type = 'unknown'
if jc.utils.has_data(data):
data = _remove_ansi(data)
data = _strip(data)
table_type = _table_sniff(data)
if table_type == 'pretty':
raw_output = _parse_pretty(data)
elif table_type == 'markdown':
raise ParseError('Only "pretty" tables supported with multiline. "markdown" table detected. Please try the "asciitable" parser.')
else:
raise ParseError('Only "pretty" tables supported with multiline. "simple" table detected. Please try the "asciitable" parser.')
return raw_output if raw else _process(raw_output)