1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-25 00:37:31 +02:00
Files
jc/jc/parsers/asciitable_m.py

407 lines
12 KiB
Python
Raw Normal View History

2022-03-18 13:05:57 -07:00
"""jc - JSON Convert `asciitable-m` parser
2022-03-22 07:05:14 -07:00
This parser converts various styles of ASCII and Unicode text tables with
multi-line rows. Tables must have a header row and separator line between
rows.
2022-03-18 13:05:57 -07:00
For example:
foo bar baz fiz
buz
good day 12345
mate
hi there abc def 3.14
2022-03-22 07:05:14 -07:00
Cells with multiple lines within rows will be joined with a new-line
character ('\n').
Headers (keys) are converted to snake case and newlines between multi-line
headers are joined with an underscore. All values are returned as strings.
2022-03-18 13:05:57 -07:00
Usage (cli):
$ cat table.txt | jc --asciitable-m
Usage (module):
import jc
result = jc.parse('asciitable_m', asciitable-string)
Schema:
[
{
2022-03-22 07:05:14 -07:00
"column_name1": string,
"column_name2": string
2022-03-18 13:05:57 -07:00
}
]
Examples:
2022-03-22 07:05:14 -07:00
$ echo '
> +----------+---------+--------+
> | foo | bar | baz |
> | | | buz |
> +==========+=========+========+
> | good day | 12345 | |
> | mate | | |
> +----------+---------+--------+
> | hi there | abc def | 3.14 |
> | | | |
> +==========+=========+========+' | jc --asciitable-m -p
[
{
"foo": "good day\nmate",
"bar": "12345",
"baz_buz": ""
},
{
"foo": "hi there",
"bar": "abc def",
"baz_buz": "3.14"
}
]
2022-03-18 13:05:57 -07:00
2022-03-22 07:05:14 -07:00
$ echo '
>
> foo bar baz
> buz
>
> good day 12345
> mate
>
> hi there abc def 3.14
>
> ' | jc --asciitable-m -p
[
{
"foo": "good day\nmate",
"bar": "12345",
"baz_buz": ""
},
{
"foo": "hi there",
"bar": "abc def",
"baz_buz": "3.14"
}
]
2022-03-18 13:05:57 -07:00
"""
import re
2022-03-21 13:06:34 -07:00
from typing import Iterable, Tuple, List, Dict
2022-03-18 13:05:57 -07:00
import jc.utils
from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = 'multi-line ASCII and Unicode table parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
__version__ = info.version
def _process(proc_data: List[Dict]) -> List[Dict]:
"""
Final processing to conform to the schema.
Parameters:
proc_data: (List of Dictionaries) raw structured data to process
Returns:
List of Dictionaries. Structured to conform to the schema.
"""
return proc_data
def _remove_ansi(string: str) -> str:
2022-03-21 13:09:50 -07:00
ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
2022-03-18 13:05:57 -07:00
return ansi_escape.sub('', string)
def _lstrip(string: str) -> str:
"""find the leftmost non-whitespace character and lstrip to that index"""
lstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
start_points = (len(x) - len(x.lstrip()) for x in lstrip_list)
min_point = min(start_points)
new_lstrip_list = (x[min_point:] for x in lstrip_list)
return '\n'.join(new_lstrip_list)
2022-03-21 17:57:14 -07:00
def _rstrip(string: str) -> str:
2022-03-22 07:05:14 -07:00
"""find the rightmost non-whitespace character and rstrip and pad to that index"""
2022-03-21 17:57:14 -07:00
rstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
end_points = (len(x.rstrip()) for x in rstrip_list)
max_point = max(end_points)
2022-03-21 19:10:02 -07:00
new_rstrip_list = ((x + ' ' * max_point)[:max_point] for x in rstrip_list)
2022-03-21 17:57:14 -07:00
return '\n'.join(new_rstrip_list)
def _strip(string: str) -> str:
string = _lstrip(string)
string = _rstrip(string)
return string
2022-03-22 06:30:07 -07:00
2022-03-18 13:05:57 -07:00
def _table_sniff(string: str) -> str:
"""Find the table-type via heuristics"""
# pretty tables
for line in string.splitlines():
line = line.strip()
2022-03-22 07:05:14 -07:00
if any((
line.startswith('╞═') and line.endswith('═╡'),
line.startswith('├─') and line.endswith('─┤'),
line.startswith('+=') and line.endswith('=+'),
line.startswith('+-') and line.endswith('-+')
)):
2022-03-18 13:05:57 -07:00
return 'pretty'
# markdown tables
second_line = string.splitlines()[1]
if second_line.startswith('|-') and second_line.endswith('-|'):
return 'markdown'
# simple tables
return 'simple'
2022-03-21 13:06:34 -07:00
def _is_separator(line: str) -> bool:
"""Returns true if a table separator line is found"""
strip_line = line.strip()
2022-03-22 06:30:07 -07:00
if any((
strip_line.startswith('╒═') and strip_line.endswith('═╕'),
strip_line.startswith('╞═') and strip_line.endswith('═╡'),
strip_line.startswith('╘═') and strip_line.endswith('═╛'),
strip_line.startswith('┌─') and strip_line.endswith('─┐'),
strip_line.startswith('├─') and strip_line.endswith('─┤'),
strip_line.startswith('└─') and strip_line.endswith('─┘'),
strip_line.startswith('+=') and strip_line.endswith('=+'),
strip_line.startswith('+-') and strip_line.endswith('-+')
)):
2022-03-21 13:06:34 -07:00
return True
return False
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
def _snake_case(line: str) -> str:
2022-03-21 13:27:44 -07:00
"""replace spaces between words with an underscore and set to lowercase"""
2022-03-21 13:06:34 -07:00
return re.sub(r'\b \b', '_', line).lower()
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
def _fixup_separators(line: str) -> str:
"""Normalize separators, and remove first and last separators"""
# normalize separator
line = line.replace('', '|')
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
# remove first separator if it is the first char in the line
if line[0] == '|':
line = line.replace('|', ' ', 1)
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
# remove last separator if it is the last char in the line
if line[-1] == '|':
line = line[::-1].replace('|', ' ', 1)[::-1]
return line
def _normalize_rows(table_lines: Iterable[str]) -> List[Tuple[int, List[str]]]:
2022-03-18 13:05:57 -07:00
"""
2022-03-21 13:06:34 -07:00
Return a List of tuples of a row counters and data lines.
2022-03-18 13:05:57 -07:00
"""
2022-03-21 13:06:34 -07:00
result = []
2022-03-18 13:05:57 -07:00
header_found = False
data_found = False
2022-03-21 13:06:34 -07:00
row_counter = 0
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
for line in table_lines:
# skip blank lines
if not line.strip():
2022-03-18 13:05:57 -07:00
continue
2022-03-21 13:06:34 -07:00
# skip top table frame
if _is_separator(line) and not header_found and not data_found:
continue
# first header row found
if not _is_separator(line) and not header_found and not data_found:
2022-03-18 13:05:57 -07:00
header_found = True
2022-03-21 13:06:34 -07:00
line = _snake_case(line)
line = _fixup_separators(line)
line_list = line.split('|')
line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
2022-03-18 13:05:57 -07:00
continue
2022-03-21 13:06:34 -07:00
# subsequent header row found
if not _is_separator(line) and header_found and not data_found:
line = _snake_case(line)
line = _fixup_separators(line)
line_list = line.split('|')
line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
2022-03-18 13:05:57 -07:00
continue
2022-03-21 13:06:34 -07:00
# table separator found - this is a header separator
if _is_separator(line) and header_found and not data_found:
2022-03-18 13:05:57 -07:00
data_found = True
2022-03-21 13:06:34 -07:00
row_counter += 1
2022-03-18 13:05:57 -07:00
continue
2022-03-21 13:06:34 -07:00
# subsequent data row found
if not _is_separator(line) and header_found and data_found:
line = _fixup_separators(line)
line_list = line.split('|')
line_list = [x.strip() for x in line_list]
result.append((row_counter, line_list))
2022-03-18 13:05:57 -07:00
continue
2022-03-21 13:06:34 -07:00
# table separator found - this is a data separator
if _is_separator(line) and header_found and data_found:
row_counter += 1
2022-03-18 13:05:57 -07:00
continue
2022-03-21 13:06:34 -07:00
return result
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
def _get_headers(table: Iterable[Tuple[int, List]]) -> List[List[str]]:
"""
return a list of all of the header rows (which are lists of strings.
[ # headers
['str', 'str', 'str'], # header rows
['str', 'str', 'str']
]
"""
result = []
for row_num, line in table:
if row_num == 0:
result.append(line)
return result
2022-03-18 16:53:23 -07:00
2022-03-21 13:06:34 -07:00
def _get_data(table: Iterable[Tuple[int, List]]) -> List[List[List[str]]]:
"""
return a list of rows, which are lists made up of lists of strings:
[ # data
2022-03-21 13:33:35 -07:00
[ # data rows
['str', 'str', 'str'], # data lines
['str', 'str', 'str']
2022-03-21 13:06:34 -07:00
]
]
"""
result: List[List[List[str]]] = []
current_row = 1
this_line: List[List[str]] = []
for row_num, line in table:
if row_num != 0:
if row_num != current_row:
result.append(this_line)
current_row = row_num
this_line = []
this_line.append(line)
2022-03-18 16:53:23 -07:00
2022-03-21 13:06:34 -07:00
if this_line:
result.append(this_line)
2022-03-18 16:53:23 -07:00
return result
2022-03-21 13:06:34 -07:00
def _collapse_headers(table: List[List[str]]) -> List[str]:
"""append each column string to return the full header list"""
result = table[0]
for line in table[1:]:
new_line: List[str] = []
for i, header in enumerate(line):
if header:
new_header = result[i] + '_' + header
new_header = re.sub(r'__+', '_', new_header)
new_line.append(new_header)
2022-03-18 13:05:57 -07:00
else:
2022-03-21 13:06:34 -07:00
new_line.append(result[i])
result = new_line
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
return result
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
def _collapse_data(table: List[List[List[str]]]) -> List[List[str]]:
"""combine data rows to return a simple list of lists"""
result: List[List[str]] = []
for row in table:
new_row: List[str] = []
for line in row:
if new_row:
for i, item in enumerate(line):
new_row[i] = (new_row[i] + '\n' + item).strip()
else:
new_row = line
result.append(new_row)
return result
2022-03-18 13:05:57 -07:00
2022-03-21 13:06:34 -07:00
def _create_table_dict(header: List[str], data: List[List[str]]) -> List[Dict[str, str]]:
return [dict(zip(header, r)) for r in data]
2022-03-18 13:05:57 -07:00
2022-03-21 13:36:54 -07:00
def _parse_pretty(string: str) -> List[Dict[str, str]]:
2022-03-21 13:06:34 -07:00
string_lines: List[str] = string.splitlines()
clean: List[Tuple[int, List[str]]] = _normalize_rows(string_lines)
raw_headers: List[List[str]] = _get_headers(clean)
raw_data: List[List[List[str]]] = _get_data(clean)
new_headers: List[str] = _collapse_headers(raw_headers)
new_data: List[List[str]] = _collapse_data(raw_data)
final_table: List[Dict[str, str]] = _create_table_dict(new_headers, new_data)
2022-03-18 13:05:57 -07:00
return final_table
def parse(
data: str,
raw: bool = False,
quiet: bool = False
) -> List[Dict]:
"""
Main text parsing function
Parameters:
data: (string) text data to parse
raw: (boolean) unprocessed output if True
quiet: (boolean) suppress warning messages if True
Returns:
List of Dictionaries. Raw or processed structured data.
"""
jc.utils.compatibility(__name__, info.compatible, quiet)
jc.utils.input_type_check(data)
raw_output: List = []
table_type = 'unknown'
if jc.utils.has_data(data):
data = _remove_ansi(data)
2022-03-21 17:57:14 -07:00
data = _strip(data)
2022-03-18 13:05:57 -07:00
table_type = _table_sniff(data)
if table_type == 'pretty':
raw_output = _parse_pretty(data)
elif table_type == 'markdown':
raise ParseError('Only "pretty" tables supported with multiline. "markdown" table detected. Please try the "asciitable" parser.')
else:
raise ParseError('Only "pretty" tables supported with multiline. "simple" table detected. Please try the "asciitable" parser.')
return raw_output if raw else _process(raw_output)