2022-03-22 12:25:24 -07:00
|
|
|
"""jc - JSON Convert `asciitable` parser
|
|
|
|
|
|
|
|
This parser converts ASCII and Unicode text tables with single-line rows.
|
|
|
|
|
|
|
|
Column headers must be at least two spaces apart from each other and must
|
2022-05-26 11:37:16 -07:00
|
|
|
be unique. For best results, column headers should be left-justified. If
|
|
|
|
column separators are present, then non-left-justified headers will be fixed
|
|
|
|
automatically.
|
|
|
|
|
|
|
|
Row separators are optional and are ignored. Each non-row-separator line is
|
|
|
|
considered a separate row in the table.
|
2022-03-22 12:25:24 -07:00
|
|
|
|
|
|
|
For example:
|
|
|
|
|
|
|
|
╒══════════╤═════════╤════════╕
|
|
|
|
│ foo │ bar │ baz │
|
|
|
|
╞══════════╪═════════╪════════╡
|
|
|
|
│ good day │ │ 12345 │
|
|
|
|
├──────────┼─────────┼────────┤
|
|
|
|
│ hi there │ abc def │ 3.14 │
|
|
|
|
╘══════════╧═════════╧════════╛
|
|
|
|
|
2022-03-29 09:35:54 -07:00
|
|
|
or
|
2022-03-22 12:25:24 -07:00
|
|
|
|
|
|
|
+-----------------------------+
|
|
|
|
| foo bar baz |
|
|
|
|
+-----------------------------+
|
|
|
|
| good day 12345 |
|
|
|
|
| hi there abc def 3.14 |
|
|
|
|
+-----------------------------+
|
|
|
|
|
2022-03-29 09:35:54 -07:00
|
|
|
or
|
2022-03-22 12:25:24 -07:00
|
|
|
|
|
|
|
| foo | bar | baz |
|
|
|
|
|----------|---------|--------|
|
|
|
|
| good day | | 12345 |
|
|
|
|
| hi there | abc def | 3.14 |
|
|
|
|
|
2022-03-29 09:35:54 -07:00
|
|
|
or
|
2022-03-22 12:25:24 -07:00
|
|
|
|
|
|
|
foo bar baz
|
|
|
|
--------- -------- ------
|
|
|
|
good day 12345
|
2022-03-22 12:42:07 -07:00
|
|
|
hi there abc def 3.14
|
2022-03-22 12:25:24 -07:00
|
|
|
|
2022-03-29 09:58:44 -07:00
|
|
|
or
|
2022-03-24 11:58:13 -07:00
|
|
|
|
|
|
|
foo bar baz
|
|
|
|
good day 12345
|
|
|
|
hi there abc def 3.14
|
|
|
|
|
2022-03-29 09:35:54 -07:00
|
|
|
etc...
|
2022-03-22 12:25:24 -07:00
|
|
|
|
2022-03-29 09:35:54 -07:00
|
|
|
Headers (keys) are converted to snake-case. All values are returned as
|
|
|
|
strings, except empty strings, which are converted to None/null.
|
2022-03-22 13:21:10 -07:00
|
|
|
|
2022-06-15 11:12:43 -07:00
|
|
|
> Note: To preserve the case of the keys use the `-r` cli option or
|
|
|
|
> `raw=True` argument in `parse()`.
|
|
|
|
|
2022-03-22 12:25:24 -07:00
|
|
|
Usage (cli):
|
|
|
|
|
|
|
|
$ cat table.txt | jc --asciitable
|
|
|
|
|
|
|
|
Usage (module):
|
|
|
|
|
|
|
|
import jc
|
|
|
|
result = jc.parse('asciitable', asciitable_string)
|
|
|
|
|
|
|
|
Schema:
|
|
|
|
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"column_name1": string, # empty string is null
|
|
|
|
"column_name2": string # empty string is null
|
|
|
|
}
|
|
|
|
]
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
2022-03-22 12:35:56 -07:00
|
|
|
$ echo '
|
|
|
|
> ╒══════════╤═════════╤════════╕
|
|
|
|
> │ foo │ bar │ baz │
|
|
|
|
> ╞══════════╪═════════╪════════╡
|
|
|
|
> │ good day │ │ 12345 │
|
|
|
|
> ├──────────┼─────────┼────────┤
|
|
|
|
> │ hi there │ abc def │ 3.14 │
|
|
|
|
> ╘══════════╧═════════╧════════╛' | jc --asciitable -p
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"foo": "good day",
|
|
|
|
"bar": null,
|
|
|
|
"baz": "12345"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"foo": "hi there",
|
|
|
|
"bar": "abc def",
|
|
|
|
"baz": "3.14"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
|
|
|
|
$ echo '
|
|
|
|
> foo bar baz
|
|
|
|
> --------- -------- ------
|
|
|
|
> good day 12345
|
2022-03-22 12:42:07 -07:00
|
|
|
> hi there abc def 3.14' | jc --asciitable -p
|
2022-03-22 12:35:56 -07:00
|
|
|
[
|
|
|
|
{
|
|
|
|
"foo": "good day",
|
|
|
|
"bar": null,
|
|
|
|
"baz": "12345"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"foo": "hi there",
|
|
|
|
"bar": "abc def",
|
2022-03-22 12:42:07 -07:00
|
|
|
"baz": "3.14"
|
2022-03-22 12:35:56 -07:00
|
|
|
}
|
|
|
|
]
|
2022-03-22 12:25:24 -07:00
|
|
|
"""
|
|
|
|
import re
|
2022-03-24 11:58:13 -07:00
|
|
|
from functools import lru_cache
|
2022-03-22 12:25:24 -07:00
|
|
|
from typing import List, Dict
|
|
|
|
import jc.utils
|
|
|
|
from jc.parsers.universal import sparse_table_parse
|
|
|
|
|
|
|
|
|
|
|
|
class info():
|
|
|
|
"""Provides parser metadata (version, author, etc.)"""
|
2022-06-15 11:12:43 -07:00
|
|
|
version = '1.2'
|
2022-03-22 12:25:24 -07:00
|
|
|
description = 'ASCII and Unicode table parser'
|
|
|
|
author = 'Kelly Brazil'
|
|
|
|
author_email = 'kellyjonbrazil@gmail.com'
|
|
|
|
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
|
2022-12-27 13:59:10 -08:00
|
|
|
tags = ['generic', 'string']
|
2022-03-22 12:25:24 -07:00
|
|
|
|
|
|
|
|
|
|
|
__version__ = info.version
|
|
|
|
|
|
|
|
|
|
|
|
def _process(proc_data: List[Dict]) -> List[Dict]:
|
|
|
|
"""
|
|
|
|
Final processing to conform to the schema.
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
|
|
|
proc_data: (List of Dictionaries) raw structured data to process
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
List of Dictionaries. Structured to conform to the schema.
|
|
|
|
"""
|
2022-06-15 11:12:43 -07:00
|
|
|
# normalize keys: convert to lowercase
|
|
|
|
for item in proc_data:
|
|
|
|
for key in item.copy():
|
|
|
|
k_new = key.lower()
|
|
|
|
item[k_new] = item.pop(key)
|
|
|
|
|
2022-03-22 12:25:24 -07:00
|
|
|
return proc_data
|
|
|
|
|
|
|
|
|
|
|
|
def _remove_ansi(string: str) -> str:
|
|
|
|
ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
|
|
|
|
return ansi_escape.sub('', string)
|
|
|
|
|
|
|
|
|
|
|
|
def _lstrip(string: str) -> str:
|
|
|
|
"""find the leftmost non-whitespace character and lstrip to that index"""
|
|
|
|
lstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
|
|
|
|
start_points = (len(x) - len(x.lstrip()) for x in lstrip_list)
|
|
|
|
min_point = min(start_points)
|
|
|
|
new_lstrip_list = (x[min_point:] for x in lstrip_list)
|
|
|
|
return '\n'.join(new_lstrip_list)
|
|
|
|
|
|
|
|
|
|
|
|
def _rstrip(string: str) -> str:
|
|
|
|
"""find the rightmost non-whitespace character and rstrip and pad to that index"""
|
|
|
|
rstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
|
|
|
|
end_points = (len(x.rstrip()) for x in rstrip_list)
|
|
|
|
max_point = max(end_points)
|
|
|
|
new_rstrip_list = ((x + ' ' * max_point)[:max_point] for x in rstrip_list)
|
|
|
|
return '\n'.join(new_rstrip_list)
|
|
|
|
|
|
|
|
|
|
|
|
def _strip(string: str) -> str:
|
|
|
|
string = _lstrip(string)
|
|
|
|
string = _rstrip(string)
|
|
|
|
return string
|
|
|
|
|
2022-03-24 11:58:13 -07:00
|
|
|
@lru_cache(maxsize=32)
|
2022-03-22 12:25:24 -07:00
|
|
|
def _is_separator(line: str) -> bool:
|
2022-03-24 09:39:53 -07:00
|
|
|
"""returns true if a table separator line is found"""
|
2022-03-24 11:58:13 -07:00
|
|
|
# This function is cacheable since tables have identical separators
|
2022-03-22 12:25:24 -07:00
|
|
|
strip_line = line.strip()
|
|
|
|
if any((
|
2022-03-24 12:37:46 -07:00
|
|
|
strip_line.startswith('|-') and strip_line.endswith('-|'),
|
|
|
|
strip_line.startswith('━━') and strip_line.endswith('━━'),
|
|
|
|
strip_line.startswith('──') and strip_line.endswith('──'),
|
|
|
|
strip_line.startswith('┄┄') and strip_line.endswith('┄┄'),
|
|
|
|
strip_line.startswith('┅┅') and strip_line.endswith('┅┅'),
|
|
|
|
strip_line.startswith('┈┈') and strip_line.endswith('┈┈'),
|
|
|
|
strip_line.startswith('┉┉') and strip_line.endswith('┉┉'),
|
|
|
|
strip_line.startswith('══') and strip_line.endswith('══'),
|
|
|
|
strip_line.startswith('--') and strip_line.endswith('--'),
|
|
|
|
strip_line.startswith('==') and strip_line.endswith('=='),
|
|
|
|
strip_line.startswith('+=') and strip_line.endswith('=+'),
|
|
|
|
strip_line.startswith('+-') and strip_line.endswith('-+'),
|
2022-03-24 09:31:12 -07:00
|
|
|
strip_line.startswith('╒') and strip_line.endswith('╕'),
|
|
|
|
strip_line.startswith('╞') and strip_line.endswith('╡'),
|
|
|
|
strip_line.startswith('╘') and strip_line.endswith('╛'),
|
|
|
|
strip_line.startswith('┏') and strip_line.endswith('┓'),
|
|
|
|
strip_line.startswith('┣') and strip_line.endswith('┫'),
|
|
|
|
strip_line.startswith('┗') and strip_line.endswith('┛'),
|
|
|
|
strip_line.startswith('┡') and strip_line.endswith('┩'),
|
|
|
|
strip_line.startswith('┢') and strip_line.endswith('┪'),
|
|
|
|
strip_line.startswith('┟') and strip_line.endswith('┧'),
|
|
|
|
strip_line.startswith('┞') and strip_line.endswith('┦'),
|
|
|
|
strip_line.startswith('┠') and strip_line.endswith('┨'),
|
|
|
|
strip_line.startswith('┝') and strip_line.endswith('┥'),
|
|
|
|
strip_line.startswith('┍') and strip_line.endswith('┑'),
|
|
|
|
strip_line.startswith('┕') and strip_line.endswith('┙'),
|
|
|
|
strip_line.startswith('┎') and strip_line.endswith('┒'),
|
|
|
|
strip_line.startswith('┖') and strip_line.endswith('┚'),
|
|
|
|
strip_line.startswith('╓') and strip_line.endswith('╖'),
|
|
|
|
strip_line.startswith('╟') and strip_line.endswith('╢'),
|
|
|
|
strip_line.startswith('╙') and strip_line.endswith('╜'),
|
|
|
|
strip_line.startswith('╔') and strip_line.endswith('╗'),
|
|
|
|
strip_line.startswith('╠') and strip_line.endswith('╣'),
|
|
|
|
strip_line.startswith('╚') and strip_line.endswith('╝'),
|
|
|
|
strip_line.startswith('┌') and strip_line.endswith('┐'),
|
|
|
|
strip_line.startswith('├') and strip_line.endswith('┤'),
|
|
|
|
strip_line.startswith('└') and strip_line.endswith('┘'),
|
|
|
|
strip_line.startswith('╭') and strip_line.endswith('╮'),
|
2022-03-24 12:37:46 -07:00
|
|
|
strip_line.startswith('╰') and strip_line.endswith('╯')
|
2022-03-22 12:25:24 -07:00
|
|
|
)):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def _snake_case(line: str) -> str:
|
2022-03-23 15:08:33 -07:00
|
|
|
"""
|
2022-06-15 11:15:26 -07:00
|
|
|
Replace spaces between words and special characters with an underscore.
|
|
|
|
Ignore the replacement char (�) used for header padding.
|
2022-03-23 15:08:33 -07:00
|
|
|
"""
|
2022-05-26 08:57:35 -07:00
|
|
|
line = re.sub(r'[^a-zA-Z0-9� ]', '_', line) # special characters
|
2022-06-15 11:12:43 -07:00
|
|
|
line = re.sub(r'\b \b', '_', line) # spaces between words
|
2022-05-26 08:57:35 -07:00
|
|
|
return line
|
2022-03-22 12:25:24 -07:00
|
|
|
|
|
|
|
|
|
|
|
def _normalize_rows(table: str) -> List[str]:
|
|
|
|
"""
|
2022-03-24 09:39:53 -07:00
|
|
|
returns a List of row strings. Header is snake-cased
|
2022-03-22 12:25:24 -07:00
|
|
|
"""
|
2022-05-26 08:57:35 -07:00
|
|
|
result: List[str] = []
|
2022-03-22 12:25:24 -07:00
|
|
|
for line in table.splitlines():
|
|
|
|
# skip blank lines
|
|
|
|
if not line.strip():
|
|
|
|
continue
|
|
|
|
|
|
|
|
# skip separators
|
|
|
|
if _is_separator(line):
|
|
|
|
continue
|
|
|
|
|
2022-05-26 08:57:35 -07:00
|
|
|
# header or data row found - remove column separators
|
|
|
|
if not result: # this is the header row
|
|
|
|
# normalize the separator
|
|
|
|
line = line.replace('│', '|')\
|
|
|
|
.replace('┃', '|')\
|
|
|
|
.replace('┆', '|')\
|
|
|
|
.replace('┇', '|')\
|
|
|
|
.replace('┊', '|')\
|
|
|
|
.replace('┋', '|')\
|
|
|
|
.replace('╎', '|')\
|
|
|
|
.replace('╏', '|')\
|
|
|
|
.replace('║', '|')
|
|
|
|
|
|
|
|
# find the number of chars to pad in front of headers that are too
|
|
|
|
# far away from the separator. Replace spaces with unicode char: �
|
|
|
|
# we will remove this char from headers after sparse_table_parse
|
|
|
|
problem_header_pattern = re.compile(r'(?:\| )( +)([^|]+)')
|
|
|
|
problem_headers = problem_header_pattern.findall(line)
|
|
|
|
if problem_headers:
|
|
|
|
for p_header in problem_headers:
|
|
|
|
old_header = p_header[0] + p_header[1]
|
|
|
|
sub_chars = '�' * len(p_header[0])
|
|
|
|
new_header = sub_chars + p_header[1]
|
|
|
|
line = line.replace(old_header, new_header)
|
|
|
|
|
|
|
|
line = line.replace('|', ' ')
|
|
|
|
result.append(_snake_case(line))
|
|
|
|
continue
|
|
|
|
|
|
|
|
# this is a data row
|
2022-03-22 17:47:19 -07:00
|
|
|
line = line.replace('|', ' ')\
|
|
|
|
.replace('│', ' ')\
|
|
|
|
.replace('┃', ' ')\
|
|
|
|
.replace('┆', ' ')\
|
|
|
|
.replace('┇', ' ')\
|
|
|
|
.replace('┊', ' ')\
|
|
|
|
.replace('┋', ' ')\
|
|
|
|
.replace('╎', ' ')\
|
|
|
|
.replace('╏', ' ')\
|
|
|
|
.replace('║', ' ')
|
2022-03-22 12:25:24 -07:00
|
|
|
result.append(line)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
2022-03-23 15:08:33 -07:00
|
|
|
def _fixup_headers(table: List[Dict]) -> List[Dict]:
|
|
|
|
"""remove consecutive underscores and any trailing underscores"""
|
|
|
|
new_table = []
|
|
|
|
for row in table:
|
|
|
|
new_row = row.copy()
|
2022-03-24 09:39:53 -07:00
|
|
|
for k in row:
|
2022-05-26 08:57:35 -07:00
|
|
|
# remove replacement character
|
|
|
|
k_new = k.replace('�', '')
|
2022-03-23 15:08:33 -07:00
|
|
|
# remove consecutive underscores
|
|
|
|
k_new = re.sub(r'__+', '_', k_new)
|
|
|
|
# remove trailing underscores
|
|
|
|
k_new = re.sub(r'_+$', '', k_new)
|
|
|
|
new_row[k_new] = new_row.pop(k)
|
|
|
|
new_table.append(new_row)
|
|
|
|
|
|
|
|
return new_table
|
|
|
|
|
2022-03-24 09:39:53 -07:00
|
|
|
|
2022-03-22 12:25:24 -07:00
|
|
|
def parse(
|
|
|
|
data: str,
|
|
|
|
raw: bool = False,
|
|
|
|
quiet: bool = False
|
|
|
|
) -> List[Dict]:
|
|
|
|
"""
|
|
|
|
Main text parsing function
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
|
|
|
data: (string) text data to parse
|
|
|
|
raw: (boolean) unprocessed output if True
|
|
|
|
quiet: (boolean) suppress warning messages if True
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
List of Dictionaries. Raw or processed structured data.
|
|
|
|
"""
|
|
|
|
jc.utils.compatibility(__name__, info.compatible, quiet)
|
|
|
|
jc.utils.input_type_check(data)
|
|
|
|
|
|
|
|
raw_output: List = []
|
|
|
|
|
|
|
|
if jc.utils.has_data(data):
|
|
|
|
data = _remove_ansi(data)
|
|
|
|
data = _strip(data)
|
|
|
|
data_list = _normalize_rows(data)
|
2022-03-23 15:08:33 -07:00
|
|
|
raw_table = sparse_table_parse(data_list)
|
|
|
|
raw_output = _fixup_headers(raw_table)
|
2022-03-22 12:25:24 -07:00
|
|
|
|
|
|
|
return raw_output if raw else _process(raw_output)
|