add multiline asciitable parser

2025-08-08 22:36:48 +02:00 · 2022-03-18 13:05:57 -07:00
parent 4a3a4e10df
commit 5a248a8fc5
1 changed files with 344 additions and 0 deletions
--- a/jc/parsers/asciitable_m.py
+++ b/jc/parsers/asciitable_m.py
@@ -0,0 +1,344 @@
+"""jc - JSON Convert `asciitable-m` parser
+
+This parser converts ASCII and Unicode text tables with multi-line rows.
+Tables must have some sort of separator line between rows.
+
+For example:
+
+    ╒══════════╤═════════╤════════╕
+    │ foo      │ bar baz │ fiz    │
+    │          │         │ buz    │
+    ╞══════════╪═════════╪════════╡
+    │ good day │ 12345   │        │
+    │ mate     │         │        │
+    ├──────────┼─────────┼────────┤
+    │ hi there │ abc def │ 3.14   │
+    │          │         │        │
+    ╘══════════╧═════════╧════════╛
+
+Usage (cli):
+
+    $ cat table.txt | jc --asciitable-m
+
+Usage (module):
+
+    import jc
+    result = jc.parse('asciitable_m', asciitable-string)
+
+Schema:
+
+    [
+      {
+        "asciitable-m":     string,
+        "bar":     boolean,
+        "baz":     integer
+      }
+    ]
+
+Examples:
+
+    $ asciitable-m | jc --asciitable-m -p
+    []
+
+    $ asciitable-m | jc --asciitable-m -p -r
+    []
+"""
+from os import sep
+import re
+from typing import Iterable, List, Dict, Tuple, Optional, Union, Generator
+import jc.utils
+from jc.exceptions import ParseError
+from jc.parsers.universal import sparse_table_parse
+
+
+class info():
+    """Provides parser metadata (version, author, etc.)"""
+    version = '1.0'
+    description = 'multi-line ASCII and Unicode table parser'
+    author = 'Kelly Brazil'
+    author_email = 'kellyjonbrazil@gmail.com'
+    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
+
+
+__version__ = info.version
+
+
+def _process(proc_data: List[Dict]) -> List[Dict]:
+    """
+    Final processing to conform to the schema.
+
+    Parameters:
+
+        proc_data:   (List of Dictionaries) raw structured data to process
+
+    Returns:
+
+        List of Dictionaries. Structured to conform to the schema.
+    """
+    # remove newlines from values
+    # for item in proc_data:
+    #     for k, v in item.items():
+    #         item[k] = v.replace('\n', '')
+
+    return proc_data
+
+
+def _remove_ansi(string: str) -> str:
+    ansi_escape =re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
+    return ansi_escape.sub('', string)
+
+
+def _lstrip(string: str) -> str:
+    """find the leftmost non-whitespace character and lstrip to that index"""
+    lstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
+    start_points = (len(x) - len(x.lstrip()) for x in lstrip_list)
+    min_point = min(start_points)
+    new_lstrip_list = (x[min_point:] for x in lstrip_list)
+    return '\n'.join(new_lstrip_list)
+
+
+def _table_sniff(string: str) -> str:
+    """Find the table-type via heuristics"""
+    # pretty tables
+    for line in string.splitlines():
+        line = line.strip()
+        if   line.startswith('╞═') and line.endswith('═╡')\
+          or line.startswith('├─') and line.endswith('─┤')\
+          or line.startswith('+=') and line.endswith('=+')\
+          or line.startswith('+-') and line.endswith('-+'):
+            return 'pretty'
+
+    # markdown tables
+    second_line = string.splitlines()[1]
+    if second_line.startswith('|-') and second_line.endswith('-|'):
+        return 'markdown'
+
+    # simple tables
+    return 'simple'
+
+
+def _pretty_set_separators(table_lines: Iterable, separator: str) -> Generator[str, None, None]:
+    """Return a generator that yields rows standardized separators"""
+    for line in table_lines:
+        strip_line = line.strip()
+
+        # skip any blanks
+        if not strip_line:
+            continue
+
+        # yield row separators as a sentinel string
+        if   strip_line.startswith('╒═') and strip_line.endswith('═╕')\
+          or strip_line.startswith('╞═') and strip_line.endswith('═╡')\
+          or strip_line.startswith('╘═') and strip_line.endswith('═╛')\
+          or strip_line.startswith('┌─') and strip_line.endswith('─┐')\
+          or strip_line.startswith('├─') and strip_line.endswith('─┤')\
+          or strip_line.startswith('└─') and strip_line.endswith('─┘')\
+          or strip_line.startswith('+=') and strip_line.endswith('=+')\
+          or strip_line.startswith('+-') and strip_line.endswith('-+'):
+            yield separator
+            continue
+
+        # remove the table column separator characters and yield the line
+        line = line.replace('|', ' ').replace('│', ' ')
+        yield line
+
+
+def _pretty_normalize_rows(table_lines: Iterable,
+                           separator: str,
+                           data_separator: str) -> Generator[str, None, None]:
+    """
+    Return a generator that yields header and data rows with different separators.
+    Also removes spaces from headers.
+    """
+    header_found = False
+    data_found = False
+
+    # Removes initial table lines, finds the header row(s) and separates
+    # the header from the data rows with different separator characters.
+    for i in table_lines:
+        if separator in i and not header_found and not data_found:
+            # top table frame
+            continue
+        if not separator in i and not header_found and not data_found:
+            header_found = True
+            # first header data found
+            # remove spaces from header
+            i = re.sub(r'\b \b', '_', i)
+            yield i
+            continue
+        if not separator in i and header_found and not data_found:
+            # subsequent header data found
+            # remove spaces from header
+            i = re.sub(r'\b \b', '_', i)
+            yield i
+            continue
+        if separator in i and header_found and not data_found:
+            data_found = True
+            # table separator found - this is a header separator
+            yield separator
+            continue
+        if not separator in i and header_found and data_found:
+            # subsequent data row found
+            yield i
+            continue
+        if separator in i and header_found and data_found:
+            # table separator found - this is a data separator
+            yield data_separator
+            continue
+
+
+def _pretty_remove_header_rows(table: List[Dict], sep: str, data_sep: str) -> List[Optional[Dict]]:
+    """return a table with only data rows."""
+    # create a new list of row objects with new key names
+    data_obj_list: List[Optional[Dict]] = []
+    sep_found = False
+    data_sep_found = False
+    for obj in table:
+        #skip to data
+        for v in obj.values():
+            if not sep_found and not str(v).strip() == sep:
+                continue
+            if not sep_found and str(v).strip() == sep:
+                sep_found = True
+                continue
+        # append data row objects or None for separators
+        if sep_found:
+            for k, v in obj.items():
+                if str(v).strip() == data_sep:
+                    data_sep_found = True
+                    break
+                else:
+                    data_sep_found = False
+            if data_sep_found:
+                data_obj_list.append(None)
+            else:
+                data_obj_list.append(obj)
+
+    # remove first item, which is a separator
+    return data_obj_list[1:]
+
+
+def _pretty_map_new_keynames(table: List[Dict], sep: str) -> Dict:
+    """
+    returns a dict of old keyname to new keyname mappings by consolidating
+    multiline keynames from the input list of dictionaries.
+    """
+    # first get all header objects to find full keynames. Stop when data rows are found.
+    header_obj_list = []
+    sep_found = False
+    for obj in table:
+        for v in obj.values():
+            if str(v).strip() == sep:
+                sep_found = True
+                break
+        if sep_found:
+            break
+        header_obj_list.append(obj)
+
+    if not header_obj_list:
+        header_obj_list = [{key: None for key in table[0]}]
+
+    # create an old-key to new-key name mapping dict
+    new_keynames_dict = dict.fromkeys([key for key in header_obj_list[0]], '')
+    for item in new_keynames_dict:
+        new_keynames_dict[item] = item
+    for obj in header_obj_list:
+        for k, v in obj.items():
+            if v:
+                new_keynames_dict[k] = new_keynames_dict[k] + '_' + v
+
+    # normalize keynames so they are lowercase, no spaces, and no redundat '_'s
+    for k, v in new_keynames_dict.items():
+        new_keynames_dict[k] = v.replace(' ', '_').lower()
+        new_keynames_dict[k] = re.sub(r'__+', '_', v)
+
+    return new_keynames_dict
+
+
+def _pretty_rename_keys(table: List, new_keynames: Dict) -> List[Optional[Dict]]:
+    """rename all of the keys in the table based on the new_keynames mapping"""
+    renamed_key_table: List[Optional[Dict]] = []
+    for item in table:
+        if item:
+            renamed_key_table.append({new_keynames[k]:v for k, v in item.items()})
+        else:
+            renamed_key_table.append(None)
+
+    return renamed_key_table
+
+
+def _pretty_consolidate_rows(table: List) -> List[Dict]:
+    """go through all data objects and combine values between data separators"""
+    consolidated_rows = []
+    current_obj = dict.fromkeys([key for key in table[0]], '')
+    for item in table:
+        if not item:
+            consolidated_rows.append(current_obj)
+            current_obj = dict.fromkeys([key for key in table[0]], '')
+            continue
+        else:
+            for k, v in item.items():
+                if v:
+                    if not current_obj[k]:
+                        current_obj[k] = v
+                    else:
+                        current_obj[k] = current_obj[k] + '\n' + v
+
+    return consolidated_rows
+
+
+def _parse_pretty(string: str) -> List:
+    string_lines = string.splitlines()
+    sep = '~~~'
+    data_sep = '==='
+    separator = '  ' + sep + ' '
+    data_separator = '  ' + data_sep + ' '
+
+    clean_gen = _pretty_set_separators(string_lines, separator)
+    normalized_gen = _pretty_normalize_rows(clean_gen, separator, data_separator)
+    raw_table = sparse_table_parse(normalized_gen)
+    new_keynames_dict = _pretty_map_new_keynames(raw_table, sep)
+    data_table = _pretty_remove_header_rows(raw_table, sep, data_sep)
+    table_with_renamed_keys = _pretty_rename_keys(data_table, new_keynames_dict)
+    final_table = _pretty_consolidate_rows(table_with_renamed_keys)
+
+    return final_table
+
+
+def parse(
+    data: str,
+    raw: bool = False,
+    quiet: bool = False
+) -> List[Dict]:
+    """
+    Main text parsing function
+
+    Parameters:
+
+        data:        (string)  text data to parse
+        raw:         (boolean) unprocessed output if True
+        quiet:       (boolean) suppress warning messages if True
+
+    Returns:
+
+        List of Dictionaries. Raw or processed structured data.
+    """
+    jc.utils.compatibility(__name__, info.compatible, quiet)
+    jc.utils.input_type_check(data)
+
+    raw_output: List = []
+    table_type = 'unknown'
+
+    if jc.utils.has_data(data):
+        data = _remove_ansi(data)
+        data = _lstrip(data)
+        table_type = _table_sniff(data)
+
+        if table_type == 'pretty':
+            raw_output = _parse_pretty(data)
+        elif table_type == 'markdown':
+            raise ParseError('Only "pretty" tables supported with multiline. "markdown" table detected. Please try the "asciitable" parser.')
+        else:
+            raise ParseError('Only "pretty" tables supported with multiline. "simple" table detected. Please try the "asciitable" parser.')
+
+    return raw_output if raw else _process(raw_output)