jc/jc/parsers/asciitable_m.py

"""jc - JSON Convert `asciitable-m` parser

This parser converts ASCII and Unicode text tables with multi-line rows.
Tables must have some sort of separator line between rows.

For example:

    ╒══════════╤═════════╤════════╕
    │ foo      │ bar baz │ fiz    │
    │          │         │ buz    │
    ╞══════════╪═════════╪════════╡
    │ good day │ 12345   │        │
    │ mate     │         │        │
    ├──────────┼─────────┼────────┤
    │ hi there │ abc def │ 3.14   │
    │          │         │        │
    ╘══════════╧═════════╧════════╛

Usage (cli):

    $ cat table.txt | jc --asciitable-m

Usage (module):

    import jc
    result = jc.parse('asciitable_m', asciitable-string)

Schema:

    [
      {
        "asciitable-m":     string,
        "bar":     boolean,
        "baz":     integer
      }
    ]

Examples:

    $ asciitable-m | jc --asciitable-m -p
    []

    $ asciitable-m | jc --asciitable-m -p -r
    []
"""
import re
from typing import Iterable, List, Dict, Optional, Generator
import jc.utils
from jc.exceptions import ParseError


class info():
    """Provides parser metadata (version, author, etc.)"""
    version = '1.0'
    description = 'multi-line ASCII and Unicode table parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']


__version__ = info.version


def _process(proc_data: List[Dict]) -> List[Dict]:
    """
    Final processing to conform to the schema.

    Parameters:

        proc_data:   (List of Dictionaries) raw structured data to process

    Returns:

        List of Dictionaries. Structured to conform to the schema.
    """
    # remove newlines from values
    # for item in proc_data:
    #     for k, v in item.items():
    #         item[k] = v.replace('\n', '')

    return proc_data


def _remove_ansi(string: str) -> str:
    ansi_escape =re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]')
    return ansi_escape.sub('', string)


def _lstrip(string: str) -> str:
    """find the leftmost non-whitespace character and lstrip to that index"""
    lstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]
    start_points = (len(x) - len(x.lstrip()) for x in lstrip_list)
    min_point = min(start_points)
    new_lstrip_list = (x[min_point:] for x in lstrip_list)
    return '\n'.join(new_lstrip_list)


def _table_sniff(string: str) -> str:
    """Find the table-type via heuristics"""
    # pretty tables
    for line in string.splitlines():
        line = line.strip()
        if   line.startswith('╞═') and line.endswith('═╡')\
          or line.startswith('├─') and line.endswith('─┤')\
          or line.startswith('+=') and line.endswith('=+')\
          or line.startswith('+-') and line.endswith('-+'):
            return 'pretty'

    # markdown tables
    second_line = string.splitlines()[1]
    if second_line.startswith('|-') and second_line.endswith('-|'):
        return 'markdown'

    # simple tables
    return 'simple'


def _pretty_set_separators(table_lines: Iterable, separator: str) -> Generator[str, None, None]:
    """Return a generator that yields rows standardized separators"""
    for line in table_lines:
        strip_line = line.strip()

        # skip any blanks
        if not strip_line:
            continue

        # yield row separators as a sentinel string
        if   strip_line.startswith('╒═') and strip_line.endswith('═╕')\
          or strip_line.startswith('╞═') and strip_line.endswith('═╡')\
          or strip_line.startswith('╘═') and strip_line.endswith('═╛')\
          or strip_line.startswith('┌─') and strip_line.endswith('─┐')\
          or strip_line.startswith('├─') and strip_line.endswith('─┤')\
          or strip_line.startswith('└─') and strip_line.endswith('─┘')\
          or strip_line.startswith('+=') and strip_line.endswith('=+')\
          or strip_line.startswith('+-') and strip_line.endswith('-+'):
            yield separator
            continue

        # remove the table column separator characters and yield the line
        # line = line.replace('|', ' ').replace('│', ' ')
        yield line


def _pretty_normalize_rows(table_lines: Iterable,
                           separator: str,
                           data_separator: str) -> Generator[str, None, None]:
    """
    Return a generator that yields header and data rows with different separators.
    Also removes spaces from headers.
    """
    header_found = False
    data_found = False

    # Removes initial table lines, finds the header row(s) and separates
    # the header from the data rows with different separator characters.
    for i in table_lines:
        if separator in i and not header_found and not data_found:
            # top table frame
            continue
        if not separator in i and not header_found and not data_found:
            header_found = True
            # first header data found
            # remove spaces from header
            i = re.sub(r'\b \b', '_', i)
            yield i
            continue
        if not separator in i and header_found and not data_found:
            # subsequent header data found
            # remove spaces from header
            i = re.sub(r'\b \b', '_', i)
            yield i
            continue
        if separator in i and header_found and not data_found:
            data_found = True
            # table separator found - this is a header separator
            yield separator
            continue
        if not separator in i and header_found and data_found:
            # subsequent data row found
            yield i
            continue
        if separator in i and header_found and data_found:
            # table separator found - this is a data separator
            yield data_separator
            continue


def _pretty_table_parse(table: Iterable) -> List[Dict]:
    temp_table = []
    for line in table:
        # normalize separator
        line = line.replace('│', '|')

        # remove first separator if it is the first char in the line
        if line[0] == '|':
            line = line.replace('|', ' ', 1)

        # remove last separator if it is the last char in the line
        if line[-1] == '|':
            line = line[::-1].replace('|', ' ', 1)[::-1]

        temp_table.append([x.strip() for x in line.split('|')])

    headers = temp_table[0]
    raw_data = temp_table[1:]
    result = [dict.fromkeys(headers, None)]
    result.extend([dict(zip(headers, r)) for r in raw_data])
    return result


def _pretty_remove_header_rows(table: List[Dict], sep: str, data_sep: str) -> List[Optional[Dict]]:
    """return a table with only data rows."""
    # create a new list of row objects with new key names
    data_obj_list: List[Optional[Dict]] = []
    sep_found = False
    data_sep_found = False
    for obj in table:
        #skip to data
        for v in obj.values():
            if not sep_found and not str(v).strip() == sep:
                continue
            if not sep_found and str(v).strip() == sep:
                sep_found = True
                continue
        # append data row objects or None for separators
        if sep_found:
            for k, v in obj.items():
                if str(v).strip() == data_sep:
                    data_sep_found = True
                    break
                else:
                    data_sep_found = False
            if data_sep_found:
                data_obj_list.append(None)
            else:
                data_obj_list.append(obj)

    # remove first item, which is a separator
    return data_obj_list[1:]


def _pretty_map_new_keynames(table: List[Dict], sep: str) -> Dict:
    """
    returns a dict of old keyname to new keyname mappings by consolidating
    multiline keynames from the input list of dictionaries.
    """
    # first get all header objects to find full keynames. Stop when data rows are found.
    header_obj_list = []
    sep_found = False
    for obj in table:
        for v in obj.values():
            if str(v).strip() == sep:
                sep_found = True
                break
        if sep_found:
            break
        header_obj_list.append(obj)

    if not header_obj_list:
        header_obj_list = [{key: None for key in table[0]}]

    # create an old-key to new-key name mapping dict
    new_keynames_dict = dict.fromkeys([key for key in header_obj_list[0]], '')
    for item in new_keynames_dict:
        new_keynames_dict[item] = item
    for obj in header_obj_list:
        for k, v in obj.items():
            if v:
                new_keynames_dict[k] = new_keynames_dict[k] + '_' + v

    # normalize keynames so they are lowercase, no spaces, and no redundat '_'s
    for k, v in new_keynames_dict.items():
        new_keynames_dict[k] = v.replace(' ', '_').lower()
        new_keynames_dict[k] = re.sub(r'__+', '_', v)

    return new_keynames_dict


def _pretty_rename_keys(table: List, new_keynames: Dict) -> List[Optional[Dict]]:
    """rename all of the keys in the table based on the new_keynames mapping"""
    renamed_key_table: List[Optional[Dict]] = []
    for item in table:
        if item:
            renamed_key_table.append({new_keynames[k]:v for k, v in item.items()})
        else:
            renamed_key_table.append(None)

    return renamed_key_table


def _pretty_consolidate_rows(table: List) -> List[Dict]:
    """go through all data objects and combine values between data separators"""
    consolidated_rows = []
    current_obj = dict.fromkeys([key for key in table[0]], '')
    for item in table:
        if not item:
            consolidated_rows.append(current_obj)
            current_obj = dict.fromkeys([key for key in table[0]], '')
            continue
        else:
            for k, v in item.items():
                if v:
                    if not current_obj[k]:
                        current_obj[k] = v
                    else:
                        current_obj[k] = current_obj[k] + '\n' + v

    return consolidated_rows


def _parse_pretty(string: str) -> List:
    string_lines = string.splitlines()
    sep = '~~~'
    data_sep = '==='
    separator = '  ' + sep + ' '
    data_separator = '  ' + data_sep + ' '

    clean: Generator = _pretty_set_separators(string_lines, separator)
    normalized: Generator = _pretty_normalize_rows(clean, separator, data_separator)
    raw_table: List[Dict] = _pretty_table_parse(normalized)  # was sparse_table_parse()
    new_keynames: Dict = _pretty_map_new_keynames(raw_table, sep)
    data_table: List[Optional[Dict]] = _pretty_remove_header_rows(raw_table, sep, data_sep)
    table_with_renamed_keys: List[Optional[Dict]] = _pretty_rename_keys(data_table, new_keynames)
    final_table: List[Dict] = _pretty_consolidate_rows(table_with_renamed_keys)

    return final_table


def parse(
    data: str,
    raw: bool = False,
    quiet: bool = False
) -> List[Dict]:
    """
    Main text parsing function

    Parameters:

        data:        (string)  text data to parse
        raw:         (boolean) unprocessed output if True
        quiet:       (boolean) suppress warning messages if True

    Returns:

        List of Dictionaries. Raw or processed structured data.
    """
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.input_type_check(data)

    raw_output: List = []
    table_type = 'unknown'

    if jc.utils.has_data(data):
        data = _remove_ansi(data)
        data = _lstrip(data)
        table_type = _table_sniff(data)

        if table_type == 'pretty':
            raw_output = _parse_pretty(data)
        elif table_type == 'markdown':
            raise ParseError('Only "pretty" tables supported with multiline. "markdown" table detected. Please try the "asciitable" parser.')
        else:
            raise ParseError('Only "pretty" tables supported with multiline. "simple" table detected. Please try the "asciitable" parser.')

    return raw_output if raw else _process(raw_output)
add multiline asciitable parser 2022-03-18 13:05:57 -07:00			"""jc - JSON Convert `asciitable-m` parser

			`This parser converts ASCII and Unicode text tables with multi-line rows.`
			`Tables must have some sort of separator line between rows.`

			`For example:`

			`╒══════════╤═════════╤════════╕`
			`│ foo │ bar baz │ fiz │`
			`│ │ │ buz │`
			`╞══════════╪═════════╪════════╡`
			`│ good day │ 12345 │ │`
			`│ mate │ │ │`
			`├──────────┼─────────┼────────┤`
			`│ hi there │ abc def │ 3.14 │`
			`│ │ │ │`
			`╘══════════╧═════════╧════════╛`

			`Usage (cli):`

			`$ cat table.txt \| jc --asciitable-m`

			`Usage (module):`

			`import jc`
			`result = jc.parse('asciitable_m', asciitable-string)`

			`Schema:`

			`[`
			`{`
			`"asciitable-m": string,`
			`"bar": boolean,`
			`"baz": integer`
			`}`
			`]`

			`Examples:`

			`$ asciitable-m \| jc --asciitable-m -p`
			`[]`

			`$ asciitable-m \| jc --asciitable-m -p -r`
			`[]`
			`"""`
			`import re`
working 2022-03-18 16:53:23 -07:00			`from typing import Iterable, List, Dict, Optional, Generator`
add multiline asciitable parser 2022-03-18 13:05:57 -07:00			`import jc.utils`
			`from jc.exceptions import ParseError`


			`class info():`
			`"""Provides parser metadata (version, author, etc.)"""`
			`version = '1.0'`
			`description = 'multi-line ASCII and Unicode table parser'`
			`author = 'Kelly Brazil'`
			`author_email = 'kellyjonbrazil@gmail.com'`
			`compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']`


			`__version__ = info.version`


			`def _process(proc_data: List[Dict]) -> List[Dict]:`
			`"""`
			`Final processing to conform to the schema.`

			`Parameters:`

			`proc_data: (List of Dictionaries) raw structured data to process`

			`Returns:`

			`List of Dictionaries. Structured to conform to the schema.`
			`"""`
			`# remove newlines from values`
			`# for item in proc_data:`
			`# for k, v in item.items():`
			`# item[k] = v.replace('\n', '')`

			`return proc_data`


			`def _remove_ansi(string: str) -> str:`
			`ansi_escape =re.compile(r'(\x9B\|\x1B\[)[0-?][ -\/][@-~]')`
			`return ansi_escape.sub('', string)`


			`def _lstrip(string: str) -> str:`
			`"""find the leftmost non-whitespace character and lstrip to that index"""`
			`lstrip_list = [x for x in string.splitlines() if not len(x.strip()) == 0]`
			`start_points = (len(x) - len(x.lstrip()) for x in lstrip_list)`
			`min_point = min(start_points)`
			`new_lstrip_list = (x[min_point:] for x in lstrip_list)`
			`return '\n'.join(new_lstrip_list)`


			`def _table_sniff(string: str) -> str:`
			`"""Find the table-type via heuristics"""`
			`# pretty tables`
			`for line in string.splitlines():`
			`line = line.strip()`
			`if line.startswith('╞═') and line.endswith('═╡')\`
			`or line.startswith('├─') and line.endswith('─┤')\`
			`or line.startswith('+=') and line.endswith('=+')\`
			`or line.startswith('+-') and line.endswith('-+'):`
			`return 'pretty'`

			`# markdown tables`
			`second_line = string.splitlines()[1]`
			`if second_line.startswith('\|-') and second_line.endswith('-\|'):`
			`return 'markdown'`

			`# simple tables`
			`return 'simple'`


			`def _pretty_set_separators(table_lines: Iterable, separator: str) -> Generator[str, None, None]:`
			`"""Return a generator that yields rows standardized separators"""`
			`for line in table_lines:`
			`strip_line = line.strip()`

			`# skip any blanks`
			`if not strip_line:`
			`continue`

			`# yield row separators as a sentinel string`
			`if strip_line.startswith('╒═') and strip_line.endswith('═╕')\`
			`or strip_line.startswith('╞═') and strip_line.endswith('═╡')\`
			`or strip_line.startswith('╘═') and strip_line.endswith('═╛')\`
			`or strip_line.startswith('┌─') and strip_line.endswith('─┐')\`
			`or strip_line.startswith('├─') and strip_line.endswith('─┤')\`
			`or strip_line.startswith('└─') and strip_line.endswith('─┘')\`
			`or strip_line.startswith('+=') and strip_line.endswith('=+')\`
			`or strip_line.startswith('+-') and strip_line.endswith('-+'):`
			`yield separator`
			`continue`

			`# remove the table column separator characters and yield the line`
working 2022-03-18 16:53:23 -07:00			`# line = line.replace('\|', ' ').replace('│', ' ')`
add multiline asciitable parser 2022-03-18 13:05:57 -07:00			`yield line`


			`def _pretty_normalize_rows(table_lines: Iterable,`
			`separator: str,`
			`data_separator: str) -> Generator[str, None, None]:`
			`"""`
			`Return a generator that yields header and data rows with different separators.`
			`Also removes spaces from headers.`
			`"""`
			`header_found = False`
			`data_found = False`

			`# Removes initial table lines, finds the header row(s) and separates`
			`# the header from the data rows with different separator characters.`
			`for i in table_lines:`
			`if separator in i and not header_found and not data_found:`
			`# top table frame`
			`continue`
			`if not separator in i and not header_found and not data_found:`
			`header_found = True`
			`# first header data found`
			`# remove spaces from header`
			`i = re.sub(r'\b \b', '_', i)`
			`yield i`
			`continue`
			`if not separator in i and header_found and not data_found:`
			`# subsequent header data found`
			`# remove spaces from header`
			`i = re.sub(r'\b \b', '_', i)`
			`yield i`
			`continue`
			`if separator in i and header_found and not data_found:`
			`data_found = True`
			`# table separator found - this is a header separator`
			`yield separator`
			`continue`
			`if not separator in i and header_found and data_found:`
			`# subsequent data row found`
			`yield i`
			`continue`
			`if separator in i and header_found and data_found:`
			`# table separator found - this is a data separator`
			`yield data_separator`
			`continue`


working 2022-03-18 16:53:23 -07:00			`def _pretty_table_parse(table: Iterable) -> List[Dict]:`
			`temp_table = []`
			`for line in table:`
			`# normalize separator`
			`line = line.replace('│', '\|')`

			`# remove first separator if it is the first char in the line`
			`if line[0] == '\|':`
			`line = line.replace('\|', ' ', 1)`

			`# remove last separator if it is the last char in the line`
			`if line[-1] == '\|':`
			`line = line[::-1].replace('\|', ' ', 1)[::-1]`

			`temp_table.append([x.strip() for x in line.split('\|')])`

			`headers = temp_table[0]`
			`raw_data = temp_table[1:]`
			`result = [dict.fromkeys(headers, None)]`
			`result.extend([dict(zip(headers, r)) for r in raw_data])`
			`return result`


add multiline asciitable parser 2022-03-18 13:05:57 -07:00			`def _pretty_remove_header_rows(table: List[Dict], sep: str, data_sep: str) -> List[Optional[Dict]]:`
			`"""return a table with only data rows."""`
			`# create a new list of row objects with new key names`
			`data_obj_list: List[Optional[Dict]] = []`
			`sep_found = False`
			`data_sep_found = False`
			`for obj in table:`
			`#skip to data`
			`for v in obj.values():`
			`if not sep_found and not str(v).strip() == sep:`
			`continue`
			`if not sep_found and str(v).strip() == sep:`
			`sep_found = True`
			`continue`
			`# append data row objects or None for separators`
			`if sep_found:`
			`for k, v in obj.items():`
			`if str(v).strip() == data_sep:`
			`data_sep_found = True`
			`break`
			`else:`
			`data_sep_found = False`
			`if data_sep_found:`
			`data_obj_list.append(None)`
			`else:`
			`data_obj_list.append(obj)`

			`# remove first item, which is a separator`
			`return data_obj_list[1:]`


			`def _pretty_map_new_keynames(table: List[Dict], sep: str) -> Dict:`
			`"""`
			`returns a dict of old keyname to new keyname mappings by consolidating`
			`multiline keynames from the input list of dictionaries.`
			`"""`
			`# first get all header objects to find full keynames. Stop when data rows are found.`
			`header_obj_list = []`
			`sep_found = False`
			`for obj in table:`
			`for v in obj.values():`
			`if str(v).strip() == sep:`
			`sep_found = True`
			`break`
			`if sep_found:`
			`break`
			`header_obj_list.append(obj)`

			`if not header_obj_list:`
			`header_obj_list = [{key: None for key in table[0]}]`

			`# create an old-key to new-key name mapping dict`
			`new_keynames_dict = dict.fromkeys([key for key in header_obj_list[0]], '')`
			`for item in new_keynames_dict:`
			`new_keynames_dict[item] = item`
			`for obj in header_obj_list:`
			`for k, v in obj.items():`
			`if v:`
			`new_keynames_dict[k] = new_keynames_dict[k] + '_' + v`

			`# normalize keynames so they are lowercase, no spaces, and no redundat '_'s`
			`for k, v in new_keynames_dict.items():`
			`new_keynames_dict[k] = v.replace(' ', '_').lower()`
			`new_keynames_dict[k] = re.sub(r'__+', '_', v)`

			`return new_keynames_dict`


			`def _pretty_rename_keys(table: List, new_keynames: Dict) -> List[Optional[Dict]]:`
			`"""rename all of the keys in the table based on the new_keynames mapping"""`
			`renamed_key_table: List[Optional[Dict]] = []`
			`for item in table:`
			`if item:`
			`renamed_key_table.append({new_keynames[k]:v for k, v in item.items()})`
			`else:`
			`renamed_key_table.append(None)`

			`return renamed_key_table`


			`def _pretty_consolidate_rows(table: List) -> List[Dict]:`
			`"""go through all data objects and combine values between data separators"""`
			`consolidated_rows = []`
			`current_obj = dict.fromkeys([key for key in table[0]], '')`
			`for item in table:`
			`if not item:`
			`consolidated_rows.append(current_obj)`
			`current_obj = dict.fromkeys([key for key in table[0]], '')`
			`continue`
			`else:`
			`for k, v in item.items():`
			`if v:`
			`if not current_obj[k]:`
			`current_obj[k] = v`
			`else:`
			`current_obj[k] = current_obj[k] + '\n' + v`

			`return consolidated_rows`


			`def _parse_pretty(string: str) -> List:`
			`string_lines = string.splitlines()`
			`sep = '~~~'`
			`data_sep = '==='`
			`separator = ' ' + sep + ' '`
			`data_separator = ' ' + data_sep + ' '`

working 2022-03-18 16:53:23 -07:00			`clean: Generator = _pretty_set_separators(string_lines, separator)`
			`normalized: Generator = _pretty_normalize_rows(clean, separator, data_separator)`
			`raw_table: List[Dict] = _pretty_table_parse(normalized) # was sparse_table_parse()`
			`new_keynames: Dict = _pretty_map_new_keynames(raw_table, sep)`
			`data_table: List[Optional[Dict]] = _pretty_remove_header_rows(raw_table, sep, data_sep)`
			`table_with_renamed_keys: List[Optional[Dict]] = _pretty_rename_keys(data_table, new_keynames)`
			`final_table: List[Dict] = _pretty_consolidate_rows(table_with_renamed_keys)`
add multiline asciitable parser 2022-03-18 13:05:57 -07:00
			`return final_table`


			`def parse(`
			`data: str,`
			`raw: bool = False,`
			`quiet: bool = False`
			`) -> List[Dict]:`
			`"""`
			`Main text parsing function`

			`Parameters:`

			`data: (string) text data to parse`
			`raw: (boolean) unprocessed output if True`
			`quiet: (boolean) suppress warning messages if True`

			`Returns:`

			`List of Dictionaries. Raw or processed structured data.`
			`"""`
			`jc.utils.compatibility(__name__, info.compatible, quiet)`
			`jc.utils.input_type_check(data)`

			`raw_output: List = []`
			`table_type = 'unknown'`

			`if jc.utils.has_data(data):`
			`data = _remove_ansi(data)`
			`data = _lstrip(data)`
			`table_type = _table_sniff(data)`

			`if table_type == 'pretty':`
			`raw_output = _parse_pretty(data)`
			`elif table_type == 'markdown':`
			`raise ParseError('Only "pretty" tables supported with multiline. "markdown" table detected. Please try the "asciitable" parser.')`
			`else:`
			`raise ParseError('Only "pretty" tables supported with multiline. "simple" table detected. Please try the "asciitable" parser.')`

			`return raw_output if raw else _process(raw_output)`