jc/jc/parsers/universal.py

"""jc - JSON CLI output utility universal Parsers"""


import string
from typing import List, Dict, Optional


def simple_table_parse(data: List[str]) -> List[Dict]:
    """
    Parse simple tables. The last column may contain data with spaces.

    Parameters:

        data:   (list)   Text data to parse that has been split into lines
                         via .splitlines(). Item 0 must be the header row.
                         Any spaces in header names should be changed to
                         underscore '_'. You should also ensure headers are
                         lowercase by using .lower().

                         Also, ensure there are no blank lines (list items)
                         in the data.

    Returns:

        List of Dictionaries
    """
    # code adapted from Conor Heine at:
    # https://gist.github.com/cahna/43a1a3ff4d075bcd71f9d7120037a501
    headers = [h for h in ' '.join(data[0].strip().split()).split() if h]
    raw_data = map(lambda s: s.strip().split(None, len(headers) - 1), data[1:])
    raw_output = [dict(zip(headers, r)) for r in raw_data]

    return raw_output


def sparse_table_parse(data: List[str], delim: Optional[str] ='\u2063') -> List[Dict]:
    """
    Parse tables with missing column data or with spaces in column data.

    Parameters:

        data:   (list)   Text data to parse that has been split into lines
                         via .splitlines(). Item 0 must be the header row.
                         Any spaces in header names should be changed to
                         underscore '_'. You should also ensure headers are
                         lowercase by using .lower(). Do not change the
                         position of header names as the positions are used
                         to find the data.

                         Also, ensure there are no blank lines (list items)
                         in the data.

        delim:  (string) Delimiter to use. By default `u\\2063`
                         (invisible separator) is used since it is unlikely
                         to ever be seen in terminal output. You can change
                         this for troubleshooting purposes or if there is a
                         delimiter conflict with your data.

    Returns:

        List of Dictionaries
    """
    output = []
    header_text = data.pop(0)
    header_text = header_text + ' '
    header_list = header_text.split()

    # find each column index and end position
    header_search = [header_list[0]]
    for h in header_list[1:]:
        header_search.append(' ' + h + ' ')

    header_spec_list = []
    for i, column in enumerate(header_list[0:len(header_list) - 1]):
        header_spec = {
            'name': column,
            'end': header_text.find(header_search[i + 1])
        }

        header_spec_list.append(header_spec)

    # parse lines
    if data:
        for entry in data:
            output_line = {}

            # insert new separator since data can contain spaces
            for col in reversed(header_list):
                # find the right header_spec
                for h_spec in header_spec_list:
                    if h_spec['name'] == col:
                        h_end = h_spec['end']
                        # check if the location contains whitespace. if not
                        # then move to the left until a space is found
                        while h_end > 0 and entry[h_end] not in string.whitespace:
                            h_end -= 1

                        # insert custom delimiter
                        entry = entry[:h_end] + delim + entry[h_end + 1:]

            # create the entry list from the new custom delimiter
            entry_list = entry.split(delim, maxsplit=len(header_list) - 1)

            # clean up leading and trailing spaces in entry
            clean_entry_list = []
            for col in entry_list:
                clean_entry = col.strip()
                if clean_entry == '':
                    clean_entry = None

                clean_entry_list.append(clean_entry)

            output_line = dict(zip(header_list, clean_entry_list))
            output.append(output_line)

    return output
new universal parsers to limit code duplication 2019-12-09 14:01:47 -08:00			`"""jc - JSON CLI output utility universal Parsers"""`


			`import string`
add type hints 2022-01-26 17:08:03 -08:00			`from typing import List, Dict, Optional`
new universal parsers to limit code duplication 2019-12-09 14:01:47 -08:00

add type hints 2022-01-26 17:08:03 -08:00			`def simple_table_parse(data: List[str]) -> List[Dict]:`
use universal simple table parser 2019-12-11 17:27:48 -08:00			`"""`
doc fix 2022-01-20 09:59:23 -08:00			`Parse simple tables. The last column may contain data with spaces.`
doc update 2019-12-12 15:59:36 -08:00
use universal simple table parser 2019-12-11 17:27:48 -08:00			`Parameters:`

formatting 2022-01-19 17:29:22 -08:00			`data: (list) Text data to parse that has been split into lines`
			`via .splitlines(). Item 0 must be the header row.`
			`Any spaces in header names should be changed to`
			`underscore '_'. You should also ensure headers are`
			`lowercase by using .lower().`
use universal simple table parser 2019-12-11 17:27:48 -08:00
formatting 2022-01-19 17:29:22 -08:00			`Also, ensure there are no blank lines (list items)`
			`in the data.`
use universal simple table parser 2019-12-11 17:27:48 -08:00
			`Returns:`

formatting 2022-01-19 17:29:22 -08:00			`List of Dictionaries`
use universal simple table parser 2019-12-11 17:27:48 -08:00			`"""`
doc fix 2022-01-20 09:59:23 -08:00			`# code adapted from Conor Heine at:`
			`# https://gist.github.com/cahna/43a1a3ff4d075bcd71f9d7120037a501`
use universal simple table parser 2019-12-11 17:27:48 -08:00			`headers = [h for h in ' '.join(data[0].strip().split()).split() if h]`
			`raw_data = map(lambda s: s.strip().split(None, len(headers) - 1), data[1:])`
			`raw_output = [dict(zip(headers, r)) for r in raw_data]`

			`return raw_output`


add type hints 2022-01-26 17:08:03 -08:00			`def sparse_table_parse(data: List[str], delim: Optional[str] ='\u2063') -> List[Dict]:`
new universal parsers to limit code duplication 2019-12-09 14:01:47 -08:00			`"""`
			`Parse tables with missing column data or with spaces in column data.`

			`Parameters:`

formatting 2022-01-19 17:29:22 -08:00			`data: (list) Text data to parse that has been split into lines`
			`via .splitlines(). Item 0 must be the header row.`
			`Any spaces in header names should be changed to`
			`underscore '_'. You should also ensure headers are`
			`lowercase by using .lower(). Do not change the`
			`position of header names as the positions are used`
			`to find the data.`
new universal parsers to limit code duplication 2019-12-09 14:01:47 -08:00
formatting 2022-01-19 17:29:22 -08:00			`Also, ensure there are no blank lines (list items)`
			`in the data.`
new universal parsers to limit code duplication 2019-12-09 14:01:47 -08:00
add get_help 2022-01-19 18:47:44 -08:00			delim: (string) Delimiter to use. By default `u\\2063`
docstring update 2022-01-20 09:40:08 -08:00			`(invisible separator) is used since it is unlikely`
add get_help 2022-01-19 18:47:44 -08:00			`to ever be seen in terminal output. You can change`
			`this for troubleshooting purposes or if there is a`
			`delimiter conflict with your data.`
new universal parsers to limit code duplication 2019-12-09 14:01:47 -08:00
			`Returns:`

formatting 2022-01-19 17:29:22 -08:00			`List of Dictionaries`
new universal parsers to limit code duplication 2019-12-09 14:01:47 -08:00			`"""`
			`output = []`
			`header_text = data.pop(0)`
			`header_text = header_text + ' '`
			`header_list = header_text.split()`

			`# find each column index and end position`
			`header_search = [header_list[0]]`
			`for h in header_list[1:]:`
			`header_search.append(' ' + h + ' ')`

			`header_spec_list = []`
			`for i, column in enumerate(header_list[0:len(header_list) - 1]):`
			`header_spec = {`
			`'name': column,`
			`'end': header_text.find(header_search[i + 1])`
			`}`

			`header_spec_list.append(header_spec)`

			`# parse lines`
			`if data:`
			`for entry in data:`
			`output_line = {}`

			`# insert new separator since data can contain spaces`
			`for col in reversed(header_list):`
			`# find the right header_spec`
			`for h_spec in header_spec_list:`
			`if h_spec['name'] == col:`
			`h_end = h_spec['end']`
			`# check if the location contains whitespace. if not`
			`# then move to the left until a space is found`
			`while h_end > 0 and entry[h_end] not in string.whitespace:`
			`h_end -= 1`

			`# insert custom delimiter`
			`entry = entry[:h_end] + delim + entry[h_end + 1:]`

			`# create the entry list from the new custom delimiter`
			`entry_list = entry.split(delim, maxsplit=len(header_list) - 1)`

			`# clean up leading and trailing spaces in entry`
			`clean_entry_list = []`
			`for col in entry_list:`
			`clean_entry = col.strip()`
			`if clean_entry == '':`
			`clean_entry = None`

			`clean_entry_list.append(clean_entry)`

			`output_line = dict(zip(header_list, clean_entry_list))`
			`output.append(output_line)`

			`return output`