From b9fb7fad9ceebf64059b380680a2eed086c90abe Mon Sep 17 00:00:00 2001 From: Andreas Weiden Date: Mon, 12 Dec 2022 15:10:59 +0100 Subject: [PATCH] Add parser for cbt --- jc/parsers/cbt.py | 175 ++++++++++++++++++++++++++++++++++++++++++++++ tests/test_cbt.py | 145 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 320 insertions(+) create mode 100644 jc/parsers/cbt.py create mode 100644 tests/test_cbt.py diff --git a/jc/parsers/cbt.py b/jc/parsers/cbt.py new file mode 100644 index 00000000..a0a25b07 --- /dev/null +++ b/jc/parsers/cbt.py @@ -0,0 +1,175 @@ +"""jc - JSON Convert `foo` command output parser + +Parses the human-, but not machine-, readable output of the cbt command (for Google's BigTable). + +No effort is made to convert the data types of the values in the cells. + +The timestamps of the cells are converted to Python's isoformat. + +Raw output contains all cells for each column (including timestamps in converted to Python's isoformat), +while the normal output contains only the latest value for each column. + +Usage (cli): + + $ cbt | jc --cbt + +or + + $ jc cbt + +Usage (module): + + import jc + result = jc.parse('cbt', cbt_command_output) + +Schema: + + [ + { + "key": string, + "cells": { + string: { + string: string + } + } + } + ] + +Examples: + + $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p + [ + { + "key": "foo", + "cells": { + "foo": { + "bar": "baz" + } + } + } + ] + + $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p -r + [ + { + "key": "foo", + "cells": [ + { + "column_family": "foo", + "column": "bar", + "timestamp": "1970-01-01T01:00:00", + "value": "baz" + } + ] + } + ] +""" +import datetime +from itertools import groupby +from typing import List, Dict +from jc.jc_types import JSONDictType +import jc.utils + + +class info(): + """Provides parser metadata (version, author, etc.)""" + version = '1.0' + description = '`cbt` command parser' + author = 'Andreas Weiden' + author_email = 'andreas.weiden@gmail.com' + # details = 'enter any other details here' + + # compatible options: linux, darwin, cygwin, win32, aix, freebsd + compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd'] + magic_commands = ['cbt'] + + +__version__ = info.version + + +def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]: + """ + Final processing to conform to the schema. + + Parameters: + + proc_data: (List of Dictionaries) raw structured data to process + + Returns: + + List of Dictionaries. Structured to conform to the schema. + """ + + # process the data here + # rebuild output for added semantic information + # use helper functions in jc.utils for int, float, bool + # conversions and timestamps + out_data = [] + for row in proc_data: + cells = {} + key_func = lambda cell: (cell["column_family"], cell["column"]) + all_cells = sorted(row["cells"], key=key_func) + for (column_family, column), group in groupby(all_cells, key=key_func): + group = sorted(group, key=lambda cell: cell["timestamp"], reverse=True) + if column_family not in cells: + cells[column_family] = {} + cells[column_family][column] = group[0]["value"] + row["cells"] = cells + out_data.append(row) + return out_data + + +def parse( + data: str, + raw: bool = False, + quiet: bool = False +) -> List[JSONDictType]: + """ + Main text parsing function + + Parameters: + + data: (string) text data to parse + raw: (boolean) unprocessed output if True + quiet: (boolean) suppress warning messages if True + + Returns: + + List of Dictionaries. Raw or processed structured data. + """ + jc.utils.compatibility(__name__, info.compatible, quiet) + jc.utils.input_type_check(data) + + raw_output: List[Dict] = [] + + if jc.utils.has_data(data): + for line in filter(None, data.split("-" * 40)): + # parse the content here + # check out helper functions in jc.utils + # and jc.parsers.universal + key = None + cells = [] + column_name = "" + timestamp = None + value_next = False + for field in line.splitlines(): + if not field.strip(): + continue + if field.startswith(" " * 4): + value = field.strip(' "') + if value_next: + cells.append({ + "column_family": column_name.split(":", 1)[0], + "column": column_name.split(":", 1)[1], + "timestamp": datetime.datetime.strptime(timestamp, "%Y/%m/%d-%H:%M:%S.%f").isoformat(), + "value": value + }) + elif field.startswith(" " * 2): + column_name, timestamp = map(str.strip, field.split("@")) + value_next = True + else: + key = field + if key is not None: + raw_output.append({"key": key, "cells": cells}) + + return raw_output if raw else _process(raw_output) diff --git a/tests/test_cbt.py b/tests/test_cbt.py new file mode 100644 index 00000000..a3e3ad3b --- /dev/null +++ b/tests/test_cbt.py @@ -0,0 +1,145 @@ +import os +import unittest +from jc.exceptions import ParseError +import jc.parsers.cbt + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class MyTests(unittest.TestCase): + + def test_cbt_nodata(self): + """ + Test 'cbt' with no data + """ + self.assertEqual(jc.parsers.cbt.parse('', quiet=True), []) + + def test_cbt_single_row(self): + """ + Test 'cbt' with a single row + """ + input = ''' +---------------------------------------- +foo + foo:bar @ 1970/01/01-01:00:00.000000 + "baz" + ''' + expected = [ + { + "key": "foo", + "cells": { + "foo": { + "bar": "baz" + } + } + } + ] + self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected) + + def test_cbt_multiple_column_families(self): + """ + Test 'cbt' with multiple column families + """ + input = ''' +---------------------------------------- +foo + foo:bar1 @ 1970/01/01-01:00:00.000000 + "baz1" + foo:bar2 @ 1970/01/01-01:00:00.000000 + "baz2" + bat:bar @ 1970/01/01-01:00:00.000000 + "baz" + ''' + expected = [ + { + "key": "foo", + "cells": { + "foo": { + "bar1": "baz1", + "bar2": "baz2", + }, + "bat": { + "bar": "baz" + } + } + } + ] + self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected) + + def test_cbt_multiple_rows(self): + """ + Test 'cbt' with multiple rows + """ + input = ''' +---------------------------------------- +foo + foo:bar @ 1970/01/01-01:00:00.000000 + "baz1" +---------------------------------------- +bar + foo:bar @ 1970/01/01-01:00:00.000000 + "baz2" + ''' + expected = [ + { + "key": "foo", + "cells": { + "foo": { + "bar": "baz1", + } + } + }, + { + "key": "bar", + "cells": { + "foo": { + "bar": "baz2", + } + } + } + ] + self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected) + + def test_cbt_multiple_rows_raw(self): + """ + Test 'cbt' with multiple rows raw + """ + input = ''' +---------------------------------------- +foo + foo:bar @ 1970/01/01-01:00:00.000000 + "baz1" +---------------------------------------- +bar + foo:bar @ 1970/01/01-01:00:00.000000 + "baz2" + ''' + expected = [ + { + "key": "foo", + "cells": [ + { + "column_family": "foo", + "column": "bar", + "timestamp": "1970-01-01T01:00:00", + "value": "baz1", + } + ] + }, + { + "key": "bar", + "cells": [ + { + "column_family": "foo", + "column": "bar", + "timestamp": "1970-01-01T01:00:00", + "value": "baz2", + } + ] + } + ] + self.assertEqual(jc.parsers.cbt.parse(input, quiet=True, raw=True), expected) + + +if __name__ == '__main__': + unittest.main()