diff --git a/README.md b/README.md index c0a92c7a..86f25a7a 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,7 @@ option. | ` --asciitable` | ASCII and Unicode table parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/asciitable) | | ` --asciitable-m` | multi-line ASCII and Unicode table parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/asciitable_m) | | ` --blkid` | `blkid` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/blkid) | +| ` --cbt` | `cbt` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cbt) | | ` --cef` | CEF string parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef) | | ` --cef-s` | CEF string streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s) | | ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) | diff --git a/jc/parsers/cbt.py b/jc/parsers/cbt.py new file mode 100644 index 00000000..e0559485 --- /dev/null +++ b/jc/parsers/cbt.py @@ -0,0 +1,191 @@ +"""jc - JSON Convert `cbt` command output parser + +Parses the human-, but not machine-, friendly output of the cbt command (for Google's BigTable). + +No effort is made to convert the data types of the values in the cells. + +The timestamps of the cells are converted to Python's isoformat. + +Raw output contains all cells for each column (including timestamps in converted to Python's isoformat), +while the normal output contains only the latest value for each column. + +Usage (cli): + + $ cbt | jc --cbt + +or + + $ jc cbt + +Usage (module): + + import jc + result = jc.parse('cbt', cbt_command_output) + +Schema: + + [ + { + "key": string, + "cells": { + string: { + string: string + } + } + } + ] + +Schema (raw): + + [ + { + "key": string, + "cells": [ + { + "column_family": string, + "column": string, + "timestamp": string, + "value": string + } + ] + } + ] + +Examples: + + $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p + [ + { + "key": "foo", + "cells": { + "foo": { + "bar": "baz" + } + } + } + ] + + $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p -r + [ + { + "key": "foo", + "cells": [ + { + "column_family": "foo", + "column": "bar", + "timestamp": "1970-01-01T01:00:00", + "value": "baz" + } + ] + } + ] +""" +import datetime +from itertools import groupby +from typing import List, Dict +from jc.jc_types import JSONDictType +import jc.utils + + +class info(): + """Provides parser metadata (version, author, etc.)""" + version = '1.0' + description = '`cbt` command parser' + author = 'Andreas Weiden' + author_email = 'andreas.weiden@gmail.com' + # details = 'enter any other details here' + + # compatible options: linux, darwin, cygwin, win32, aix, freebsd + compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd'] + magic_commands = ['cbt'] + + +__version__ = info.version + + +def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]: + """ + Final processing to conform to the schema. + + Parameters: + + proc_data: (List of Dictionaries) raw structured data to process + + Returns: + + List of Dictionaries. Structured to conform to the schema. + """ + + # process the data here + # rebuild output for added semantic information + # use helper functions in jc.utils for int, float, bool + # conversions and timestamps + out_data = [] + for row in proc_data: + cells = {} + key_func = lambda cell: (cell["column_family"], cell["column"]) + all_cells = sorted(row["cells"], key=key_func) + for (column_family, column), group in groupby(all_cells, key=key_func): + group = sorted(group, key=lambda cell: cell["timestamp"], reverse=True) + if column_family not in cells: + cells[column_family] = {} + cells[column_family][column] = group[0]["value"] + row["cells"] = cells + out_data.append(row) + return out_data + + +def parse( + data: str, + raw: bool = False, + quiet: bool = False +) -> List[JSONDictType]: + """ + Main text parsing function + + Parameters: + + data: (string) text data to parse + raw: (boolean) unprocessed output if True + quiet: (boolean) suppress warning messages if True + + Returns: + + List of Dictionaries. Raw or processed structured data. + """ + jc.utils.compatibility(__name__, info.compatible, quiet) + jc.utils.input_type_check(data) + + raw_output: List[Dict] = [] + + if jc.utils.has_data(data): + for line in filter(None, data.split("-" * 40)): + # parse the content here + # check out helper functions in jc.utils + # and jc.parsers.universal + key = None + cells = [] + column_name = "" + timestamp = None + value_next = False + for field in line.splitlines(): + if not field.strip(): + continue + if field.startswith(" " * 4): + value = field.strip(' "') + if value_next: + cells.append({ + "column_family": column_name.split(":", 1)[0], + "column": column_name.split(":", 1)[1], + "timestamp": datetime.datetime.strptime(timestamp, "%Y/%m/%d-%H:%M:%S.%f").isoformat(), + "value": value + }) + elif field.startswith(" " * 2): + column_name, timestamp = map(str.strip, field.split("@")) + value_next = True + else: + key = field + if key is not None: + raw_output.append({"key": key, "cells": cells}) + + return raw_output if raw else _process(raw_output) diff --git a/tests/fixtures/generic/cbt-multiple-columns.json b/tests/fixtures/generic/cbt-multiple-columns.json new file mode 100644 index 00000000..f410bf37 --- /dev/null +++ b/tests/fixtures/generic/cbt-multiple-columns.json @@ -0,0 +1 @@ +[{"key":"foo","cells":{"bat":{"bar":"baz"},"foo":{"bar1":"baz1","bar2":"baz2"}}}] diff --git a/tests/fixtures/generic/cbt-multiple-columns.out b/tests/fixtures/generic/cbt-multiple-columns.out new file mode 100644 index 00000000..fa876b10 --- /dev/null +++ b/tests/fixtures/generic/cbt-multiple-columns.out @@ -0,0 +1,8 @@ +---------------------------------------- +foo + foo:bar1 @ 1970/01/01-01:00:00.000000 + "baz1" + foo:bar2 @ 1970/01/01-01:00:00.000000 + "baz2" + bat:bar @ 1970/01/01-01:00:00.000000 + "baz" diff --git a/tests/fixtures/generic/cbt-multiple-rows-raw.json b/tests/fixtures/generic/cbt-multiple-rows-raw.json new file mode 100644 index 00000000..8f17f235 --- /dev/null +++ b/tests/fixtures/generic/cbt-multiple-rows-raw.json @@ -0,0 +1 @@ +[{"key":"foo","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz1"}]},{"key":"bar","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz2"}]}] diff --git a/tests/fixtures/generic/cbt-multiple-rows.json b/tests/fixtures/generic/cbt-multiple-rows.json new file mode 100644 index 00000000..3d443138 --- /dev/null +++ b/tests/fixtures/generic/cbt-multiple-rows.json @@ -0,0 +1 @@ +[{"key":"foo","cells":{"foo":{"bar":"baz1"}}},{"key":"bar","cells":{"foo":{"bar":"baz2"}}}] diff --git a/tests/fixtures/generic/cbt-multiple-rows.out b/tests/fixtures/generic/cbt-multiple-rows.out new file mode 100644 index 00000000..686cd2b2 --- /dev/null +++ b/tests/fixtures/generic/cbt-multiple-rows.out @@ -0,0 +1,8 @@ +---------------------------------------- +foo + foo:bar @ 1970/01/01-01:00:00.000000 + "baz1" +---------------------------------------- +bar + foo:bar @ 1970/01/01-01:00:00.000000 + "baz2" diff --git a/tests/fixtures/generic/cbt-single.json b/tests/fixtures/generic/cbt-single.json new file mode 100644 index 00000000..d3e578cf --- /dev/null +++ b/tests/fixtures/generic/cbt-single.json @@ -0,0 +1 @@ +[{"key":"foo","cells":{"foo":{"bar":"baz"}}}] diff --git a/tests/fixtures/generic/cbt-single.out b/tests/fixtures/generic/cbt-single.out new file mode 100644 index 00000000..9eb4f1f0 --- /dev/null +++ b/tests/fixtures/generic/cbt-single.out @@ -0,0 +1,4 @@ +---------------------------------------- +foo + foo:bar @ 1970/01/01-01:00:00.000000 + "baz" diff --git a/tests/test_cbt.py b/tests/test_cbt.py new file mode 100644 index 00000000..3dbf1a3e --- /dev/null +++ b/tests/test_cbt.py @@ -0,0 +1,64 @@ +import json +import os +import unittest +from jc.exceptions import ParseError +import jc.parsers.cbt + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class MyTests(unittest.TestCase): + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-single.out'), 'r', encoding='utf-8') as f: + single = f.read() + + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-columns.out'), 'r', encoding='utf-8') as f: + multiple_columns = f.read() + + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows.out'), 'r', encoding='utf-8') as f: + multiple_rows = f.read() + + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-single.json'), 'r', encoding='utf-8') as f: + single_json = json.loads(f.read()) + + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-columns.json'), 'r', encoding='utf-8') as f: + multiple_columns_json = json.loads(f.read()) + + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows.json'), 'r', encoding='utf-8') as f: + multiple_rows_json = json.loads(f.read()) + + with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows-raw.json'), 'r', encoding='utf-8') as f: + multiple_rows_raw_json = json.loads(f.read()) + + def test_cbt_nodata(self): + """ + Test 'cbt' with no data + """ + self.assertEqual(jc.parsers.cbt.parse('', quiet=True), []) + + def test_cbt_single_row(self): + """ + Test 'cbt' with a single row + """ + self.assertEqual(jc.parsers.cbt.parse(self.single, quiet=True), self.single_json) + + def test_cbt_multiple_column_families(self): + """ + Test 'cbt' with multiple columns from multiple column families + """ + self.assertEqual(jc.parsers.cbt.parse(self.multiple_columns, quiet=True), self.multiple_columns_json) + + def test_cbt_multiple_rows(self): + """ + Test 'cbt' with multiple rows + """ + self.assertEqual(jc.parsers.cbt.parse(self.multiple_rows, quiet=True), self.multiple_rows_json) + + def test_cbt_multiple_rows_raw(self): + """ + Test 'cbt' with multiple rows raw + """ + self.assertEqual(jc.parsers.cbt.parse(self.multiple_rows, quiet=True, raw=True), self.multiple_rows_raw_json) + + +if __name__ == '__main__': + unittest.main()