Merge pull request #327 from graipher/master

Add parser for cbt
2025-08-06 22:32:54 +02:00 · 2022-12-13 10:40:46 -06:00
parent be85d78f55 fd61e19135
commit b8ef583b93
10 changed files with 280 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -161,6 +161,7 @@ option.
 | `   --asciitable` | ASCII and Unicode table parser                          | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/asciitable)     |
 | ` --asciitable-m` | multi-line ASCII and Unicode table parser               | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/asciitable_m)   |
 | `        --blkid` | `blkid` command parser                                  | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/blkid)          |
+| `          --cbt` | `cbt` command parser                                    | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cbt)            |
 | `          --cef` | CEF string parser                                       | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef)            |
 | `        --cef-s` | CEF string streaming parser                             | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s)          |
 | `        --chage` | `chage --list` command parser                           | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage)          |
--- a/jc/parsers/cbt.py
+++ b/jc/parsers/cbt.py
@ -0,0 +1,191 @@
+"""jc - JSON Convert `cbt` command output parser
+
+Parses the human-, but not machine-, friendly output of the cbt command (for Google's BigTable).
+
+No effort is made to convert the data types of the values in the cells.
+
+The timestamps of the cells are converted to Python's isoformat.
+
+Raw output contains all cells for each column (including timestamps in converted to Python's isoformat),
+while the normal output contains only the latest value for each column.
+
+Usage (cli):
+
+    $ cbt | jc --cbt
+
+or
+
+    $ jc cbt
+
+Usage (module):
+
+    import jc
+    result = jc.parse('cbt', cbt_command_output)
+
+Schema:
+
+    [
+      {
+        "key":     string,
+        "cells": {
+            string: {
+                string: string
+            }
+        }
+      }
+    ]
+
+Schema (raw):
+
+    [
+      {
+        "key": string,
+        "cells": [
+          {
+            "column_family": string,
+            "column": string,
+            "timestamp": string,
+            "value": string
+          }
+        ]
+      }
+    ]
+
+Examples:
+
+    $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p
+    [
+        {
+            "key": "foo",
+            "cells": {
+                "foo": {
+                    "bar": "baz"
+                }
+            }
+        }
+    ]
+
+    $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p -r
+    [
+        {
+            "key": "foo",
+            "cells": [
+                {
+                    "column_family": "foo",
+                    "column": "bar",
+                    "timestamp": "1970-01-01T01:00:00",
+                    "value": "baz"
+                }
+            ]
+        }
+    ]
+"""
+import datetime
+from itertools import groupby
+from typing import List, Dict
+from jc.jc_types import JSONDictType
+import jc.utils
+
+
+class info():
+    """Provides parser metadata (version, author, etc.)"""
+    version = '1.0'
+    description = '`cbt` command parser'
+    author = 'Andreas Weiden'
+    author_email = 'andreas.weiden@gmail.com'
+    # details = 'enter any other details here'
+
+    # compatible options: linux, darwin, cygwin, win32, aix, freebsd
+    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
+    magic_commands = ['cbt']
+
+
+__version__ = info.version
+
+
+def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]:
+    """
+    Final processing to conform to the schema.
+
+    Parameters:
+
+        proc_data:   (List of Dictionaries) raw structured data to process
+
+    Returns:
+
+        List of Dictionaries. Structured to conform to the schema.
+    """
+
+    # process the data here
+    # rebuild output for added semantic information
+    # use helper functions in jc.utils for int, float, bool
+    # conversions and timestamps
+    out_data = []
+    for row in proc_data:
+        cells = {}
+        key_func = lambda cell: (cell["column_family"], cell["column"])
+        all_cells = sorted(row["cells"], key=key_func)
+        for (column_family, column), group in groupby(all_cells, key=key_func):
+            group = sorted(group, key=lambda cell: cell["timestamp"], reverse=True)
+            if column_family not in cells:
+                cells[column_family] = {}
+            cells[column_family][column] = group[0]["value"]
+        row["cells"] = cells
+        out_data.append(row)
+    return out_data
+
+
+def parse(
+        data: str,
+        raw: bool = False,
+        quiet: bool = False
+) -> List[JSONDictType]:
+    """
+    Main text parsing function
+
+    Parameters:
+
+        data:        (string)  text data to parse
+        raw:         (boolean) unprocessed output if True
+        quiet:       (boolean) suppress warning messages if True
+
+    Returns:
+
+        List of Dictionaries. Raw or processed structured data.
+    """
+    jc.utils.compatibility(__name__, info.compatible, quiet)
+    jc.utils.input_type_check(data)
+
+    raw_output: List[Dict] = []
+
+    if jc.utils.has_data(data):
+        for line in filter(None, data.split("-" * 40)):
+            # parse the content here
+            # check out helper functions in jc.utils
+            # and jc.parsers.universal
+            key = None
+            cells = []
+            column_name = ""
+            timestamp = None
+            value_next = False
+            for field in line.splitlines():
+                if not field.strip():
+                    continue
+                if field.startswith(" " * 4):
+                    value = field.strip(' "')
+                    if value_next:
+                        cells.append({
+                            "column_family": column_name.split(":", 1)[0],
+                            "column": column_name.split(":", 1)[1],
+                            "timestamp": datetime.datetime.strptime(timestamp, "%Y/%m/%d-%H:%M:%S.%f").isoformat(),
+                            "value": value
+                        })
+                elif field.startswith(" " * 2):
+                    column_name, timestamp = map(str.strip, field.split("@"))
+                    value_next = True
+                else:
+                    key = field
+            if key is not None:
+                raw_output.append({"key": key, "cells": cells})
+
+    return raw_output if raw else _process(raw_output)
--- a/tests/fixtures/generic/cbt-multiple-columns.json
+++ b/tests/fixtures/generic/cbt-multiple-columns.json
@ -0,0 +1 @@
+[{"key":"foo","cells":{"bat":{"bar":"baz"},"foo":{"bar1":"baz1","bar2":"baz2"}}}]
--- a/tests/fixtures/generic/cbt-multiple-columns.out
+++ b/tests/fixtures/generic/cbt-multiple-columns.out
@ -0,0 +1,8 @@
+----------------------------------------
+foo
+  foo:bar1                                 @ 1970/01/01-01:00:00.000000
+    "baz1"
+  foo:bar2                                 @ 1970/01/01-01:00:00.000000
+    "baz2"
+  bat:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz"
--- a/tests/fixtures/generic/cbt-multiple-rows-raw.json
+++ b/tests/fixtures/generic/cbt-multiple-rows-raw.json
@ -0,0 +1 @@
+[{"key":"foo","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz1"}]},{"key":"bar","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz2"}]}]
--- a/tests/fixtures/generic/cbt-multiple-rows.json
+++ b/tests/fixtures/generic/cbt-multiple-rows.json
@ -0,0 +1 @@
+[{"key":"foo","cells":{"foo":{"bar":"baz1"}}},{"key":"bar","cells":{"foo":{"bar":"baz2"}}}]
--- a/tests/fixtures/generic/cbt-multiple-rows.out
+++ b/tests/fixtures/generic/cbt-multiple-rows.out
@ -0,0 +1,8 @@
+----------------------------------------
+foo
+  foo:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz1"
+----------------------------------------
+bar
+  foo:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz2"
--- a/tests/fixtures/generic/cbt-single.json
+++ b/tests/fixtures/generic/cbt-single.json
@ -0,0 +1 @@
+[{"key":"foo","cells":{"foo":{"bar":"baz"}}}]
--- a/tests/fixtures/generic/cbt-single.out
+++ b/tests/fixtures/generic/cbt-single.out
@ -0,0 +1,4 @@
+----------------------------------------
+foo
+  foo:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz"
--- a/tests/test_cbt.py
+++ b/tests/test_cbt.py
@ -0,0 +1,64 @@
+import json
+import os
+import unittest
+from jc.exceptions import ParseError
+import jc.parsers.cbt
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class MyTests(unittest.TestCase):
+    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-single.out'), 'r', encoding='utf-8') as f:
+        single = f.read()
+
+    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-columns.out'), 'r', encoding='utf-8') as f:
+        multiple_columns = f.read()
+
+    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows.out'), 'r', encoding='utf-8') as f:
+        multiple_rows = f.read()
+
+    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-single.json'), 'r', encoding='utf-8') as f:
+        single_json = json.loads(f.read())
+
+    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-columns.json'), 'r', encoding='utf-8') as f:
+        multiple_columns_json = json.loads(f.read())
+
+    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows.json'), 'r', encoding='utf-8') as f:
+        multiple_rows_json = json.loads(f.read())
+
+    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows-raw.json'), 'r', encoding='utf-8') as f:
+        multiple_rows_raw_json = json.loads(f.read())
+
+    def test_cbt_nodata(self):
+        """
+        Test 'cbt' with no data
+        """
+        self.assertEqual(jc.parsers.cbt.parse('', quiet=True), [])
+
+    def test_cbt_single_row(self):
+        """
+        Test 'cbt' with a single row
+        """
+        self.assertEqual(jc.parsers.cbt.parse(self.single, quiet=True), self.single_json)
+
+    def test_cbt_multiple_column_families(self):
+        """
+        Test 'cbt' with multiple columns from multiple column families
+        """
+        self.assertEqual(jc.parsers.cbt.parse(self.multiple_columns, quiet=True), self.multiple_columns_json)
+
+    def test_cbt_multiple_rows(self):
+        """
+        Test 'cbt' with multiple rows
+        """
+        self.assertEqual(jc.parsers.cbt.parse(self.multiple_rows, quiet=True), self.multiple_rows_json)
+
+    def test_cbt_multiple_rows_raw(self):
+        """
+        Test 'cbt' with multiple rows raw
+        """
+        self.assertEqual(jc.parsers.cbt.parse(self.multiple_rows, quiet=True, raw=True), self.multiple_rows_raw_json)
+
+
+if __name__ == '__main__':
+    unittest.main()
				`@ -0,0 +1 @@`
				`[{"key":"foo","cells":{"bat":{"bar":"baz"},"foo":{"bar1":"baz1","bar2":"baz2"}}}]`
				`@ -0,0 +1 @@`
				`[{"key":"foo","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz1"}]},{"key":"bar","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz2"}]}]`
				`@ -0,0 +1 @@`
				`[{"key":"foo","cells":{"foo":{"bar":"baz1"}}},{"key":"bar","cells":{"foo":{"bar":"baz2"}}}]`