Add parser for cbt

2026-06-13 21:54:29 +02:00 · 2022-12-12 15:10:59 +01:00
parent 299b0faf7c
commit b9fb7fad9c
2 changed files with 320 additions and 0 deletions
@@ -0,0 +1,175 @@
+"""jc - JSON Convert `foo` command output parser
+
+Parses the human-, but not machine-, readable output of the cbt command (for Google's BigTable).
+
+No effort is made to convert the data types of the values in the cells.
+
+The timestamps of the cells are converted to Python's isoformat.
+
+Raw output contains all cells for each column (including timestamps in converted to Python's isoformat),
+while the normal output contains only the latest value for each column.
+
+Usage (cli):
+
+    $ cbt | jc --cbt
+
+or
+
+    $ jc cbt
+
+Usage (module):
+
+    import jc
+    result = jc.parse('cbt', cbt_command_output)
+
+Schema:
+
+    [
+      {
+        "key":     string,
+        "cells": {
+            string: {
+                string: string
+            }
+        }
+      }
+    ]
+
+Examples:
+
+    $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p
+    [
+        {
+            "key": "foo",
+            "cells": {
+                "foo": {
+                    "bar": "baz"
+                }
+            }
+        }
+    ]
+
+    $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p -r
+    [
+        {
+            "key": "foo",
+            "cells": [
+                {
+                    "column_family": "foo",
+                    "column": "bar",
+                    "timestamp": "1970-01-01T01:00:00",
+                    "value": "baz"
+                }
+            ]
+        }
+    ]
+"""
+import datetime
+from itertools import groupby
+from typing import List, Dict
+from jc.jc_types import JSONDictType
+import jc.utils
+
+
+class info():
+    """Provides parser metadata (version, author, etc.)"""
+    version = '1.0'
+    description = '`cbt` command parser'
+    author = 'Andreas Weiden'
+    author_email = 'andreas.weiden@gmail.com'
+    # details = 'enter any other details here'
+
+    # compatible options: linux, darwin, cygwin, win32, aix, freebsd
+    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
+    magic_commands = ['cbt']
+
+
+__version__ = info.version
+
+
+def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]:
+    """
+    Final processing to conform to the schema.
+
+    Parameters:
+
+        proc_data:   (List of Dictionaries) raw structured data to process
+
+    Returns:
+
+        List of Dictionaries. Structured to conform to the schema.
+    """
+
+    # process the data here
+    # rebuild output for added semantic information
+    # use helper functions in jc.utils for int, float, bool
+    # conversions and timestamps
+    out_data = []
+    for row in proc_data:
+        cells = {}
+        key_func = lambda cell: (cell["column_family"], cell["column"])
+        all_cells = sorted(row["cells"], key=key_func)
+        for (column_family, column), group in groupby(all_cells, key=key_func):
+            group = sorted(group, key=lambda cell: cell["timestamp"], reverse=True)
+            if column_family not in cells:
+                cells[column_family] = {}
+            cells[column_family][column] = group[0]["value"]
+        row["cells"] = cells
+        out_data.append(row)
+    return out_data
+
+
+def parse(
+        data: str,
+        raw: bool = False,
+        quiet: bool = False
+) -> List[JSONDictType]:
+    """
+    Main text parsing function
+
+    Parameters:
+
+        data:        (string)  text data to parse
+        raw:         (boolean) unprocessed output if True
+        quiet:       (boolean) suppress warning messages if True
+
+    Returns:
+
+        List of Dictionaries. Raw or processed structured data.
+    """
+    jc.utils.compatibility(__name__, info.compatible, quiet)
+    jc.utils.input_type_check(data)
+
+    raw_output: List[Dict] = []
+
+    if jc.utils.has_data(data):
+        for line in filter(None, data.split("-" * 40)):
+            # parse the content here
+            # check out helper functions in jc.utils
+            # and jc.parsers.universal
+            key = None
+            cells = []
+            column_name = ""
+            timestamp = None
+            value_next = False
+            for field in line.splitlines():
+                if not field.strip():
+                    continue
+                if field.startswith(" " * 4):
+                    value = field.strip(' "')
+                    if value_next:
+                        cells.append({
+                            "column_family": column_name.split(":", 1)[0],
+                            "column": column_name.split(":", 1)[1],
+                            "timestamp": datetime.datetime.strptime(timestamp, "%Y/%m/%d-%H:%M:%S.%f").isoformat(),
+                            "value": value
+                        })
+                elif field.startswith(" " * 2):
+                    column_name, timestamp = map(str.strip, field.split("@"))
+                    value_next = True
+                else:
+                    key = field
+            if key is not None:
+                raw_output.append({"key": key, "cells": cells})
+
+    return raw_output if raw else _process(raw_output)
@@ -0,0 +1,145 @@
+import os
+import unittest
+from jc.exceptions import ParseError
+import jc.parsers.cbt
+
+THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class MyTests(unittest.TestCase):
+
+    def test_cbt_nodata(self):
+        """
+        Test 'cbt' with no data
+        """
+        self.assertEqual(jc.parsers.cbt.parse('', quiet=True), [])
+
+    def test_cbt_single_row(self):
+        """
+        Test 'cbt' with a single row
+        """
+        input = '''
+----------------------------------------
+foo
+  foo:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz"
+        '''
+        expected = [
+            {
+                "key": "foo",
+                "cells": {
+                    "foo": {
+                        "bar": "baz"
+                    }
+                }
+            }
+        ]
+        self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected)
+
+    def test_cbt_multiple_column_families(self):
+        """
+        Test 'cbt' with multiple column families
+        """
+        input = '''
+----------------------------------------
+foo
+  foo:bar1                                 @ 1970/01/01-01:00:00.000000
+    "baz1"
+  foo:bar2                                 @ 1970/01/01-01:00:00.000000
+    "baz2"
+  bat:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz"
+            '''
+        expected = [
+            {
+                "key": "foo",
+                "cells": {
+                    "foo": {
+                        "bar1": "baz1",
+                        "bar2": "baz2",
+                    },
+                    "bat": {
+                        "bar": "baz"
+                    }
+                }
+            }
+        ]
+        self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected)
+
+    def test_cbt_multiple_rows(self):
+        """
+        Test 'cbt' with multiple rows
+        """
+        input = '''
+----------------------------------------
+foo
+  foo:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz1"
+----------------------------------------
+bar
+  foo:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz2"
+            '''
+        expected = [
+            {
+                "key": "foo",
+                "cells": {
+                    "foo": {
+                        "bar": "baz1",
+                    }
+                }
+            },
+            {
+                "key": "bar",
+                "cells": {
+                    "foo": {
+                        "bar": "baz2",
+                    }
+                }
+            }
+        ]
+        self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected)
+
+    def test_cbt_multiple_rows_raw(self):
+        """
+        Test 'cbt' with multiple rows raw
+        """
+        input = '''
+----------------------------------------
+foo
+  foo:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz1"
+----------------------------------------
+bar
+  foo:bar                                  @ 1970/01/01-01:00:00.000000
+    "baz2"
+            '''
+        expected = [
+            {
+                "key": "foo",
+                "cells": [
+                    {
+                        "column_family": "foo",
+                        "column": "bar",
+                        "timestamp": "1970-01-01T01:00:00",
+                        "value": "baz1",
+                    }
+                ]
+            },
+            {
+                "key": "bar",
+                "cells": [
+                    {
+                        "column_family": "foo",
+                        "column": "bar",
+                        "timestamp": "1970-01-01T01:00:00",
+                        "value": "baz2",
+                    }
+                ]
+            }
+        ]
+        self.assertEqual(jc.parsers.cbt.parse(input, quiet=True, raw=True), expected)
+
+
+if __name__ == '__main__':
+    unittest.main()