Merge pull request #327 from graipher/master

Add parser for cbt
2025-07-15 01:24:29 +02:00 · 2022-12-13 10:40:46 -06:00
parent be85d78f55 fd61e19135
commit b8ef583b93
10 changed files with 280 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -161,6 +161,7 @@ option.
 | `   --asciitable` | ASCII and Unicode table parser                          | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/asciitable)     |
 | ` --asciitable-m` | multi-line ASCII and Unicode table parser               | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/asciitable_m)   |
 | `        --blkid` | `blkid` command parser                                  | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/blkid)          |
 | `          --cbt` | `cbt` command parser                                    | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cbt)            |
 | `          --cef` | CEF string parser                                       | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef)            |
 | `        --cef-s` | CEF string streaming parser                             | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s)          |
 | `        --chage` | `chage --list` command parser                           | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage)          |
--- a/jc/parsers/cbt.py
+++ b/jc/parsers/cbt.py
@ -0,0 +1,191 @@
 """jc - JSON Convert `cbt` command output parser
 Parses the human-, but not machine-, friendly output of the cbt command (for Google's BigTable).
 No effort is made to convert the data types of the values in the cells.
 The timestamps of the cells are converted to Python's isoformat.
 Raw output contains all cells for each column (including timestamps in converted to Python's isoformat),
 while the normal output contains only the latest value for each column.
 Usage (cli):
    $ cbt | jc --cbt
 or
    $ jc cbt
 Usage (module):
    import jc
    result = jc.parse('cbt', cbt_command_output)
 Schema:
    [
      {
        "key":     string,
        "cells": {
            string: {
                string: string
            }
        }
      }
    ]
 Schema (raw):
    [
      {
        "key": string,
        "cells": [
          {
            "column_family": string,
            "column": string,
            "timestamp": string,
            "value": string
          }
        ]
      }
    ]
 Examples:
    $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p
    [
        {
            "key": "foo",
            "cells": {
                "foo": {
                    "bar": "baz"
                }
            }
        }
    ]
    $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p -r
    [
        {
            "key": "foo",
            "cells": [
                {
                    "column_family": "foo",
                    "column": "bar",
                    "timestamp": "1970-01-01T01:00:00",
                    "value": "baz"
                }
            ]
        }
    ]
 """
 import datetime
 from itertools import groupby
 from typing import List, Dict
 from jc.jc_types import JSONDictType
 import jc.utils
 class info():
    """Provides parser metadata (version, author, etc.)"""
    version = '1.0'
    description = '`cbt` command parser'
    author = 'Andreas Weiden'
    author_email = 'andreas.weiden@gmail.com'
    # details = 'enter any other details here'
    # compatible options: linux, darwin, cygwin, win32, aix, freebsd
    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
    magic_commands = ['cbt']
 __version__ = info.version
 def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]:
    """
    Final processing to conform to the schema.
    Parameters:
        proc_data:   (List of Dictionaries) raw structured data to process
    Returns:
        List of Dictionaries. Structured to conform to the schema.
    """
    # process the data here
    # rebuild output for added semantic information
    # use helper functions in jc.utils for int, float, bool
    # conversions and timestamps
    out_data = []
    for row in proc_data:
        cells = {}
        key_func = lambda cell: (cell["column_family"], cell["column"])
        all_cells = sorted(row["cells"], key=key_func)
        for (column_family, column), group in groupby(all_cells, key=key_func):
            group = sorted(group, key=lambda cell: cell["timestamp"], reverse=True)
            if column_family not in cells:
                cells[column_family] = {}
            cells[column_family][column] = group[0]["value"]
        row["cells"] = cells
        out_data.append(row)
    return out_data
 def parse(
        data: str,
        raw: bool = False,
        quiet: bool = False
 ) -> List[JSONDictType]:
    """
    Main text parsing function
    Parameters:
        data:        (string)  text data to parse
        raw:         (boolean) unprocessed output if True
        quiet:       (boolean) suppress warning messages if True
    Returns:
        List of Dictionaries. Raw or processed structured data.
    """
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.input_type_check(data)
    raw_output: List[Dict] = []
    if jc.utils.has_data(data):
        for line in filter(None, data.split("-" * 40)):
            # parse the content here
            # check out helper functions in jc.utils
            # and jc.parsers.universal
            key = None
            cells = []
            column_name = ""
            timestamp = None
            value_next = False
            for field in line.splitlines():
                if not field.strip():
                    continue
                if field.startswith(" " * 4):
                    value = field.strip(' "')
                    if value_next:
                        cells.append({
                            "column_family": column_name.split(":", 1)[0],
                            "column": column_name.split(":", 1)[1],
                            "timestamp": datetime.datetime.strptime(timestamp, "%Y/%m/%d-%H:%M:%S.%f").isoformat(),
                            "value": value
                        })
                elif field.startswith(" " * 2):
                    column_name, timestamp = map(str.strip, field.split("@"))
                    value_next = True
                else:
                    key = field
            if key is not None:
                raw_output.append({"key": key, "cells": cells})
    return raw_output if raw else _process(raw_output)
--- a/tests/fixtures/generic/cbt-multiple-columns.json
+++ b/tests/fixtures/generic/cbt-multiple-columns.json
@ -0,0 +1 @@
 [{"key":"foo","cells":{"bat":{"bar":"baz"},"foo":{"bar1":"baz1","bar2":"baz2"}}}]
--- a/tests/fixtures/generic/cbt-multiple-columns.out
+++ b/tests/fixtures/generic/cbt-multiple-columns.out
@ -0,0 +1,8 @@
 ----------------------------------------
 foo
  foo:bar1                                 @ 1970/01/01-01:00:00.000000
    "baz1"
  foo:bar2                                 @ 1970/01/01-01:00:00.000000
    "baz2"
  bat:bar                                  @ 1970/01/01-01:00:00.000000
    "baz"
--- a/tests/fixtures/generic/cbt-multiple-rows-raw.json
+++ b/tests/fixtures/generic/cbt-multiple-rows-raw.json
@ -0,0 +1 @@
 [{"key":"foo","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz1"}]},{"key":"bar","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz2"}]}]
--- a/tests/fixtures/generic/cbt-multiple-rows.json
+++ b/tests/fixtures/generic/cbt-multiple-rows.json
@ -0,0 +1 @@
 [{"key":"foo","cells":{"foo":{"bar":"baz1"}}},{"key":"bar","cells":{"foo":{"bar":"baz2"}}}]
--- a/tests/fixtures/generic/cbt-multiple-rows.out
+++ b/tests/fixtures/generic/cbt-multiple-rows.out
@ -0,0 +1,8 @@
 ----------------------------------------
 foo
  foo:bar                                  @ 1970/01/01-01:00:00.000000
    "baz1"
 ----------------------------------------
 bar
  foo:bar                                  @ 1970/01/01-01:00:00.000000
    "baz2"
--- a/tests/fixtures/generic/cbt-single.json
+++ b/tests/fixtures/generic/cbt-single.json
@ -0,0 +1 @@
 [{"key":"foo","cells":{"foo":{"bar":"baz"}}}]
--- a/tests/fixtures/generic/cbt-single.out
+++ b/tests/fixtures/generic/cbt-single.out
@ -0,0 +1,4 @@
 ----------------------------------------
 foo
  foo:bar                                  @ 1970/01/01-01:00:00.000000
    "baz"
--- a/tests/test_cbt.py
+++ b/tests/test_cbt.py
@ -0,0 +1,64 @@
 import json
 import os
 import unittest
 from jc.exceptions import ParseError
 import jc.parsers.cbt
 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 class MyTests(unittest.TestCase):
    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-single.out'), 'r', encoding='utf-8') as f:
        single = f.read()
    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-columns.out'), 'r', encoding='utf-8') as f:
        multiple_columns = f.read()
    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows.out'), 'r', encoding='utf-8') as f:
        multiple_rows = f.read()
    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-single.json'), 'r', encoding='utf-8') as f:
        single_json = json.loads(f.read())
    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-columns.json'), 'r', encoding='utf-8') as f:
        multiple_columns_json = json.loads(f.read())
    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows.json'), 'r', encoding='utf-8') as f:
        multiple_rows_json = json.loads(f.read())
    with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows-raw.json'), 'r', encoding='utf-8') as f:
        multiple_rows_raw_json = json.loads(f.read())
    def test_cbt_nodata(self):
        """
        Test 'cbt' with no data
        """
        self.assertEqual(jc.parsers.cbt.parse('', quiet=True), [])
    def test_cbt_single_row(self):
        """
        Test 'cbt' with a single row
        """
        self.assertEqual(jc.parsers.cbt.parse(self.single, quiet=True), self.single_json)
    def test_cbt_multiple_column_families(self):
        """
        Test 'cbt' with multiple columns from multiple column families
        """
        self.assertEqual(jc.parsers.cbt.parse(self.multiple_columns, quiet=True), self.multiple_columns_json)
    def test_cbt_multiple_rows(self):
        """
        Test 'cbt' with multiple rows
        """
        self.assertEqual(jc.parsers.cbt.parse(self.multiple_rows, quiet=True), self.multiple_rows_json)
    def test_cbt_multiple_rows_raw(self):
        """
        Test 'cbt' with multiple rows raw
        """
        self.assertEqual(jc.parsers.cbt.parse(self.multiple_rows, quiet=True, raw=True), self.multiple_rows_raw_json)
 if __name__ == '__main__':
    unittest.main()
		`@ -0,0 +1 @@`
							`[{"key":"foo","cells":{"bat":{"bar":"baz"},"foo":{"bar1":"baz1","bar2":"baz2"}}}]`
		`@ -0,0 +1 @@`
							`[{"key":"foo","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz1"}]},{"key":"bar","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz2"}]}]`
		`@ -0,0 +1 @@`
							`[{"key":"foo","cells":{"foo":{"bar":"baz1"}}},{"key":"bar","cells":{"foo":{"bar":"baz2"}}}]`