Add parser for cbt

2025-07-13 01:20:24 +02:00 · 2022-12-12 15:10:59 +01:00
parent 299b0faf7c
commit b9fb7fad9c
2 changed files with 320 additions and 0 deletions
--- a/jc/parsers/cbt.py
+++ b/jc/parsers/cbt.py
@ -0,0 +1,175 @@
 """jc - JSON Convert `foo` command output parser
 Parses the human-, but not machine-, readable output of the cbt command (for Google's BigTable).
 No effort is made to convert the data types of the values in the cells.
 The timestamps of the cells are converted to Python's isoformat.
 Raw output contains all cells for each column (including timestamps in converted to Python's isoformat),
 while the normal output contains only the latest value for each column.
 Usage (cli):
    $ cbt | jc --cbt
 or
    $ jc cbt
 Usage (module):
    import jc
    result = jc.parse('cbt', cbt_command_output)
 Schema:
    [
      {
        "key":     string,
        "cells": {
            string: {
                string: string
            }
        }
      }
    ]
 Examples:
    $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p
    [
        {
            "key": "foo",
            "cells": {
                "foo": {
                    "bar": "baz"
                }
            }
        }
    ]
    $ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p -r
    [
        {
            "key": "foo",
            "cells": [
                {
                    "column_family": "foo",
                    "column": "bar",
                    "timestamp": "1970-01-01T01:00:00",
                    "value": "baz"
                }
            ]
        }
    ]
 """
 import datetime
 from itertools import groupby
 from typing import List, Dict
 from jc.jc_types import JSONDictType
 import jc.utils
 class info():
    """Provides parser metadata (version, author, etc.)"""
    version = '1.0'
    description = '`cbt` command parser'
    author = 'Andreas Weiden'
    author_email = 'andreas.weiden@gmail.com'
    # details = 'enter any other details here'
    # compatible options: linux, darwin, cygwin, win32, aix, freebsd
    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
    magic_commands = ['cbt']
 __version__ = info.version
 def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]:
    """
    Final processing to conform to the schema.
    Parameters:
        proc_data:   (List of Dictionaries) raw structured data to process
    Returns:
        List of Dictionaries. Structured to conform to the schema.
    """
    # process the data here
    # rebuild output for added semantic information
    # use helper functions in jc.utils for int, float, bool
    # conversions and timestamps
    out_data = []
    for row in proc_data:
        cells = {}
        key_func = lambda cell: (cell["column_family"], cell["column"])
        all_cells = sorted(row["cells"], key=key_func)
        for (column_family, column), group in groupby(all_cells, key=key_func):
            group = sorted(group, key=lambda cell: cell["timestamp"], reverse=True)
            if column_family not in cells:
                cells[column_family] = {}
            cells[column_family][column] = group[0]["value"]
        row["cells"] = cells
        out_data.append(row)
    return out_data
 def parse(
        data: str,
        raw: bool = False,
        quiet: bool = False
 ) -> List[JSONDictType]:
    """
    Main text parsing function
    Parameters:
        data:        (string)  text data to parse
        raw:         (boolean) unprocessed output if True
        quiet:       (boolean) suppress warning messages if True
    Returns:
        List of Dictionaries. Raw or processed structured data.
    """
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.input_type_check(data)
    raw_output: List[Dict] = []
    if jc.utils.has_data(data):
        for line in filter(None, data.split("-" * 40)):
            # parse the content here
            # check out helper functions in jc.utils
            # and jc.parsers.universal
            key = None
            cells = []
            column_name = ""
            timestamp = None
            value_next = False
            for field in line.splitlines():
                if not field.strip():
                    continue
                if field.startswith(" " * 4):
                    value = field.strip(' "')
                    if value_next:
                        cells.append({
                            "column_family": column_name.split(":", 1)[0],
                            "column": column_name.split(":", 1)[1],
                            "timestamp": datetime.datetime.strptime(timestamp, "%Y/%m/%d-%H:%M:%S.%f").isoformat(),
                            "value": value
                        })
                elif field.startswith(" " * 2):
                    column_name, timestamp = map(str.strip, field.split("@"))
                    value_next = True
                else:
                    key = field
            if key is not None:
                raw_output.append({"key": key, "cells": cells})
    return raw_output if raw else _process(raw_output)
--- a/tests/test_cbt.py
+++ b/tests/test_cbt.py
@ -0,0 +1,145 @@
 import os
 import unittest
 from jc.exceptions import ParseError
 import jc.parsers.cbt
 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 class MyTests(unittest.TestCase):
    def test_cbt_nodata(self):
        """
        Test 'cbt' with no data
        """
        self.assertEqual(jc.parsers.cbt.parse('', quiet=True), [])
    def test_cbt_single_row(self):
        """
        Test 'cbt' with a single row
        """
        input = '''
 ----------------------------------------
 foo
  foo:bar                                  @ 1970/01/01-01:00:00.000000
    "baz"
        '''
        expected = [
            {
                "key": "foo",
                "cells": {
                    "foo": {
                        "bar": "baz"
                    }
                }
            }
        ]
        self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected)
    def test_cbt_multiple_column_families(self):
        """
        Test 'cbt' with multiple column families
        """
        input = '''
 ----------------------------------------
 foo
  foo:bar1                                 @ 1970/01/01-01:00:00.000000
    "baz1"
  foo:bar2                                 @ 1970/01/01-01:00:00.000000
    "baz2"
  bat:bar                                  @ 1970/01/01-01:00:00.000000
    "baz"
            '''
        expected = [
            {
                "key": "foo",
                "cells": {
                    "foo": {
                        "bar1": "baz1",
                        "bar2": "baz2",
                    },
                    "bat": {
                        "bar": "baz"
                    }
                }
            }
        ]
        self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected)
    def test_cbt_multiple_rows(self):
        """
        Test 'cbt' with multiple rows
        """
        input = '''
 ----------------------------------------
 foo
  foo:bar                                  @ 1970/01/01-01:00:00.000000
    "baz1"
 ----------------------------------------
 bar
  foo:bar                                  @ 1970/01/01-01:00:00.000000
    "baz2"
            '''
        expected = [
            {
                "key": "foo",
                "cells": {
                    "foo": {
                        "bar": "baz1",
                    }
                }
            },
            {
                "key": "bar",
                "cells": {
                    "foo": {
                        "bar": "baz2",
                    }
                }
            }
        ]
        self.assertEqual(jc.parsers.cbt.parse(input, quiet=True), expected)
    def test_cbt_multiple_rows_raw(self):
        """
        Test 'cbt' with multiple rows raw
        """
        input = '''
 ----------------------------------------
 foo
  foo:bar                                  @ 1970/01/01-01:00:00.000000
    "baz1"
 ----------------------------------------
 bar
  foo:bar                                  @ 1970/01/01-01:00:00.000000
    "baz2"
            '''
        expected = [
            {
                "key": "foo",
                "cells": [
                    {
                        "column_family": "foo",
                        "column": "bar",
                        "timestamp": "1970-01-01T01:00:00",
                        "value": "baz1",
                    }
                ]
            },
            {
                "key": "bar",
                "cells": [
                    {
                        "column_family": "foo",
                        "column": "bar",
                        "timestamp": "1970-01-01T01:00:00",
                        "value": "baz2",
                    }
                ]
            }
        ]
        self.assertEqual(jc.parsers.cbt.parse(input, quiet=True, raw=True), expected)
 if __name__ == '__main__':
    unittest.main()