1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-07-15 01:24:29 +02:00

Merge pull request #327 from graipher/master

Add parser for cbt
This commit is contained in:
Kelly Brazil
2022-12-13 10:40:46 -06:00
committed by GitHub
10 changed files with 280 additions and 0 deletions

View File

@ -161,6 +161,7 @@ option.
| ` --asciitable` | ASCII and Unicode table parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/asciitable) |
| ` --asciitable-m` | multi-line ASCII and Unicode table parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/asciitable_m) |
| ` --blkid` | `blkid` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/blkid) |
| ` --cbt` | `cbt` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cbt) |
| ` --cef` | CEF string parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef) |
| ` --cef-s` | CEF string streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s) |
| ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) |

191
jc/parsers/cbt.py Normal file
View File

@ -0,0 +1,191 @@
"""jc - JSON Convert `cbt` command output parser
Parses the human-, but not machine-, friendly output of the cbt command (for Google's BigTable).
No effort is made to convert the data types of the values in the cells.
The timestamps of the cells are converted to Python's isoformat.
Raw output contains all cells for each column (including timestamps in converted to Python's isoformat),
while the normal output contains only the latest value for each column.
Usage (cli):
$ cbt | jc --cbt
or
$ jc cbt
Usage (module):
import jc
result = jc.parse('cbt', cbt_command_output)
Schema:
[
{
"key": string,
"cells": {
string: {
string: string
}
}
}
]
Schema (raw):
[
{
"key": string,
"cells": [
{
"column_family": string,
"column": string,
"timestamp": string,
"value": string
}
]
}
]
Examples:
$ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p
[
{
"key": "foo",
"cells": {
"foo": {
"bar": "baz"
}
}
}
]
$ cbt -project=$PROJECT -instance=$INSTANCE lookup $TABLE foo | jc --cbt -p -r
[
{
"key": "foo",
"cells": [
{
"column_family": "foo",
"column": "bar",
"timestamp": "1970-01-01T01:00:00",
"value": "baz"
}
]
}
]
"""
import datetime
from itertools import groupby
from typing import List, Dict
from jc.jc_types import JSONDictType
import jc.utils
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = '`cbt` command parser'
author = 'Andreas Weiden'
author_email = 'andreas.weiden@gmail.com'
# details = 'enter any other details here'
# compatible options: linux, darwin, cygwin, win32, aix, freebsd
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
magic_commands = ['cbt']
__version__ = info.version
def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]:
"""
Final processing to conform to the schema.
Parameters:
proc_data: (List of Dictionaries) raw structured data to process
Returns:
List of Dictionaries. Structured to conform to the schema.
"""
# process the data here
# rebuild output for added semantic information
# use helper functions in jc.utils for int, float, bool
# conversions and timestamps
out_data = []
for row in proc_data:
cells = {}
key_func = lambda cell: (cell["column_family"], cell["column"])
all_cells = sorted(row["cells"], key=key_func)
for (column_family, column), group in groupby(all_cells, key=key_func):
group = sorted(group, key=lambda cell: cell["timestamp"], reverse=True)
if column_family not in cells:
cells[column_family] = {}
cells[column_family][column] = group[0]["value"]
row["cells"] = cells
out_data.append(row)
return out_data
def parse(
data: str,
raw: bool = False,
quiet: bool = False
) -> List[JSONDictType]:
"""
Main text parsing function
Parameters:
data: (string) text data to parse
raw: (boolean) unprocessed output if True
quiet: (boolean) suppress warning messages if True
Returns:
List of Dictionaries. Raw or processed structured data.
"""
jc.utils.compatibility(__name__, info.compatible, quiet)
jc.utils.input_type_check(data)
raw_output: List[Dict] = []
if jc.utils.has_data(data):
for line in filter(None, data.split("-" * 40)):
# parse the content here
# check out helper functions in jc.utils
# and jc.parsers.universal
key = None
cells = []
column_name = ""
timestamp = None
value_next = False
for field in line.splitlines():
if not field.strip():
continue
if field.startswith(" " * 4):
value = field.strip(' "')
if value_next:
cells.append({
"column_family": column_name.split(":", 1)[0],
"column": column_name.split(":", 1)[1],
"timestamp": datetime.datetime.strptime(timestamp, "%Y/%m/%d-%H:%M:%S.%f").isoformat(),
"value": value
})
elif field.startswith(" " * 2):
column_name, timestamp = map(str.strip, field.split("@"))
value_next = True
else:
key = field
if key is not None:
raw_output.append({"key": key, "cells": cells})
return raw_output if raw else _process(raw_output)

View File

@ -0,0 +1 @@
[{"key":"foo","cells":{"bat":{"bar":"baz"},"foo":{"bar1":"baz1","bar2":"baz2"}}}]

View File

@ -0,0 +1,8 @@
----------------------------------------
foo
foo:bar1 @ 1970/01/01-01:00:00.000000
"baz1"
foo:bar2 @ 1970/01/01-01:00:00.000000
"baz2"
bat:bar @ 1970/01/01-01:00:00.000000
"baz"

View File

@ -0,0 +1 @@
[{"key":"foo","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz1"}]},{"key":"bar","cells":[{"column_family":"foo","column":"bar","timestamp":"1970-01-01T01:00:00","value":"baz2"}]}]

View File

@ -0,0 +1 @@
[{"key":"foo","cells":{"foo":{"bar":"baz1"}}},{"key":"bar","cells":{"foo":{"bar":"baz2"}}}]

View File

@ -0,0 +1,8 @@
----------------------------------------
foo
foo:bar @ 1970/01/01-01:00:00.000000
"baz1"
----------------------------------------
bar
foo:bar @ 1970/01/01-01:00:00.000000
"baz2"

View File

@ -0,0 +1 @@
[{"key":"foo","cells":{"foo":{"bar":"baz"}}}]

4
tests/fixtures/generic/cbt-single.out vendored Normal file
View File

@ -0,0 +1,4 @@
----------------------------------------
foo
foo:bar @ 1970/01/01-01:00:00.000000
"baz"

64
tests/test_cbt.py Normal file
View File

@ -0,0 +1,64 @@
import json
import os
import unittest
from jc.exceptions import ParseError
import jc.parsers.cbt
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
class MyTests(unittest.TestCase):
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-single.out'), 'r', encoding='utf-8') as f:
single = f.read()
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-columns.out'), 'r', encoding='utf-8') as f:
multiple_columns = f.read()
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows.out'), 'r', encoding='utf-8') as f:
multiple_rows = f.read()
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-single.json'), 'r', encoding='utf-8') as f:
single_json = json.loads(f.read())
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-columns.json'), 'r', encoding='utf-8') as f:
multiple_columns_json = json.loads(f.read())
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows.json'), 'r', encoding='utf-8') as f:
multiple_rows_json = json.loads(f.read())
with open(os.path.join(THIS_DIR, os.pardir, 'tests/fixtures/generic/cbt-multiple-rows-raw.json'), 'r', encoding='utf-8') as f:
multiple_rows_raw_json = json.loads(f.read())
def test_cbt_nodata(self):
"""
Test 'cbt' with no data
"""
self.assertEqual(jc.parsers.cbt.parse('', quiet=True), [])
def test_cbt_single_row(self):
"""
Test 'cbt' with a single row
"""
self.assertEqual(jc.parsers.cbt.parse(self.single, quiet=True), self.single_json)
def test_cbt_multiple_column_families(self):
"""
Test 'cbt' with multiple columns from multiple column families
"""
self.assertEqual(jc.parsers.cbt.parse(self.multiple_columns, quiet=True), self.multiple_columns_json)
def test_cbt_multiple_rows(self):
"""
Test 'cbt' with multiple rows
"""
self.assertEqual(jc.parsers.cbt.parse(self.multiple_rows, quiet=True), self.multiple_rows_json)
def test_cbt_multiple_rows_raw(self):
"""
Test 'cbt' with multiple rows raw
"""
self.assertEqual(jc.parsers.cbt.parse(self.multiple_rows, quiet=True, raw=True), self.multiple_rows_raw_json)
if __name__ == '__main__':
unittest.main()