From b087e712ca1199b7270a73b995443a218043be5e Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Tue, 9 Aug 2022 20:25:54 -0700 Subject: [PATCH] initial cef parser --- CHANGELOG | 1 + jc/lib.py | 1 + jc/parsers/cef.py | 226 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+) create mode 100644 jc/parsers/cef.py diff --git a/CHANGELOG b/CHANGELOG index a7d7cf56..2ee8d2e5 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ jc changelog xxxxxxxx v1.20.5 - Add IP Address string parser +- Add CEF string parser - Add PLIST file parser (XML and binary support) - Add `mdadm` command parser tested on linux (IN PROGRESS) - Add `--time-out` or `-t` option to add a UTC timestamp to the JSON output diff --git a/jc/lib.py b/jc/lib.py index 40314c82..955b4706 100644 --- a/jc/lib.py +++ b/jc/lib.py @@ -16,6 +16,7 @@ parsers = [ 'asciitable', 'asciitable-m', 'blkid', + 'cef', 'chage', 'cksum', 'crontab', diff --git a/jc/parsers/cef.py b/jc/parsers/cef.py new file mode 100644 index 00000000..c3a53941 --- /dev/null +++ b/jc/parsers/cef.py @@ -0,0 +1,226 @@ +"""jc - JSON Convert CEF string parser + +This is a best-effort parser since there are so many variations to CEF +formatting from different vendors. If you require special handling for your +CEF input, you can copy this parser code to the `jc` pluggin directory for +your system and modify it to suit your needs. + +This parser will accept a single CEF string or multiple CEF string lines. +Any text before "CEF" will be ignored. + +Usage (cli): + + $ echo 'CEF:0|Vendor|Product|3.2.0|1|SYSTEM|1|... | jc --cef + +Usage (module): + + import jc + result = jc.parse('cef', cef_string_output) + +Schema: + + [ + { + "cef": string, + "bar": boolean, + "baz": integer + } + ] + +Examples: + + $ cef | jc --cef -p + [] + + $ cef | jc --cef -p -r + [] +""" +from typing import List, Dict +import re +import jc.utils +from jc.exceptions import ParseError + + +class info(): + """Provides parser metadata (version, author, etc.)""" + version = '1.0' + description = 'CEF string parser' + author = 'Kelly Brazil' + author_email = 'kellyjonbrazil@gmail.com' + details = 'Using the pycef library at https://github.com/DavidJBianco/pycef/releases/tag/v1.11-2' + compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd'] + +__version__ = info.version + + +############################################################################ +""" +The MIT License (MIT) + +Copyright (c) 2016 DavidJBianco + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +def _pycef_parse(str_input): + """ + Parse a string in CEF format and return a dict with the header values + and the extension data. + """ + + # Create the empty dict we'll return later + values = dict() + + # This regex separates the string into the CEF header and the extension + # data. Once we do this, it's easier to use other regexes to parse each + # part. + header_re = r'((CEF:\d+)([^=\\]+\|){,7})(.*)' + + res = re.search(header_re, str_input) + + if res: + header = res.group(1) + extension = res.group(4) + + # Split the header on the "|" char. Uses a negative lookbehind + # assertion to ensure we don't accidentally split on escaped chars, + # though. + spl = re.split(r'(? 6: + values["Severity"] = spl[6] + + # The first value is actually the CEF version, formatted like + # "CEF:#". Ignore anything before that (like a date from a syslog message). + # We then split on the colon and use the second value as the + # version number. + cef_start = spl[0].find('CEF') + if cef_start == -1: + raise ParseError('Invalid CEF string.') + (cef, version) = spl[0][cef_start:].split(':') + values["CEFVersion"] = version + + # The ugly, gnarly regex here finds a single key=value pair, + # taking into account multiple whitespaces, escaped '=' and '|' + # chars. It returns an iterator of tuples. + spl = re.findall(r'([^=\s]+)=((?:[\\]=|[^=])+)(?:\s|$)', extension) + + for i in spl: + # Split the tuples and put them into the dictionary + values[i[0]] = i[1] + + # Process custom field labels + for key in list(values.keys()): + # If the key string ends with Label, replace it in the appropriate + # custom field + if key[-5:] == "Label": + customlabel = key[:-5] + # Find the corresponding customfield and replace with the label + for customfield in list(values.keys()): + if customfield == customlabel: + values[values[key]] = values[customfield] + del values[customfield] + del values[key] + else: + raise ParseError('Could not parse record. Is it valid CEF format?') + + return values + +############################################################################ + + +def _process(proc_data: List[Dict]) -> List[Dict]: + """ + Final processing to conform to the schema. + + Parameters: + + proc_data: (List of Dictionaries) raw structured data to process + + Returns: + + List of Dictionaries. Structured to conform to the schema. + """ + # fix escape chars specified in syslog RFC 5424 + # https://www.rfc-editor.org/rfc/rfc5424.html#section-6 + escape_map = { + r'\\': '\\', + r'\"': r'"', + r'\]': r']' + } + + for item in proc_data: + for key, value in item.copy().items(): + # remove any spaces around values + item[key] = value.strip() + + # fixup escaped characters + for esc, esc_sub in escape_map.items(): + item[key] = item[key].replace(esc, esc_sub) + + # remove any quotation marks from key names + if '"' in key: + new_key = key.replace('"', '') + item[new_key] = item.pop(key) + + return proc_data + + +def parse( + data: str, + raw: bool = False, + quiet: bool = False +) -> List[Dict]: + """ + Main text parsing function + + Parameters: + + data: (string) text data to parse + raw: (boolean) unprocessed output if True + quiet: (boolean) suppress warning messages if True + + Returns: + + List of Dictionaries. Raw or processed structured data. + """ + jc.utils.compatibility(__name__, info.compatible, quiet) + jc.utils.input_type_check(data) + + raw_output: List = [] + + if jc.utils.has_data(data): + for line in filter(None, data.splitlines()): + raw_output.append(_pycef_parse(line)) + + return raw_output if raw else _process(raw_output)