diff --git a/jc/lib.py b/jc/lib.py index 6bfa4c81..39065997 100644 --- a/jc/lib.py +++ b/jc/lib.py @@ -23,6 +23,7 @@ parsers: List[str] = [ 'cef-s', 'chage', 'cksum', + 'clf', 'crontab', 'crontab-u', 'csv', diff --git a/jc/parsers/clf.py b/jc/parsers/clf.py new file mode 100644 index 00000000..577de90a --- /dev/null +++ b/jc/parsers/clf.py @@ -0,0 +1,149 @@ +"""jc - JSON Convert Common Log Format file parser + +This parser will handle the Common Log Format standard as specified at +https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format. +Extra fields may be present and will be enclosed in the `extra` field as +a single string. + +Usage (cli): + + $ cat file.log | jc --clf + +Usage (module): + + import jc + result = jc.parse('clf', common_log_file_output) + +Schema: + + [ + { + "host": string, + "ident": string, + "authuser": string, + "date": string, + "day": integer, + "month": string, + "year": integer, + "hour": integer, + "minute": integer, + "second": integer, + "tz": string, + "request": string, + "request_method": string, + "request_url": string, + "request_version": string, + "status": integer, + "bytes": integer, + "extra": string, + "epoch": integer, # [0] + "epoch_utc": integer # [1] + } + ] + + [0] naive timestamp + [1] timezone-aware timestamp. Only available if timezone field is UTC + +Examples: + + $ cat file.log | jc --clf -p + [] + + $ cat file.log | jc --clf -p -r + [] +""" +import re +from typing import List, Dict +from jc.jc_types import JSONDictType +import jc.utils + + +class info(): + """Provides parser metadata (version, author, etc.)""" + version = '1.0' + description = 'Common Log Format file parser' + author = 'Kelly Brazil' + author_email = 'kellyjonbrazil@gmail.com' + compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd'] + + +__version__ = info.version + + +def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]: + """ + Final processing to conform to the schema. + + Parameters: + + proc_data: (List of Dictionaries) raw structured data to process + + Returns: + + List of Dictionaries. Structured to conform to the schema. + """ + int_list = {'day', 'year', 'hour', 'minute', 'second', 'status', 'bytes'} + + for log in proc_data: + for key, val in log.items(): + if key in int_list: + log[key] = int(val) + + return proc_data + + +def parse( + data: str, + raw: bool = False, + quiet: bool = False +) -> List[JSONDictType]: + """ + Main text parsing function + + Parameters: + + data: (string) text data to parse + raw: (boolean) unprocessed output if True + quiet: (boolean) suppress warning messages if True + + Returns: + + List of Dictionaries. Raw or processed structured data. + """ + jc.utils.compatibility(__name__, info.compatible, quiet) + jc.utils.input_type_check(data) + + raw_output: List[Dict] = [] + clf_pattern = re.compile(r''' + ^(?P-|\S+)\s + (?P-|\S+)\s + (?P-|\S+)\s + \[ + (?P + (?P\d+)/ + (?P\S\S\S)/ + (?P\d\d\d\d): + (?P\d\d): + (?P\d\d): + (?P\d\d)\s + (?P\S+) + ) + \]\s + \"(?P + (?P\S+)\s + (?P.*?(?=\sHTTPS?/|\"))\s? # positive lookahead for HTTP or quote mark + (?PHTTPS?/\d\.\d)?)\"\s + (?P-|\d\d\d)\s + (?P-|\d+)\s? + (?P.*) + ''', re.VERBOSE + ) + + if jc.utils.has_data(data): + + for line in filter(None, data.splitlines()): + clf_match = re.match(clf_pattern, line) + if clf_match: + raw_output.append(clf_match.groupdict()) + + return raw_output if raw else _process(raw_output)