mirror of
https://github.com/kellyjonbrazil/jc.git
synced 2025-06-19 00:17:51 +02:00
initial common log format parser
This commit is contained in:
@ -23,6 +23,7 @@ parsers: List[str] = [
|
||||
'cef-s',
|
||||
'chage',
|
||||
'cksum',
|
||||
'clf',
|
||||
'crontab',
|
||||
'crontab-u',
|
||||
'csv',
|
||||
|
149
jc/parsers/clf.py
Normal file
149
jc/parsers/clf.py
Normal file
@ -0,0 +1,149 @@
|
||||
"""jc - JSON Convert Common Log Format file parser
|
||||
|
||||
This parser will handle the Common Log Format standard as specified at
|
||||
https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format.
|
||||
Extra fields may be present and will be enclosed in the `extra` field as
|
||||
a single string.
|
||||
|
||||
Usage (cli):
|
||||
|
||||
$ cat file.log | jc --clf
|
||||
|
||||
Usage (module):
|
||||
|
||||
import jc
|
||||
result = jc.parse('clf', common_log_file_output)
|
||||
|
||||
Schema:
|
||||
|
||||
[
|
||||
{
|
||||
"host": string,
|
||||
"ident": string,
|
||||
"authuser": string,
|
||||
"date": string,
|
||||
"day": integer,
|
||||
"month": string,
|
||||
"year": integer,
|
||||
"hour": integer,
|
||||
"minute": integer,
|
||||
"second": integer,
|
||||
"tz": string,
|
||||
"request": string,
|
||||
"request_method": string,
|
||||
"request_url": string,
|
||||
"request_version": string,
|
||||
"status": integer,
|
||||
"bytes": integer,
|
||||
"extra": string,
|
||||
"epoch": integer, # [0]
|
||||
"epoch_utc": integer # [1]
|
||||
}
|
||||
]
|
||||
|
||||
[0] naive timestamp
|
||||
[1] timezone-aware timestamp. Only available if timezone field is UTC
|
||||
|
||||
Examples:
|
||||
|
||||
$ cat file.log | jc --clf -p
|
||||
[]
|
||||
|
||||
$ cat file.log | jc --clf -p -r
|
||||
[]
|
||||
"""
|
||||
import re
|
||||
from typing import List, Dict
|
||||
from jc.jc_types import JSONDictType
|
||||
import jc.utils
|
||||
|
||||
|
||||
class info():
|
||||
"""Provides parser metadata (version, author, etc.)"""
|
||||
version = '1.0'
|
||||
description = 'Common Log Format file parser'
|
||||
author = 'Kelly Brazil'
|
||||
author_email = 'kellyjonbrazil@gmail.com'
|
||||
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
|
||||
|
||||
|
||||
__version__ = info.version
|
||||
|
||||
|
||||
def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]:
|
||||
"""
|
||||
Final processing to conform to the schema.
|
||||
|
||||
Parameters:
|
||||
|
||||
proc_data: (List of Dictionaries) raw structured data to process
|
||||
|
||||
Returns:
|
||||
|
||||
List of Dictionaries. Structured to conform to the schema.
|
||||
"""
|
||||
int_list = {'day', 'year', 'hour', 'minute', 'second', 'status', 'bytes'}
|
||||
|
||||
for log in proc_data:
|
||||
for key, val in log.items():
|
||||
if key in int_list:
|
||||
log[key] = int(val)
|
||||
|
||||
return proc_data
|
||||
|
||||
|
||||
def parse(
|
||||
data: str,
|
||||
raw: bool = False,
|
||||
quiet: bool = False
|
||||
) -> List[JSONDictType]:
|
||||
"""
|
||||
Main text parsing function
|
||||
|
||||
Parameters:
|
||||
|
||||
data: (string) text data to parse
|
||||
raw: (boolean) unprocessed output if True
|
||||
quiet: (boolean) suppress warning messages if True
|
||||
|
||||
Returns:
|
||||
|
||||
List of Dictionaries. Raw or processed structured data.
|
||||
"""
|
||||
jc.utils.compatibility(__name__, info.compatible, quiet)
|
||||
jc.utils.input_type_check(data)
|
||||
|
||||
raw_output: List[Dict] = []
|
||||
clf_pattern = re.compile(r'''
|
||||
^(?P<host>-|\S+)\s
|
||||
(?P<ident>-|\S+)\s
|
||||
(?P<authuser>-|\S+)\s
|
||||
\[
|
||||
(?P<date>
|
||||
(?P<day>\d+)/
|
||||
(?P<month>\S\S\S)/
|
||||
(?P<year>\d\d\d\d):
|
||||
(?P<hour>\d\d):
|
||||
(?P<minute>\d\d):
|
||||
(?P<second>\d\d)\s
|
||||
(?P<tz>\S+)
|
||||
)
|
||||
\]\s
|
||||
\"(?P<request>
|
||||
(?P<request_method>\S+)\s
|
||||
(?P<request_url>.*?(?=\sHTTPS?/|\"))\s? # positive lookahead for HTTP or quote mark
|
||||
(?P<request_version>HTTPS?/\d\.\d)?)\"\s
|
||||
(?P<status>-|\d\d\d)\s
|
||||
(?P<bytes>-|\d+)\s?
|
||||
(?P<extra>.*)
|
||||
''', re.VERBOSE
|
||||
)
|
||||
|
||||
if jc.utils.has_data(data):
|
||||
|
||||
for line in filter(None, data.splitlines()):
|
||||
clf_match = re.match(clf_pattern, line)
|
||||
if clf_match:
|
||||
raw_output.append(clf_match.groupdict())
|
||||
|
||||
return raw_output if raw else _process(raw_output)
|
Reference in New Issue
Block a user