1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-06-19 00:17:51 +02:00

add clf-s streaming parser

This commit is contained in:
Kelly Brazil
2022-11-21 16:54:13 -08:00
parent 3f13c70dfa
commit 9f4327f517
8 changed files with 429 additions and 6 deletions

View File

@ -166,6 +166,7 @@ option.
| ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) |
| ` --cksum` | `cksum` and `sum` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cksum) |
| ` --clf` | Common and Combined Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) |
| ` --clf-s` | Common and Combined Log Format file streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf_s) |
| ` --crontab` | `crontab` command and file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab) |
| ` --crontab-u` | `crontab` file parser with user support | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab_u) |
| ` --csv` | CSV file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/csv) |

View File

@ -4,7 +4,7 @@ _jc()
jc_about_options jc_about_mod_options jc_help_options jc_special_options
jc_commands=(acpi airport arp blkid chage cksum crontab date df dig dmidecode dpkg du env file findmnt finger free git gpg hciconfig id ifconfig iostat iptables iw jobs last lastb ls lsblk lsmod lsof lspci lsusb md5 md5sum mdadm mount mpstat netstat nmcli ntpq os-prober pidstat ping ping6 pip pip3 postconf printenv ps route rpm rsync sfdisk sha1sum sha224sum sha256sum sha384sum sha512sum shasum ss sshd stat sum sysctl systemctl systeminfo timedatectl top tracepath tracepath6 traceroute traceroute6 udevadm ufw uname update-alternatives upower uptime vdir vmstat w wc who xrandr zipinfo)
jc_parsers=(--acpi --airport --airport-s --arp --asciitable --asciitable-m --blkid --cef --cef-s --chage --cksum --clf --crontab --crontab-u --csv --csv-s --date --datetime-iso --df --dig --dir --dmidecode --dpkg-l --du --email-address --env --file --findmnt --finger --free --fstab --git-log --git-log-s --git-ls-remote --gpg --group --gshadow --hash --hashsum --hciconfig --history --hosts --id --ifconfig --ini --iostat --iostat-s --ip-address --iptables --iw-scan --jar-manifest --jobs --jwt --kv --last --ls --ls-s --lsblk --lsmod --lsof --lspci --lsusb --m3u --mdadm --mount --mpstat --mpstat-s --netstat --nmcli --ntpq --os-prober --passwd --pci-ids --pidstat --pidstat-s --ping --ping-s --pip-list --pip-show --plist --postconf --proc --proc-buddyinfo --proc-consoles --proc-cpuinfo --proc-crypto --proc-devices --proc-diskstats --proc-filesystems --proc-interrupts --proc-iomem --proc-ioports --proc-loadavg --proc-locks --proc-meminfo --proc-modules --proc-mtrr --proc-pagetypeinfo --proc-partitions --proc-slabinfo --proc-softirqs --proc-stat --proc-swaps --proc-uptime --proc-version --proc-vmallocinfo --proc-vmstat --proc-zoneinfo --proc-driver-rtc --proc-net-arp --proc-net-dev --proc-net-dev-mcast --proc-net-if-inet6 --proc-net-igmp --proc-net-igmp6 --proc-net-ipv6-route --proc-net-netlink --proc-net-netstat --proc-net-packet --proc-net-protocols --proc-net-route --proc-net-unix --proc-pid-fdinfo --proc-pid-io --proc-pid-maps --proc-pid-mountinfo --proc-pid-numa-maps --proc-pid-smaps --proc-pid-stat --proc-pid-statm --proc-pid-status --ps --route --rpm-qi --rsync --rsync-s --semver --sfdisk --shadow --ss --sshd-conf --stat --stat-s --sysctl --syslog --syslog-s --syslog-bsd --syslog-bsd-s --systemctl --systemctl-lj --systemctl-ls --systemctl-luf --systeminfo --time --timedatectl --timestamp --top --top-s --tracepath --traceroute --udevadm --ufw --ufw-appinfo --uname --update-alt-gs --update-alt-q --upower --uptime --url --vmstat --vmstat-s --w --wc --who --x509-cert --xml --xrandr --yaml --zipinfo)
jc_parsers=(--acpi --airport --airport-s --arp --asciitable --asciitable-m --blkid --cef --cef-s --chage --cksum --clf --clf-s --crontab --crontab-u --csv --csv-s --date --datetime-iso --df --dig --dir --dmidecode --dpkg-l --du --email-address --env --file --findmnt --finger --free --fstab --git-log --git-log-s --git-ls-remote --gpg --group --gshadow --hash --hashsum --hciconfig --history --hosts --id --ifconfig --ini --iostat --iostat-s --ip-address --iptables --iw-scan --jar-manifest --jobs --jwt --kv --last --ls --ls-s --lsblk --lsmod --lsof --lspci --lsusb --m3u --mdadm --mount --mpstat --mpstat-s --netstat --nmcli --ntpq --os-prober --passwd --pci-ids --pidstat --pidstat-s --ping --ping-s --pip-list --pip-show --plist --postconf --proc --proc-buddyinfo --proc-consoles --proc-cpuinfo --proc-crypto --proc-devices --proc-diskstats --proc-filesystems --proc-interrupts --proc-iomem --proc-ioports --proc-loadavg --proc-locks --proc-meminfo --proc-modules --proc-mtrr --proc-pagetypeinfo --proc-partitions --proc-slabinfo --proc-softirqs --proc-stat --proc-swaps --proc-uptime --proc-version --proc-vmallocinfo --proc-vmstat --proc-zoneinfo --proc-driver-rtc --proc-net-arp --proc-net-dev --proc-net-dev-mcast --proc-net-if-inet6 --proc-net-igmp --proc-net-igmp6 --proc-net-ipv6-route --proc-net-netlink --proc-net-netstat --proc-net-packet --proc-net-protocols --proc-net-route --proc-net-unix --proc-pid-fdinfo --proc-pid-io --proc-pid-maps --proc-pid-mountinfo --proc-pid-numa-maps --proc-pid-smaps --proc-pid-stat --proc-pid-statm --proc-pid-status --ps --route --rpm-qi --rsync --rsync-s --semver --sfdisk --shadow --ss --sshd-conf --stat --stat-s --sysctl --syslog --syslog-s --syslog-bsd --syslog-bsd-s --systemctl --systemctl-lj --systemctl-ls --systemctl-luf --systeminfo --time --timedatectl --timestamp --top --top-s --tracepath --traceroute --udevadm --ufw --ufw-appinfo --uname --update-alt-gs --update-alt-q --upower --uptime --url --vmstat --vmstat-s --w --wc --who --x509-cert --xml --xrandr --yaml --zipinfo)
jc_options=(--force-color -C --debug -d --monochrome -m --meta-out -M --pretty -p --quiet -q --raw -r --unbuffer -u --yaml-out -y)
jc_about_options=(--about -a)
jc_about_mod_options=(--pretty -p --yaml-out -y --monochrome -m --force-color -C)

View File

@ -100,7 +100,7 @@ _jc() {
'xrandr:run "xrandr" command with magic syntax.'
'zipinfo:run "zipinfo" command with magic syntax.'
)
jc_parsers=(--acpi --airport --airport-s --arp --asciitable --asciitable-m --blkid --cef --cef-s --chage --cksum --clf --crontab --crontab-u --csv --csv-s --date --datetime-iso --df --dig --dir --dmidecode --dpkg-l --du --email-address --env --file --findmnt --finger --free --fstab --git-log --git-log-s --git-ls-remote --gpg --group --gshadow --hash --hashsum --hciconfig --history --hosts --id --ifconfig --ini --iostat --iostat-s --ip-address --iptables --iw-scan --jar-manifest --jobs --jwt --kv --last --ls --ls-s --lsblk --lsmod --lsof --lspci --lsusb --m3u --mdadm --mount --mpstat --mpstat-s --netstat --nmcli --ntpq --os-prober --passwd --pci-ids --pidstat --pidstat-s --ping --ping-s --pip-list --pip-show --plist --postconf --proc --proc-buddyinfo --proc-consoles --proc-cpuinfo --proc-crypto --proc-devices --proc-diskstats --proc-filesystems --proc-interrupts --proc-iomem --proc-ioports --proc-loadavg --proc-locks --proc-meminfo --proc-modules --proc-mtrr --proc-pagetypeinfo --proc-partitions --proc-slabinfo --proc-softirqs --proc-stat --proc-swaps --proc-uptime --proc-version --proc-vmallocinfo --proc-vmstat --proc-zoneinfo --proc-driver-rtc --proc-net-arp --proc-net-dev --proc-net-dev-mcast --proc-net-if-inet6 --proc-net-igmp --proc-net-igmp6 --proc-net-ipv6-route --proc-net-netlink --proc-net-netstat --proc-net-packet --proc-net-protocols --proc-net-route --proc-net-unix --proc-pid-fdinfo --proc-pid-io --proc-pid-maps --proc-pid-mountinfo --proc-pid-numa-maps --proc-pid-smaps --proc-pid-stat --proc-pid-statm --proc-pid-status --ps --route --rpm-qi --rsync --rsync-s --semver --sfdisk --shadow --ss --sshd-conf --stat --stat-s --sysctl --syslog --syslog-s --syslog-bsd --syslog-bsd-s --systemctl --systemctl-lj --systemctl-ls --systemctl-luf --systeminfo --time --timedatectl --timestamp --top --top-s --tracepath --traceroute --udevadm --ufw --ufw-appinfo --uname --update-alt-gs --update-alt-q --upower --uptime --url --vmstat --vmstat-s --w --wc --who --x509-cert --xml --xrandr --yaml --zipinfo)
jc_parsers=(--acpi --airport --airport-s --arp --asciitable --asciitable-m --blkid --cef --cef-s --chage --cksum --clf --clf-s --crontab --crontab-u --csv --csv-s --date --datetime-iso --df --dig --dir --dmidecode --dpkg-l --du --email-address --env --file --findmnt --finger --free --fstab --git-log --git-log-s --git-ls-remote --gpg --group --gshadow --hash --hashsum --hciconfig --history --hosts --id --ifconfig --ini --iostat --iostat-s --ip-address --iptables --iw-scan --jar-manifest --jobs --jwt --kv --last --ls --ls-s --lsblk --lsmod --lsof --lspci --lsusb --m3u --mdadm --mount --mpstat --mpstat-s --netstat --nmcli --ntpq --os-prober --passwd --pci-ids --pidstat --pidstat-s --ping --ping-s --pip-list --pip-show --plist --postconf --proc --proc-buddyinfo --proc-consoles --proc-cpuinfo --proc-crypto --proc-devices --proc-diskstats --proc-filesystems --proc-interrupts --proc-iomem --proc-ioports --proc-loadavg --proc-locks --proc-meminfo --proc-modules --proc-mtrr --proc-pagetypeinfo --proc-partitions --proc-slabinfo --proc-softirqs --proc-stat --proc-swaps --proc-uptime --proc-version --proc-vmallocinfo --proc-vmstat --proc-zoneinfo --proc-driver-rtc --proc-net-arp --proc-net-dev --proc-net-dev-mcast --proc-net-if-inet6 --proc-net-igmp --proc-net-igmp6 --proc-net-ipv6-route --proc-net-netlink --proc-net-netstat --proc-net-packet --proc-net-protocols --proc-net-route --proc-net-unix --proc-pid-fdinfo --proc-pid-io --proc-pid-maps --proc-pid-mountinfo --proc-pid-numa-maps --proc-pid-smaps --proc-pid-stat --proc-pid-statm --proc-pid-status --ps --route --rpm-qi --rsync --rsync-s --semver --sfdisk --shadow --ss --sshd-conf --stat --stat-s --sysctl --syslog --syslog-s --syslog-bsd --syslog-bsd-s --systemctl --systemctl-lj --systemctl-ls --systemctl-luf --systeminfo --time --timedatectl --timestamp --top --top-s --tracepath --traceroute --udevadm --ufw --ufw-appinfo --uname --update-alt-gs --update-alt-q --upower --uptime --url --vmstat --vmstat-s --w --wc --who --x509-cert --xml --xrandr --yaml --zipinfo)
jc_parsers_describe=(
'--acpi:`acpi` command parser'
'--airport:`airport -I` command parser'
@ -114,6 +114,7 @@ _jc() {
'--chage:`chage --list` command parser'
'--cksum:`cksum` and `sum` command parser'
'--clf:Common and Combined Log Format file parser'
'--clf-s:Common and Combined Log Format file streaming parser'
'--crontab:`crontab` command and file parser'
'--crontab-u:`crontab` file parser with user support'
'--csv:CSV file parser'

View File

@ -70,10 +70,106 @@ Empty strings and `-` values are converted to `null`/`None`.
Examples:
$ cat file.log | jc --clf -p
[]
[
{
"host": "127.0.0.1",
"ident": "user-identifier",
"authuser": "frank",
"date": "10/Oct/2000:13:55:36 -0700",
"day": 10,
"month": "Oct",
"year": 2000,
"hour": 13,
"minute": 55,
"second": 36,
"tz": "-0700",
"request": "GET /apache_pb.gif HTTPS/1.0",
"status": 200,
"bytes": 2326,
"referer": null,
"user_agent": null,
"extra": null,
"request_method": "GET",
"request_url": "/apache_pb.gif",
"request_version": "HTTPS/1.0",
"epoch": 971211336,
"epoch_utc": null
},
{
"host": "1.1.1.2",
"ident": null,
"authuser": null,
"date": "11/Nov/2016:03:04:55 +0100",
"day": 11,
"month": "Nov",
"year": 2016,
"hour": 3,
"minute": 4,
"second": 55,
"tz": "+0100",
"request": "GET /",
"status": 200,
"bytes": 83,
"referer": null,
"user_agent": null,
"extra": "- 9221 1.1.1.1",
"request_method": "GET",
"request_url": "/",
"request_version": null,
"epoch": 1478862295,
"epoch_utc": null
},
...
]
$ cat file.log | jc --clf -p -r
[]
[
{
"host": "127.0.0.1",
"ident": "user-identifier",
"authuser": "frank",
"date": "10/Oct/2000:13:55:36 -0700",
"day": "10",
"month": "Oct",
"year": "2000",
"hour": "13",
"minute": "55",
"second": "36",
"tz": "-0700",
"request": "GET /apache_pb.gif HTTPS/1.0",
"status": "200",
"bytes": "2326",
"referer": null,
"user_agent": null,
"extra": "",
"request_method": "GET",
"request_url": "/apache_pb.gif",
"request_version": "HTTPS/1.0"
},
{
"host": "1.1.1.2",
"ident": "-",
"authuser": "-",
"date": "11/Nov/2016:03:04:55 +0100",
"day": "11",
"month": "Nov",
"year": "2016",
"hour": "03",
"minute": "04",
"second": "55",
"tz": "+0100",
"request": "GET /",
"status": "200",
"bytes": "83",
"referer": "-",
"user_agent": "-",
"extra": "- 9221 1.1.1.1",
"request_method": "GET",
"request_url": "/",
"request_version": null
},
...
]
<a id="jc.parsers.clf.parse"></a>

View File

@ -24,6 +24,7 @@ parsers: List[str] = [
'chage',
'cksum',
'clf',
'clf-s',
'crontab',
'crontab-u',
'csv',

View File

@ -65,10 +65,106 @@ Empty strings and `-` values are converted to `null`/`None`.
Examples:
$ cat file.log | jc --clf -p
[]
[
{
"host": "127.0.0.1",
"ident": "user-identifier",
"authuser": "frank",
"date": "10/Oct/2000:13:55:36 -0700",
"day": 10,
"month": "Oct",
"year": 2000,
"hour": 13,
"minute": 55,
"second": 36,
"tz": "-0700",
"request": "GET /apache_pb.gif HTTPS/1.0",
"status": 200,
"bytes": 2326,
"referer": null,
"user_agent": null,
"extra": null,
"request_method": "GET",
"request_url": "/apache_pb.gif",
"request_version": "HTTPS/1.0",
"epoch": 971211336,
"epoch_utc": null
},
{
"host": "1.1.1.2",
"ident": null,
"authuser": null,
"date": "11/Nov/2016:03:04:55 +0100",
"day": 11,
"month": "Nov",
"year": 2016,
"hour": 3,
"minute": 4,
"second": 55,
"tz": "+0100",
"request": "GET /",
"status": 200,
"bytes": 83,
"referer": null,
"user_agent": null,
"extra": "- 9221 1.1.1.1",
"request_method": "GET",
"request_url": "/",
"request_version": null,
"epoch": 1478862295,
"epoch_utc": null
},
...
]
$ cat file.log | jc --clf -p -r
[]
[
{
"host": "127.0.0.1",
"ident": "user-identifier",
"authuser": "frank",
"date": "10/Oct/2000:13:55:36 -0700",
"day": "10",
"month": "Oct",
"year": "2000",
"hour": "13",
"minute": "55",
"second": "36",
"tz": "-0700",
"request": "GET /apache_pb.gif HTTPS/1.0",
"status": "200",
"bytes": "2326",
"referer": null,
"user_agent": null,
"extra": "",
"request_method": "GET",
"request_url": "/apache_pb.gif",
"request_version": "HTTPS/1.0"
},
{
"host": "1.1.1.2",
"ident": "-",
"authuser": "-",
"date": "11/Nov/2016:03:04:55 +0100",
"day": "11",
"month": "Nov",
"year": "2016",
"hour": "03",
"minute": "04",
"second": "55",
"tz": "+0100",
"request": "GET /",
"status": "200",
"bytes": "83",
"referer": "-",
"user_agent": "-",
"extra": "- 9221 1.1.1.1",
"request_method": "GET",
"request_url": "/",
"request_version": null
},
...
]
"""
import re
from typing import List, Dict

223
jc/parsers/clf_s.py Normal file
View File

@ -0,0 +1,223 @@
"""jc - JSON Convert Common Log Format file streaming parser
> This streaming parser outputs JSON Lines (cli) or returns an Iterable of
> Dictionaries (module)
This parser will handle the Common Log Format standard as specified at
https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format.
Combined Log Format is also supported. (Referer and User Agent fields added)
Extra fields may be present and will be enclosed in the `extra` field as
a single string.
If a log line cannot be parsed, an object with an `unparsable` field will
be present with a value of the original line.
The `epoch` calculated timestamp field is naive. (i.e. based on the
local time of the system the parser is run on)
The `epoch_utc` calculated timestamp field is timezone-aware and is
only available if the timezone field is UTC.
Usage (cli):
$ cat file.log | jc --clf-s
Usage (module):
import jc
result = jc.parse('clf_s', common_log_file_output.splitlines())
for item in result:
# do something
Schema:
Empty strings and `-` values are converted to `null`/`None`.
{
"host": string,
"ident": string,
"authuser": string,
"date": string,
"day": integer,
"month": string,
"year": integer,
"hour": integer,
"minute": integer,
"second": integer,
"tz": string,
"request": string,
"request_method": string,
"request_url": string,
"request_version": string,
"status": integer,
"bytes": integer,
"referer": string,
"user_agent": string,
"extra": string,
"epoch": integer, # [0]
"epoch_utc": integer, # [1]
"unparsable": string # [2]
}
[0] naive timestamp
[1] timezone-aware timestamp. Only available if timezone field is UTC
[2] exists if the line was not able to be parsed
Examples:
$ cat file.log | jc --clf-s
{"host":"127.0.0.1","ident":"user-identifier","authuser":"frank","...}
{"host":"1.1.1.2","ident":null,"authuser":null,"date":"11/Nov/2016...}
...
$ cat file.log | jc --clf-s -r
{"host":"127.0.0.1","ident":"user-identifier","authuser":"frank","...}
{"host":"1.1.1.2","ident":"-","authuser":"-","date":"11/Nov/2016:0...}
...
"""
import re
from typing import Dict, Iterable
import jc.utils
from jc.streaming import (
add_jc_meta, streaming_input_type_check, streaming_line_input_type_check, raise_or_yield
)
from jc.jc_types import JSONDictType, StreamingOutputType
from jc.exceptions import ParseError
class info():
"""Provides parser metadata (version, author, etc.)"""
version = '1.0'
description = 'Common and Combined Log Format file streaming parser'
author = 'Kelly Brazil'
author_email = 'kellyjonbrazil@gmail.com'
compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
streaming = True
__version__ = info.version
def _process(proc_data: JSONDictType) -> JSONDictType:
"""
Final processing to conform to the schema.
Parameters:
proc_data: (Dictionary) raw structured data to process
Returns:
Dictionary. Structured data to conform to the schema.
"""
int_list = {'day', 'year', 'hour', 'minute', 'second', 'status', 'bytes'}
for key, val in proc_data.items():
# integer conversions
if key in int_list:
proc_data[key] = jc.utils.convert_to_int(val)
# convert `-` and blank values to None
if val == '-' or val == '':
proc_data[key] = None
# add unix timestamps
if 'date' in proc_data:
ts = jc.utils.timestamp(proc_data['date'], format_hint=(1800,)) # type: ignore
proc_data['epoch'] = ts.naive
proc_data['epoch_utc'] = ts.utc
return proc_data
@add_jc_meta
def parse(
data: Iterable[str],
raw: bool = False,
quiet: bool = False,
ignore_exceptions: bool = False
) -> StreamingOutputType:
"""
Main text parsing generator function. Returns an iterable object.
Parameters:
data: (iterable) line-based text data to parse
(e.g. sys.stdin or str.splitlines())
raw: (boolean) unprocessed output if True
quiet: (boolean) suppress warning messages if True
ignore_exceptions: (boolean) ignore parsing exceptions if True
Returns:
Iterable of Dictionaries
"""
jc.utils.compatibility(__name__, info.compatible, quiet)
streaming_input_type_check(data)
clf_pattern = re.compile(r'''
^(?P<host>-|\S+)\s
(?P<ident>-|\S+)\s
(?P<authuser>-|\S+)\s
\[
(?P<date>
(?P<day>\d+)/
(?P<month>\S\S\S)/
(?P<year>\d\d\d\d):
(?P<hour>\d\d):
(?P<minute>\d\d):
(?P<second>\d\d)\s
(?P<tz>\S+)
)
\]\s
\"(?P<request>.*?)\"\s
(?P<status>-|\d\d\d)\s
(?P<bytes>-|\d+)\s?
(?:\"(?P<referer>.*?)\"\s?)?
(?:\"(?P<user_agent>.*?)\"\s?)?
(?P<extra>.*)
''', re.VERBOSE
)
request_pattern = re.compile(r'''
(?P<request_method>\S+)\s
(?P<request_url>.*?(?=\sHTTPS?/|$))\s? # positive lookahead for HTTP(S)/ or end of string
(?P<request_version>HTTPS?/[\d\.]+)?
''', re.VERBOSE
)
for line in data:
try:
streaming_line_input_type_check(line)
output_line: Dict = {}
if line == '' or line == '\n':
continue
clf_match = re.match(clf_pattern, line)
if clf_match:
output_line = clf_match.groupdict()
if clf_match.groupdict().get('request', None):
request_string = clf_match.groupdict()['request']
request_match = re.match(request_pattern, request_string)
if request_match:
output_line.update(request_match.groupdict())
else:
output_line = {"unparsable": line.strip()}
if output_line:
yield output_line if raw else _process(output_line)
else:
raise ParseError('Not clf data')
except Exception as e:
yield raise_or_yield(ignore_exceptions, e, line)

View File

@ -90,6 +90,11 @@ CEF string streaming parser
\fB--clf\fP
Common and Combined Log Format file parser
.TP
.B
\fB--clf-s\fP
Common and Combined Log Format file streaming parser
.TP
.B
\fB--crontab\fP