From 9f4327f51776718b51da5b6dd47ebbd4a210155e Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Mon, 21 Nov 2022 16:54:13 -0800 Subject: [PATCH] add clf-s streaming parser --- README.md | 1 + completions/jc_bash_completion.sh | 2 +- completions/jc_zsh_completion.sh | 3 +- docs/parsers/clf.md | 100 +++++++++++++- jc/lib.py | 1 + jc/parsers/clf.py | 100 +++++++++++++- jc/parsers/clf_s.py | 223 ++++++++++++++++++++++++++++++ man/jc.1 | 5 + 8 files changed, 429 insertions(+), 6 deletions(-) create mode 100644 jc/parsers/clf_s.py diff --git a/README.md b/README.md index 8a92285d..06558814 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,7 @@ option. | ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) | | ` --cksum` | `cksum` and `sum` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cksum) | | ` --clf` | Common and Combined Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) | +| ` --clf-s` | Common and Combined Log Format file streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf_s) | | ` --crontab` | `crontab` command and file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab) | | ` --crontab-u` | `crontab` file parser with user support | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab_u) | | ` --csv` | CSV file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/csv) | diff --git a/completions/jc_bash_completion.sh b/completions/jc_bash_completion.sh index 83be3486..78c3e6a6 100644 --- a/completions/jc_bash_completion.sh +++ b/completions/jc_bash_completion.sh @@ -4,7 +4,7 @@ _jc() jc_about_options jc_about_mod_options jc_help_options jc_special_options jc_commands=(acpi airport arp blkid chage cksum crontab date df dig dmidecode dpkg du env file findmnt finger free git gpg hciconfig id ifconfig iostat iptables iw jobs last lastb ls lsblk lsmod lsof lspci lsusb md5 md5sum mdadm mount mpstat netstat nmcli ntpq os-prober pidstat ping ping6 pip pip3 postconf printenv ps route rpm rsync sfdisk sha1sum sha224sum sha256sum sha384sum sha512sum shasum ss sshd stat sum sysctl systemctl systeminfo timedatectl top tracepath tracepath6 traceroute traceroute6 udevadm ufw uname update-alternatives upower uptime vdir vmstat w wc who xrandr zipinfo) - jc_parsers=(--acpi --airport --airport-s --arp --asciitable --asciitable-m --blkid --cef --cef-s --chage --cksum --clf --crontab --crontab-u --csv --csv-s --date --datetime-iso --df --dig --dir --dmidecode --dpkg-l --du --email-address --env --file --findmnt --finger --free --fstab --git-log --git-log-s --git-ls-remote --gpg --group --gshadow --hash --hashsum --hciconfig --history --hosts --id --ifconfig --ini --iostat --iostat-s --ip-address --iptables --iw-scan --jar-manifest --jobs --jwt --kv --last --ls --ls-s --lsblk --lsmod --lsof --lspci --lsusb --m3u --mdadm --mount --mpstat --mpstat-s --netstat --nmcli --ntpq --os-prober --passwd --pci-ids --pidstat --pidstat-s --ping --ping-s --pip-list --pip-show --plist --postconf --proc --proc-buddyinfo --proc-consoles --proc-cpuinfo --proc-crypto --proc-devices --proc-diskstats --proc-filesystems --proc-interrupts --proc-iomem --proc-ioports --proc-loadavg --proc-locks --proc-meminfo --proc-modules --proc-mtrr --proc-pagetypeinfo --proc-partitions --proc-slabinfo --proc-softirqs --proc-stat --proc-swaps --proc-uptime --proc-version --proc-vmallocinfo --proc-vmstat --proc-zoneinfo --proc-driver-rtc --proc-net-arp --proc-net-dev --proc-net-dev-mcast --proc-net-if-inet6 --proc-net-igmp --proc-net-igmp6 --proc-net-ipv6-route --proc-net-netlink --proc-net-netstat --proc-net-packet --proc-net-protocols --proc-net-route --proc-net-unix --proc-pid-fdinfo --proc-pid-io --proc-pid-maps --proc-pid-mountinfo --proc-pid-numa-maps --proc-pid-smaps --proc-pid-stat --proc-pid-statm --proc-pid-status --ps --route --rpm-qi --rsync --rsync-s --semver --sfdisk --shadow --ss --sshd-conf --stat --stat-s --sysctl --syslog --syslog-s --syslog-bsd --syslog-bsd-s --systemctl --systemctl-lj --systemctl-ls --systemctl-luf --systeminfo --time --timedatectl --timestamp --top --top-s --tracepath --traceroute --udevadm --ufw --ufw-appinfo --uname --update-alt-gs --update-alt-q --upower --uptime --url --vmstat --vmstat-s --w --wc --who --x509-cert --xml --xrandr --yaml --zipinfo) + jc_parsers=(--acpi --airport --airport-s --arp --asciitable --asciitable-m --blkid --cef --cef-s --chage --cksum --clf --clf-s --crontab --crontab-u --csv --csv-s --date --datetime-iso --df --dig --dir --dmidecode --dpkg-l --du --email-address --env --file --findmnt --finger --free --fstab --git-log --git-log-s --git-ls-remote --gpg --group --gshadow --hash --hashsum --hciconfig --history --hosts --id --ifconfig --ini --iostat --iostat-s --ip-address --iptables --iw-scan --jar-manifest --jobs --jwt --kv --last --ls --ls-s --lsblk --lsmod --lsof --lspci --lsusb --m3u --mdadm --mount --mpstat --mpstat-s --netstat --nmcli --ntpq --os-prober --passwd --pci-ids --pidstat --pidstat-s --ping --ping-s --pip-list --pip-show --plist --postconf --proc --proc-buddyinfo --proc-consoles --proc-cpuinfo --proc-crypto --proc-devices --proc-diskstats --proc-filesystems --proc-interrupts --proc-iomem --proc-ioports --proc-loadavg --proc-locks --proc-meminfo --proc-modules --proc-mtrr --proc-pagetypeinfo --proc-partitions --proc-slabinfo --proc-softirqs --proc-stat --proc-swaps --proc-uptime --proc-version --proc-vmallocinfo --proc-vmstat --proc-zoneinfo --proc-driver-rtc --proc-net-arp --proc-net-dev --proc-net-dev-mcast --proc-net-if-inet6 --proc-net-igmp --proc-net-igmp6 --proc-net-ipv6-route --proc-net-netlink --proc-net-netstat --proc-net-packet --proc-net-protocols --proc-net-route --proc-net-unix --proc-pid-fdinfo --proc-pid-io --proc-pid-maps --proc-pid-mountinfo --proc-pid-numa-maps --proc-pid-smaps --proc-pid-stat --proc-pid-statm --proc-pid-status --ps --route --rpm-qi --rsync --rsync-s --semver --sfdisk --shadow --ss --sshd-conf --stat --stat-s --sysctl --syslog --syslog-s --syslog-bsd --syslog-bsd-s --systemctl --systemctl-lj --systemctl-ls --systemctl-luf --systeminfo --time --timedatectl --timestamp --top --top-s --tracepath --traceroute --udevadm --ufw --ufw-appinfo --uname --update-alt-gs --update-alt-q --upower --uptime --url --vmstat --vmstat-s --w --wc --who --x509-cert --xml --xrandr --yaml --zipinfo) jc_options=(--force-color -C --debug -d --monochrome -m --meta-out -M --pretty -p --quiet -q --raw -r --unbuffer -u --yaml-out -y) jc_about_options=(--about -a) jc_about_mod_options=(--pretty -p --yaml-out -y --monochrome -m --force-color -C) diff --git a/completions/jc_zsh_completion.sh b/completions/jc_zsh_completion.sh index 94f3c48f..79bad8c8 100644 --- a/completions/jc_zsh_completion.sh +++ b/completions/jc_zsh_completion.sh @@ -100,7 +100,7 @@ _jc() { 'xrandr:run "xrandr" command with magic syntax.' 'zipinfo:run "zipinfo" command with magic syntax.' ) - jc_parsers=(--acpi --airport --airport-s --arp --asciitable --asciitable-m --blkid --cef --cef-s --chage --cksum --clf --crontab --crontab-u --csv --csv-s --date --datetime-iso --df --dig --dir --dmidecode --dpkg-l --du --email-address --env --file --findmnt --finger --free --fstab --git-log --git-log-s --git-ls-remote --gpg --group --gshadow --hash --hashsum --hciconfig --history --hosts --id --ifconfig --ini --iostat --iostat-s --ip-address --iptables --iw-scan --jar-manifest --jobs --jwt --kv --last --ls --ls-s --lsblk --lsmod --lsof --lspci --lsusb --m3u --mdadm --mount --mpstat --mpstat-s --netstat --nmcli --ntpq --os-prober --passwd --pci-ids --pidstat --pidstat-s --ping --ping-s --pip-list --pip-show --plist --postconf --proc --proc-buddyinfo --proc-consoles --proc-cpuinfo --proc-crypto --proc-devices --proc-diskstats --proc-filesystems --proc-interrupts --proc-iomem --proc-ioports --proc-loadavg --proc-locks --proc-meminfo --proc-modules --proc-mtrr --proc-pagetypeinfo --proc-partitions --proc-slabinfo --proc-softirqs --proc-stat --proc-swaps --proc-uptime --proc-version --proc-vmallocinfo --proc-vmstat --proc-zoneinfo --proc-driver-rtc --proc-net-arp --proc-net-dev --proc-net-dev-mcast --proc-net-if-inet6 --proc-net-igmp --proc-net-igmp6 --proc-net-ipv6-route --proc-net-netlink --proc-net-netstat --proc-net-packet --proc-net-protocols --proc-net-route --proc-net-unix --proc-pid-fdinfo --proc-pid-io --proc-pid-maps --proc-pid-mountinfo --proc-pid-numa-maps --proc-pid-smaps --proc-pid-stat --proc-pid-statm --proc-pid-status --ps --route --rpm-qi --rsync --rsync-s --semver --sfdisk --shadow --ss --sshd-conf --stat --stat-s --sysctl --syslog --syslog-s --syslog-bsd --syslog-bsd-s --systemctl --systemctl-lj --systemctl-ls --systemctl-luf --systeminfo --time --timedatectl --timestamp --top --top-s --tracepath --traceroute --udevadm --ufw --ufw-appinfo --uname --update-alt-gs --update-alt-q --upower --uptime --url --vmstat --vmstat-s --w --wc --who --x509-cert --xml --xrandr --yaml --zipinfo) + jc_parsers=(--acpi --airport --airport-s --arp --asciitable --asciitable-m --blkid --cef --cef-s --chage --cksum --clf --clf-s --crontab --crontab-u --csv --csv-s --date --datetime-iso --df --dig --dir --dmidecode --dpkg-l --du --email-address --env --file --findmnt --finger --free --fstab --git-log --git-log-s --git-ls-remote --gpg --group --gshadow --hash --hashsum --hciconfig --history --hosts --id --ifconfig --ini --iostat --iostat-s --ip-address --iptables --iw-scan --jar-manifest --jobs --jwt --kv --last --ls --ls-s --lsblk --lsmod --lsof --lspci --lsusb --m3u --mdadm --mount --mpstat --mpstat-s --netstat --nmcli --ntpq --os-prober --passwd --pci-ids --pidstat --pidstat-s --ping --ping-s --pip-list --pip-show --plist --postconf --proc --proc-buddyinfo --proc-consoles --proc-cpuinfo --proc-crypto --proc-devices --proc-diskstats --proc-filesystems --proc-interrupts --proc-iomem --proc-ioports --proc-loadavg --proc-locks --proc-meminfo --proc-modules --proc-mtrr --proc-pagetypeinfo --proc-partitions --proc-slabinfo --proc-softirqs --proc-stat --proc-swaps --proc-uptime --proc-version --proc-vmallocinfo --proc-vmstat --proc-zoneinfo --proc-driver-rtc --proc-net-arp --proc-net-dev --proc-net-dev-mcast --proc-net-if-inet6 --proc-net-igmp --proc-net-igmp6 --proc-net-ipv6-route --proc-net-netlink --proc-net-netstat --proc-net-packet --proc-net-protocols --proc-net-route --proc-net-unix --proc-pid-fdinfo --proc-pid-io --proc-pid-maps --proc-pid-mountinfo --proc-pid-numa-maps --proc-pid-smaps --proc-pid-stat --proc-pid-statm --proc-pid-status --ps --route --rpm-qi --rsync --rsync-s --semver --sfdisk --shadow --ss --sshd-conf --stat --stat-s --sysctl --syslog --syslog-s --syslog-bsd --syslog-bsd-s --systemctl --systemctl-lj --systemctl-ls --systemctl-luf --systeminfo --time --timedatectl --timestamp --top --top-s --tracepath --traceroute --udevadm --ufw --ufw-appinfo --uname --update-alt-gs --update-alt-q --upower --uptime --url --vmstat --vmstat-s --w --wc --who --x509-cert --xml --xrandr --yaml --zipinfo) jc_parsers_describe=( '--acpi:`acpi` command parser' '--airport:`airport -I` command parser' @@ -114,6 +114,7 @@ _jc() { '--chage:`chage --list` command parser' '--cksum:`cksum` and `sum` command parser' '--clf:Common and Combined Log Format file parser' + '--clf-s:Common and Combined Log Format file streaming parser' '--crontab:`crontab` command and file parser' '--crontab-u:`crontab` file parser with user support' '--csv:CSV file parser' diff --git a/docs/parsers/clf.md b/docs/parsers/clf.md index af132eae..891479d9 100644 --- a/docs/parsers/clf.md +++ b/docs/parsers/clf.md @@ -70,10 +70,106 @@ Empty strings and `-` values are converted to `null`/`None`. Examples: $ cat file.log | jc --clf -p - [] + [ + { + "host": "127.0.0.1", + "ident": "user-identifier", + "authuser": "frank", + "date": "10/Oct/2000:13:55:36 -0700", + "day": 10, + "month": "Oct", + "year": 2000, + "hour": 13, + "minute": 55, + "second": 36, + "tz": "-0700", + "request": "GET /apache_pb.gif HTTPS/1.0", + "status": 200, + "bytes": 2326, + "referer": null, + "user_agent": null, + "extra": null, + "request_method": "GET", + "request_url": "/apache_pb.gif", + "request_version": "HTTPS/1.0", + "epoch": 971211336, + "epoch_utc": null + }, + { + "host": "1.1.1.2", + "ident": null, + "authuser": null, + "date": "11/Nov/2016:03:04:55 +0100", + "day": 11, + "month": "Nov", + "year": 2016, + "hour": 3, + "minute": 4, + "second": 55, + "tz": "+0100", + "request": "GET /", + "status": 200, + "bytes": 83, + "referer": null, + "user_agent": null, + "extra": "- 9221 1.1.1.1", + "request_method": "GET", + "request_url": "/", + "request_version": null, + "epoch": 1478862295, + "epoch_utc": null + }, + ... + ] $ cat file.log | jc --clf -p -r - [] + [ + { + "host": "127.0.0.1", + "ident": "user-identifier", + "authuser": "frank", + "date": "10/Oct/2000:13:55:36 -0700", + "day": "10", + "month": "Oct", + "year": "2000", + "hour": "13", + "minute": "55", + "second": "36", + "tz": "-0700", + "request": "GET /apache_pb.gif HTTPS/1.0", + "status": "200", + "bytes": "2326", + "referer": null, + "user_agent": null, + "extra": "", + "request_method": "GET", + "request_url": "/apache_pb.gif", + "request_version": "HTTPS/1.0" + }, + { + "host": "1.1.1.2", + "ident": "-", + "authuser": "-", + "date": "11/Nov/2016:03:04:55 +0100", + "day": "11", + "month": "Nov", + "year": "2016", + "hour": "03", + "minute": "04", + "second": "55", + "tz": "+0100", + "request": "GET /", + "status": "200", + "bytes": "83", + "referer": "-", + "user_agent": "-", + "extra": "- 9221 1.1.1.1", + "request_method": "GET", + "request_url": "/", + "request_version": null + }, + ... + ] diff --git a/jc/lib.py b/jc/lib.py index 39065997..4ed763ce 100644 --- a/jc/lib.py +++ b/jc/lib.py @@ -24,6 +24,7 @@ parsers: List[str] = [ 'chage', 'cksum', 'clf', + 'clf-s', 'crontab', 'crontab-u', 'csv', diff --git a/jc/parsers/clf.py b/jc/parsers/clf.py index 3e269ea1..4273445c 100644 --- a/jc/parsers/clf.py +++ b/jc/parsers/clf.py @@ -65,10 +65,106 @@ Empty strings and `-` values are converted to `null`/`None`. Examples: $ cat file.log | jc --clf -p - [] + [ + { + "host": "127.0.0.1", + "ident": "user-identifier", + "authuser": "frank", + "date": "10/Oct/2000:13:55:36 -0700", + "day": 10, + "month": "Oct", + "year": 2000, + "hour": 13, + "minute": 55, + "second": 36, + "tz": "-0700", + "request": "GET /apache_pb.gif HTTPS/1.0", + "status": 200, + "bytes": 2326, + "referer": null, + "user_agent": null, + "extra": null, + "request_method": "GET", + "request_url": "/apache_pb.gif", + "request_version": "HTTPS/1.0", + "epoch": 971211336, + "epoch_utc": null + }, + { + "host": "1.1.1.2", + "ident": null, + "authuser": null, + "date": "11/Nov/2016:03:04:55 +0100", + "day": 11, + "month": "Nov", + "year": 2016, + "hour": 3, + "minute": 4, + "second": 55, + "tz": "+0100", + "request": "GET /", + "status": 200, + "bytes": 83, + "referer": null, + "user_agent": null, + "extra": "- 9221 1.1.1.1", + "request_method": "GET", + "request_url": "/", + "request_version": null, + "epoch": 1478862295, + "epoch_utc": null + }, + ... + ] $ cat file.log | jc --clf -p -r - [] + [ + { + "host": "127.0.0.1", + "ident": "user-identifier", + "authuser": "frank", + "date": "10/Oct/2000:13:55:36 -0700", + "day": "10", + "month": "Oct", + "year": "2000", + "hour": "13", + "minute": "55", + "second": "36", + "tz": "-0700", + "request": "GET /apache_pb.gif HTTPS/1.0", + "status": "200", + "bytes": "2326", + "referer": null, + "user_agent": null, + "extra": "", + "request_method": "GET", + "request_url": "/apache_pb.gif", + "request_version": "HTTPS/1.0" + }, + { + "host": "1.1.1.2", + "ident": "-", + "authuser": "-", + "date": "11/Nov/2016:03:04:55 +0100", + "day": "11", + "month": "Nov", + "year": "2016", + "hour": "03", + "minute": "04", + "second": "55", + "tz": "+0100", + "request": "GET /", + "status": "200", + "bytes": "83", + "referer": "-", + "user_agent": "-", + "extra": "- 9221 1.1.1.1", + "request_method": "GET", + "request_url": "/", + "request_version": null + }, + ... + ] """ import re from typing import List, Dict diff --git a/jc/parsers/clf_s.py b/jc/parsers/clf_s.py new file mode 100644 index 00000000..5a5b2a55 --- /dev/null +++ b/jc/parsers/clf_s.py @@ -0,0 +1,223 @@ +"""jc - JSON Convert Common Log Format file streaming parser + +> This streaming parser outputs JSON Lines (cli) or returns an Iterable of +> Dictionaries (module) + +This parser will handle the Common Log Format standard as specified at +https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format. + +Combined Log Format is also supported. (Referer and User Agent fields added) + +Extra fields may be present and will be enclosed in the `extra` field as +a single string. + +If a log line cannot be parsed, an object with an `unparsable` field will +be present with a value of the original line. + +The `epoch` calculated timestamp field is naive. (i.e. based on the +local time of the system the parser is run on) + +The `epoch_utc` calculated timestamp field is timezone-aware and is +only available if the timezone field is UTC. + +Usage (cli): + + $ cat file.log | jc --clf-s + +Usage (module): + + import jc + + result = jc.parse('clf_s', common_log_file_output.splitlines()) + for item in result: + # do something + +Schema: + + Empty strings and `-` values are converted to `null`/`None`. + + { + "host": string, + "ident": string, + "authuser": string, + "date": string, + "day": integer, + "month": string, + "year": integer, + "hour": integer, + "minute": integer, + "second": integer, + "tz": string, + "request": string, + "request_method": string, + "request_url": string, + "request_version": string, + "status": integer, + "bytes": integer, + "referer": string, + "user_agent": string, + "extra": string, + "epoch": integer, # [0] + "epoch_utc": integer, # [1] + "unparsable": string # [2] + } + + [0] naive timestamp + [1] timezone-aware timestamp. Only available if timezone field is UTC + [2] exists if the line was not able to be parsed + +Examples: + + $ cat file.log | jc --clf-s + {"host":"127.0.0.1","ident":"user-identifier","authuser":"frank","...} + {"host":"1.1.1.2","ident":null,"authuser":null,"date":"11/Nov/2016...} + ... + + $ cat file.log | jc --clf-s -r + {"host":"127.0.0.1","ident":"user-identifier","authuser":"frank","...} + {"host":"1.1.1.2","ident":"-","authuser":"-","date":"11/Nov/2016:0...} + ... +""" +import re +from typing import Dict, Iterable +import jc.utils +from jc.streaming import ( + add_jc_meta, streaming_input_type_check, streaming_line_input_type_check, raise_or_yield +) +from jc.jc_types import JSONDictType, StreamingOutputType +from jc.exceptions import ParseError + + +class info(): + """Provides parser metadata (version, author, etc.)""" + version = '1.0' + description = 'Common and Combined Log Format file streaming parser' + author = 'Kelly Brazil' + author_email = 'kellyjonbrazil@gmail.com' + compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd'] + streaming = True + + +__version__ = info.version + + +def _process(proc_data: JSONDictType) -> JSONDictType: + """ + Final processing to conform to the schema. + + Parameters: + + proc_data: (Dictionary) raw structured data to process + + Returns: + + Dictionary. Structured data to conform to the schema. + """ + int_list = {'day', 'year', 'hour', 'minute', 'second', 'status', 'bytes'} + + for key, val in proc_data.items(): + + # integer conversions + if key in int_list: + proc_data[key] = jc.utils.convert_to_int(val) + + # convert `-` and blank values to None + if val == '-' or val == '': + proc_data[key] = None + + # add unix timestamps + if 'date' in proc_data: + ts = jc.utils.timestamp(proc_data['date'], format_hint=(1800,)) # type: ignore + proc_data['epoch'] = ts.naive + proc_data['epoch_utc'] = ts.utc + + return proc_data + + +@add_jc_meta +def parse( + data: Iterable[str], + raw: bool = False, + quiet: bool = False, + ignore_exceptions: bool = False +) -> StreamingOutputType: + """ + Main text parsing generator function. Returns an iterable object. + + Parameters: + + data: (iterable) line-based text data to parse + (e.g. sys.stdin or str.splitlines()) + + raw: (boolean) unprocessed output if True + quiet: (boolean) suppress warning messages if True + ignore_exceptions: (boolean) ignore parsing exceptions if True + + + Returns: + + Iterable of Dictionaries + """ + jc.utils.compatibility(__name__, info.compatible, quiet) + streaming_input_type_check(data) + + clf_pattern = re.compile(r''' + ^(?P-|\S+)\s + (?P-|\S+)\s + (?P-|\S+)\s + \[ + (?P + (?P\d+)/ + (?P\S\S\S)/ + (?P\d\d\d\d): + (?P\d\d): + (?P\d\d): + (?P\d\d)\s + (?P\S+) + ) + \]\s + \"(?P.*?)\"\s + (?P-|\d\d\d)\s + (?P-|\d+)\s? + (?:\"(?P.*?)\"\s?)? + (?:\"(?P.*?)\"\s?)? + (?P.*) + ''', re.VERBOSE + ) + + request_pattern = re.compile(r''' + (?P\S+)\s + (?P.*?(?=\sHTTPS?/|$))\s? # positive lookahead for HTTP(S)/ or end of string + (?PHTTPS?/[\d\.]+)? + ''', re.VERBOSE + ) + + for line in data: + try: + streaming_line_input_type_check(line) + output_line: Dict = {} + + if line == '' or line == '\n': + continue + + clf_match = re.match(clf_pattern, line) + + if clf_match: + output_line = clf_match.groupdict() + + if clf_match.groupdict().get('request', None): + request_string = clf_match.groupdict()['request'] + request_match = re.match(request_pattern, request_string) + if request_match: + output_line.update(request_match.groupdict()) + + else: + output_line = {"unparsable": line.strip()} + + if output_line: + yield output_line if raw else _process(output_line) + else: + raise ParseError('Not clf data') + + except Exception as e: + yield raise_or_yield(ignore_exceptions, e, line) diff --git a/man/jc.1 b/man/jc.1 index 0f67cb4c..f56f9f33 100644 --- a/man/jc.1 +++ b/man/jc.1 @@ -90,6 +90,11 @@ CEF string streaming parser \fB--clf\fP Common and Combined Log Format file parser +.TP +.B +\fB--clf-s\fP +Common and Combined Log Format file streaming parser + .TP .B \fB--crontab\fP