1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-07-15 01:24:29 +02:00

fix clf request string parsing and add tests

This commit is contained in:
Kelly Brazil
2022-11-21 11:00:58 -08:00
parent 5ab2ebe45a
commit 60f1e79b2f
9 changed files with 106 additions and 12 deletions

View File

@ -1,7 +1,7 @@
jc changelog jc changelog
20221118 v1.22.3 20221118 v1.22.3
- Add Common Log Format file parser - Add Common Log Format and Combined Log Format file parser
- Fix `git-log` and `git-log-s` parsers for failure on empty author name - Fix `git-log` and `git-log-s` parsers for failure on empty author name
- Update `os-prober` parser with split EFI partition fields - Update `os-prober` parser with split EFI partition fields
- Fix several documentation typos - Fix several documentation typos

View File

@ -165,7 +165,7 @@ option.
| ` --cef-s` | CEF string streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s) | | ` --cef-s` | CEF string streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s) |
| ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) | | ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) |
| ` --cksum` | `cksum` and `sum` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cksum) | | ` --cksum` | `cksum` and `sum` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cksum) |
| ` --clf` | Common Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) | | ` --clf` | Common and Combined Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) |
| ` --crontab` | `crontab` command and file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab) | | ` --crontab` | `crontab` command and file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab) |
| ` --crontab-u` | `crontab` file parser with user support | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab_u) | | ` --crontab-u` | `crontab` file parser with user support | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab_u) |
| ` --csv` | CSV file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/csv) | | ` --csv` | CSV file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/csv) |

View File

@ -113,7 +113,7 @@ _jc() {
'--cef-s:CEF string streaming parser' '--cef-s:CEF string streaming parser'
'--chage:`chage --list` command parser' '--chage:`chage --list` command parser'
'--cksum:`cksum` and `sum` command parser' '--cksum:`cksum` and `sum` command parser'
'--clf:Common Log Format file parser' '--clf:Common and Combined Log Format file parser'
'--crontab:`crontab` command and file parser' '--crontab:`crontab` command and file parser'
'--crontab-u:`crontab` file parser with user support' '--crontab-u:`crontab` file parser with user support'
'--csv:CSV file parser' '--csv:CSV file parser'

View File

@ -7,9 +7,18 @@ jc - JSON Convert Common Log Format file parser
This parser will handle the Common Log Format standard as specified at This parser will handle the Common Log Format standard as specified at
https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format. https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format.
Combined Log Format is also supported. (Referer and User Agent fields added)
Extra fields may be present and will be enclosed in the `extra` field as Extra fields may be present and will be enclosed in the `extra` field as
a single string. a single string.
The `epoch` calculated timestamp field is naive. (i.e. based on the
local time of the system the parser is run on)
The `epoch_utc` calculated timestamp field is timezone-aware and is
only available if the timezone field is UTC.
Usage (cli): Usage (cli):
$ cat file.log | jc --clf $ cat file.log | jc --clf
@ -21,6 +30,8 @@ Usage (module):
Schema: Schema:
Empty strings and `-` values are converted to `null`/`None`.
[ [
{ {
"host": string, "host": string,
@ -40,6 +51,8 @@ Schema:
"request_version": string, "request_version": string,
"status": integer, "status": integer,
"bytes": integer, "bytes": integer,
"referer": string,
"user_agent": string,
"extra": string, "extra": string,
"epoch": integer, # [0] "epoch": integer, # [0]
"epoch_utc": integer # [1] "epoch_utc": integer # [1]

View File

@ -139,6 +139,8 @@ def parse(
jc.utils.input_type_check(data) jc.utils.input_type_check(data)
raw_output: List[Dict] = [] raw_output: List[Dict] = []
output_line: Dict = {}
clf_pattern = re.compile(r''' clf_pattern = re.compile(r'''
^(?P<host>-|\S+)\s ^(?P<host>-|\S+)\s
(?P<ident>-|\S+)\s (?P<ident>-|\S+)\s
@ -154,23 +156,37 @@ def parse(
(?P<tz>\S+) (?P<tz>\S+)
) )
\]\s \]\s
\"(?P<request> \"(?P<request>.*?)\"\s
(?P<request_method>\S+)\s
(?P<request_url>.*?(?=\sHTTPS?/|\"))\s? # positive lookahead for HTTP or quote mark
(?P<request_version>HTTPS?/\d\.\d)?)\"\s
(?P<status>-|\d\d\d)\s (?P<status>-|\d\d\d)\s
(?P<bytes>-|\d+)\s? (?P<bytes>-|\d+)\s?
\"(?P<referer>.*?)\"\s? (?:\"(?P<referer>.*?)\"\s?)?
\"(?P<user_agent>.*?)\"\s? (?:\"(?P<user_agent>.*?)\"\s?)?
(?P<extra>.*) (?P<extra>.*)
''', re.VERBOSE ''', re.VERBOSE
) )
request_pattern = re.compile(r'''
(?P<request_method>\S+)\s
(?P<request_url>.*?(?=\sHTTPS?/|$))\s? # positive lookahead for HTTP(S)/ or end of string
(?P<request_version>HTTPS?/[\d\.]+)?
''', re.VERBOSE
)
if jc.utils.has_data(data): if jc.utils.has_data(data):
for line in filter(None, data.splitlines()): for line in filter(None, data.splitlines()):
output_line = {}
clf_match = re.match(clf_pattern, line) clf_match = re.match(clf_pattern, line)
if clf_match: if clf_match:
raw_output.append(clf_match.groupdict()) output_line = clf_match.groupdict()
if clf_match.groupdict().get('request', None):
request_string = clf_match.groupdict()['request']
request_match = re.match(request_pattern, request_string)
if request_match:
output_line.update(request_match.groupdict())
raw_output.append(output_line)
return raw_output if raw else _process(raw_output) return raw_output if raw else _process(raw_output)

View File

@ -1,4 +1,4 @@
.TH jc 1 2022-11-20 1.22.3 "JSON Convert" .TH jc 1 2022-11-21 1.22.3 "JSON Convert"
.SH NAME .SH NAME
\fBjc\fP \- JSON Convert JSONifies the output of many CLI tools, file-types, and strings \fBjc\fP \- JSON Convert JSONifies the output of many CLI tools, file-types, and strings
.SH SYNOPSIS .SH SYNOPSIS
@ -88,7 +88,7 @@ CEF string streaming parser
.TP .TP
.B .B
\fB--clf\fP \fB--clf\fP
Common Log Format file parser Common and Combined Log Format file parser
.TP .TP
.B .B

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,17 @@
127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTPS/1.0" 200 2326
1.1.1.2 - - [11/Nov/2016:03:04:55 +0100] "GET /" 200 83 "-" "-" - 9221 1.1.1.1
127.0.0.1 - - [11/Nov/2016:14:24:21 +0100] "GET /uno dos" 404 298 "-" "-" - 400233 1.1.1.1
127.0.0.1 - - [11/Nov/2016:14:23:37 +0100] "GET /uno dos HTTP/1.0" 404 298 "-" "-" - 385111 1.1.1.1
1.1.1.1 - - [11/Nov/2016:00:00:11 +0100] "GET /icc HTTP/1.1" 302 - "-" "XXX XXX XXX" - 6160 11.1.1.1
1.1.1.1 - - [11/Nov/2016:00:00:11 +0100] "GET /icc/ HTTP/1.1" 302 - "-" "XXX XXX XXX" - 2981 1.1.1.1
tarpon.gulf.net - - [12/Jan/1996:20:37:55 +0000] "GET index.htm HTTP/1.0" 200 215
tarpon.gulf.net - - [12/Jan/1996:20:37:56 +0000] "POST products.htm HTTP/1.0" 200 215
tarpon.gulf.net - - [12/Jan/1996:20:37:57 +0000] "PUT sales.htm HTTP/1.0" 200 215
tarpon.gulf.net - - [12/Jan/1996:20:37:58 +0000] "GET /images/log.gif HTTP/1.0" 200 215
tarpon.gulf.net - - [12/Jan/1996:20:37:59 +0000] "GET /buttons/form.gif HTTP/1.0" 200 215
66.249.66.1 - - [01/Jan/2017:09:00:00 +0000] "GET /contact.html HTTP/1.1" 200 250
66.249.66.1 - - [01/Jan/2017:09:00:00 +0000] "GET /contact.html HTTP/1.1" 200 250 "http://www.example.com/" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
jay.bird.com - fred [25/Dec/1998:17:45:35 +0000] "GET /~sret1/ HTTP/1.0" 200 1243
127.0.0.1 - peter [9/Feb/2017:10:34:12 -0700] "GET /sample-image.png HTTP/2" 200 1479
10.1.2.3 - rehg [10/Nov/2021:19:22:12 -0000] "GET /sematext.png HTTP/1.1" 200 3423

47
tests/test_clf.py Normal file
View File

@ -0,0 +1,47 @@
import os
import unittest
import json
from typing import Dict
from jc.parsers.clf import parse
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
class MyTests(unittest.TestCase):
f_in: Dict = {}
f_json: Dict = {}
@classmethod
def setUpClass(cls):
fixtures = {
'clf': (
'fixtures/generic/common-log-format.log',
'fixtures/generic/common-log-format.json')
}
for file, filepaths in fixtures.items():
with open(os.path.join(THIS_DIR, filepaths[0]), 'r', encoding='utf-8') as a, \
open(os.path.join(THIS_DIR, filepaths[1]), 'r', encoding='utf-8') as b:
cls.f_in[file] = a.read()
cls.f_json[file] = json.loads(b.read())
def test_clf_nodata(self):
"""
Test 'clf' with no data
"""
self.assertEqual(parse('', quiet=True), [])
def test_clf(self):
"""
Test 'clf' with various log lines
"""
self.assertEqual(parse(
self.f_in['clf'], quiet=True),
self.f_json['clf']
)
if __name__ == '__main__':
unittest.main()