From 60f1e79b2fd274042bc382601a031923111b0ec3 Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Mon, 21 Nov 2022 11:00:58 -0800 Subject: [PATCH] fix clf request string parsing and add tests --- CHANGELOG | 2 +- README.md | 2 +- completions/jc_zsh_completion.sh | 2 +- docs/parsers/clf.md | 13 +++++ jc/parsers/clf.py | 30 +++++++++--- man/jc.1 | 4 +- tests/fixtures/generic/common-log-format.json | 1 + tests/fixtures/generic/common-log-format.log | 17 +++++++ tests/test_clf.py | 47 +++++++++++++++++++ 9 files changed, 106 insertions(+), 12 deletions(-) create mode 100644 tests/fixtures/generic/common-log-format.json create mode 100644 tests/fixtures/generic/common-log-format.log create mode 100644 tests/test_clf.py diff --git a/CHANGELOG b/CHANGELOG index 5f646ef0..8fb77f94 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,7 @@ jc changelog 20221118 v1.22.3 -- Add Common Log Format file parser +- Add Common Log Format and Combined Log Format file parser - Fix `git-log` and `git-log-s` parsers for failure on empty author name - Update `os-prober` parser with split EFI partition fields - Fix several documentation typos diff --git a/README.md b/README.md index fcae2609..8a92285d 100644 --- a/README.md +++ b/README.md @@ -165,7 +165,7 @@ option. | ` --cef-s` | CEF string streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s) | | ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) | | ` --cksum` | `cksum` and `sum` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cksum) | -| ` --clf` | Common Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) | +| ` --clf` | Common and Combined Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) | | ` --crontab` | `crontab` command and file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab) | | ` --crontab-u` | `crontab` file parser with user support | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab_u) | | ` --csv` | CSV file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/csv) | diff --git a/completions/jc_zsh_completion.sh b/completions/jc_zsh_completion.sh index 93d541b4..94f3c48f 100644 --- a/completions/jc_zsh_completion.sh +++ b/completions/jc_zsh_completion.sh @@ -113,7 +113,7 @@ _jc() { '--cef-s:CEF string streaming parser' '--chage:`chage --list` command parser' '--cksum:`cksum` and `sum` command parser' - '--clf:Common Log Format file parser' + '--clf:Common and Combined Log Format file parser' '--crontab:`crontab` command and file parser' '--crontab-u:`crontab` file parser with user support' '--csv:CSV file parser' diff --git a/docs/parsers/clf.md b/docs/parsers/clf.md index 37797b3c..37fe88e8 100644 --- a/docs/parsers/clf.md +++ b/docs/parsers/clf.md @@ -7,9 +7,18 @@ jc - JSON Convert Common Log Format file parser This parser will handle the Common Log Format standard as specified at https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format. + +Combined Log Format is also supported. (Referer and User Agent fields added) + Extra fields may be present and will be enclosed in the `extra` field as a single string. +The `epoch` calculated timestamp field is naive. (i.e. based on the +local time of the system the parser is run on) + +The `epoch_utc` calculated timestamp field is timezone-aware and is +only available if the timezone field is UTC. + Usage (cli): $ cat file.log | jc --clf @@ -21,6 +30,8 @@ Usage (module): Schema: +Empty strings and `-` values are converted to `null`/`None`. + [ { "host": string, @@ -40,6 +51,8 @@ Schema: "request_version": string, "status": integer, "bytes": integer, + "referer": string, + "user_agent": string, "extra": string, "epoch": integer, # [0] "epoch_utc": integer # [1] diff --git a/jc/parsers/clf.py b/jc/parsers/clf.py index 9353e337..ef054142 100644 --- a/jc/parsers/clf.py +++ b/jc/parsers/clf.py @@ -139,6 +139,8 @@ def parse( jc.utils.input_type_check(data) raw_output: List[Dict] = [] + output_line: Dict = {} + clf_pattern = re.compile(r''' ^(?P-|\S+)\s (?P-|\S+)\s @@ -154,23 +156,37 @@ def parse( (?P\S+) ) \]\s - \"(?P - (?P\S+)\s - (?P.*?(?=\sHTTPS?/|\"))\s? # positive lookahead for HTTP or quote mark - (?PHTTPS?/\d\.\d)?)\"\s + \"(?P.*?)\"\s (?P-|\d\d\d)\s (?P-|\d+)\s? - \"(?P.*?)\"\s? - \"(?P.*?)\"\s? + (?:\"(?P.*?)\"\s?)? + (?:\"(?P.*?)\"\s?)? (?P.*) ''', re.VERBOSE ) + request_pattern = re.compile(r''' + (?P\S+)\s + (?P.*?(?=\sHTTPS?/|$))\s? # positive lookahead for HTTP(S)/ or end of string + (?PHTTPS?/[\d\.]+)? + ''', re.VERBOSE + ) + if jc.utils.has_data(data): for line in filter(None, data.splitlines()): + output_line = {} clf_match = re.match(clf_pattern, line) + if clf_match: - raw_output.append(clf_match.groupdict()) + output_line = clf_match.groupdict() + + if clf_match.groupdict().get('request', None): + request_string = clf_match.groupdict()['request'] + request_match = re.match(request_pattern, request_string) + if request_match: + output_line.update(request_match.groupdict()) + + raw_output.append(output_line) return raw_output if raw else _process(raw_output) diff --git a/man/jc.1 b/man/jc.1 index 071b3413..0f67cb4c 100644 --- a/man/jc.1 +++ b/man/jc.1 @@ -1,4 +1,4 @@ -.TH jc 1 2022-11-20 1.22.3 "JSON Convert" +.TH jc 1 2022-11-21 1.22.3 "JSON Convert" .SH NAME \fBjc\fP \- JSON Convert JSONifies the output of many CLI tools, file-types, and strings .SH SYNOPSIS @@ -88,7 +88,7 @@ CEF string streaming parser .TP .B \fB--clf\fP -Common Log Format file parser +Common and Combined Log Format file parser .TP .B diff --git a/tests/fixtures/generic/common-log-format.json b/tests/fixtures/generic/common-log-format.json new file mode 100644 index 00000000..397cd84a --- /dev/null +++ b/tests/fixtures/generic/common-log-format.json @@ -0,0 +1 @@ +[{"host":"127.0.0.1","ident":"user-identifier","authuser":"frank","date":"10/Oct/2000:13:55:36 -0700","day":10,"month":"Oct","year":2000,"hour":13,"minute":55,"second":36,"tz":"-0700","request":"GET /apache_pb.gif HTTPS/1.0","status":200,"bytes":2326,"referer":null,"user_agent":null,"extra":null,"request_method":"GET","request_url":"/apache_pb.gif","request_version":"HTTPS/1.0","epoch":971211336,"epoch_utc":null},{"host":"1.1.1.2","ident":null,"authuser":null,"date":"11/Nov/2016:03:04:55 +0100","day":11,"month":"Nov","year":2016,"hour":3,"minute":4,"second":55,"tz":"+0100","request":"GET /","status":200,"bytes":83,"referer":null,"user_agent":null,"extra":"- 9221 1.1.1.1","request_method":"GET","request_url":"/","request_version":null,"epoch":1478862295,"epoch_utc":null},{"host":"127.0.0.1","ident":null,"authuser":null,"date":"11/Nov/2016:14:24:21 +0100","day":11,"month":"Nov","year":2016,"hour":14,"minute":24,"second":21,"tz":"+0100","request":"GET /uno dos","status":404,"bytes":298,"referer":null,"user_agent":null,"extra":"- 400233 1.1.1.1","request_method":"GET","request_url":"/uno dos","request_version":null,"epoch":1478903061,"epoch_utc":null},{"host":"127.0.0.1","ident":null,"authuser":null,"date":"11/Nov/2016:14:23:37 +0100","day":11,"month":"Nov","year":2016,"hour":14,"minute":23,"second":37,"tz":"+0100","request":"GET /uno dos HTTP/1.0","status":404,"bytes":298,"referer":null,"user_agent":null,"extra":"- 385111 1.1.1.1","request_method":"GET","request_url":"/uno dos","request_version":"HTTP/1.0","epoch":1478903017,"epoch_utc":null},{"host":"1.1.1.1","ident":null,"authuser":null,"date":"11/Nov/2016:00:00:11 +0100","day":11,"month":"Nov","year":2016,"hour":0,"minute":0,"second":11,"tz":"+0100","request":"GET /icc HTTP/1.1","status":302,"bytes":null,"referer":null,"user_agent":"XXX XXX XXX","extra":"- 6160 11.1.1.1","request_method":"GET","request_url":"/icc","request_version":"HTTP/1.1","epoch":1478851211,"epoch_utc":null},{"host":"1.1.1.1","ident":null,"authuser":null,"date":"11/Nov/2016:00:00:11 +0100","day":11,"month":"Nov","year":2016,"hour":0,"minute":0,"second":11,"tz":"+0100","request":"GET /icc/ HTTP/1.1","status":302,"bytes":null,"referer":null,"user_agent":"XXX XXX XXX","extra":"- 2981 1.1.1.1","request_method":"GET","request_url":"/icc/","request_version":"HTTP/1.1","epoch":1478851211,"epoch_utc":null},{"host":"tarpon.gulf.net","ident":null,"authuser":null,"date":"12/Jan/1996:20:37:55 +0000","day":12,"month":"Jan","year":1996,"hour":20,"minute":37,"second":55,"tz":"+0000","request":"GET index.htm HTTP/1.0","status":200,"bytes":215,"referer":null,"user_agent":null,"extra":null,"request_method":"GET","request_url":"index.htm","request_version":"HTTP/1.0","epoch":821507875,"epoch_utc":821479075},{"host":"tarpon.gulf.net","ident":null,"authuser":null,"date":"12/Jan/1996:20:37:56 +0000","day":12,"month":"Jan","year":1996,"hour":20,"minute":37,"second":56,"tz":"+0000","request":"POST products.htm HTTP/1.0","status":200,"bytes":215,"referer":null,"user_agent":null,"extra":null,"request_method":"POST","request_url":"products.htm","request_version":"HTTP/1.0","epoch":821507876,"epoch_utc":821479076},{"host":"tarpon.gulf.net","ident":null,"authuser":null,"date":"12/Jan/1996:20:37:57 +0000","day":12,"month":"Jan","year":1996,"hour":20,"minute":37,"second":57,"tz":"+0000","request":"PUT sales.htm HTTP/1.0","status":200,"bytes":215,"referer":null,"user_agent":null,"extra":null,"request_method":"PUT","request_url":"sales.htm","request_version":"HTTP/1.0","epoch":821507877,"epoch_utc":821479077},{"host":"tarpon.gulf.net","ident":null,"authuser":null,"date":"12/Jan/1996:20:37:58 +0000","day":12,"month":"Jan","year":1996,"hour":20,"minute":37,"second":58,"tz":"+0000","request":"GET /images/log.gif HTTP/1.0","status":200,"bytes":215,"referer":null,"user_agent":null,"extra":null,"request_method":"GET","request_url":"/images/log.gif","request_version":"HTTP/1.0","epoch":821507878,"epoch_utc":821479078},{"host":"tarpon.gulf.net","ident":null,"authuser":null,"date":"12/Jan/1996:20:37:59 +0000","day":12,"month":"Jan","year":1996,"hour":20,"minute":37,"second":59,"tz":"+0000","request":"GET /buttons/form.gif HTTP/1.0","status":200,"bytes":215,"referer":null,"user_agent":null,"extra":null,"request_method":"GET","request_url":"/buttons/form.gif","request_version":"HTTP/1.0","epoch":821507879,"epoch_utc":821479079},{"host":"66.249.66.1","ident":null,"authuser":null,"date":"01/Jan/2017:09:00:00 +0000","day":1,"month":"Jan","year":2017,"hour":9,"minute":0,"second":0,"tz":"+0000","request":"GET /contact.html HTTP/1.1","status":200,"bytes":250,"referer":null,"user_agent":null,"extra":null,"request_method":"GET","request_url":"/contact.html","request_version":"HTTP/1.1","epoch":1483290000,"epoch_utc":1483261200},{"host":"66.249.66.1","ident":null,"authuser":null,"date":"01/Jan/2017:09:00:00 +0000","day":1,"month":"Jan","year":2017,"hour":9,"minute":0,"second":0,"tz":"+0000","request":"GET /contact.html HTTP/1.1","status":200,"bytes":250,"referer":"http://www.example.com/","user_agent":"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)","extra":null,"request_method":"GET","request_url":"/contact.html","request_version":"HTTP/1.1","epoch":1483290000,"epoch_utc":1483261200},{"host":"127.0.0.1","ident":null,"authuser":"frank","date":"10/Oct/2000:13:55:36 -0700","day":10,"month":"Oct","year":2000,"hour":13,"minute":55,"second":36,"tz":"-0700","request":"GET /apache_pb.gif HTTP/1.0","status":200,"bytes":2326,"referer":"http://www.example.com/start.html","user_agent":"Mozilla/4.08 [en] (Win98; I ;Nav)","extra":null,"request_method":"GET","request_url":"/apache_pb.gif","request_version":"HTTP/1.0","epoch":971211336,"epoch_utc":null},{"host":"jay.bird.com","ident":null,"authuser":"fred","date":"25/Dec/1998:17:45:35 +0000","day":25,"month":"Dec","year":1998,"hour":17,"minute":45,"second":35,"tz":"+0000","request":"GET /~sret1/ HTTP/1.0","status":200,"bytes":1243,"referer":null,"user_agent":null,"extra":null,"request_method":"GET","request_url":"/~sret1/","request_version":"HTTP/1.0","epoch":914636735,"epoch_utc":914607935},{"host":"127.0.0.1","ident":null,"authuser":"peter","date":"9/Feb/2017:10:34:12 -0700","day":9,"month":"Feb","year":2017,"hour":10,"minute":34,"second":12,"tz":"-0700","request":"GET /sample-image.png HTTP/2","status":200,"bytes":1479,"referer":null,"user_agent":null,"extra":null,"request_method":"GET","request_url":"/sample-image.png","request_version":"HTTP/2","epoch":1486665252,"epoch_utc":null},{"host":"10.1.2.3","ident":null,"authuser":"rehg","date":"10/Nov/2021:19:22:12 -0000","day":10,"month":"Nov","year":2021,"hour":19,"minute":22,"second":12,"tz":"-0000","request":"GET /sematext.png HTTP/1.1","status":200,"bytes":3423,"referer":null,"user_agent":null,"extra":null,"request_method":"GET","request_url":"/sematext.png","request_version":"HTTP/1.1","epoch":1636600932,"epoch_utc":1636572132}] diff --git a/tests/fixtures/generic/common-log-format.log b/tests/fixtures/generic/common-log-format.log new file mode 100644 index 00000000..8ede8a16 --- /dev/null +++ b/tests/fixtures/generic/common-log-format.log @@ -0,0 +1,17 @@ +127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTPS/1.0" 200 2326 +1.1.1.2 - - [11/Nov/2016:03:04:55 +0100] "GET /" 200 83 "-" "-" - 9221 1.1.1.1 +127.0.0.1 - - [11/Nov/2016:14:24:21 +0100] "GET /uno dos" 404 298 "-" "-" - 400233 1.1.1.1 +127.0.0.1 - - [11/Nov/2016:14:23:37 +0100] "GET /uno dos HTTP/1.0" 404 298 "-" "-" - 385111 1.1.1.1 +1.1.1.1 - - [11/Nov/2016:00:00:11 +0100] "GET /icc HTTP/1.1" 302 - "-" "XXX XXX XXX" - 6160 11.1.1.1 +1.1.1.1 - - [11/Nov/2016:00:00:11 +0100] "GET /icc/ HTTP/1.1" 302 - "-" "XXX XXX XXX" - 2981 1.1.1.1 +tarpon.gulf.net - - [12/Jan/1996:20:37:55 +0000] "GET index.htm HTTP/1.0" 200 215 +tarpon.gulf.net - - [12/Jan/1996:20:37:56 +0000] "POST products.htm HTTP/1.0" 200 215 +tarpon.gulf.net - - [12/Jan/1996:20:37:57 +0000] "PUT sales.htm HTTP/1.0" 200 215 +tarpon.gulf.net - - [12/Jan/1996:20:37:58 +0000] "GET /images/log.gif HTTP/1.0" 200 215 +tarpon.gulf.net - - [12/Jan/1996:20:37:59 +0000] "GET /buttons/form.gif HTTP/1.0" 200 215 +66.249.66.1 - - [01/Jan/2017:09:00:00 +0000] "GET /contact.html HTTP/1.1" 200 250 +66.249.66.1 - - [01/Jan/2017:09:00:00 +0000] "GET /contact.html HTTP/1.1" 200 250 "http://www.example.com/" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" +127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)" +jay.bird.com - fred [25/Dec/1998:17:45:35 +0000] "GET /~sret1/ HTTP/1.0" 200 1243 +127.0.0.1 - peter [9/Feb/2017:10:34:12 -0700] "GET /sample-image.png HTTP/2" 200 1479 +10.1.2.3 - rehg [10/Nov/2021:19:22:12 -0000] "GET /sematext.png HTTP/1.1" 200 3423 diff --git a/tests/test_clf.py b/tests/test_clf.py new file mode 100644 index 00000000..f7dd43c1 --- /dev/null +++ b/tests/test_clf.py @@ -0,0 +1,47 @@ +import os +import unittest +import json +from typing import Dict +from jc.parsers.clf import parse + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + + +class MyTests(unittest.TestCase): + f_in: Dict = {} + f_json: Dict = {} + + @classmethod + def setUpClass(cls): + fixtures = { + 'clf': ( + 'fixtures/generic/common-log-format.log', + 'fixtures/generic/common-log-format.json') + } + + for file, filepaths in fixtures.items(): + with open(os.path.join(THIS_DIR, filepaths[0]), 'r', encoding='utf-8') as a, \ + open(os.path.join(THIS_DIR, filepaths[1]), 'r', encoding='utf-8') as b: + cls.f_in[file] = a.read() + cls.f_json[file] = json.loads(b.read()) + + + def test_clf_nodata(self): + """ + Test 'clf' with no data + """ + self.assertEqual(parse('', quiet=True), []) + + + def test_clf(self): + """ + Test 'clf' with various log lines + """ + self.assertEqual(parse( + self.f_in['clf'], quiet=True), + self.f_json['clf'] + ) + + +if __name__ == '__main__': + unittest.main()