mirror of
https://github.com/kellyjonbrazil/jc.git
synced 2025-07-15 01:24:29 +02:00
fix clf request string parsing and add tests
This commit is contained in:
@ -1,7 +1,7 @@
|
|||||||
jc changelog
|
jc changelog
|
||||||
|
|
||||||
20221118 v1.22.3
|
20221118 v1.22.3
|
||||||
- Add Common Log Format file parser
|
- Add Common Log Format and Combined Log Format file parser
|
||||||
- Fix `git-log` and `git-log-s` parsers for failure on empty author name
|
- Fix `git-log` and `git-log-s` parsers for failure on empty author name
|
||||||
- Update `os-prober` parser with split EFI partition fields
|
- Update `os-prober` parser with split EFI partition fields
|
||||||
- Fix several documentation typos
|
- Fix several documentation typos
|
||||||
|
@ -165,7 +165,7 @@ option.
|
|||||||
| ` --cef-s` | CEF string streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s) |
|
| ` --cef-s` | CEF string streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s) |
|
||||||
| ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) |
|
| ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) |
|
||||||
| ` --cksum` | `cksum` and `sum` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cksum) |
|
| ` --cksum` | `cksum` and `sum` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cksum) |
|
||||||
| ` --clf` | Common Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) |
|
| ` --clf` | Common and Combined Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) |
|
||||||
| ` --crontab` | `crontab` command and file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab) |
|
| ` --crontab` | `crontab` command and file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab) |
|
||||||
| ` --crontab-u` | `crontab` file parser with user support | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab_u) |
|
| ` --crontab-u` | `crontab` file parser with user support | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab_u) |
|
||||||
| ` --csv` | CSV file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/csv) |
|
| ` --csv` | CSV file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/csv) |
|
||||||
|
@ -113,7 +113,7 @@ _jc() {
|
|||||||
'--cef-s:CEF string streaming parser'
|
'--cef-s:CEF string streaming parser'
|
||||||
'--chage:`chage --list` command parser'
|
'--chage:`chage --list` command parser'
|
||||||
'--cksum:`cksum` and `sum` command parser'
|
'--cksum:`cksum` and `sum` command parser'
|
||||||
'--clf:Common Log Format file parser'
|
'--clf:Common and Combined Log Format file parser'
|
||||||
'--crontab:`crontab` command and file parser'
|
'--crontab:`crontab` command and file parser'
|
||||||
'--crontab-u:`crontab` file parser with user support'
|
'--crontab-u:`crontab` file parser with user support'
|
||||||
'--csv:CSV file parser'
|
'--csv:CSV file parser'
|
||||||
|
@ -7,9 +7,18 @@ jc - JSON Convert Common Log Format file parser
|
|||||||
|
|
||||||
This parser will handle the Common Log Format standard as specified at
|
This parser will handle the Common Log Format standard as specified at
|
||||||
https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format.
|
https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format.
|
||||||
|
|
||||||
|
Combined Log Format is also supported. (Referer and User Agent fields added)
|
||||||
|
|
||||||
Extra fields may be present and will be enclosed in the `extra` field as
|
Extra fields may be present and will be enclosed in the `extra` field as
|
||||||
a single string.
|
a single string.
|
||||||
|
|
||||||
|
The `epoch` calculated timestamp field is naive. (i.e. based on the
|
||||||
|
local time of the system the parser is run on)
|
||||||
|
|
||||||
|
The `epoch_utc` calculated timestamp field is timezone-aware and is
|
||||||
|
only available if the timezone field is UTC.
|
||||||
|
|
||||||
Usage (cli):
|
Usage (cli):
|
||||||
|
|
||||||
$ cat file.log | jc --clf
|
$ cat file.log | jc --clf
|
||||||
@ -21,6 +30,8 @@ Usage (module):
|
|||||||
|
|
||||||
Schema:
|
Schema:
|
||||||
|
|
||||||
|
Empty strings and `-` values are converted to `null`/`None`.
|
||||||
|
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"host": string,
|
"host": string,
|
||||||
@ -40,6 +51,8 @@ Schema:
|
|||||||
"request_version": string,
|
"request_version": string,
|
||||||
"status": integer,
|
"status": integer,
|
||||||
"bytes": integer,
|
"bytes": integer,
|
||||||
|
"referer": string,
|
||||||
|
"user_agent": string,
|
||||||
"extra": string,
|
"extra": string,
|
||||||
"epoch": integer, # [0]
|
"epoch": integer, # [0]
|
||||||
"epoch_utc": integer # [1]
|
"epoch_utc": integer # [1]
|
||||||
|
@ -139,6 +139,8 @@ def parse(
|
|||||||
jc.utils.input_type_check(data)
|
jc.utils.input_type_check(data)
|
||||||
|
|
||||||
raw_output: List[Dict] = []
|
raw_output: List[Dict] = []
|
||||||
|
output_line: Dict = {}
|
||||||
|
|
||||||
clf_pattern = re.compile(r'''
|
clf_pattern = re.compile(r'''
|
||||||
^(?P<host>-|\S+)\s
|
^(?P<host>-|\S+)\s
|
||||||
(?P<ident>-|\S+)\s
|
(?P<ident>-|\S+)\s
|
||||||
@ -154,23 +156,37 @@ def parse(
|
|||||||
(?P<tz>\S+)
|
(?P<tz>\S+)
|
||||||
)
|
)
|
||||||
\]\s
|
\]\s
|
||||||
\"(?P<request>
|
\"(?P<request>.*?)\"\s
|
||||||
(?P<request_method>\S+)\s
|
|
||||||
(?P<request_url>.*?(?=\sHTTPS?/|\"))\s? # positive lookahead for HTTP or quote mark
|
|
||||||
(?P<request_version>HTTPS?/\d\.\d)?)\"\s
|
|
||||||
(?P<status>-|\d\d\d)\s
|
(?P<status>-|\d\d\d)\s
|
||||||
(?P<bytes>-|\d+)\s?
|
(?P<bytes>-|\d+)\s?
|
||||||
\"(?P<referer>.*?)\"\s?
|
(?:\"(?P<referer>.*?)\"\s?)?
|
||||||
\"(?P<user_agent>.*?)\"\s?
|
(?:\"(?P<user_agent>.*?)\"\s?)?
|
||||||
(?P<extra>.*)
|
(?P<extra>.*)
|
||||||
''', re.VERBOSE
|
''', re.VERBOSE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
request_pattern = re.compile(r'''
|
||||||
|
(?P<request_method>\S+)\s
|
||||||
|
(?P<request_url>.*?(?=\sHTTPS?/|$))\s? # positive lookahead for HTTP(S)/ or end of string
|
||||||
|
(?P<request_version>HTTPS?/[\d\.]+)?
|
||||||
|
''', re.VERBOSE
|
||||||
|
)
|
||||||
|
|
||||||
if jc.utils.has_data(data):
|
if jc.utils.has_data(data):
|
||||||
|
|
||||||
for line in filter(None, data.splitlines()):
|
for line in filter(None, data.splitlines()):
|
||||||
|
output_line = {}
|
||||||
clf_match = re.match(clf_pattern, line)
|
clf_match = re.match(clf_pattern, line)
|
||||||
|
|
||||||
if clf_match:
|
if clf_match:
|
||||||
raw_output.append(clf_match.groupdict())
|
output_line = clf_match.groupdict()
|
||||||
|
|
||||||
|
if clf_match.groupdict().get('request', None):
|
||||||
|
request_string = clf_match.groupdict()['request']
|
||||||
|
request_match = re.match(request_pattern, request_string)
|
||||||
|
if request_match:
|
||||||
|
output_line.update(request_match.groupdict())
|
||||||
|
|
||||||
|
raw_output.append(output_line)
|
||||||
|
|
||||||
return raw_output if raw else _process(raw_output)
|
return raw_output if raw else _process(raw_output)
|
||||||
|
4
man/jc.1
4
man/jc.1
@ -1,4 +1,4 @@
|
|||||||
.TH jc 1 2022-11-20 1.22.3 "JSON Convert"
|
.TH jc 1 2022-11-21 1.22.3 "JSON Convert"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
\fBjc\fP \- JSON Convert JSONifies the output of many CLI tools, file-types, and strings
|
\fBjc\fP \- JSON Convert JSONifies the output of many CLI tools, file-types, and strings
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
@ -88,7 +88,7 @@ CEF string streaming parser
|
|||||||
.TP
|
.TP
|
||||||
.B
|
.B
|
||||||
\fB--clf\fP
|
\fB--clf\fP
|
||||||
Common Log Format file parser
|
Common and Combined Log Format file parser
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
.B
|
.B
|
||||||
|
1
tests/fixtures/generic/common-log-format.json
vendored
Normal file
1
tests/fixtures/generic/common-log-format.json
vendored
Normal file
File diff suppressed because one or more lines are too long
17
tests/fixtures/generic/common-log-format.log
vendored
Normal file
17
tests/fixtures/generic/common-log-format.log
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTPS/1.0" 200 2326
|
||||||
|
1.1.1.2 - - [11/Nov/2016:03:04:55 +0100] "GET /" 200 83 "-" "-" - 9221 1.1.1.1
|
||||||
|
127.0.0.1 - - [11/Nov/2016:14:24:21 +0100] "GET /uno dos" 404 298 "-" "-" - 400233 1.1.1.1
|
||||||
|
127.0.0.1 - - [11/Nov/2016:14:23:37 +0100] "GET /uno dos HTTP/1.0" 404 298 "-" "-" - 385111 1.1.1.1
|
||||||
|
1.1.1.1 - - [11/Nov/2016:00:00:11 +0100] "GET /icc HTTP/1.1" 302 - "-" "XXX XXX XXX" - 6160 11.1.1.1
|
||||||
|
1.1.1.1 - - [11/Nov/2016:00:00:11 +0100] "GET /icc/ HTTP/1.1" 302 - "-" "XXX XXX XXX" - 2981 1.1.1.1
|
||||||
|
tarpon.gulf.net - - [12/Jan/1996:20:37:55 +0000] "GET index.htm HTTP/1.0" 200 215
|
||||||
|
tarpon.gulf.net - - [12/Jan/1996:20:37:56 +0000] "POST products.htm HTTP/1.0" 200 215
|
||||||
|
tarpon.gulf.net - - [12/Jan/1996:20:37:57 +0000] "PUT sales.htm HTTP/1.0" 200 215
|
||||||
|
tarpon.gulf.net - - [12/Jan/1996:20:37:58 +0000] "GET /images/log.gif HTTP/1.0" 200 215
|
||||||
|
tarpon.gulf.net - - [12/Jan/1996:20:37:59 +0000] "GET /buttons/form.gif HTTP/1.0" 200 215
|
||||||
|
66.249.66.1 - - [01/Jan/2017:09:00:00 +0000] "GET /contact.html HTTP/1.1" 200 250
|
||||||
|
66.249.66.1 - - [01/Jan/2017:09:00:00 +0000] "GET /contact.html HTTP/1.1" 200 250 "http://www.example.com/" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||||
|
127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
|
||||||
|
jay.bird.com - fred [25/Dec/1998:17:45:35 +0000] "GET /~sret1/ HTTP/1.0" 200 1243
|
||||||
|
127.0.0.1 - peter [9/Feb/2017:10:34:12 -0700] "GET /sample-image.png HTTP/2" 200 1479
|
||||||
|
10.1.2.3 - rehg [10/Nov/2021:19:22:12 -0000] "GET /sematext.png HTTP/1.1" 200 3423
|
47
tests/test_clf.py
Normal file
47
tests/test_clf.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
from typing import Dict
|
||||||
|
from jc.parsers.clf import parse
|
||||||
|
|
||||||
|
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
|
||||||
|
class MyTests(unittest.TestCase):
|
||||||
|
f_in: Dict = {}
|
||||||
|
f_json: Dict = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
fixtures = {
|
||||||
|
'clf': (
|
||||||
|
'fixtures/generic/common-log-format.log',
|
||||||
|
'fixtures/generic/common-log-format.json')
|
||||||
|
}
|
||||||
|
|
||||||
|
for file, filepaths in fixtures.items():
|
||||||
|
with open(os.path.join(THIS_DIR, filepaths[0]), 'r', encoding='utf-8') as a, \
|
||||||
|
open(os.path.join(THIS_DIR, filepaths[1]), 'r', encoding='utf-8') as b:
|
||||||
|
cls.f_in[file] = a.read()
|
||||||
|
cls.f_json[file] = json.loads(b.read())
|
||||||
|
|
||||||
|
|
||||||
|
def test_clf_nodata(self):
|
||||||
|
"""
|
||||||
|
Test 'clf' with no data
|
||||||
|
"""
|
||||||
|
self.assertEqual(parse('', quiet=True), [])
|
||||||
|
|
||||||
|
|
||||||
|
def test_clf(self):
|
||||||
|
"""
|
||||||
|
Test 'clf' with various log lines
|
||||||
|
"""
|
||||||
|
self.assertEqual(parse(
|
||||||
|
self.f_in['clf'], quiet=True),
|
||||||
|
self.f_json['clf']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Reference in New Issue
Block a user