1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-07-15 01:24:29 +02:00

fix clf request string parsing and add tests

This commit is contained in:
Kelly Brazil
2022-11-21 11:00:58 -08:00
parent 5ab2ebe45a
commit 60f1e79b2f
9 changed files with 106 additions and 12 deletions

View File

@ -1,7 +1,7 @@
jc changelog
20221118 v1.22.3
- Add Common Log Format file parser
- Add Common Log Format and Combined Log Format file parser
- Fix `git-log` and `git-log-s` parsers for failure on empty author name
- Update `os-prober` parser with split EFI partition fields
- Fix several documentation typos

View File

@ -165,7 +165,7 @@ option.
| ` --cef-s` | CEF string streaming parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cef_s) |
| ` --chage` | `chage --list` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/chage) |
| ` --cksum` | `cksum` and `sum` command parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/cksum) |
| ` --clf` | Common Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) |
| ` --clf` | Common and Combined Log Format file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/clf) |
| ` --crontab` | `crontab` command and file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab) |
| ` --crontab-u` | `crontab` file parser with user support | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/crontab_u) |
| ` --csv` | CSV file parser | [details](https://kellyjonbrazil.github.io/jc/docs/parsers/csv) |

View File

@ -113,7 +113,7 @@ _jc() {
'--cef-s:CEF string streaming parser'
'--chage:`chage --list` command parser'
'--cksum:`cksum` and `sum` command parser'
'--clf:Common Log Format file parser'
'--clf:Common and Combined Log Format file parser'
'--crontab:`crontab` command and file parser'
'--crontab-u:`crontab` file parser with user support'
'--csv:CSV file parser'

View File

@ -7,9 +7,18 @@ jc - JSON Convert Common Log Format file parser
This parser will handle the Common Log Format standard as specified at
https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format.
Combined Log Format is also supported. (Referer and User Agent fields added)
Extra fields may be present and will be enclosed in the `extra` field as
a single string.
The `epoch` calculated timestamp field is naive. (i.e. based on the
local time of the system the parser is run on)
The `epoch_utc` calculated timestamp field is timezone-aware and is
only available if the timezone field is UTC.
Usage (cli):
$ cat file.log | jc --clf
@ -21,6 +30,8 @@ Usage (module):
Schema:
Empty strings and `-` values are converted to `null`/`None`.
[
{
"host": string,
@ -40,6 +51,8 @@ Schema:
"request_version": string,
"status": integer,
"bytes": integer,
"referer": string,
"user_agent": string,
"extra": string,
"epoch": integer, # [0]
"epoch_utc": integer # [1]

View File

@ -139,6 +139,8 @@ def parse(
jc.utils.input_type_check(data)
raw_output: List[Dict] = []
output_line: Dict = {}
clf_pattern = re.compile(r'''
^(?P<host>-|\S+)\s
(?P<ident>-|\S+)\s
@ -154,23 +156,37 @@ def parse(
(?P<tz>\S+)
)
\]\s
\"(?P<request>
(?P<request_method>\S+)\s
(?P<request_url>.*?(?=\sHTTPS?/|\"))\s? # positive lookahead for HTTP or quote mark
(?P<request_version>HTTPS?/\d\.\d)?)\"\s
\"(?P<request>.*?)\"\s
(?P<status>-|\d\d\d)\s
(?P<bytes>-|\d+)\s?
\"(?P<referer>.*?)\"\s?
\"(?P<user_agent>.*?)\"\s?
(?:\"(?P<referer>.*?)\"\s?)?
(?:\"(?P<user_agent>.*?)\"\s?)?
(?P<extra>.*)
''', re.VERBOSE
)
request_pattern = re.compile(r'''
(?P<request_method>\S+)\s
(?P<request_url>.*?(?=\sHTTPS?/|$))\s? # positive lookahead for HTTP(S)/ or end of string
(?P<request_version>HTTPS?/[\d\.]+)?
''', re.VERBOSE
)
if jc.utils.has_data(data):
for line in filter(None, data.splitlines()):
output_line = {}
clf_match = re.match(clf_pattern, line)
if clf_match:
raw_output.append(clf_match.groupdict())
output_line = clf_match.groupdict()
if clf_match.groupdict().get('request', None):
request_string = clf_match.groupdict()['request']
request_match = re.match(request_pattern, request_string)
if request_match:
output_line.update(request_match.groupdict())
raw_output.append(output_line)
return raw_output if raw else _process(raw_output)

View File

@ -1,4 +1,4 @@
.TH jc 1 2022-11-20 1.22.3 "JSON Convert"
.TH jc 1 2022-11-21 1.22.3 "JSON Convert"
.SH NAME
\fBjc\fP \- JSON Convert JSONifies the output of many CLI tools, file-types, and strings
.SH SYNOPSIS
@ -88,7 +88,7 @@ CEF string streaming parser
.TP
.B
\fB--clf\fP
Common Log Format file parser
Common and Combined Log Format file parser
.TP
.B

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,17 @@
127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTPS/1.0" 200 2326
1.1.1.2 - - [11/Nov/2016:03:04:55 +0100] "GET /" 200 83 "-" "-" - 9221 1.1.1.1
127.0.0.1 - - [11/Nov/2016:14:24:21 +0100] "GET /uno dos" 404 298 "-" "-" - 400233 1.1.1.1
127.0.0.1 - - [11/Nov/2016:14:23:37 +0100] "GET /uno dos HTTP/1.0" 404 298 "-" "-" - 385111 1.1.1.1
1.1.1.1 - - [11/Nov/2016:00:00:11 +0100] "GET /icc HTTP/1.1" 302 - "-" "XXX XXX XXX" - 6160 11.1.1.1
1.1.1.1 - - [11/Nov/2016:00:00:11 +0100] "GET /icc/ HTTP/1.1" 302 - "-" "XXX XXX XXX" - 2981 1.1.1.1
tarpon.gulf.net - - [12/Jan/1996:20:37:55 +0000] "GET index.htm HTTP/1.0" 200 215
tarpon.gulf.net - - [12/Jan/1996:20:37:56 +0000] "POST products.htm HTTP/1.0" 200 215
tarpon.gulf.net - - [12/Jan/1996:20:37:57 +0000] "PUT sales.htm HTTP/1.0" 200 215
tarpon.gulf.net - - [12/Jan/1996:20:37:58 +0000] "GET /images/log.gif HTTP/1.0" 200 215
tarpon.gulf.net - - [12/Jan/1996:20:37:59 +0000] "GET /buttons/form.gif HTTP/1.0" 200 215
66.249.66.1 - - [01/Jan/2017:09:00:00 +0000] "GET /contact.html HTTP/1.1" 200 250
66.249.66.1 - - [01/Jan/2017:09:00:00 +0000] "GET /contact.html HTTP/1.1" 200 250 "http://www.example.com/" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)"
jay.bird.com - fred [25/Dec/1998:17:45:35 +0000] "GET /~sret1/ HTTP/1.0" 200 1243
127.0.0.1 - peter [9/Feb/2017:10:34:12 -0700] "GET /sample-image.png HTTP/2" 200 1479
10.1.2.3 - rehg [10/Nov/2021:19:22:12 -0000] "GET /sematext.png HTTP/1.1" 200 3423

47
tests/test_clf.py Normal file
View File

@ -0,0 +1,47 @@
import os
import unittest
import json
from typing import Dict
from jc.parsers.clf import parse
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
class MyTests(unittest.TestCase):
f_in: Dict = {}
f_json: Dict = {}
@classmethod
def setUpClass(cls):
fixtures = {
'clf': (
'fixtures/generic/common-log-format.log',
'fixtures/generic/common-log-format.json')
}
for file, filepaths in fixtures.items():
with open(os.path.join(THIS_DIR, filepaths[0]), 'r', encoding='utf-8') as a, \
open(os.path.join(THIS_DIR, filepaths[1]), 'r', encoding='utf-8') as b:
cls.f_in[file] = a.read()
cls.f_json[file] = json.loads(b.read())
def test_clf_nodata(self):
"""
Test 'clf' with no data
"""
self.assertEqual(parse('', quiet=True), [])
def test_clf(self):
"""
Test 'clf' with various log lines
"""
self.assertEqual(parse(
self.f_in['clf'], quiet=True),
self.f_json['clf']
)
if __name__ == '__main__':
unittest.main()