From 39f43aad634c512eb7d8f66e2af32c994561defb Mon Sep 17 00:00:00 2001 From: Kelly Brazil Date: Sun, 4 Feb 2024 11:17:01 -0800 Subject: [PATCH] add http_headers and curl_head parsers --- jc/lib.py | 2 + jc/parsers/curl_head.py | 285 ++++++++++++++++++++++++ jc/parsers/http_headers.py | 444 +++++++++++++++++++++++++++++++++++++ 3 files changed, 731 insertions(+) create mode 100644 jc/parsers/curl_head.py create mode 100644 jc/parsers/http_headers.py diff --git a/jc/lib.py b/jc/lib.py index 2ff060ea..a584a634 100644 --- a/jc/lib.py +++ b/jc/lib.py @@ -33,6 +33,7 @@ parsers: List[str] = [ 'crontab-u', 'csv', 'csv-s', + 'curl-head', 'date', 'datetime-iso', 'debconf-show', @@ -62,6 +63,7 @@ parsers: List[str] = [ 'history', 'host', 'hosts', + 'http-headers', 'id', 'ifconfig', 'ini', diff --git a/jc/parsers/curl_head.py b/jc/parsers/curl_head.py new file mode 100644 index 00000000..05be6966 --- /dev/null +++ b/jc/parsers/curl_head.py @@ -0,0 +1,285 @@ +"""jc - JSON Convert `curl --head` command output parser + +This parser converts standard and verbose `curl --head` output. + +When converting verbose output from `curl` (to gather request headers, for +example) you will need to redirect STDERR to STDOUT with `2>&1`. The magic +syntax will not work in this case. + +Usage (cli): + + $ curl --head www.example.com | jc --curl-head + $ curl -Ivs www.example.com 2>&1 | jc --curl-head + +or + + $ jc curl --head www.example.com + +Usage (module): + + import jc + result = jc.parse('curl_head', curl_head_command_output) + +Schema: + + [ + { + "
": string, + "accept": [ + string + ], + "accept-ch": [ + string + ], + "accept-ch-lifetime": integer, + "accept-encoding": [ + string + ], + "accept-language": [ + string + ], + "accept-patch": [ + string + ], + "accept-post": [ + string + ], + "accept-ranges": [ + string + ], + "access-control-allow-headers": [ + string + ], + "access-control-allow-methods": [ + string + ], + "access-control-expose-headers": [ + string + ], + "access-control-max-age": integer, + "access-control-request-headers": [ + string + ], + "age": integer, + "allow": [ + string + ], + "alt-svc": [ + string + ], + "cache-control": [ + string + ], + "clear-site-data": [ + string + ], + "connection": [ + string + ], + "content-encoding": [ + string + ], + "content-dpr": integer, + "content-language": [ + string + ], + "content-length": integer, + "content-security-policy": [ + string + ], + "content-security-policy-report-only": [ + string + ], + "cookie": [ + string + ], + "critical-ch": [ + string + ], + "date": string, + "date_epoch_utc": integer, + "expect-ct": [ + string + ], + "expires": string, + "expires_epoch_utc": integer, + "device-memory": integer, + "downlink": integer, + "dpr": integer, + "forwarded": [ + string + ], + "if-match": [ + string + ], + "if-modified-since": string, + "if-modified-since_epoch_utc": integer, + "if-none-match": [ + string + ], + "if-range": string, + "if-range_epoch_utc": integer, + "if-unmodified-since": string, + "if-unmodified-since_epoch_utc": integer, + "im": [ + string + ], + "keep-alive": [ + string + ], + "large-allocation": integer, + "last-modified": string, + "last-modified_epoch_utc": integer, + "link": [ + string + ], + "max-forwards": integer, + "memento-datetime": string, + "memento-datetime_epoch_utc": integer, + "permissions-policy": [ + string + ], + "pragma": [ + string + ], + "proxy-authenticate": [ + string + ], + "retry-after": string, + "retry-after_epoch_utc": integer, + "rtt": integer, + "sec-ch-ua": [ + string + ], + "sec-ch-ua-full-version-list": [ + string + ], + "server": [ + string + ], + "server-timing": [ + string + ], + "set-cookie": [ + string + ], + "timing-allow-origin": [ + string + ], + "trailer": [ + string + ], + "transfer-encoding": [ + string + ], + "upgrade": [ + string + ], + "upgrade-insecure-requests": integer, + "vary": [ + string + ], + "via": [ + string + ], + "warning": [ + string + ], + "www-authenticate": [ + string + ], + "x-cache-hits": [ + integer + ], + "x-content-duration": float + } + ] + +Examples: + + $ curl-head | jc --curl-head -p + [] + + $ curl-head | jc --curl-head -p -r + [] +""" +from typing import List, Dict +from jc.jc_types import JSONDictType +import jc.utils +import jc.parsers.http_headers as headers_parser + + +class info(): + """Provides parser metadata (version, author, etc.)""" + version = '1.0' + description = '`curl --head` command parser' + author = 'Kelly Brazil' + author_email = 'kellyjonbrazil@gmail.com' + details = 'Using the http-headers parser.' + compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd'] + tags = ['command', 'standard'] + magic_commands = ['curl'] + + +__version__ = info.version + + +def _remove_extra_chars(data: str, verbose: bool) -> str: + if data.startswith('> ') or data.startswith('< '): + return data[2:] + elif data.startswith('* '): + return '' + elif verbose: + return '' + else: + return data + + +def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]: + """ + Final processing to conform to the schema. + + Parameters: + + proc_data: (List of Dictionaries) raw structured data to process + + Returns: + + List of Dictionaries. Structured to conform to the schema. + """ + return proc_data + + +def parse( + data: str, + raw: bool = False, + quiet: bool = False +) -> List[JSONDictType]: + """ + Main text parsing function + + Parameters: + + data: (string) text data to parse + raw: (boolean) unprocessed output if True + quiet: (boolean) suppress warning messages if True + + Returns: + + List of Dictionaries. Raw or processed structured data. + """ + jc.utils.compatibility(__name__, info.compatible, quiet) + jc.utils.input_type_check(data) + + raw_output: List[Dict] = [] + curl_verbose = False + + if jc.utils.has_data(data): + data_list = data.splitlines() + if data_list[0].startswith('* '): + curl_verbose = True + data_list = [_remove_extra_chars(x, verbose=curl_verbose) for x in data_list] + data_str = '\n'.join(data_list) + raw_output = headers_parser.parse(data_str, raw, quiet) + + return raw_output if raw else _process(raw_output) diff --git a/jc/parsers/http_headers.py b/jc/parsers/http_headers.py new file mode 100644 index 00000000..ea1a5f0c --- /dev/null +++ b/jc/parsers/http_headers.py @@ -0,0 +1,444 @@ +"""jc - JSON Convert HTTP headers parser + +Converts HTTP request and response headers into a list of dictionaries. +Well-known headers are processed to allow multiple instances which are +aggregated into an array along with any comma-separated values. Integer, +float, and datetimes are converted where defined in the specifications. + +https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers +https://datatracker.ietf.org/doc/html/rfc2616 +https://datatracker.ietf.org/doc/html/rfc3229 +https://datatracker.ietf.org/doc/html/rfc7089 +https://datatracker.ietf.org/doc/html/rfc7231 +https://www.rfc-editor.org/rfc/rfc5789 + +If you are converting HTTP headers from `curl` verbose output, use the +`curl-head` parser which will strip the `>` and `<` characters and remove +non-header lines that begin with `*`. + +Usage (cli): + + $ cat headers.txt | jc --http-headers + +Usage (module): + + import jc + result = jc.parse('http_headers', http_headers_output) + +Schema: + + [ + { + "
": string, + "accept": [ + string + ], + "accept-ch": [ + string + ], + "accept-ch-lifetime": integer, + "accept-encoding": [ + string + ], + "accept-language": [ + string + ], + "accept-patch": [ + string + ], + "accept-post": [ + string + ], + "accept-ranges": [ + string + ], + "access-control-allow-headers": [ + string + ], + "access-control-allow-methods": [ + string + ], + "access-control-expose-headers": [ + string + ], + "access-control-max-age": integer, + "access-control-request-headers": [ + string + ], + "age": integer, + "allow": [ + string + ], + "alt-svc": [ + string + ], + "cache-control": [ + string + ], + "clear-site-data": [ + string + ], + "connection": [ + string + ], + "content-encoding": [ + string + ], + "content-dpr": integer, + "content-language": [ + string + ], + "content-length": integer, + "content-security-policy": [ + string + ], + "content-security-policy-report-only": [ + string + ], + "cookie": [ + string + ], + "critical-ch": [ + string + ], + "date": string, + "date_epoch_utc": integer, + "expect-ct": [ + string + ], + "expires": string, + "expires_epoch_utc": integer, + "device-memory": integer, + "downlink": integer, + "dpr": integer, + "forwarded": [ + string + ], + "if-match": [ + string + ], + "if-modified-since": string, + "if-modified-since_epoch_utc": integer, + "if-none-match": [ + string + ], + "if-range": string, + "if-range_epoch_utc": integer, + "if-unmodified-since": string, + "if-unmodified-since_epoch_utc": integer, + "im": [ + string + ], + "keep-alive": [ + string + ], + "large-allocation": integer, + "last-modified": string, + "last-modified_epoch_utc": integer, + "link": [ + string + ], + "max-forwards": integer, + "memento-datetime": string, + "memento-datetime_epoch_utc": integer, + "permissions-policy": [ + string + ], + "pragma": [ + string + ], + "proxy-authenticate": [ + string + ], + "retry-after": string, + "retry-after_epoch_utc": integer, + "rtt": integer, + "sec-ch-ua": [ + string + ], + "sec-ch-ua-full-version-list": [ + string + ], + "server": [ + string + ], + "server-timing": [ + string + ], + "set-cookie": [ + string + ], + "timing-allow-origin": [ + string + ], + "trailer": [ + string + ], + "transfer-encoding": [ + string + ], + "upgrade": [ + string + ], + "upgrade-insecure-requests": integer, + "vary": [ + string + ], + "via": [ + string + ], + "warning": [ + string + ], + "www-authenticate": [ + string + ], + "x-cache-hits": [ + integer + ], + "x-content-duration": float + } + ] + +Examples: + + $ curl-head | jc --curl-head -p + [] + + $ curl-head | jc --curl-head -p -r + [] +""" +from typing import List, Dict +from jc.jc_types import JSONDictType +import jc.utils + + +class info(): + """Provides parser metadata (version, author, etc.)""" + version = '1.0' + description = 'HTTP headers parser' + author = 'Kelly Brazil' + author_email = 'kellyjonbrazil@gmail.com' + compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd'] + tags = ['standard', 'file'] + + +__version__ = info.version + +METHODS = {'connect', 'delete', 'get', 'head', 'options', 'patch', 'post', 'put', 'trace'} + +INT_HEADERS = { + 'accept-ch-lifetime', + 'access-control-max-age', + 'age', + 'content-dpr', + 'content-length', + 'device-memory', + 'downlink', + 'dpr', + 'large-allocation', + 'max-forwards', + 'rtt', + 'upgrade-insecure-requests' +} + +FLOAT_HEADERS = { + 'x-content-duration' +} + +DT_HEADERS = { + 'date', + 'if-modified-since', + 'if-unmodified-since', + 'last-modified', + 'memento-datetime' +} + +DT_OR_INT_HEADERS = { + 'expires', + 'retry-after' +} + +DT_OR_STR_HEADERS = { + 'if-range' +} + +MULTI_HEADERS = { + 'content-security-policy', + 'content-security-policy-report-only', + 'cookie', + 'set-cookie' +} + +SPLIT_AND_MULTI_HEADERS = { + 'accept', + 'accept-ch', + 'accept-encoding', + 'accept-language', + 'accept-patch', + 'accept-post', + 'accept-ranges', + 'access-control-allow-headers', + 'access-control-allow-methods', + 'access-control-expose-headers', + 'access-control-request-headers', + 'allow', + 'alt-svc', + 'cache-control', + 'clear-site-data', + 'connection', + 'content-encoding', + 'content-language', + 'critical-ch', + 'expect-ct', + 'forwarded', + 'if-match', + 'if-none-match', + 'im', + 'keep-alive', + 'link', + 'permissions-policy', + 'pragma', + 'proxy-authenticate', + 'sec-ch-ua', + 'sec-ch-ua-full-version-list', + 'server', + 'server-timing', + 'timing-allow-origin', + 'trailer', + 'transfer-encoding', + 'upgrade', + 'vary', + 'via', + 'warning', + 'www-authenticate', + 'x-cache-hits' +} + +def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]: + """ + Final processing to conform to the schema. + + Parameters: + + proc_data: (List of Dictionaries) raw structured data to process + + Returns: + + List of Dictionaries. Structured to conform to the schema. + """ + for item in proc_data: + + for key in item.copy(): + if key in INT_HEADERS: + item[key] = jc.utils.convert_to_int(item[key]) + + if key in FLOAT_HEADERS: + item[key] = jc.utils.convert_to_float(item[key]) + + if key in DT_HEADERS or key in DT_OR_STR_HEADERS: + item[key + '_epoch_utc'] = jc.utils.timestamp(item[key], format_hint=(3500,)).utc + + if key in DT_OR_INT_HEADERS: + timestamp = jc.utils.timestamp(item[key], format_hint=(3500,)).utc + int_val = jc.utils.convert_to_int(item[key]) + if timestamp: + item[key + '_epoch_utc'] = timestamp + if int_val is not None: + item[key] = int_val + + # special handling + if 'x-cache-hits' in item: + item['x-cache-hits'] = [jc.utils.convert_to_int(val) for val in item['x-cache-hits']] + + return proc_data + + +def parse( + data: str, + raw: bool = False, + quiet: bool = False +) -> List[JSONDictType]: + """ + Main text parsing function + + Parameters: + + data: (string) text data to parse + raw: (boolean) unprocessed output if True + quiet: (boolean) suppress warning messages if True + + Returns: + + List of Dictionaries. Raw or processed structured data. + """ + jc.utils.compatibility(__name__, info.compatible, quiet) + jc.utils.input_type_check(data) + + raw_output: List[Dict] = [] + output_object: Dict = {} + + if jc.utils.has_data(data): + + for line in filter(None, data.splitlines()): + + first_word = line.split(maxsplit=1)[0] + first_word = first_word.rstrip(':') + first_word = first_word.lower() + + if first_word in METHODS: + if output_object: + raw_output.append(output_object) + + method, uri, version = line.split(maxsplit=2) + output_object = {} + output_object['type'] = 'request' + output_object['request_method'] = method + output_object['request_uri'] = uri + output_object['request_version'] = version + continue + + if first_word.startswith('http/'): + if output_object: + raw_output.append(output_object) + + reason = None + version, status, *reason = line.split(maxsplit=2) + output_object = {} + output_object['type'] = 'response' + output_object['response_version'] = version + output_object['response_status'] = int(status) + output_object['response_reason'] = reason or None + continue + + if first_word in SPLIT_AND_MULTI_HEADERS: + key, value = line.split(': ', maxsplit=1) + key = key.lower() + value_list = value.split(',') + value_list = [x.strip() for x in value_list] + if key in output_object: + output_object[key].extend(value_list) + else: + output_object[key] = [] + output_object[key].extend(value_list) + continue + + if first_word in MULTI_HEADERS: + key, value = line.split(': ', maxsplit=1) + key = key.lower() + if key in output_object: + output_object[key].append(value) + else: + output_object[key] = [] + output_object[key].append(value) + continue + + # All other headers + key, value = line.split(': ', maxsplit=1) + key = key.lower() + output_object[key] = value + + if output_object: + raw_output.append(output_object) + + return raw_output if raw else _process(raw_output)