add http_headers and curl_head parsers

2025-08-06 22:32:54 +02:00 · 2024-02-04 11:17:01 -08:00
parent 29689f7603
commit 39f43aad63
3 changed files with 731 additions and 0 deletions
--- a/jc/lib.py
+++ b/jc/lib.py
@ -33,6 +33,7 @@ parsers: List[str] = [
    'crontab-u',
    'csv',
    'csv-s',
    'curl-head',
    'date',
    'datetime-iso',
    'debconf-show',
@ -62,6 +63,7 @@ parsers: List[str] = [
    'history',
    'host',
    'hosts',
    'http-headers',
    'id',
    'ifconfig',
    'ini',
--- a/jc/parsers/curl_head.py
+++ b/jc/parsers/curl_head.py
@ -0,0 +1,285 @@
 """jc - JSON Convert `curl --head` command output parser
 This parser converts standard and verbose `curl --head` output.
 When converting verbose output from `curl` (to gather request headers, for
 example) you will need to redirect STDERR to STDOUT with `2>&1`. The magic
 syntax will not work in this case.
 Usage (cli):
    $ curl --head www.example.com | jc --curl-head
    $ curl -Ivs www.example.com 2>&1 | jc --curl-head
 or
    $ jc curl --head www.example.com
 Usage (module):
    import jc
    result = jc.parse('curl_head', curl_head_command_output)
 Schema:
    [
      {
        "<header>":                             string,
        "accept": [
                                                string
        ],
        "accept-ch": [
                                                string
        ],
        "accept-ch-lifetime":                   integer,
        "accept-encoding": [
                                                string
        ],
        "accept-language": [
                                                string
        ],
        "accept-patch": [
                                                string
        ],
        "accept-post": [
                                                string
        ],
        "accept-ranges": [
                                                string
        ],
        "access-control-allow-headers": [
                                                string
        ],
        "access-control-allow-methods": [
                                                string
        ],
        "access-control-expose-headers": [
                                                string
        ],
        "access-control-max-age":               integer,
        "access-control-request-headers": [
                                                string
        ],
        "age":                                  integer,
        "allow": [
                                                string
        ],
        "alt-svc": [
                                                string
        ],
        "cache-control": [
                                                string
        ],
        "clear-site-data": [
                                                string
        ],
        "connection": [
                                                string
        ],
        "content-encoding": [
                                                string
        ],
        "content-dpr":                          integer,
        "content-language": [
                                                string
        ],
        "content-length":                       integer,
        "content-security-policy": [
                                                string
        ],
        "content-security-policy-report-only": [
                                                string
        ],
        "cookie": [
                                                string
        ],
        "critical-ch": [
                                                string
        ],
        "date":                                 string,
        "date_epoch_utc":                       integer,
        "expect-ct": [
                                                string
        ],
        "expires":                              string,
        "expires_epoch_utc":                    integer,
        "device-memory":                        integer,
        "downlink":                             integer,
        "dpr":                                  integer,
        "forwarded": [
                                                string
        ],
        "if-match": [
                                                string
        ],
        "if-modified-since":                    string,
        "if-modified-since_epoch_utc":          integer,
        "if-none-match": [
                                                string
        ],
        "if-range":                             string,
        "if-range_epoch_utc":                   integer,
        "if-unmodified-since":                  string,
        "if-unmodified-since_epoch_utc":        integer,
        "im": [
                                                string
        ],
        "keep-alive": [
                                                string
        ],
        "large-allocation":                     integer,
        "last-modified":                        string,
        "last-modified_epoch_utc":              integer,
        "link": [
                                                string
        ],
        "max-forwards":                         integer,
        "memento-datetime":                     string,
        "memento-datetime_epoch_utc":           integer,
        "permissions-policy": [
                                                string
        ],
        "pragma": [
                                                string
        ],
        "proxy-authenticate": [
                                                string
        ],
        "retry-after":                          string,
        "retry-after_epoch_utc":                integer,
        "rtt":                                  integer,
        "sec-ch-ua": [
                                                string
        ],
        "sec-ch-ua-full-version-list": [
                                                string
        ],
        "server": [
                                                string
        ],
        "server-timing": [
                                                string
        ],
        "set-cookie": [
                                                string
        ],
        "timing-allow-origin": [
                                                string
        ],
        "trailer": [
                                                string
        ],
        "transfer-encoding": [
                                                string
        ],
        "upgrade": [
                                                string
        ],
        "upgrade-insecure-requests":            integer,
        "vary": [
                                                string
        ],
        "via": [
                                                string
        ],
        "warning": [
                                                string
        ],
        "www-authenticate": [
                                                string
        ],
        "x-cache-hits": [
                                                integer
        ],
        "x-content-duration":                   float
      }
    ]
 Examples:
    $ curl-head | jc --curl-head -p
    []
    $ curl-head | jc --curl-head -p -r
    []
 """
 from typing import List, Dict
 from jc.jc_types import JSONDictType
 import jc.utils
 import jc.parsers.http_headers as headers_parser
 class info():
    """Provides parser metadata (version, author, etc.)"""
    version = '1.0'
    description = '`curl --head` command parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
    details = 'Using the http-headers parser.'
    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
    tags = ['command', 'standard']
    magic_commands = ['curl']
 __version__ = info.version
 def _remove_extra_chars(data: str, verbose: bool) -> str:
    if data.startswith('> ') or data.startswith('< '):
        return data[2:]
    elif data.startswith('* '):
        return ''
    elif verbose:
        return ''
    else:
        return data
 def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]:
    """
    Final processing to conform to the schema.
    Parameters:
        proc_data:   (List of Dictionaries) raw structured data to process
    Returns:
        List of Dictionaries. Structured to conform to the schema.
    """
    return proc_data
 def parse(
    data: str,
    raw: bool = False,
    quiet: bool = False
 ) -> List[JSONDictType]:
    """
    Main text parsing function
    Parameters:
        data:        (string)  text data to parse
        raw:         (boolean) unprocessed output if True
        quiet:       (boolean) suppress warning messages if True
    Returns:
        List of Dictionaries. Raw or processed structured data.
    """
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.input_type_check(data)
    raw_output: List[Dict] = []
    curl_verbose = False
    if jc.utils.has_data(data):
        data_list = data.splitlines()
        if data_list[0].startswith('* '):
            curl_verbose = True
        data_list = [_remove_extra_chars(x, verbose=curl_verbose) for x in data_list]
        data_str = '\n'.join(data_list)
        raw_output = headers_parser.parse(data_str, raw, quiet)
    return raw_output if raw else _process(raw_output)
--- a/jc/parsers/http_headers.py
+++ b/jc/parsers/http_headers.py
@ -0,0 +1,444 @@
 """jc - JSON Convert HTTP headers parser
 Converts HTTP request and response headers into a list of dictionaries.
 Well-known headers are processed to allow multiple instances which are
 aggregated into an array along with any comma-separated values. Integer,
 float, and datetimes are converted where defined in the specifications.
 https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers
 https://datatracker.ietf.org/doc/html/rfc2616
 https://datatracker.ietf.org/doc/html/rfc3229
 https://datatracker.ietf.org/doc/html/rfc7089
 https://datatracker.ietf.org/doc/html/rfc7231
 https://www.rfc-editor.org/rfc/rfc5789
 If you are converting HTTP headers from `curl` verbose output, use the
 `curl-head` parser which will strip the `>` and `<` characters and remove
 non-header lines that begin with `*`.
 Usage (cli):
    $ cat headers.txt | jc --http-headers
 Usage (module):
    import jc
    result = jc.parse('http_headers', http_headers_output)
 Schema:
    [
      {
        "<header>":                             string,
        "accept": [
                                                string
        ],
        "accept-ch": [
                                                string
        ],
        "accept-ch-lifetime":                   integer,
        "accept-encoding": [
                                                string
        ],
        "accept-language": [
                                                string
        ],
        "accept-patch": [
                                                string
        ],
        "accept-post": [
                                                string
        ],
        "accept-ranges": [
                                                string
        ],
        "access-control-allow-headers": [
                                                string
        ],
        "access-control-allow-methods": [
                                                string
        ],
        "access-control-expose-headers": [
                                                string
        ],
        "access-control-max-age":               integer,
        "access-control-request-headers": [
                                                string
        ],
        "age":                                  integer,
        "allow": [
                                                string
        ],
        "alt-svc": [
                                                string
        ],
        "cache-control": [
                                                string
        ],
        "clear-site-data": [
                                                string
        ],
        "connection": [
                                                string
        ],
        "content-encoding": [
                                                string
        ],
        "content-dpr":                          integer,
        "content-language": [
                                                string
        ],
        "content-length":                       integer,
        "content-security-policy": [
                                                string
        ],
        "content-security-policy-report-only": [
                                                string
        ],
        "cookie": [
                                                string
        ],
        "critical-ch": [
                                                string
        ],
        "date":                                 string,
        "date_epoch_utc":                       integer,
        "expect-ct": [
                                                string
        ],
        "expires":                              string,
        "expires_epoch_utc":                    integer,
        "device-memory":                        integer,
        "downlink":                             integer,
        "dpr":                                  integer,
        "forwarded": [
                                                string
        ],
        "if-match": [
                                                string
        ],
        "if-modified-since":                    string,
        "if-modified-since_epoch_utc":          integer,
        "if-none-match": [
                                                string
        ],
        "if-range":                             string,
        "if-range_epoch_utc":                   integer,
        "if-unmodified-since":                  string,
        "if-unmodified-since_epoch_utc":        integer,
        "im": [
                                                string
        ],
        "keep-alive": [
                                                string
        ],
        "large-allocation":                     integer,
        "last-modified":                        string,
        "last-modified_epoch_utc":              integer,
        "link": [
                                                string
        ],
        "max-forwards":                         integer,
        "memento-datetime":                     string,
        "memento-datetime_epoch_utc":           integer,
        "permissions-policy": [
                                                string
        ],
        "pragma": [
                                                string
        ],
        "proxy-authenticate": [
                                                string
        ],
        "retry-after":                          string,
        "retry-after_epoch_utc":                integer,
        "rtt":                                  integer,
        "sec-ch-ua": [
                                                string
        ],
        "sec-ch-ua-full-version-list": [
                                                string
        ],
        "server": [
                                                string
        ],
        "server-timing": [
                                                string
        ],
        "set-cookie": [
                                                string
        ],
        "timing-allow-origin": [
                                                string
        ],
        "trailer": [
                                                string
        ],
        "transfer-encoding": [
                                                string
        ],
        "upgrade": [
                                                string
        ],
        "upgrade-insecure-requests":            integer,
        "vary": [
                                                string
        ],
        "via": [
                                                string
        ],
        "warning": [
                                                string
        ],
        "www-authenticate": [
                                                string
        ],
        "x-cache-hits": [
                                                integer
        ],
        "x-content-duration":                   float
      }
    ]
 Examples:
    $ curl-head | jc --curl-head -p
    []
    $ curl-head | jc --curl-head -p -r
    []
 """
 from typing import List, Dict
 from jc.jc_types import JSONDictType
 import jc.utils
 class info():
    """Provides parser metadata (version, author, etc.)"""
    version = '1.0'
    description = 'HTTP headers parser'
    author = 'Kelly Brazil'
    author_email = 'kellyjonbrazil@gmail.com'
    compatible = ['linux', 'darwin', 'cygwin', 'win32', 'aix', 'freebsd']
    tags = ['standard', 'file']
 __version__ = info.version
 METHODS = {'connect', 'delete', 'get', 'head', 'options', 'patch', 'post', 'put', 'trace'}
 INT_HEADERS = {
    'accept-ch-lifetime',
    'access-control-max-age',
    'age',
    'content-dpr',
    'content-length',
    'device-memory',
    'downlink',
    'dpr',
    'large-allocation',
    'max-forwards',
    'rtt',
    'upgrade-insecure-requests'
 }
 FLOAT_HEADERS = {
    'x-content-duration'
 }
 DT_HEADERS = {
    'date',
    'if-modified-since',
    'if-unmodified-since',
    'last-modified',
    'memento-datetime'
 }
 DT_OR_INT_HEADERS = {
    'expires',
    'retry-after'
 }
 DT_OR_STR_HEADERS = {
    'if-range'
 }
 MULTI_HEADERS = {
    'content-security-policy',
    'content-security-policy-report-only',
    'cookie',
    'set-cookie'
 }
 SPLIT_AND_MULTI_HEADERS = {
    'accept',
    'accept-ch',
    'accept-encoding',
    'accept-language',
    'accept-patch',
    'accept-post',
    'accept-ranges',
    'access-control-allow-headers',
    'access-control-allow-methods',
    'access-control-expose-headers',
    'access-control-request-headers',
    'allow',
    'alt-svc',
    'cache-control',
    'clear-site-data',
    'connection',
    'content-encoding',
    'content-language',
    'critical-ch',
    'expect-ct',
    'forwarded',
    'if-match',
    'if-none-match',
    'im',
    'keep-alive',
    'link',
    'permissions-policy',
    'pragma',
    'proxy-authenticate',
    'sec-ch-ua',
    'sec-ch-ua-full-version-list',
    'server',
    'server-timing',
    'timing-allow-origin',
    'trailer',
    'transfer-encoding',
    'upgrade',
    'vary',
    'via',
    'warning',
    'www-authenticate',
    'x-cache-hits'
 }
 def _process(proc_data: List[JSONDictType]) -> List[JSONDictType]:
    """
    Final processing to conform to the schema.
    Parameters:
        proc_data:   (List of Dictionaries) raw structured data to process
    Returns:
        List of Dictionaries. Structured to conform to the schema.
    """
    for item in proc_data:
        for key in item.copy():
            if key in INT_HEADERS:
                item[key] = jc.utils.convert_to_int(item[key])
            if key in FLOAT_HEADERS:
                item[key] = jc.utils.convert_to_float(item[key])
            if key in DT_HEADERS or key in DT_OR_STR_HEADERS:
                item[key + '_epoch_utc'] = jc.utils.timestamp(item[key], format_hint=(3500,)).utc
            if key in DT_OR_INT_HEADERS:
                timestamp = jc.utils.timestamp(item[key], format_hint=(3500,)).utc
                int_val = jc.utils.convert_to_int(item[key])
                if timestamp:
                    item[key + '_epoch_utc'] = timestamp
                if int_val is not None:
                    item[key] = int_val
        # special handling
        if 'x-cache-hits' in item:
            item['x-cache-hits'] = [jc.utils.convert_to_int(val) for val in item['x-cache-hits']]
    return proc_data
 def parse(
    data: str,
    raw: bool = False,
    quiet: bool = False
 ) -> List[JSONDictType]:
    """
    Main text parsing function
    Parameters:
        data:        (string)  text data to parse
        raw:         (boolean) unprocessed output if True
        quiet:       (boolean) suppress warning messages if True
    Returns:
        List of Dictionaries. Raw or processed structured data.
    """
    jc.utils.compatibility(__name__, info.compatible, quiet)
    jc.utils.input_type_check(data)
    raw_output: List[Dict] = []
    output_object: Dict = {}
    if jc.utils.has_data(data):
        for line in filter(None, data.splitlines()):
            first_word = line.split(maxsplit=1)[0]
            first_word = first_word.rstrip(':')
            first_word = first_word.lower()
            if first_word in METHODS:
                if output_object:
                    raw_output.append(output_object)
                method, uri, version = line.split(maxsplit=2)
                output_object = {}
                output_object['type'] = 'request'
                output_object['request_method'] = method
                output_object['request_uri'] = uri
                output_object['request_version'] = version
                continue
            if first_word.startswith('http/'):
                if output_object:
                    raw_output.append(output_object)
                reason = None
                version, status, *reason = line.split(maxsplit=2)
                output_object = {}
                output_object['type'] = 'response'
                output_object['response_version'] = version
                output_object['response_status'] = int(status)
                output_object['response_reason'] = reason or None
                continue
            if first_word in SPLIT_AND_MULTI_HEADERS:
                key, value = line.split(': ', maxsplit=1)
                key = key.lower()
                value_list = value.split(',')
                value_list = [x.strip() for x in value_list]
                if key in output_object:
                    output_object[key].extend(value_list)
                else:
                    output_object[key] = []
                    output_object[key].extend(value_list)
                continue
            if first_word in MULTI_HEADERS:
                key, value = line.split(': ', maxsplit=1)
                key = key.lower()
                if key in output_object:
                    output_object[key].append(value)
                else:
                    output_object[key] = []
                    output_object[key].append(value)
                continue
            # All other headers
            key, value = line.split(': ', maxsplit=1)
            key = key.lower()
            output_object[key] = value
    if output_object:
        raw_output.append(output_object)
    return raw_output if raw else _process(raw_output)