1
0
mirror of https://github.com/kellyjonbrazil/jc.git synced 2025-07-17 01:32:37 +02:00

schema update for normalized, encoded, and decoded fields

This commit is contained in:
Kelly Brazil
2022-07-22 09:28:18 -07:00
parent 163a76dd10
commit 8d42f747a7
2 changed files with 278 additions and 97 deletions

View File

@ -12,10 +12,12 @@ This parser will work with naked and wrapped URL strings:
- `<scheme://host/path>`
- `<URL:scheme://host/path>`
Normalized encoded and decoded versions of the original URL and URL parts
Normalized, Encoded, and Decoded versions of the original URL and URL parts
are included in the output.
> Note: Do not use the encoded fields for a URL that is already encoded.
> Note: Do not use the encoded fields for a URL that has already been
> encoded. Similarly, do not use the decoded fields for a URL that has
> already been decoded.
Usage (cli):
@ -30,33 +32,53 @@ Schema:
{
"url": string,
'url_encoded": string,
"scheme": string,
"scheme_encoded": string,
"netloc": string,
"netloc_encoded": string,
"path": string or null,
"path_encoded": string or null,
"path_list": [ array or null
string
],
"query": string or Null,
"query_encoded": string or Null,
"query": string or null,
"query_obj": { object or null
<query-key>: [ array or null
<query-value> string # [0]
]
},
"fragment": string or null,
"fragment_encoded": string or null,
"username": string or null,
"username_encoded": string or null,
"password": string or null,
"password_encoded": string or null,
"hostname": string or null,
"hostname_encoded": string or null,
"port": integer or null,
"port_encoded": string or null
"encoded": {
"url": string,
"scheme": string,
"netloc": string,
"path": string or null,
"path_list": [ array or null
string
],
"query": string or null,
"fragment": string or null,
"username": string or null,
"password": string or null,
"hostname": string or null,
"port": string or null,
},
"decoded": {
"url": string,
"scheme": string,
"netloc": string,
"path": string or null,
"path_list": [ array or null
string
],
"query": string or null,
"fragment": string or null,
"username": string or null,
"password": string or null,
"hostname": string or null,
"port": string or null,
}
}
[0] Duplicate query-keys will have their values consolidated into the
@ -68,19 +90,14 @@ Examples:
| jc --url -p
{
"url": "http://example.com/test/path?q1=foo&q1=bar&q2=baz#frag",
"url_encoded": "http://example.com/test/path?q1%3Dfoo%26q1%3Dbar%26q2%3Dbaz#frag",
"scheme": "http",
"scheme_encoded": "http",
"netloc": "example.com",
"netloc_encoded": "example.com",
"path": "/test/path",
"path_encoded": "/test/path",
"path_list": [
"test",
"path"
],
"query": "q1=foo&q1=bar&q2=baz",
"query_encoded": "q1%3Dfoo%26q1%3Dbar%26q2%3Dbaz",
"query_obj": {
"q1": [
"foo",
@ -91,43 +108,90 @@ Examples:
]
},
"fragment": "frag",
"fragment_encoded": "frag",
"username": null,
"username_encoded": null,
"password": null,
"password_encoded": null,
"hostname": "example.com",
"hostname_encoded": "example.com",
"port": null,
"port_encoded": null
"encoded": {
"url": "http://example.com/test/path?q1=foo&q1=bar&q2=baz#frag",
"scheme": "http",
"netloc": "example.com",
"path": "/test/path",
"path_list": [
"test",
"path"
],
"query": "q1=foo&q1=bar&q2=baz",
"fragment": "frag",
"username": null,
"password": null,
"hostname": "example.com",
"port": null
},
"decoded": {
"url": "http://example.com/test/path?q1=foo&q1=bar&q2=baz#frag",
"scheme": "http",
"netloc": "example.com",
"path": "/test/path",
"path_list": [
"test",
"path"
],
"query": "q1=foo&q1=bar&q2=baz",
"fragment": "frag",
"username": null,
"password": null,
"hostname": "example.com",
"port": null
}
}
$ echo "ftp://localhost/filepath" | jc --url -p
{
"url": "ftp://localhost/filepath",
"url_encoded": "ftp://localhost/filepath",
"scheme": "ftp",
"scheme_encoded": "ftp",
"netloc": "localhost",
"netloc_encoded": "localhost",
"path": "/filepath",
"path_encoded": "/filepath",
"path_list": [
"filepath"
],
"query": null,
"query_encoded": null,
"query_obj": null,
"fragment": null,
"fragment_encoded": null,
"username": null,
"username_encoded": null,
"password": null,
"password_encoded": null,
"hostname": "localhost",
"hostname_encoded": "localhost",
"port": null,
"port_encoded": null
"encoded": {
"url": "ftp://localhost/filepath",
"scheme": "ftp",
"netloc": "localhost",
"path": "/filepath",
"path_list": [
"filepath"
],
"query": null,
"fragment": null,
"username": null,
"password": null,
"hostname": "localhost",
"port": null
},
"decoded": {
"url": "ftp://localhost/filepath",
"scheme": "ftp",
"netloc": "localhost",
"path": "/filepath",
"path_list": [
"filepath"
],
"query": null,
"fragment": null,
"username": null,
"password": null,
"hostname": "localhost",
"port": null
}
}
<a id="jc.parsers.url.parse"></a>

View File

@ -7,10 +7,12 @@ This parser will work with naked and wrapped URL strings:
- `<scheme://host/path>`
- `<URL:scheme://host/path>`
Normalized encoded and decoded versions of the original URL and URL parts
Normalized, Encoded, and Decoded versions of the original URL and URL parts
are included in the output.
> Note: Do not use the encoded fields for a URL that is already encoded.
> Note: Do not use the encoded fields for a URL that has already been
> encoded. Similarly, do not use the decoded fields for a URL that has
> already been decoded.
Usage (cli):
@ -25,33 +27,53 @@ Schema:
{
"url": string,
'url_encoded": string,
"scheme": string,
"scheme_encoded": string,
"netloc": string,
"netloc_encoded": string,
"path": string or null,
"path_encoded": string or null,
"path_list": [ array or null
string
],
"query": string or Null,
"query_encoded": string or Null,
"query": string or null,
"query_obj": { object or null
<query-key>: [ array or null
<query-value> string # [0]
]
},
"fragment": string or null,
"fragment_encoded": string or null,
"username": string or null,
"username_encoded": string or null,
"password": string or null,
"password_encoded": string or null,
"hostname": string or null,
"hostname_encoded": string or null,
"port": integer or null,
"port_encoded": string or null
"encoded": {
"url": string,
"scheme": string,
"netloc": string,
"path": string or null,
"path_list": [ array or null
string
],
"query": string or null,
"fragment": string or null,
"username": string or null,
"password": string or null,
"hostname": string or null,
"port": string or null,
},
"decoded": {
"url": string,
"scheme": string,
"netloc": string,
"path": string or null,
"path_list": [ array or null
string
],
"query": string or null,
"fragment": string or null,
"username": string or null,
"password": string or null,
"hostname": string or null,
"port": string or null,
}
}
[0] Duplicate query-keys will have their values consolidated into the
@ -63,19 +85,14 @@ Examples:
| jc --url -p
{
"url": "http://example.com/test/path?q1=foo&q1=bar&q2=baz#frag",
"url_encoded": "http://example.com/test/path?q1%3Dfoo%26q1%3Dbar%26q2%3Dbaz#frag",
"scheme": "http",
"scheme_encoded": "http",
"netloc": "example.com",
"netloc_encoded": "example.com",
"path": "/test/path",
"path_encoded": "/test/path",
"path_list": [
"test",
"path"
],
"query": "q1=foo&q1=bar&q2=baz",
"query_encoded": "q1%3Dfoo%26q1%3Dbar%26q2%3Dbaz",
"query_obj": {
"q1": [
"foo",
@ -86,43 +103,90 @@ Examples:
]
},
"fragment": "frag",
"fragment_encoded": "frag",
"username": null,
"username_encoded": null,
"password": null,
"password_encoded": null,
"hostname": "example.com",
"hostname_encoded": "example.com",
"port": null,
"port_encoded": null
"encoded": {
"url": "http://example.com/test/path?q1=foo&q1=bar&q2=baz#frag",
"scheme": "http",
"netloc": "example.com",
"path": "/test/path",
"path_list": [
"test",
"path"
],
"query": "q1=foo&q1=bar&q2=baz",
"fragment": "frag",
"username": null,
"password": null,
"hostname": "example.com",
"port": null
},
"decoded": {
"url": "http://example.com/test/path?q1=foo&q1=bar&q2=baz#frag",
"scheme": "http",
"netloc": "example.com",
"path": "/test/path",
"path_list": [
"test",
"path"
],
"query": "q1=foo&q1=bar&q2=baz",
"fragment": "frag",
"username": null,
"password": null,
"hostname": "example.com",
"port": null
}
}
$ echo "ftp://localhost/filepath" | jc --url -p
{
"url": "ftp://localhost/filepath",
"url_encoded": "ftp://localhost/filepath",
"scheme": "ftp",
"scheme_encoded": "ftp",
"netloc": "localhost",
"netloc_encoded": "localhost",
"path": "/filepath",
"path_encoded": "/filepath",
"path_list": [
"filepath"
],
"query": null,
"query_encoded": null,
"query_obj": null,
"fragment": null,
"fragment_encoded": null,
"username": null,
"username_encoded": null,
"password": null,
"password_encoded": null,
"hostname": "localhost",
"hostname_encoded": "localhost",
"port": null,
"port_encoded": null
"encoded": {
"url": "ftp://localhost/filepath",
"scheme": "ftp",
"netloc": "localhost",
"path": "/filepath",
"path_list": [
"filepath"
],
"query": null,
"fragment": null,
"username": null,
"password": null,
"hostname": "localhost",
"port": null
},
"decoded": {
"url": "ftp://localhost/filepath",
"scheme": "ftp",
"netloc": "localhost",
"path": "/filepath",
"path_list": [
"filepath"
],
"query": null,
"fragment": null,
"username": null,
"password": null,
"hostname": "localhost",
"port": null
}
}
"""
import re
@ -226,62 +290,115 @@ def parse(
my_path = None
encoded_path = None
decoded_path = None
path_list = None
encoded_path_list = None
decoded_path_list = None
query_obj = None
encoded_username = None
decoded_username = None
encoded_password = None
decoded_password = None
encoded_hostname = None
decoded_hostname = None
normalized_port = None
encoded_port = None
decoded_port = None
if unquoted_parts.path:
if normalized.path:
# normalize the path by removing any duplicate `/` chars
my_path = re.sub(r'/+', '/', unquoted_parts.path)
my_path = re.sub(r'/+', '/', normalized.path)
encoded_path = re.sub(r'/+', '/', quoted_parts.path)
decoded_path = re.sub(r'/+', '/', unquoted_parts.path)
# remove first '/' and split
path_list = my_path.replace('/', '', 1).split('/')
encoded_path_list = encoded_path.replace('/', '', 1).split('/')
decoded_path_list = decoded_path.replace('/', '', 1).split('/')
if path_list == ['']:
path_list = None
if unquoted_parts.query:
query_obj = parse_qs(unquoted_parts.query)
if encoded_path_list == ['']:
encoded_path_list = None
if unquoted_parts.username:
encoded_username = quote(unquoted_parts.username, safe=NETLOC_SAFE)
if decoded_path_list == ['']:
decoded_path_list = None
if unquoted_parts.password:
encoded_password = quote(unquoted_parts.password, safe=NETLOC_SAFE)
if normalized.query:
query_obj = parse_qs(normalized.query)
if unquoted_parts.hostname:
encoded_hostname = quote(unquoted_parts.hostname, safe=NETLOC_SAFE)
if normalized.username:
encoded_username = quote(normalized.username, safe=NETLOC_SAFE)
decoded_username = unquote(normalized.username)
if unquoted_parts.port:
encoded_port = quote(str(unquoted_parts.port), safe=NETLOC_SAFE)
if normalized.password:
encoded_password = quote(normalized.password, safe=NETLOC_SAFE)
decoded_password = unquote(normalized.password)
if normalized.hostname:
encoded_hostname = quote(normalized.hostname, safe=NETLOC_SAFE)
decoded_hostname = unquote(normalized.hostname)
# handle port differently since an encoded port can cause a ValueError
try:
if normalized.port:
normalized_port = normalized.port
encoded_port = quote(str(normalized.port), safe=NETLOC_SAFE)
decoded_port = unquote(str(normalized.port))
except ValueError:
# Non-integer decoded port values can also cause a ValueError
try:
if unquoted_parts.port:
normalized_port = unquote(str(unquoted_parts.port)) # type: ignore
encoded_port = quote(str(unquoted_parts.port), safe=NETLOC_SAFE)
decoded_port = unquote(str(unquoted_parts.port))
except ValueError:
normalized_port = None
encoded_port = None
decoded_port = None
raw_output = {
'url': unquoted or None,
'url_encoded': quoted or None,
'scheme': unquoted_parts.scheme or None,
'scheme_encoded': quoted_parts.scheme or None,
'netloc': unquoted_parts.netloc or None,
'netloc_encoded': quoted_parts.netloc or None,
'url': normalized.geturl() or None,
'scheme': normalized.scheme or None,
'netloc': normalized.netloc or None,
'path': my_path or None,
'path_encoded': encoded_path or None,
'path_list': path_list or None,
'query': unquoted_parts.query or None,
'query_encoded': quoted_parts.query or None,
'query': normalized.query or None,
'query_obj': query_obj or None,
'fragment': unquoted_parts.fragment or None,
'fragment_encoded': quoted_parts.fragment or None,
'username': unquoted_parts.username or None,
'username_encoded': encoded_username or None,
'password': unquoted_parts.password or None,
'password_encoded': encoded_password or None,
'hostname': unquoted_parts.hostname or None,
'hostname_encoded': encoded_hostname or None,
'port': unquoted_parts.port or None,
'port_encoded': encoded_port or None
'fragment': normalized.fragment or None,
'username': normalized.username or None,
'password': normalized.password or None,
'hostname': normalized.hostname or None,
'port': normalized_port or None,
'encoded': {
'url': quoted or None,
'scheme': quoted_parts.scheme or None,
'netloc': quoted_parts.netloc or None,
'path': encoded_path or None,
'path_list': encoded_path_list or None,
'query': quoted_parts.query or None,
'fragment': quoted_parts.fragment or None,
'username': encoded_username or None,
'password': encoded_password or None,
'hostname': encoded_hostname or None,
'port': encoded_port or None,
},
'decoded': {
'url': unquoted or None,
'scheme': unquoted_parts.scheme or None,
'netloc': unquoted_parts.netloc or None,
'path': decoded_path or None,
'path_list': decoded_path_list or None,
'query': unquoted_parts.query or None,
'fragment': unquoted_parts.fragment or None,
'username': decoded_username or None,
'password': decoded_password or None,
'hostname': decoded_hostname or None,
'port': decoded_port or None,
}
}
return raw_output if raw else _process(raw_output)