add normalized fields

2025-08-06 22:32:54 +02:00 · 2022-07-20 15:43:08 -07:00
parent 720c6b5d42
commit d5a6c22cc6
1 changed files with 55 additions and 14 deletions
--- a/jc/parsers/url.py
+++ b/jc/parsers/url.py
@ -1,5 +1,12 @@
 """jc - JSON Convert URL string parser
 This parser will work with naked and wrapped URL strings:
 - `scheme://host/path`
 - `URL:scheme://host/path`
 - `<scheme://host/path>`
 - `<URL:scheme://host/path>`
 Usage (cli):
    $ echo "http://example.com/test/path?q1=foo&q2=bar#frag" | jc --url
@ -12,12 +19,22 @@ Usage (module):
 Schema:
    {
      "quoted":               string,
      'unquoted":             string,
      "scheme":               string,
      "netloc":               string,
      "path":                 string or null,
-      "query": {              object or null,
+      "path_list": [          array or null
-        <query-key>:          string
+                              string
      ],
      "query": {              object or null
        <query-key>: [        linst or null
                              string
        ]
      },
      "query_list": [         array or null
        <query-key>:          string
      ],
      "fragment":             string or null,
      "username":             string or null,
      "password":             string or null,
@ -48,6 +65,7 @@ Examples:
      "scheme": "ftp",
      "netloc": "localhost",
      "path": "/filepath",
      "path_list": ['filepath'],
      "query": null,
      "fragment": null,
      "username": null,
@ -56,7 +74,11 @@ Examples:
      "port": null
    }
 """
-from urllib.parse import urlparse
+import re
 from urllib.parse import (
    urlsplit, unwrap, parse_qs, parse_qsl, urlunsplit, quote, quote_plus,
    unquote, unquote_plus
 )
 from typing import Dict
 import jc.utils
@ -112,23 +134,42 @@ def parse(
    raw_output: Dict = {}
    if jc.utils.has_data(data):
-        parts = urlparse(data)
+        parts = urlsplit(unwrap(data))
-        query = {}
+        normalized = urlsplit(urlunsplit(parts))
        query_list = []
-        if parts.query:
+        quoted = normalized._replace(path=quote(normalized.path),
-            query_list = parts.query.split('&')
+                                     query=quote_plus(normalized.query)).geturl()
-        if query_list:
+        unquoted = normalized._replace(path=unquote(normalized.path),
-            for q in query_list:
+                                       query=unquote_plus(normalized.query)).geturl()
-                k, v = q.split('=')
+
-                query.update({k: v})
+        unquoted_parts = urlsplit(unwrap(unquoted))
        my_path = None
        path_list = None
        if unquoted_parts.path:
            # normalize the path by removing any duplicate `/` chars
            my_path = re.sub(r'/+', '/', unquoted_parts.path)
            # remove first '/' and split
            path_list = my_path.replace('/', '', 1).split('/')
            if path_list == ['']:
                path_list = None
        if unquoted_parts.query:
            query_obj = parse_qs(unquoted_parts.query)
            query_list = parse_qsl(unquoted_parts.query)
        raw_output = {
            'quoted': quoted or None,
            'unquoted': unquoted or None,
            'scheme': parts.scheme or None,
            'netloc': parts.netloc or None,
-            'path': parts.path or None,
+            'path': my_path or None,
-            'query': query or None,
+            'path_list': path_list or None,
            'query': query_obj or None,
            'query_list': query_list or None,
            'fragment': parts.fragment or None,
            'username': parts.username,
            'password': parts.password,