From 0a462978b731770d1290187a502706c2322f5f91 Mon Sep 17 00:00:00 2001
From: Kelly Brazil <kellyjonbrazil@gmail.com>
Date: Wed, 23 Mar 2022 15:08:33 -0700
Subject: [PATCH] fix for special characters in headers

---
 jc/parsers/asciitable.py   | 25 ++++++++++++++++++++--
 jc/parsers/asciitable_m.py |  8 ++++++-
 tests/test_asciitable.py   | 43 ++++++++++++++++++++++++++++++++++++++
 tests/test_asciitable_m.py | 28 +++++++++++++++++++++++++
 4 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/jc/parsers/asciitable.py b/jc/parsers/asciitable.py
index 3c21fa60..9346278c 100644
--- a/jc/parsers/asciitable.py
+++ b/jc/parsers/asciitable.py
@@ -211,7 +211,11 @@ def _is_separator(line: str) -> bool:
 
 
 def _snake_case(line: str) -> str:
-    """replace spaces between words with an underscore and set to lowercase"""
+    """
+    replace spaces between words and special characters with an underscore
+    and set to lowercase
+    """
+    line = re.sub(r'[^a-zA-Z0-9 ]', '_', line)
     return re.sub(r'\b \b', '_', line).lower()
 
 
@@ -246,6 +250,22 @@ def _normalize_rows(table: str) -> List[str]:
     return result
 
 
+def _fixup_headers(table: List[Dict]) -> List[Dict]:
+    """remove consecutive underscores and any trailing underscores"""
+    new_table = []
+    for row in table:
+        new_row = row.copy()
+        for k, v in row.items():
+            k_new = k
+            # remove consecutive underscores
+            k_new = re.sub(r'__+', '_', k_new)
+            # remove trailing underscores
+            k_new = re.sub(r'_+$', '', k_new)
+            new_row[k_new] = new_row.pop(k)
+        new_table.append(new_row)
+
+    return new_table
+
 def parse(
     data: str,
     raw: bool = False,
@@ -273,6 +293,7 @@ def parse(
         data = _remove_ansi(data)
         data = _strip(data)
         data_list = _normalize_rows(data)
-        raw_output = sparse_table_parse(data_list)
+        raw_table = sparse_table_parse(data_list)
+        raw_output = _fixup_headers(raw_table)
 
     return raw_output if raw else _process(raw_output)
diff --git a/jc/parsers/asciitable_m.py b/jc/parsers/asciitable_m.py
index 5f51b819..6ad45640 100644
--- a/jc/parsers/asciitable_m.py
+++ b/jc/parsers/asciitable_m.py
@@ -222,7 +222,12 @@ def _is_separator(line: str) -> bool:
 
 
 def _snake_case(line: str) -> str:
-    """replace spaces between words with an underscore and set to lowercase"""
+    """
+    replace spaces between words and special characters with an underscore
+    and set to lowercase
+    """
+    # must include all column separator characters in regex
+    line = re.sub(r'[^a-zA-Z0-9 |│┃┆┇┊┋╎╏║]', '_', line)
     return re.sub(r'\b \b', '_', line).lower()
 
 
@@ -360,6 +365,7 @@ def _collapse_headers(table: List[List[str]]) -> List[str]:
         for i, header in enumerate(line):
             if header:
                 new_header = result[i] + '_' + header
+                # remove consecutive underscores
                 new_header = re.sub(r'__+', '_', new_header)
                 new_line.append(new_header)
             else:
diff --git a/tests/test_asciitable.py b/tests/test_asciitable.py
index a8cdc14f..e8e9967a 100644
--- a/tests/test_asciitable.py
+++ b/tests/test_asciitable.py
@@ -301,6 +301,49 @@ class MyTests(unittest.TestCase):
 
         self.assertEqual(jc.parsers.asciitable.parse(input, quiet=True), expected)
 
+    def test_asciitable_special_chars_in_header(self):
+        """
+        Test 'asciitable' with a pure ASCII table that has special
+        characters in the header. These should be converted to underscores
+        and no trailing or consecutive underscores should end up in the
+        resulting key names.
+        """
+        input = '''
+Protocol  Address     Age (min)  Hardware Addr   Type   Interface
+Internet  10.12.13.1        98   0950.5785.5cd1  ARPA   FastEthernet2.13
+Internet  10.12.13.3       131   0150.7685.14d5  ARPA   GigabitEthernet2.13
+Internet  10.12.13.4       198   0950.5C8A.5c41  ARPA   GigabitEthernet2.17
+        '''
+
+        expected = [
+            {
+                "protocol": "Internet",
+                "address": "10.12.13.1",
+                "age_min": "98",
+                "hardware_addr": "0950.5785.5cd1",
+                "type": "ARPA",
+                "interface": "FastEthernet2.13"
+            },
+            {
+                "protocol": "Internet",
+                "address": "10.12.13.3",
+                "age_min": "131",
+                "hardware_addr": "0150.7685.14d5",
+                "type": "ARPA",
+                "interface": "GigabitEthernet2.13"
+            },
+            {
+                "protocol": "Internet",
+                "address": "10.12.13.4",
+                "age_min": "198",
+                "hardware_addr": "0950.5C8A.5c41",
+                "type": "ARPA",
+                "interface": "GigabitEthernet2.17"
+            }
+        ]
+
+        self.assertEqual(jc.parsers.asciitable.parse(input, quiet=True), expected)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_asciitable_m.py b/tests/test_asciitable_m.py
index d2c6f483..0b8b0aa9 100644
--- a/tests/test_asciitable_m.py
+++ b/tests/test_asciitable_m.py
@@ -242,6 +242,34 @@ class MyTests(unittest.TestCase):
 
         self.assertEqual(jc.parsers.asciitable_m.parse(input, quiet=True), expected)
 
+    def test_asciitable_m_special_chars_in_header(self):
+        """
+        Test 'asciitable_m' with a pure ASCII table that has special
+        characters in the header. These should be converted to underscores
+        and no trailing or consecutive underscores should end up in the
+        resulting key names.
+        """
+        input = '''
++----------+------------+-----------+----------------+-------+--------------------+
+| Protocol | Address    | Age (min) | Hardware Addr  | Type  | Interface          |
+|          |            | of int    |                |       |                    |
++----------+------------+-----------+----------------+-------+--------------------+
+| Internet | 10.12.13.1 |       98  | 0950.5785.5cd1 | ARPA  | FastEthernet2.13   |
++----------+------------+-----------+----------------+-------+--------------------+
+        '''
+        expected = [
+            {
+                "protocol": "Internet",
+                "address": "10.12.13.1",
+                "age_min_of_int": "98",
+                "hardware_addr": "0950.5785.5cd1",
+                "type": "ARPA",
+                "interface": "FastEthernet2.13"
+            }
+        ]
+
+        self.assertEqual(jc.parsers.asciitable_m.parse(input, quiet=True), expected)
+
     def test_asciitable_m_markdown(self):
         """
         Test 'asciitable_m' with a markdown table. Should raise a ParseError