add line_slice function

2026-06-19 22:28:17 +02:00 · 2024-01-05 15:05:59 -08:00
parent 3a95407161
commit c97a8fb48f
3 changed files with 106 additions and 1 deletions
@@ -11,6 +11,7 @@
  * [convert\_to\_bool](#jc.utils.convert_to_bool)
  * [convert\_size\_to\_int](#jc.utils.convert_size_to_int)
  * [input\_type\_check](#jc.utils.input_type_check)
+  * [line\_slice](#jc.utils.line_slice)
  * [timestamp](#jc.utils.timestamp)
    * [\_\_init\_\_](#jc.utils.timestamp.__init__)

@@ -231,6 +232,35 @@ def input_type_check(data: object) -> None

 Ensure input data is a string. Raises `TypeError` if not.

+<a id="jc.utils.line_slice"></a>
+
+### line\_slice
+
+```python
+def line_slice(data: Union[str, Iterable],
+               slice_start: Optional[int] = None,
+               slice_end: Optional[int] = None) -> Union[str, Iterable]
+```
+
+Slice input data by lines - lazily, if possible.
+
+Accepts a string (for normal parsers) or an iterable (for streaming
+parsers). Uses normal start/stop slicing values, but will always slice
+on lines instead of characters. Positive slices will use less memory as
+the function will attempt to lazily iterate over the input. A negative
+slice parameter will force the function to read in all of the data and
+then slice, which will use more memory.
+
+Parameters:
+
+    data:              (string or iterable) - input to slice by lines
+    slice_start:       (int) - starting line
+    slice_end:         (int) - ending line
+
+Returns:
+    string if input is a string.
+    iterable of strings if input is an iterable (for streaming parsers)
+
 <a id="jc.utils.timestamp"></a>

 ### timestamp Objects
@@ -3,6 +3,7 @@ import sys
 import re
 import locale
 import shutil
+from itertools import islice
 from collections import namedtuple
 from numbers import Number
 from datetime import datetime, timezone
@@ -393,6 +394,80 @@ def input_type_check(data: object) -> None:
        raise TypeError("Input data must be a 'str' object.")


+def _lazy_splitlines(text: str) -> Iterable[str]:
+    NEWLINES_PATTERN: str = r'(\r\n|\r|\n)'
+    NEWLINES_RE = re.compile(NEWLINES_PATTERN)
+    start = 0
+    for m in NEWLINES_RE.finditer(text):
+        begin, end = m.span()
+        if begin != start:
+            yield text[start:begin]
+        start = end
+
+    if text[start:]:
+        yield text[start:]
+
+
+def line_slice(
+        data: Union[str, Iterable],
+        slice_start: Optional[int] = None,
+        slice_end: Optional[int] = None
+) -> Union[str, Iterable]:
+    """
+    Slice input data by lines - lazily, if possible.
+
+    Accepts a string (for normal parsers) or an iterable (for streaming
+    parsers). Uses normal start/stop slicing values, but will always slice
+    on lines instead of characters. Positive slices will use less memory as
+    the function will attempt to lazily iterate over the input. A negative
+    slice parameter will force the function to read in all of the data and
+    then slice, which will use more memory.
+
+    Parameters:
+
+        data:              (string or iterable) - input to slice by lines
+        slice_start:       (int) - starting line
+        slice_end:         (int) - ending line
+
+    Returns:
+        string if input is a string.
+        iterable of strings if input is an iterable (for streaming parsers)
+    """
+    if not slice_start is None or not slice_end is None:
+        # standard parsers UTF-8 input
+        if isinstance(data, str):
+            data_iter = _lazy_splitlines(data)
+
+            # positive slices
+            if (slice_start is None or slice_start >= 0) \
+                and (slice_end is None or slice_end >= 0):
+
+                return '\n'.join(islice(data_iter, slice_start, slice_end))
+
+            # negative slices found (non-lazy, uses more memory)
+            else:
+                return '\n'.join(list(data_iter)[slice_start:slice_end])
+
+        # standard parsers bytes input
+        elif isinstance(data, bytes):
+            raise ValueError('Cannot slice bytes data.')
+
+        # streaming parsers UTF-8 input
+        else:
+            # positive slices
+            if (slice_start is None or slice_start >= 0) \
+                and (slice_end is None or slice_end >= 0) \
+                and data:
+
+                return islice(data, slice_start, slice_end)
+
+            # negative slices found (non-lazy, uses more memory)
+            elif data:
+                return list(data)[slice_start:slice_end]
+
+    return data
+
+
 class timestamp:
    __slots__ = ('string', 'format', 'naive', 'utc', 'iso')

@@ -1,4 +1,4 @@
-.TH jc 1 2024-01-04 1.24.1 "JSON Convert"
+.TH jc 1 2024-01-05 1.24.1 "JSON Convert"
 .SH NAME
 \fBjc\fP \- JSON Convert JSONifies the output of many CLI tools, file-types,
 and strings