add convert_size_to_int function

2025-08-06 22:32:54 +02:00 · 2023-12-09 11:41:06 -08:00
parent 356857f5d6
commit 347097a294
2 changed files with 154 additions and 0 deletions
--- a/docs/utils.md
+++ b/docs/utils.md
@ -9,6 +9,7 @@
  * [convert\_to\_int](#jc.utils.convert_to_int)
  * [convert\_to\_float](#jc.utils.convert_to_float)
  * [convert\_to\_bool](#jc.utils.convert_to_bool)
+  * [convert\_size\_to\_int](#jc.utils.convert_size_to_int)
  * [input\_type\_check](#jc.utils.input_type_check)
  * [timestamp](#jc.utils.timestamp)
    * [\_\_init\_\_](#jc.utils.timestamp.__init__)
@ -178,6 +179,47 @@ Returns:
    True/False      False unless a 'truthy' number or string is found
                    ('y', 'yes', 'true', '1', 1, -1, etc.)

+<a id="jc.utils.convert_size_to_int"></a>
+
+### convert\_size\_to\_int
+
+```python
+def convert_size_to_int(size: str, binary: bool = False) -> Optional[int]
+```
+
+Parse a human readable data size and return the number of bytes.
+
+Parameters:
+
+    size:           (string) The human readable file size to parse.
+    binary:         (boolean) `True` to use binary multiples of bytes
+                    (base-2) for ambiguous unit symbols and names,
+                    `False` to use decimal multiples of bytes (base-10).
+Returns:
+    integer/None    Integer if successful conversion, otherwise None
+
+This function knows how to parse sizes in bytes, kilobytes, megabytes,
+gigabytes, terabytes and petabytes. Some examples:
+
+>>> convert_size_to_int('42')
+42
+>>> convert_size_to_int('13b')
+13
+>>> convert_size_to_int('5 bytes')
+5
+>>> convert_size_to_int('1 KB')
+1000
+>>> convert_size_to_int('1 kilobyte')
+1000
+>>> convert_size_to_int('1 KiB')
+1024
+>>> convert_size_to_int('1 KB', binary=True)
+1024
+>>> convert_size_to_int('1.5 GB')
+1500000000
+>>> convert_size_to_int('1.5 GB', binary=True)
+1610612736
+
 <a id="jc.utils.input_type_check"></a>

 ### input\_type\_check
--- a/jc/utils.py
+++ b/jc/utils.py
@ -3,6 +3,8 @@ import sys
 import re
 import locale
 import shutil
+from collections import namedtuple
+from numbers import Number
 from datetime import datetime, timezone
 from textwrap import TextWrapper
 from functools import lru_cache
@ -274,6 +276,116 @@ def convert_to_bool(value: object) -> bool:
    return False


+# convert_size_to_int from https://github.com/xolox/python-humanfriendly
+
+# Copyright (c) 2021 Peter Odding
+
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+def convert_size_to_int(size: str, binary: bool = False) -> Optional[int]:
+    """
+    Parse a human readable data size and return the number of bytes.
+
+    Parameters:
+
+        size:           (string) The human readable file size to parse.
+        binary:         (boolean) `True` to use binary multiples of bytes
+                        (base-2) for ambiguous unit symbols and names,
+                        `False` to use decimal multiples of bytes (base-10).
+    Returns:
+        integer/None    Integer if successful conversion, otherwise None
+
+    This function knows how to parse sizes in bytes, kilobytes, megabytes,
+    gigabytes, terabytes and petabytes. Some examples:
+
+    >>> convert_size_to_int('42')
+    42
+    >>> convert_size_to_int('13b')
+    13
+    >>> convert_size_to_int('5 bytes')
+    5
+    >>> convert_size_to_int('1 KB')
+    1000
+    >>> convert_size_to_int('1 kilobyte')
+    1000
+    >>> convert_size_to_int('1 KiB')
+    1024
+    >>> convert_size_to_int('1 KB', binary=True)
+    1024
+    >>> convert_size_to_int('1.5 GB')
+    1500000000
+    >>> convert_size_to_int('1.5 GB', binary=True)
+    1610612736
+    """
+    def tokenize(text: str) -> List[str]:
+        tokenized_input: List = []
+        for token in re.split(r'(\d+(?:\.\d+)?)', text):
+            token = token.strip()
+            if re.match(r'\d+\.\d+', token):
+                tokenized_input.append(float(token))
+            elif token.isdigit():
+                tokenized_input.append(int(token))
+            elif token:
+                tokenized_input.append(token)
+        return tokenized_input
+
+    SizeUnit = namedtuple('SizeUnit', 'divider, symbol, name')
+    CombinedUnit = namedtuple('CombinedUnit', 'decimal, binary')
+    disk_size_units = (
+        CombinedUnit(SizeUnit(1000**1, 'KB', 'kilobyte'), SizeUnit(1024**1, 'KiB', 'kibibyte')),
+        CombinedUnit(SizeUnit(1000**2, 'MB', 'megabyte'), SizeUnit(1024**2, 'MiB', 'mebibyte')),
+        CombinedUnit(SizeUnit(1000**3, 'GB', 'gigabyte'), SizeUnit(1024**3, 'GiB', 'gibibyte')),
+        CombinedUnit(SizeUnit(1000**4, 'TB', 'terabyte'), SizeUnit(1024**4, 'TiB', 'tebibyte')),
+        CombinedUnit(SizeUnit(1000**5, 'PB', 'petabyte'), SizeUnit(1024**5, 'PiB', 'pebibyte')),
+        CombinedUnit(SizeUnit(1000**6, 'EB', 'exabyte'), SizeUnit(1024**6, 'EiB', 'exbibyte')),
+        CombinedUnit(SizeUnit(1000**7, 'ZB', 'zettabyte'), SizeUnit(1024**7, 'ZiB', 'zebibyte')),
+        CombinedUnit(SizeUnit(1000**8, 'YB', 'yottabyte'), SizeUnit(1024**8, 'YiB', 'yobibyte')),
+    )
+    tokens = tokenize(size)
+    if tokens and isinstance(tokens[0], Number):
+        # Get the normalized unit (if any) from the tokenized input.
+        normalized_unit = tokens[1].lower() if len(tokens) == 2 and isinstance(tokens[1], str) else ''
+        # If the input contains only a number, it's assumed to be the number of
+        # bytes. The second token can also explicitly reference the unit bytes.
+        if len(tokens) == 1 or normalized_unit.startswith('b'):
+            return int(tokens[0])
+        # Otherwise we expect two tokens: A number and a unit.
+        if normalized_unit:
+            # Convert plural units to singular units, for details:
+            # https://github.com/xolox/python-humanfriendly/issues/26
+            normalized_unit = normalized_unit.rstrip('s')
+            for unit in disk_size_units:
+                # First we check for unambiguous symbols (KiB, MiB, GiB, etc)
+                # and names (kibibyte, mebibyte, gibibyte, etc) because their
+                # handling is always the same.
+                if normalized_unit in (unit.binary.symbol.lower(), unit.binary.name.lower()):
+                    return int(tokens[0] * unit.binary.divider)
+                # Now we will deal with ambiguous prefixes (K, M, G, etc),
+                # symbols (KB, MB, GB, etc) and names (kilobyte, megabyte,
+                # gigabyte, etc) according to the caller's preference.
+                if (normalized_unit in (unit.decimal.symbol.lower(), unit.decimal.name.lower()) or
+                        normalized_unit.startswith(unit.decimal.symbol[0].lower())):
+                    return int(tokens[0] * (unit.binary.divider if binary else unit.decimal.divider))
+    # We failed to parse the size specification.
+    return None
+
+
 def input_type_check(data: object) -> None:
    """Ensure input data is a string. Raises `TypeError` if not."""
    if not isinstance(data, str):