diff --git a/docs/utils.md b/docs/utils.md
index 61fe19ff..b3abc43e 100644
--- a/docs/utils.md
+++ b/docs/utils.md
@@ -9,6 +9,7 @@
* [convert\_to\_int](#jc.utils.convert_to_int)
* [convert\_to\_float](#jc.utils.convert_to_float)
* [convert\_to\_bool](#jc.utils.convert_to_bool)
+ * [convert\_size\_to\_int](#jc.utils.convert_size_to_int)
* [input\_type\_check](#jc.utils.input_type_check)
* [timestamp](#jc.utils.timestamp)
* [\_\_init\_\_](#jc.utils.timestamp.__init__)
@@ -178,6 +179,47 @@ Returns:
True/False False unless a 'truthy' number or string is found
('y', 'yes', 'true', '1', 1, -1, etc.)
+
+
+### convert\_size\_to\_int
+
+```python
+def convert_size_to_int(size: str, binary: bool = False) -> Optional[int]
+```
+
+Parse a human readable data size and return the number of bytes.
+
+Parameters:
+
+ size: (string) The human readable file size to parse.
+ binary: (boolean) `True` to use binary multiples of bytes
+ (base-2) for ambiguous unit symbols and names,
+ `False` to use decimal multiples of bytes (base-10).
+Returns:
+ integer/None Integer if successful conversion, otherwise None
+
+This function knows how to parse sizes in bytes, kilobytes, megabytes,
+gigabytes, terabytes and petabytes. Some examples:
+
+>>> convert_size_to_int('42')
+42
+>>> convert_size_to_int('13b')
+13
+>>> convert_size_to_int('5 bytes')
+5
+>>> convert_size_to_int('1 KB')
+1000
+>>> convert_size_to_int('1 kilobyte')
+1000
+>>> convert_size_to_int('1 KiB')
+1024
+>>> convert_size_to_int('1 KB', binary=True)
+1024
+>>> convert_size_to_int('1.5 GB')
+1500000000
+>>> convert_size_to_int('1.5 GB', binary=True)
+1610612736
+
### input\_type\_check
diff --git a/jc/utils.py b/jc/utils.py
index 70faf137..224b123a 100644
--- a/jc/utils.py
+++ b/jc/utils.py
@@ -3,6 +3,8 @@ import sys
import re
import locale
import shutil
+from collections import namedtuple
+from numbers import Number
from datetime import datetime, timezone
from textwrap import TextWrapper
from functools import lru_cache
@@ -274,6 +276,116 @@ def convert_to_bool(value: object) -> bool:
return False
+# convert_size_to_int from https://github.com/xolox/python-humanfriendly
+
+# Copyright (c) 2021 Peter Odding
+
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+def convert_size_to_int(size: str, binary: bool = False) -> Optional[int]:
+ """
+ Parse a human readable data size and return the number of bytes.
+
+ Parameters:
+
+ size: (string) The human readable file size to parse.
+ binary: (boolean) `True` to use binary multiples of bytes
+ (base-2) for ambiguous unit symbols and names,
+ `False` to use decimal multiples of bytes (base-10).
+ Returns:
+ integer/None Integer if successful conversion, otherwise None
+
+ This function knows how to parse sizes in bytes, kilobytes, megabytes,
+ gigabytes, terabytes and petabytes. Some examples:
+
+ >>> convert_size_to_int('42')
+ 42
+ >>> convert_size_to_int('13b')
+ 13
+ >>> convert_size_to_int('5 bytes')
+ 5
+ >>> convert_size_to_int('1 KB')
+ 1000
+ >>> convert_size_to_int('1 kilobyte')
+ 1000
+ >>> convert_size_to_int('1 KiB')
+ 1024
+ >>> convert_size_to_int('1 KB', binary=True)
+ 1024
+ >>> convert_size_to_int('1.5 GB')
+ 1500000000
+ >>> convert_size_to_int('1.5 GB', binary=True)
+ 1610612736
+ """
+ def tokenize(text: str) -> List[str]:
+ tokenized_input: List = []
+ for token in re.split(r'(\d+(?:\.\d+)?)', text):
+ token = token.strip()
+ if re.match(r'\d+\.\d+', token):
+ tokenized_input.append(float(token))
+ elif token.isdigit():
+ tokenized_input.append(int(token))
+ elif token:
+ tokenized_input.append(token)
+ return tokenized_input
+
+ SizeUnit = namedtuple('SizeUnit', 'divider, symbol, name')
+ CombinedUnit = namedtuple('CombinedUnit', 'decimal, binary')
+ disk_size_units = (
+ CombinedUnit(SizeUnit(1000**1, 'KB', 'kilobyte'), SizeUnit(1024**1, 'KiB', 'kibibyte')),
+ CombinedUnit(SizeUnit(1000**2, 'MB', 'megabyte'), SizeUnit(1024**2, 'MiB', 'mebibyte')),
+ CombinedUnit(SizeUnit(1000**3, 'GB', 'gigabyte'), SizeUnit(1024**3, 'GiB', 'gibibyte')),
+ CombinedUnit(SizeUnit(1000**4, 'TB', 'terabyte'), SizeUnit(1024**4, 'TiB', 'tebibyte')),
+ CombinedUnit(SizeUnit(1000**5, 'PB', 'petabyte'), SizeUnit(1024**5, 'PiB', 'pebibyte')),
+ CombinedUnit(SizeUnit(1000**6, 'EB', 'exabyte'), SizeUnit(1024**6, 'EiB', 'exbibyte')),
+ CombinedUnit(SizeUnit(1000**7, 'ZB', 'zettabyte'), SizeUnit(1024**7, 'ZiB', 'zebibyte')),
+ CombinedUnit(SizeUnit(1000**8, 'YB', 'yottabyte'), SizeUnit(1024**8, 'YiB', 'yobibyte')),
+ )
+ tokens = tokenize(size)
+ if tokens and isinstance(tokens[0], Number):
+ # Get the normalized unit (if any) from the tokenized input.
+ normalized_unit = tokens[1].lower() if len(tokens) == 2 and isinstance(tokens[1], str) else ''
+ # If the input contains only a number, it's assumed to be the number of
+ # bytes. The second token can also explicitly reference the unit bytes.
+ if len(tokens) == 1 or normalized_unit.startswith('b'):
+ return int(tokens[0])
+ # Otherwise we expect two tokens: A number and a unit.
+ if normalized_unit:
+ # Convert plural units to singular units, for details:
+ # https://github.com/xolox/python-humanfriendly/issues/26
+ normalized_unit = normalized_unit.rstrip('s')
+ for unit in disk_size_units:
+ # First we check for unambiguous symbols (KiB, MiB, GiB, etc)
+ # and names (kibibyte, mebibyte, gibibyte, etc) because their
+ # handling is always the same.
+ if normalized_unit in (unit.binary.symbol.lower(), unit.binary.name.lower()):
+ return int(tokens[0] * unit.binary.divider)
+ # Now we will deal with ambiguous prefixes (K, M, G, etc),
+ # symbols (KB, MB, GB, etc) and names (kilobyte, megabyte,
+ # gigabyte, etc) according to the caller's preference.
+ if (normalized_unit in (unit.decimal.symbol.lower(), unit.decimal.name.lower()) or
+ normalized_unit.startswith(unit.decimal.symbol[0].lower())):
+ return int(tokens[0] * (unit.binary.divider if binary else unit.decimal.divider))
+ # We failed to parse the size specification.
+ return None
+
+
def input_type_check(data: object) -> None:
"""Ensure input data is a string. Raises `TypeError` if not."""
if not isinstance(data, str):