Optimize encoding detection (#1243)

* Optimize encoding detection * Use a threshold based system
2026-06-20 11:32:56 +02:00 · 2021-12-23 22:05:58 +03:00
parent 5a83a9ebc4
commit e09401b81a
6 changed files with 71 additions and 10 deletions
@@ -178,9 +178,8 @@ for pretty in ['all', 'none']:
        f'`http --pretty={pretty} pie.dev/stream/1000`',
        [
            '--print=HBhb',
-            '--stream',
            f'--pretty={pretty}',
-            'httpbin.org/stream/100'
+            'httpbin.org/stream/1000'
        ]
    )
 DownloadRunner('download', '`http --download :/big_file.txt` (3GB)', '3G')
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Tuple

 from charset_normalizer import from_bytes
 from charset_normalizer.constant import TOO_SMALL_SEQUENCE
@@ -29,7 +29,7 @@ def detect_encoding(content: ContentBytes) -> str:
    return encoding


-def smart_decode(content: ContentBytes, encoding: str) -> str:
+def smart_decode(content: ContentBytes, encoding: str) -> Tuple[str, str]:
    """Decode `content` using the given `encoding`.
    If no `encoding` is provided, the best effort is to guess it from `content`.

@@ -38,7 +38,7 @@ def smart_decode(content: ContentBytes, encoding: str) -> str:
    """
    if not encoding:
        encoding = detect_encoding(content)
-    return content.decode(encoding, 'replace')
+    return content.decode(encoding, 'replace'), encoding


 def smart_encode(content: str, encoding: str) -> bytes:
@@ -1,6 +1,6 @@
 from abc import ABCMeta, abstractmethod
 from itertools import chain
-from typing import Callable, Iterable, Union
+from typing import Callable, Iterable, Optional, Union

 from .processing import Conversion, Formatting
 from ..context import Environment
@@ -89,6 +89,9 @@ class RawStream(BaseStream):
        return self.msg.iter_body(self.chunk_size)


+ENCODING_GUESS_THRESHOLD = 3
+
+
 class EncodedStream(BaseStream):
    """Encoded HTTP message stream.

@@ -111,7 +114,8 @@ class EncodedStream(BaseStream):
            self.mime = mime_overwrite
        else:
            self.mime, _ = parse_content_type_header(self.msg.content_type)
-        self.encoding = encoding_overwrite or self.msg.encoding
+        self._encoding = encoding_overwrite or self.msg.encoding
+        self._encoding_guesses = []
        if env.stdout_isatty:
            # Use the encoding supported by the terminal.
            output_encoding = env.stdout_encoding
@@ -125,9 +129,33 @@ class EncodedStream(BaseStream):
        for line, lf in self.msg.iter_lines(self.CHUNK_SIZE):
            if b'\0' in line:
                raise BinarySuppressedError()
-            line = smart_decode(line, self.encoding)
+            line = self.decode_chunk(line)
            yield smart_encode(line, self.output_encoding) + lf

+    def decode_chunk(self, raw_chunk: str) -> str:
+        chunk, guessed_encoding = smart_decode(raw_chunk, self.encoding)
+        self._encoding_guesses.append(guessed_encoding)
+        return chunk
+
+    @property
+    def encoding(self) -> Optional[str]:
+        if self._encoding:
+            return self._encoding
+
+        # If we find a reliable (used consecutively) encoding, than
+        # use it for the next iterations.
+        if len(self._encoding_guesses) < ENCODING_GUESS_THRESHOLD:
+            return None
+
+        guess_1, guess_2 = self._encoding_guesses[-2:]
+        if guess_1 == guess_2:
+            self._encoding = guess_1
+            return guess_1
+
+    @encoding.setter
+    def encoding(self, value) -> None:
+        self._encoding = value
+

 class PrettyStream(EncodedStream):
    """In addition to :class:`EncodedStream` behaviour, this stream applies
@@ -178,7 +206,7 @@ class PrettyStream(EncodedStream):
        if not isinstance(chunk, str):
            # Text when a converter has been used,
            # otherwise it will always be bytes.
-            chunk = smart_decode(chunk, self.encoding)
+            chunk = self.decode_chunk(chunk)
        chunk = self.formatting.format_body(content=chunk, mime=self.mime)
        return smart_encode(chunk, self.output_encoding)

@@ -32,6 +32,8 @@ JSON_FILE_PATH_ARG = patharg(JSON_FILE_PATH)
 # line would be escaped).
 FILE_CONTENT = FILE_PATH.read_text(encoding=UTF8).strip()

+ASCII_FILE_CONTENT = "random text" * 10
+

 JSON_FILE_CONTENT = JSON_FILE_PATH.read_text(encoding=UTF8)
 BIN_FILE_CONTENT = BIN_FILE_PATH.read_bytes()
@@ -11,7 +11,12 @@ from httpie.plugins import ConverterPlugin
 from httpie.plugins.registry import plugin_manager

 from .utils import StdinBytesIO, http, MockEnvironment, DUMMY_URL
-from .fixtures import BIN_FILE_CONTENT, BIN_FILE_PATH
+from .fixtures import (
+    ASCII_FILE_CONTENT,
+    BIN_FILE_CONTENT,
+    BIN_FILE_PATH,
+    FILE_CONTENT as UNICODE_FILE_CONTENT
+)

 PRETTY_OPTIONS = list(PRETTY_MAP.keys())

@@ -133,3 +138,9 @@ def test_auto_streaming(http_server, extras, expected):
        for call_arg in env.stdout.write.call_args_list
        if b'test' in call_arg[0][0]
    ]) == expected
+
+
+def test_streaming_encoding_detection(http_server):
+    r = http('--stream', http_server + '/stream/encoding/random')
+    assert ASCII_FILE_CONTENT in r
+    assert UNICODE_FILE_CONTENT in r
@@ -52,6 +52,27 @@ def chunked_drip(handler):
    handler.wfile.write('0\r\n\r\n'.encode('utf-8'))


+@TestHandler.handler('GET', '/stream/encoding/random')
+def random_encoding(handler):
+    from tests.fixtures import ASCII_FILE_CONTENT, FILE_CONTENT as UNICODE_FILE_CONTENT
+
+    handler.send_response(200)
+    handler.send_header('Transfer-Encoding', 'chunked')
+    handler.end_headers()
+
+    for body in [
+        ASCII_FILE_CONTENT,
+        ASCII_FILE_CONTENT,
+        UNICODE_FILE_CONTENT,
+        UNICODE_FILE_CONTENT,
+        UNICODE_FILE_CONTENT,
+    ]:
+        body += "\n"
+        handler.wfile.write(f'{len(body.encode()):X}\r\n{body}\r\n'.encode())
+
+    handler.wfile.write('0\r\n\r\n'.encode('utf-8'))
+
+
@pytest.fixture(scope="function")
 def http_server():
    """A custom HTTP server implementation for our tests, that is