ripgrep: replace decoder with encoding_rs_io

This commit mostly moves the transcoder implementation to its own crate: https://github.com/BurntSushi/encoding_rs_io The new crate adds clear documentation and cleans up the implementation to fully implement the contract of io::Read.
2025-08-04 21:52:54 +02:00 · 2018-07-21 20:36:32 -04:00
parent 090216cf00
commit 209a125ea2
5 changed files with 18 additions and 460 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -69,6 +69,14 @@ dependencies = [
 "simd 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "encoding_rs_io"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "fnv"
 version = "1.0.6"
@ -234,6 +242,7 @@ dependencies = [
 "bytecount 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "encoding_rs_io 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "globset 0.4.0",
 "grep 0.1.8",
 "ignore 0.4.2",
@ -385,6 +394,7 @@ dependencies = [
 "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e"
 "checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19"
 "checksum encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88a1b66a0d28af4b03a8c8278c6dcb90e6e600d89c14500a9e7a02e64b9ee3ac"
+"checksum encoding_rs_io 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7bcd05bae9dfcb6d689427192bdf740d92daf53ff8e4d11ae46aad626353e48a"
 "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
 "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
 "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -38,6 +38,7 @@ members = ["grep", "globset", "ignore"]
 atty = "0.2.10"
 bytecount = "0.3.1"
 encoding_rs = "0.8"
+encoding_rs_io = "0.1"
 globset = { version = "0.4.0", path = "globset" }
 grep = { version = "0.1.8", path = "grep" }
 ignore = { version = "0.4.0", path = "ignore" }
--- a/src/decoder.rs
+++ b/src/decoder.rs
@ -1,456 +0,0 @@
-use std::cmp;
-use std::io::{self, Read};
-
-use encoding_rs::{Decoder, Encoding, UTF_8};
-
-/// A BOM is at least 2 bytes and at most 3 bytes.
-///
-/// If fewer than 2 bytes are available to be read at the beginning of a
-/// reader, then a BOM is `None`.
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-struct Bom {
-    bytes: [u8; 3],
-    len: usize,
-}
-
-impl Bom {
-    fn as_slice(&self) -> &[u8] {
-        &self.bytes[0..self.len]
-    }
-
-    fn decoder(&self) -> Option<Decoder> {
-        let bom = self.as_slice();
-        if bom.len() < 3 {
-            return None;
-        }
-        if let Some((enc, _)) = Encoding::for_bom(bom) {
-            if enc != UTF_8 {
-                return Some(enc.new_decoder_with_bom_removal());
-            }
-        }
-        None
-    }
-}
-
-/// `BomPeeker` wraps `R` and satisfies the `io::Read` interface while also
-/// providing a peek at the BOM if one exists. Peeking at the BOM does not
-/// advance the reader.
-struct BomPeeker<R> {
-    rdr: R,
-    bom: Option<Bom>,
-    nread: usize,
-}
-
-impl<R: io::Read> BomPeeker<R> {
-    /// Create a new BomPeeker.
-    ///
-    /// The first three bytes can be read using the `peek_bom` method, but
-    /// will not advance the reader.
-    fn new(rdr: R) -> BomPeeker<R> {
-        BomPeeker { rdr: rdr, bom: None, nread: 0 }
-    }
-
-    /// Peek at the first three bytes of the underlying reader.
-    ///
-    /// This does not advance the reader provided by `BomPeeker`.
-    ///
-    /// If the underlying reader does not have at least two bytes available,
-    /// then `None` is returned.
-    fn peek_bom(&mut self) -> io::Result<Bom> {
-        if let Some(bom) = self.bom {
-            return Ok(bom);
-        }
-        self.bom = Some(Bom { bytes: [0; 3], len: 0 });
-        let mut buf = [0u8; 3];
-        let bom_len = read_full(&mut self.rdr, &mut buf)?;
-        self.bom = Some(Bom { bytes: buf, len: bom_len });
-        Ok(self.bom.unwrap())
-    }
-}
-
-impl<R: io::Read> io::Read for BomPeeker<R> {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        if self.nread < 3 {
-            let bom = self.peek_bom()?;
-            let bom = bom.as_slice();
-            if self.nread < bom.len() {
-                let rest = &bom[self.nread..];
-                let len = cmp::min(buf.len(), rest.len());
-                buf[..len].copy_from_slice(&rest[..len]);
-                self.nread += len;
-                return Ok(len);
-            }
-        }
-        let nread = self.rdr.read(buf)?;
-        self.nread += nread;
-        Ok(nread)
-    }
-}
-
-/// Like `io::Read::read_exact`, except it never returns `UnexpectedEof` and
-/// instead returns the number of bytes read if EOF is seen before filling
-/// `buf`.
-fn read_full<R: io::Read>(
-    mut rdr: R,
-    mut buf: &mut [u8],
-) -> io::Result<usize> {
-    let mut nread = 0;
-    while !buf.is_empty() {
-        match rdr.read(buf) {
-            Ok(0) => break,
-            Ok(n) => {
-                nread += n;
-                let tmp = buf;
-                buf = &mut tmp[n..];
-            }
-            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
-            Err(e) => return Err(e),
-        }
-    }
-    Ok(nread)
-}
-
-/// A reader that transcodes to UTF-8. The source encoding is determined by
-/// inspecting the BOM from the stream read from `R`, if one exists. If a
-/// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with
-/// invalid UTF-16 sequences translated to the Unicode replacement character.
-/// In all other cases, the underlying reader is passed through unchanged.
-///
-/// `R` is the type of the underlying reader and `B` is the type of an internal
-/// buffer used to store the results of transcoding.
-///
-/// Note that not all methods on `io::Read` work with this implementation.
-/// For example, the `bytes` adapter method attempts to read a single byte at
-/// a time, but this implementation requires a buffer of size at least `4`. If
-/// a buffer of size less than 4 is given, then an error is returned.
-pub struct DecodeReader<R, B> {
-    /// The underlying reader, wrapped in a peeker for reading a BOM if one
-    /// exists.
-    rdr: BomPeeker<R>,
-    /// The internal buffer to store transcoded bytes before they are read by
-    /// callers.
-    buf: B,
-    /// The current position in `buf`. Subsequent reads start here.
-    pos: usize,
-    /// The number of transcoded bytes in `buf`. Subsequent reads end here.
-    buflen: usize,
-    /// Whether this is the first read or not (in which we inspect the BOM).
-    first: bool,
-    /// Whether a "last" read has occurred. After this point, EOF will always
-    /// be returned.
-    last: bool,
-    /// The underlying text decoder derived from the BOM, if one exists.
-    decoder: Option<Decoder>,
-}
-
-impl<R: io::Read, B: AsMut<[u8]>> DecodeReader<R, B> {
-    /// Create a new transcoder that converts a source stream to valid UTF-8.
-    ///
-    /// If an encoding is specified, then it is used to transcode `rdr` to
-    /// UTF-8. Otherwise, if no encoding is specified, and if a UTF-16 BOM is
-    /// found, then the corresponding UTF-16 encoding is used to transcode
-    /// `rdr` to UTF-8. In all other cases, `rdr` is assumed to be at least
-    /// ASCII-compatible and passed through untouched.
-    ///
-    /// Errors in the encoding of `rdr` are handled with the Unicode
-    /// replacement character. If no encoding of `rdr` is specified, then
-    /// errors are not handled.
-    pub fn new(
-        rdr: R,
-        buf: B,
-        enc: Option<&'static Encoding>,
-    ) -> DecodeReader<R, B> {
-        DecodeReader {
-            rdr: BomPeeker::new(rdr),
-            buf: buf,
-            buflen: 0,
-            pos: 0,
-            first: enc.is_none(),
-            last: false,
-            decoder: enc.map(|enc| enc.new_decoder_with_bom_removal()),
-        }
-    }
-
-    /// Fill the internal buffer from the underlying reader.
-    ///
-    /// If there are unread bytes in the internal buffer, then we move them
-    /// to the beginning of the internal buffer and fill the remainder.
-    ///
-    /// If the internal buffer is too small to read additional bytes, then an
-    /// error is returned.
-    #[inline(always)] // massive perf benefit (???)
-    fn fill(&mut self) -> io::Result<()> {
-        if self.pos < self.buflen {
-            if self.buflen >= self.buf.as_mut().len() {
-                return Err(io::Error::new(
-                    io::ErrorKind::Other,
-                    "DecodeReader: internal buffer exhausted"));
-            }
-            let newlen = self.buflen - self.pos;
-            let mut tmp = Vec::with_capacity(newlen);
-            tmp.extend_from_slice(&self.buf.as_mut()[self.pos..self.buflen]);
-            self.buf.as_mut()[..newlen].copy_from_slice(&tmp);
-            self.buflen = newlen;
-        } else {
-            self.buflen = 0;
-        }
-        self.pos = 0;
-        self.buflen +=
-            self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
-        Ok(())
-    }
-
-    /// Transcode the inner stream to UTF-8 in `buf`. This assumes that there
-    /// is a decoder capable of transcoding the inner stream to UTF-8. This
-    /// returns the number of bytes written to `buf`.
-    ///
-    /// When this function returns, exactly one of the following things will
-    /// be true:
-    ///
-    /// 1. A non-zero number of bytes were written to `buf`.
-    /// 2. The underlying reader reached EOF.
-    /// 3. An error is returned: the internal buffer ran out of room.
-    /// 4. An I/O error occurred.
-    ///
-    /// Note that `buf` must have at least 4 bytes of space.
-    fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        assert!(buf.len() >= 4);
-        if self.last {
-            return Ok(0);
-        }
-        if self.pos >= self.buflen {
-            self.fill()?;
-        }
-        let mut nwrite = 0;
-        loop {
-            let (_, nin, nout, _) =
-                self.decoder.as_mut().unwrap().decode_to_utf8(
-                    &self.buf.as_mut()[self.pos..self.buflen], buf, false);
-            self.pos += nin;
-            nwrite += nout;
-            // If we've written at least one byte to the caller-provided
-            // buffer, then our mission is complete.
-            if nwrite > 0 {
-                break;
-            }
-            // Otherwise, we know that our internal buffer has insufficient
-            // data to transcode at least one char, so we attempt to refill it.
-            self.fill()?;
-            // Quit on EOF.
-            if self.buflen == 0 {
-                self.pos = 0;
-                self.last = true;
-                let (_, _, nout, _) =
-                    self.decoder.as_mut().unwrap().decode_to_utf8(
-                        &[], buf, true);
-                return Ok(nout);
-            }
-        }
-        Ok(nwrite)
-    }
-
-    #[inline(never)] // impacts perf...
-    fn detect(&mut self) -> io::Result<()> {
-        let bom = self.rdr.peek_bom()?;
-        self.decoder = bom.decoder();
-        Ok(())
-    }
-}
-
-impl<R: io::Read, B: AsMut<[u8]>> io::Read for DecodeReader<R, B> {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        if self.first {
-            self.first = false;
-            self.detect()?;
-        }
-        if self.decoder.is_none() {
-            return self.rdr.read(buf);
-        }
-        // When decoding UTF-8, we need at least 4 bytes of space to guarantee
-        // that we can decode at least one codepoint. If we don't have it, we
-        // can either return `0` for the number of bytes read or return an
-        // error. Since `0` would be interpreted as a possibly premature EOF,
-        // we opt for an error.
-        if buf.len() < 4 {
-            return Err(io::Error::new(
-                io::ErrorKind::Other,
-                "DecodeReader: byte buffer must have length at least 4"));
-        }
-        self.transcode(buf)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::io::Read;
-
-    use encoding_rs::Encoding;
-
-    use super::{Bom, BomPeeker, DecodeReader};
-
-    fn read_to_string<R: Read>(mut rdr: R) -> String {
-        let mut s = String::new();
-        rdr.read_to_string(&mut s).unwrap();
-        s
-    }
-
-    #[test]
-    fn peeker_empty() {
-        let buf = [];
-        let mut peeker = BomPeeker::new(&buf[..]);
-        assert_eq!(Bom { bytes: [0; 3], len: 0}, peeker.peek_bom().unwrap());
-
-        let mut tmp = [0; 100];
-        assert_eq!(0, peeker.read(&mut tmp).unwrap());
-    }
-
-    #[test]
-    fn peeker_one() {
-        let buf = [1];
-        let mut peeker = BomPeeker::new(&buf[..]);
-        assert_eq!(
-            Bom { bytes: [1, 0, 0], len: 1},
-            peeker.peek_bom().unwrap());
-
-        let mut tmp = [0; 100];
-        assert_eq!(1, peeker.read(&mut tmp).unwrap());
-        assert_eq!(1, tmp[0]);
-        assert_eq!(0, peeker.read(&mut tmp).unwrap());
-    }
-
-    #[test]
-    fn peeker_two() {
-        let buf = [1, 2];
-        let mut peeker = BomPeeker::new(&buf[..]);
-        assert_eq!(
-            Bom { bytes: [1, 2, 0], len: 2},
-            peeker.peek_bom().unwrap());
-
-        let mut tmp = [0; 100];
-        assert_eq!(2, peeker.read(&mut tmp).unwrap());
-        assert_eq!(1, tmp[0]);
-        assert_eq!(2, tmp[1]);
-        assert_eq!(0, peeker.read(&mut tmp).unwrap());
-    }
-
-    #[test]
-    fn peeker_three() {
-        let buf = [1, 2, 3];
-        let mut peeker = BomPeeker::new(&buf[..]);
-        assert_eq!(
-            Bom { bytes: [1, 2, 3], len: 3},
-            peeker.peek_bom().unwrap());
-
-        let mut tmp = [0; 100];
-        assert_eq!(3, peeker.read(&mut tmp).unwrap());
-        assert_eq!(1, tmp[0]);
-        assert_eq!(2, tmp[1]);
-        assert_eq!(3, tmp[2]);
-        assert_eq!(0, peeker.read(&mut tmp).unwrap());
-    }
-
-    #[test]
-    fn peeker_four() {
-        let buf = [1, 2, 3, 4];
-        let mut peeker = BomPeeker::new(&buf[..]);
-        assert_eq!(
-            Bom { bytes: [1, 2, 3], len: 3},
-            peeker.peek_bom().unwrap());
-
-        let mut tmp = [0; 100];
-        assert_eq!(3, peeker.read(&mut tmp).unwrap());
-        assert_eq!(1, tmp[0]);
-        assert_eq!(2, tmp[1]);
-        assert_eq!(3, tmp[2]);
-        assert_eq!(1, peeker.read(&mut tmp).unwrap());
-        assert_eq!(4, tmp[0]);
-        assert_eq!(0, peeker.read(&mut tmp).unwrap());
-    }
-
-    #[test]
-    fn peeker_one_at_a_time() {
-        let buf = [1, 2, 3, 4];
-        let mut peeker = BomPeeker::new(&buf[..]);
-
-        let mut tmp = [0; 1];
-        assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap());
-        assert_eq!(0, tmp[0]);
-        assert_eq!(1, peeker.read(&mut tmp).unwrap());
-        assert_eq!(1, tmp[0]);
-        assert_eq!(1, peeker.read(&mut tmp).unwrap());
-        assert_eq!(2, tmp[0]);
-        assert_eq!(1, peeker.read(&mut tmp).unwrap());
-        assert_eq!(3, tmp[0]);
-        assert_eq!(1, peeker.read(&mut tmp).unwrap());
-        assert_eq!(4, tmp[0]);
-    }
-
-    // In cases where all we have is a bom, we expect the bytes to be
-    // passed through unchanged.
-    #[test]
-    fn trans_utf16_bom() {
-        let srcbuf = vec![0xFF, 0xFE];
-        let mut dstbuf = vec![0; 8 * (1<<10)];
-        let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
-        let n = rdr.read(&mut dstbuf).unwrap();
-        assert_eq!(&*srcbuf, &dstbuf[..n]);
-
-        let srcbuf = vec![0xFE, 0xFF];
-        let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
-        let n = rdr.read(&mut dstbuf).unwrap();
-        assert_eq!(&*srcbuf, &dstbuf[..n]);
-
-        let srcbuf = vec![0xEF, 0xBB, 0xBF];
-        let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
-        let n = rdr.read(&mut dstbuf).unwrap();
-        assert_eq!(&*srcbuf, &dstbuf[..n]);
-    }
-
-    // Test basic UTF-16 decoding.
-    #[test]
-    fn trans_utf16_basic() {
-        let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
-        let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
-        assert_eq!("a", read_to_string(&mut rdr));
-
-        let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
-        let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
-        assert_eq!("a", read_to_string(&mut rdr));
-    }
-
-    // Test incomplete UTF-16 decoding. This ensures we see a replacement char
-    // if the stream ends with an unpaired code unit.
-    #[test]
-    fn trans_utf16_incomplete() {
-        let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00];
-        let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
-        assert_eq!("a\u{FFFD}", read_to_string(&mut rdr));
-    }
-
-    macro_rules! test_trans_simple {
-        ($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => {
-            #[test]
-            fn $name() {
-                let srcbuf = &$srcbytes[..];
-                let enc = Encoding::for_label($enc.as_bytes());
-                let mut rdr = DecodeReader::new(
-                    &*srcbuf, vec![0; 8 * (1<<10)], enc);
-                assert_eq!($dst, read_to_string(&mut rdr));
-            }
-        }
-    }
-
-    // This isn't exhaustive obviously, but it lets us test base level support.
-    test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж");
-    test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж");
-    test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж");
-    test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж");
-    test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж");
-    test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж");
-    test_trans_simple!(
-        trans_simple_big5_hkscs, "big5-hkscs", b"\xC7\xFA", "Ж");
-    test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж");
-    test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж");
-    test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж");
-    test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©");
-}
--- a/src/main.rs
+++ b/src/main.rs
@ -3,6 +3,7 @@ extern crate bytecount;
 #[macro_use]
 extern crate clap;
 extern crate encoding_rs;
+extern crate encoding_rs_io;
 extern crate globset;
 extern crate grep;
 extern crate ignore;
@ -41,7 +42,6 @@ macro_rules! errored {
 mod app;
 mod args;
 mod config;
-mod decoder;
 mod decompressor;
 mod preprocessor;
 mod logger;
--- a/src/worker.rs
+++ b/src/worker.rs
@ -8,7 +8,8 @@ use ignore::DirEntry;
 use memmap::Mmap;
 use termcolor::WriteColor;

-use decoder::DecodeReader;
+// use decoder::DecodeReader;
+use encoding_rs_io::DecodeReaderBytesBuilder;
 use decompressor::{self, DecompressionReader};
 use preprocessor::PreprocessorReader;
 use pathutil::strip_prefix;
@ -319,8 +320,10 @@ impl Worker {
        path: &Path,
        rdr: R,
    ) -> Result<u64> {
-        let rdr = DecodeReader::new(
-            rdr, &mut self.decodebuf, self.opts.encoding);
+        let rdr = DecodeReaderBytesBuilder::new()
+            .encoding(self.opts.encoding)
+            .utf8_passthru(true)
+            .build_with_buffer(rdr, &mut self.decodebuf)?;
        let searcher = Searcher::new(
            &mut self.inpbuf, printer, &self.grep, path, rdr);
        searcher