From 209a125ea25615e8fa605c844f8cbcca8c672208 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 21 Jul 2018 20:36:32 -0400 Subject: [PATCH] ripgrep: replace decoder with encoding_rs_io This commit mostly moves the transcoder implementation to its own crate: https://github.com/BurntSushi/encoding_rs_io The new crate adds clear documentation and cleans up the implementation to fully implement the contract of io::Read. --- Cargo.lock | 10 ++ Cargo.toml | 1 + src/decoder.rs | 456 ------------------------------------------------- src/main.rs | 2 +- src/worker.rs | 9 +- 5 files changed, 18 insertions(+), 460 deletions(-) delete mode 100644 src/decoder.rs diff --git a/Cargo.lock b/Cargo.lock index 7aa50c3f..2b47012b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -69,6 +69,14 @@ dependencies = [ "simd 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "encoding_rs_io" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "fnv" version = "1.0.6" @@ -234,6 +242,7 @@ dependencies = [ "bytecount 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs_io 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "globset 0.4.0", "grep 0.1.8", "ignore 0.4.2", @@ -385,6 +394,7 @@ dependencies = [ "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" "checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19" "checksum encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88a1b66a0d28af4b03a8c8278c6dcb90e6e600d89c14500a9e7a02e64b9ee3ac" +"checksum encoding_rs_io 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7bcd05bae9dfcb6d689427192bdf740d92daf53ff8e4d11ae46aad626353e48a" "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" diff --git a/Cargo.toml b/Cargo.toml index c9b02d4c..8686d515 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ members = ["grep", "globset", "ignore"] atty = "0.2.10" bytecount = "0.3.1" encoding_rs = "0.8" +encoding_rs_io = "0.1" globset = { version = "0.4.0", path = "globset" } grep = { version = "0.1.8", path = "grep" } ignore = { version = "0.4.0", path = "ignore" } diff --git a/src/decoder.rs b/src/decoder.rs deleted file mode 100644 index 0842fb5c..00000000 --- a/src/decoder.rs +++ /dev/null @@ -1,456 +0,0 @@ -use std::cmp; -use std::io::{self, Read}; - -use encoding_rs::{Decoder, Encoding, UTF_8}; - -/// A BOM is at least 2 bytes and at most 3 bytes. -/// -/// If fewer than 2 bytes are available to be read at the beginning of a -/// reader, then a BOM is `None`. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -struct Bom { - bytes: [u8; 3], - len: usize, -} - -impl Bom { - fn as_slice(&self) -> &[u8] { - &self.bytes[0..self.len] - } - - fn decoder(&self) -> Option { - let bom = self.as_slice(); - if bom.len() < 3 { - return None; - } - if let Some((enc, _)) = Encoding::for_bom(bom) { - if enc != UTF_8 { - return Some(enc.new_decoder_with_bom_removal()); - } - } - None - } -} - -/// `BomPeeker` wraps `R` and satisfies the `io::Read` interface while also -/// providing a peek at the BOM if one exists. Peeking at the BOM does not -/// advance the reader. -struct BomPeeker { - rdr: R, - bom: Option, - nread: usize, -} - -impl BomPeeker { - /// Create a new BomPeeker. - /// - /// The first three bytes can be read using the `peek_bom` method, but - /// will not advance the reader. - fn new(rdr: R) -> BomPeeker { - BomPeeker { rdr: rdr, bom: None, nread: 0 } - } - - /// Peek at the first three bytes of the underlying reader. - /// - /// This does not advance the reader provided by `BomPeeker`. - /// - /// If the underlying reader does not have at least two bytes available, - /// then `None` is returned. - fn peek_bom(&mut self) -> io::Result { - if let Some(bom) = self.bom { - return Ok(bom); - } - self.bom = Some(Bom { bytes: [0; 3], len: 0 }); - let mut buf = [0u8; 3]; - let bom_len = read_full(&mut self.rdr, &mut buf)?; - self.bom = Some(Bom { bytes: buf, len: bom_len }); - Ok(self.bom.unwrap()) - } -} - -impl io::Read for BomPeeker { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - if self.nread < 3 { - let bom = self.peek_bom()?; - let bom = bom.as_slice(); - if self.nread < bom.len() { - let rest = &bom[self.nread..]; - let len = cmp::min(buf.len(), rest.len()); - buf[..len].copy_from_slice(&rest[..len]); - self.nread += len; - return Ok(len); - } - } - let nread = self.rdr.read(buf)?; - self.nread += nread; - Ok(nread) - } -} - -/// Like `io::Read::read_exact`, except it never returns `UnexpectedEof` and -/// instead returns the number of bytes read if EOF is seen before filling -/// `buf`. -fn read_full( - mut rdr: R, - mut buf: &mut [u8], -) -> io::Result { - let mut nread = 0; - while !buf.is_empty() { - match rdr.read(buf) { - Ok(0) => break, - Ok(n) => { - nread += n; - let tmp = buf; - buf = &mut tmp[n..]; - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), - } - } - Ok(nread) -} - -/// A reader that transcodes to UTF-8. The source encoding is determined by -/// inspecting the BOM from the stream read from `R`, if one exists. If a -/// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with -/// invalid UTF-16 sequences translated to the Unicode replacement character. -/// In all other cases, the underlying reader is passed through unchanged. -/// -/// `R` is the type of the underlying reader and `B` is the type of an internal -/// buffer used to store the results of transcoding. -/// -/// Note that not all methods on `io::Read` work with this implementation. -/// For example, the `bytes` adapter method attempts to read a single byte at -/// a time, but this implementation requires a buffer of size at least `4`. If -/// a buffer of size less than 4 is given, then an error is returned. -pub struct DecodeReader { - /// The underlying reader, wrapped in a peeker for reading a BOM if one - /// exists. - rdr: BomPeeker, - /// The internal buffer to store transcoded bytes before they are read by - /// callers. - buf: B, - /// The current position in `buf`. Subsequent reads start here. - pos: usize, - /// The number of transcoded bytes in `buf`. Subsequent reads end here. - buflen: usize, - /// Whether this is the first read or not (in which we inspect the BOM). - first: bool, - /// Whether a "last" read has occurred. After this point, EOF will always - /// be returned. - last: bool, - /// The underlying text decoder derived from the BOM, if one exists. - decoder: Option, -} - -impl> DecodeReader { - /// Create a new transcoder that converts a source stream to valid UTF-8. - /// - /// If an encoding is specified, then it is used to transcode `rdr` to - /// UTF-8. Otherwise, if no encoding is specified, and if a UTF-16 BOM is - /// found, then the corresponding UTF-16 encoding is used to transcode - /// `rdr` to UTF-8. In all other cases, `rdr` is assumed to be at least - /// ASCII-compatible and passed through untouched. - /// - /// Errors in the encoding of `rdr` are handled with the Unicode - /// replacement character. If no encoding of `rdr` is specified, then - /// errors are not handled. - pub fn new( - rdr: R, - buf: B, - enc: Option<&'static Encoding>, - ) -> DecodeReader { - DecodeReader { - rdr: BomPeeker::new(rdr), - buf: buf, - buflen: 0, - pos: 0, - first: enc.is_none(), - last: false, - decoder: enc.map(|enc| enc.new_decoder_with_bom_removal()), - } - } - - /// Fill the internal buffer from the underlying reader. - /// - /// If there are unread bytes in the internal buffer, then we move them - /// to the beginning of the internal buffer and fill the remainder. - /// - /// If the internal buffer is too small to read additional bytes, then an - /// error is returned. - #[inline(always)] // massive perf benefit (???) - fn fill(&mut self) -> io::Result<()> { - if self.pos < self.buflen { - if self.buflen >= self.buf.as_mut().len() { - return Err(io::Error::new( - io::ErrorKind::Other, - "DecodeReader: internal buffer exhausted")); - } - let newlen = self.buflen - self.pos; - let mut tmp = Vec::with_capacity(newlen); - tmp.extend_from_slice(&self.buf.as_mut()[self.pos..self.buflen]); - self.buf.as_mut()[..newlen].copy_from_slice(&tmp); - self.buflen = newlen; - } else { - self.buflen = 0; - } - self.pos = 0; - self.buflen += - self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?; - Ok(()) - } - - /// Transcode the inner stream to UTF-8 in `buf`. This assumes that there - /// is a decoder capable of transcoding the inner stream to UTF-8. This - /// returns the number of bytes written to `buf`. - /// - /// When this function returns, exactly one of the following things will - /// be true: - /// - /// 1. A non-zero number of bytes were written to `buf`. - /// 2. The underlying reader reached EOF. - /// 3. An error is returned: the internal buffer ran out of room. - /// 4. An I/O error occurred. - /// - /// Note that `buf` must have at least 4 bytes of space. - fn transcode(&mut self, buf: &mut [u8]) -> io::Result { - assert!(buf.len() >= 4); - if self.last { - return Ok(0); - } - if self.pos >= self.buflen { - self.fill()?; - } - let mut nwrite = 0; - loop { - let (_, nin, nout, _) = - self.decoder.as_mut().unwrap().decode_to_utf8( - &self.buf.as_mut()[self.pos..self.buflen], buf, false); - self.pos += nin; - nwrite += nout; - // If we've written at least one byte to the caller-provided - // buffer, then our mission is complete. - if nwrite > 0 { - break; - } - // Otherwise, we know that our internal buffer has insufficient - // data to transcode at least one char, so we attempt to refill it. - self.fill()?; - // Quit on EOF. - if self.buflen == 0 { - self.pos = 0; - self.last = true; - let (_, _, nout, _) = - self.decoder.as_mut().unwrap().decode_to_utf8( - &[], buf, true); - return Ok(nout); - } - } - Ok(nwrite) - } - - #[inline(never)] // impacts perf... - fn detect(&mut self) -> io::Result<()> { - let bom = self.rdr.peek_bom()?; - self.decoder = bom.decoder(); - Ok(()) - } -} - -impl> io::Read for DecodeReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - if self.first { - self.first = false; - self.detect()?; - } - if self.decoder.is_none() { - return self.rdr.read(buf); - } - // When decoding UTF-8, we need at least 4 bytes of space to guarantee - // that we can decode at least one codepoint. If we don't have it, we - // can either return `0` for the number of bytes read or return an - // error. Since `0` would be interpreted as a possibly premature EOF, - // we opt for an error. - if buf.len() < 4 { - return Err(io::Error::new( - io::ErrorKind::Other, - "DecodeReader: byte buffer must have length at least 4")); - } - self.transcode(buf) - } -} - -#[cfg(test)] -mod tests { - use std::io::Read; - - use encoding_rs::Encoding; - - use super::{Bom, BomPeeker, DecodeReader}; - - fn read_to_string(mut rdr: R) -> String { - let mut s = String::new(); - rdr.read_to_string(&mut s).unwrap(); - s - } - - #[test] - fn peeker_empty() { - let buf = []; - let mut peeker = BomPeeker::new(&buf[..]); - assert_eq!(Bom { bytes: [0; 3], len: 0}, peeker.peek_bom().unwrap()); - - let mut tmp = [0; 100]; - assert_eq!(0, peeker.read(&mut tmp).unwrap()); - } - - #[test] - fn peeker_one() { - let buf = [1]; - let mut peeker = BomPeeker::new(&buf[..]); - assert_eq!( - Bom { bytes: [1, 0, 0], len: 1}, - peeker.peek_bom().unwrap()); - - let mut tmp = [0; 100]; - assert_eq!(1, peeker.read(&mut tmp).unwrap()); - assert_eq!(1, tmp[0]); - assert_eq!(0, peeker.read(&mut tmp).unwrap()); - } - - #[test] - fn peeker_two() { - let buf = [1, 2]; - let mut peeker = BomPeeker::new(&buf[..]); - assert_eq!( - Bom { bytes: [1, 2, 0], len: 2}, - peeker.peek_bom().unwrap()); - - let mut tmp = [0; 100]; - assert_eq!(2, peeker.read(&mut tmp).unwrap()); - assert_eq!(1, tmp[0]); - assert_eq!(2, tmp[1]); - assert_eq!(0, peeker.read(&mut tmp).unwrap()); - } - - #[test] - fn peeker_three() { - let buf = [1, 2, 3]; - let mut peeker = BomPeeker::new(&buf[..]); - assert_eq!( - Bom { bytes: [1, 2, 3], len: 3}, - peeker.peek_bom().unwrap()); - - let mut tmp = [0; 100]; - assert_eq!(3, peeker.read(&mut tmp).unwrap()); - assert_eq!(1, tmp[0]); - assert_eq!(2, tmp[1]); - assert_eq!(3, tmp[2]); - assert_eq!(0, peeker.read(&mut tmp).unwrap()); - } - - #[test] - fn peeker_four() { - let buf = [1, 2, 3, 4]; - let mut peeker = BomPeeker::new(&buf[..]); - assert_eq!( - Bom { bytes: [1, 2, 3], len: 3}, - peeker.peek_bom().unwrap()); - - let mut tmp = [0; 100]; - assert_eq!(3, peeker.read(&mut tmp).unwrap()); - assert_eq!(1, tmp[0]); - assert_eq!(2, tmp[1]); - assert_eq!(3, tmp[2]); - assert_eq!(1, peeker.read(&mut tmp).unwrap()); - assert_eq!(4, tmp[0]); - assert_eq!(0, peeker.read(&mut tmp).unwrap()); - } - - #[test] - fn peeker_one_at_a_time() { - let buf = [1, 2, 3, 4]; - let mut peeker = BomPeeker::new(&buf[..]); - - let mut tmp = [0; 1]; - assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap()); - assert_eq!(0, tmp[0]); - assert_eq!(1, peeker.read(&mut tmp).unwrap()); - assert_eq!(1, tmp[0]); - assert_eq!(1, peeker.read(&mut tmp).unwrap()); - assert_eq!(2, tmp[0]); - assert_eq!(1, peeker.read(&mut tmp).unwrap()); - assert_eq!(3, tmp[0]); - assert_eq!(1, peeker.read(&mut tmp).unwrap()); - assert_eq!(4, tmp[0]); - } - - // In cases where all we have is a bom, we expect the bytes to be - // passed through unchanged. - #[test] - fn trans_utf16_bom() { - let srcbuf = vec![0xFF, 0xFE]; - let mut dstbuf = vec![0; 8 * (1<<10)]; - let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); - let n = rdr.read(&mut dstbuf).unwrap(); - assert_eq!(&*srcbuf, &dstbuf[..n]); - - let srcbuf = vec![0xFE, 0xFF]; - let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); - let n = rdr.read(&mut dstbuf).unwrap(); - assert_eq!(&*srcbuf, &dstbuf[..n]); - - let srcbuf = vec![0xEF, 0xBB, 0xBF]; - let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); - let n = rdr.read(&mut dstbuf).unwrap(); - assert_eq!(&*srcbuf, &dstbuf[..n]); - } - - // Test basic UTF-16 decoding. - #[test] - fn trans_utf16_basic() { - let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00]; - let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); - assert_eq!("a", read_to_string(&mut rdr)); - - let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61]; - let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); - assert_eq!("a", read_to_string(&mut rdr)); - } - - // Test incomplete UTF-16 decoding. This ensures we see a replacement char - // if the stream ends with an unpaired code unit. - #[test] - fn trans_utf16_incomplete() { - let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00]; - let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); - assert_eq!("a\u{FFFD}", read_to_string(&mut rdr)); - } - - macro_rules! test_trans_simple { - ($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => { - #[test] - fn $name() { - let srcbuf = &$srcbytes[..]; - let enc = Encoding::for_label($enc.as_bytes()); - let mut rdr = DecodeReader::new( - &*srcbuf, vec![0; 8 * (1<<10)], enc); - assert_eq!($dst, read_to_string(&mut rdr)); - } - } - } - - // This isn't exhaustive obviously, but it lets us test base level support. - test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж"); - test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж"); - test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж"); - test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж"); - test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж"); - test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж"); - test_trans_simple!( - trans_simple_big5_hkscs, "big5-hkscs", b"\xC7\xFA", "Ж"); - test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж"); - test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж"); - test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж"); - test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©"); -} diff --git a/src/main.rs b/src/main.rs index ab0e4118..73bc8c23 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,6 +3,7 @@ extern crate bytecount; #[macro_use] extern crate clap; extern crate encoding_rs; +extern crate encoding_rs_io; extern crate globset; extern crate grep; extern crate ignore; @@ -41,7 +42,6 @@ macro_rules! errored { mod app; mod args; mod config; -mod decoder; mod decompressor; mod preprocessor; mod logger; diff --git a/src/worker.rs b/src/worker.rs index 5b7ef0a4..8e840400 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -8,7 +8,8 @@ use ignore::DirEntry; use memmap::Mmap; use termcolor::WriteColor; -use decoder::DecodeReader; +// use decoder::DecodeReader; +use encoding_rs_io::DecodeReaderBytesBuilder; use decompressor::{self, DecompressionReader}; use preprocessor::PreprocessorReader; use pathutil::strip_prefix; @@ -319,8 +320,10 @@ impl Worker { path: &Path, rdr: R, ) -> Result { - let rdr = DecodeReader::new( - rdr, &mut self.decodebuf, self.opts.encoding); + let rdr = DecodeReaderBytesBuilder::new() + .encoding(self.opts.encoding) + .utf8_passthru(true) + .build_with_buffer(rdr, &mut self.decodebuf)?; let searcher = Searcher::new( &mut self.inpbuf, printer, &self.grep, path, rdr); searcher