diff --git a/Cargo.lock b/Cargo.lock index 16ae133b..6901b526 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,6 +5,7 @@ dependencies = [ "atty 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "bytecount 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.20.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", "grep 0.1.5", "ignore 0.1.7", @@ -55,6 +56,11 @@ dependencies = [ "simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "cfg-if" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "clap" version = "2.20.5" @@ -75,6 +81,14 @@ name = "crossbeam" version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "encoding_rs" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "env_logger" version = "0.4.2" @@ -323,8 +337,10 @@ dependencies = [ "checksum atty 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d912da0db7fa85514874458ca3651fe2cddace8d0b0505571dbdcd41ab490159" "checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d" "checksum bytecount 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "1e8f09fbc8c6726a4b616dcfbd4f54491068d6bb1b93ac03c78ac18ff9a5924a" +"checksum cfg-if 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "de1e760d7b6535af4241fca8bd8adf68e2e7edacc6b29f5d399050c5e48cf88c" "checksum clap 2.20.5 (registry+https://github.com/rust-lang/crates.io-index)" = "7db281b0520e97fbd15cd615dcd8f8bcad0c26f5f7d5effe705f090f39e9a758" "checksum crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "0c5ea215664ca264da8a9d9c3be80d2eaf30923c259d03e870388eb927508f97" +"checksum encoding_rs 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a1cca0a26f904955d80d70b9bff1019e4f4cbc06f2fcbccf8bd3d889cc1c9b7" "checksum env_logger 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e3856f1697098606fc6cb97a93de88ca3f3bc35bb878c725920e6e82ecf05e83" "checksum fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6cc484842f1e2884faf56f529f960cc12ad8c71ce96cc7abba0a067c98fee344" "checksum fs2 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "34edaee07555859dc13ca387e6ae05686bb4d0364c95d649b6dab959511f4baf" diff --git a/Cargo.toml b/Cargo.toml index 2a079292..e0f7cd41 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ path = "tests/tests.rs" atty = "0.2.2" bytecount = "0.1.4" clap = "2.20.5" +encoding_rs = "0.5.0" env_logger = { version = "0.4", default-features = false } grep = { version = "0.1.5", path = "grep" } ignore = { version = "0.1.7", path = "ignore" } diff --git a/README.md b/README.md index 6827e703..cc5923ea 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,10 @@ increases the times to `3.081s` for ripgrep and `11.403s` for GNU grep. of search results, searching multiple patterns, highlighting matches with color and full Unicode support. Unlike GNU grep, `ripgrep` stays fast while supporting Unicode (which is always on). +* `ripgrep` supports searching files in text encodings other than UTF-8, such + as UTF-16, latin-1, GBK, EUC-JP, Shift_JIS and more. (Some support for + automatically detecting UTF-16 is provided. Other text encodings must be + specifically specified with the `-E/--encoding` flag.) In other words, use `ripgrep` if you like speed, filtering by default, fewer bugs and Unicode support. @@ -101,18 +105,12 @@ give you a glimpse at some important downsides or missing features of support for Unicode categories (e.g., `\p{Sc}` to match currency symbols or `\p{Lu}` to match any uppercase letter). (Fancier regexes will never be supported.) -* If you need to search files with text encodings other than UTF-8 (like - UTF-16), then `ripgrep` won't work. `ripgrep` will still work on ASCII - compatible encodings like latin1 or otherwise partially valid UTF-8. - `ripgrep` *can* search for arbitrary bytes though, which might work in - a pinch. (Likely to be supported in the future.) * `ripgrep` doesn't yet support searching compressed files. (Likely to be supported in the future.) * `ripgrep` doesn't have multiline search. (Unlikely to ever be supported.) -In other words, if you like fancy regexes, non-UTF-8 character encodings, -searching compressed files or multiline search, then `ripgrep` may not quite -meet your needs (yet). +In other words, if you like fancy regexes, searching compressed files or +multiline search, then `ripgrep` may not quite meet your needs (yet). ### Is it really faster than everything else? diff --git a/doc/rg.1.md b/doc/rg.1.md index 830a3bb4..7c49eb72 100644 --- a/doc/rg.1.md +++ b/doc/rg.1.md @@ -136,6 +136,13 @@ Project home page: https://github.com/BurntSushi/ripgrep --debug : Show debug messages. +-E, --encoding *ENCODING* +: Specify the text encoding that ripgrep will use on all files + searched. The default value is 'auto', which will cause ripgrep to do + a best effort automatic detection of encoding on a per-file basis. + Other supported values can be found in the list of labels here: + https://encoding.spec.whatwg.org/#concept-encoding-get + -f, --file FILE ... : Search for patterns from the given file, with one pattern per line. When this flag is used or multiple times or in combination with the -e/--regexp flag, diff --git a/src/app.rs b/src/app.rs index e2f4b010..c285ab03 100644 --- a/src/app.rs +++ b/src/app.rs @@ -96,6 +96,8 @@ fn app<F>(next_line_help: bool, doc: F) -> App<'static, 'static> .possible_values(&["never", "auto", "always", "ansi"])) .arg(flag("colors").value_name("SPEC") .takes_value(true).multiple(true).number_of_values(1)) + .arg(flag("encoding").short("E").value_name("ENCODING") + .takes_value(true).number_of_values(1)) .arg(flag("fixed-strings").short("F")) .arg(flag("glob").short("g") .takes_value(true).multiple(true).number_of_values(1) @@ -251,6 +253,14 @@ lazy_static! { change the match color to magenta and the background color for \ line numbers to yellow:\n\n\ rg --colors 'match:fg:magenta' --colors 'line:bg:yellow' foo."); + doc!(h, "encoding", + "Specify the text encoding of files to search.", + "Specify the text encoding that ripgrep will use on all files \ + searched. The default value is 'auto', which will cause ripgrep \ + to do a best effort automatic detection of encoding on a \ + per-file basis. Other supported values can be found in the list \ + of labels here: \ + https://encoding.spec.whatwg.org/#concept-encoding-get"); doc!(h, "fixed-strings", "Treat the pattern as a literal string.", "Treat the pattern as a literal string instead of a regular \ @@ -335,9 +345,9 @@ lazy_static! { provided are searched. Empty pattern lines will match all input \ lines, and the newline is not counted as part of the pattern."); doc!(h, "files-with-matches", - "Only show the path of each file with at least one match."); + "Only show the paths with at least one match."); doc!(h, "files-without-match", - "Only show the path of each file that contains zero matches."); + "Only show the paths that contains zero matches."); doc!(h, "with-filename", "Show file name for each match.", "Prefix each match with the file name that contains it. This is \ diff --git a/src/args.rs b/src/args.rs index 8cf7cd69..7e60cd7d 100644 --- a/src/args.rs +++ b/src/args.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use clap; +use encoding_rs::Encoding; use env_logger; use grep::{Grep, GrepBuilder}; use log; @@ -41,6 +42,7 @@ pub struct Args { column: bool, context_separator: Vec<u8>, count: bool, + encoding: Option<&'static Encoding>, files_with_matches: bool, files_without_matches: bool, eol: u8, @@ -224,6 +226,7 @@ impl Args { .after_context(self.after_context) .before_context(self.before_context) .count(self.count) + .encoding(self.encoding) .files_with_matches(self.files_with_matches) .files_without_matches(self.files_without_matches) .eol(self.eol) @@ -330,6 +333,7 @@ impl<'a> ArgMatches<'a> { column: self.column(), context_separator: self.context_separator(), count: self.is_present("count"), + encoding: try!(self.encoding()), files_with_matches: self.is_present("files-with-matches"), files_without_matches: self.is_present("files-without-match"), eol: b'\n', @@ -569,6 +573,7 @@ impl<'a> ArgMatches<'a> { /// will need to search. fn mmap(&self, paths: &[PathBuf]) -> Result<bool> { let (before, after) = try!(self.contexts()); + let enc = try!(self.encoding()); Ok(if before > 0 || after > 0 || self.is_present("no-mmap") { false } else if self.is_present("mmap") { @@ -576,6 +581,10 @@ impl<'a> ArgMatches<'a> { } else if cfg!(target_os = "macos") { // On Mac, memory maps appear to suck. Neat. false + } else if enc.is_some() { + // There's no practical way to transcode a memory map that isn't + // isomorphic to searching over io::Read. + false } else { // If we're only searching a few paths and all of them are // files, then memory maps are probably faster. @@ -721,6 +730,29 @@ impl<'a> ArgMatches<'a> { Ok(ColorSpecs::new(&specs)) } + /// Return the text encoding specified. + /// + /// If the label given by the caller doesn't correspond to a valid + /// supported encoding (and isn't `auto`), then return an error. + /// + /// A `None` encoding implies that the encoding should be automatically + /// detected on a per-file basis. + fn encoding(&self) -> Result<Option<&'static Encoding>> { + match self.0.value_of_lossy("encoding") { + None => Ok(None), + Some(label) => { + if label == "auto" { + return Ok(None); + } + match Encoding::for_label(label.as_bytes()) { + Some(enc) => Ok(Some(enc)), + None => Err(From::from( + format!("unsupported encoding: {}", label))), + } + } + } + } + /// Returns the approximate number of threads that ripgrep should use. fn threads(&self) -> Result<usize> { if self.is_present("sort-files") { diff --git a/src/decoder.rs b/src/decoder.rs new file mode 100644 index 00000000..d43cbdbb --- /dev/null +++ b/src/decoder.rs @@ -0,0 +1,460 @@ +#![allow(dead_code)] + +use std::cmp; +use std::io::{self, Read}; + +use encoding_rs::{Decoder, Encoding, UTF_8}; + +/// A BOM is at least 2 bytes and at most 3 bytes. +/// +/// If fewer than 2 bytes are available to be read at the beginning of a +/// reader, then a BOM is `None`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct Bom { + bytes: [u8; 3], + len: usize, +} + +impl Bom { + fn as_slice(&self) -> &[u8] { + &self.bytes[0..self.len] + } + + fn decoder(&self) -> Option<Decoder> { + let bom = self.as_slice(); + if bom.len() < 3 { + return None; + } + if let Some((enc, _)) = Encoding::for_bom(bom) { + if enc != UTF_8 { + return Some(enc.new_decoder_with_bom_removal()); + } + } + None + } +} + +/// BomPeeker wraps `R` and satisfies the `io::Read` interface while also +/// providing a peek at the BOM if one exists. Peeking at the BOM does not +/// advance the reader. +struct BomPeeker<R> { + rdr: R, + bom: Option<Bom>, + nread: usize, +} + +impl<R: io::Read> BomPeeker<R> { + /// Create a new BomPeeker. + /// + /// The first three bytes can be read using the `peek_bom` method, but + /// will not advance the reader. + fn new(rdr: R) -> BomPeeker<R> { + BomPeeker { rdr: rdr, bom: None, nread: 0 } + } + + /// Peek at the first three bytes of the underlying reader. + /// + /// This does not advance the reader provided by `BomPeeker`. + /// + /// If the underlying reader does not have at least two bytes available, + /// then `None` is returned. + fn peek_bom(&mut self) -> io::Result<Bom> { + if let Some(bom) = self.bom { + return Ok(bom); + } + self.bom = Some(Bom { bytes: [0; 3], len: 0 }); + let mut buf = [0u8; 3]; + let bom_len = try!(read_full(&mut self.rdr, &mut buf)); + self.bom = Some(Bom { bytes: buf, len: bom_len }); + Ok(self.bom.unwrap()) + } +} + +impl<R: io::Read> io::Read for BomPeeker<R> { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + if self.nread < 3 { + let bom = try!(self.peek_bom()); + let bom = bom.as_slice(); + if self.nread < bom.len() { + let rest = &bom[self.nread..]; + let len = cmp::min(buf.len(), rest.len()); + buf[..len].copy_from_slice(&rest[..len]); + self.nread += len; + return Ok(len); + } + } + let nread = try!(self.rdr.read(buf)); + self.nread += nread; + Ok(nread) + } +} + +/// Like io::Read::read_exact, except it never returns UnexpectedEof and +/// instead returns the number of bytes read if EOF is seen before filling +/// `buf`. +fn read_full<R: io::Read>( + mut rdr: R, + mut buf: &mut [u8], +) -> io::Result<usize> { + let mut nread = 0; + while !buf.is_empty() { + match rdr.read(buf) { + Ok(0) => break, + Ok(n) => { + nread += n; + let tmp = buf; + buf = &mut tmp[n..]; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + Ok(nread) +} + +/// A reader that transcodes to UTF-8. The source encoding is determined by +/// inspecting the BOM from the stream read from `R`, if one exists. If a +/// UTF-16 BOM exists, then the source stream is trancoded to UTF-8 with +/// invalid UTF-16 sequences translated to the Unicode replacement character. +/// In all other cases, the underlying reader is passed through unchanged. +/// +/// `R` is the type of the underlying reader and `B` is the type of an internal +/// buffer used to store the results of trancoding. +/// +/// Note that not all methods on `io::Read` work with this implementation. +/// For example, the `bytes` adapter method attempts to read a single byte at +/// a time, but this implementation requires a buffer of size at least `4`. If +/// a buffer of size less than 4 is given, then an error is returned. +pub struct DecodeReader<R, B> { + /// The underlying reader, wrapped in a peeker for reading a BOM if one + /// exists. + rdr: BomPeeker<R>, + /// The internal buffer to store transcoded bytes before they are read by + /// callers. + buf: B, + /// The current position in `buf`. Subsequent reads start here. + pos: usize, + /// The number of transcoded bytes in `buf`. Subsequent reads end here. + buflen: usize, + /// Whether this is the first read or not (in which we inspect the BOM). + first: bool, + /// Whether a "last" read has occurred. After this point, EOF will always + /// be returned. + last: bool, + /// The underlying text decoder derived from the BOM, if one exists. + decoder: Option<Decoder>, +} + +impl<R: io::Read, B: AsMut<[u8]>> DecodeReader<R, B> { + /// Create a new transcoder that converts a source stream to valid UTF-8. + /// + /// If an encoding is specified, then it is used to transcode `rdr` to + /// UTF-8. Otherwise, if no encoding is specified, and if a UTF-16 BOM is + /// found, then the corresponding UTF-16 encoding is used to transcode + /// `rdr` to UTF-8. In all other cases, `rdr` is assumed to be at least + /// ASCII-compatible and passed through untouched. + /// + /// Errors in the encoding of `rdr` are handled with the Unicode + /// replacement character. If no encoding of `rdr` is specified, then + /// errors are not handled. + pub fn new( + rdr: R, + buf: B, + enc: Option<&'static Encoding>, + ) -> DecodeReader<R, B> { + DecodeReader { + rdr: BomPeeker::new(rdr), + buf: buf, + buflen: 0, + pos: 0, + first: enc.is_none(), + last: false, + decoder: enc.map(|enc| enc.new_decoder_with_bom_removal()), + } + } + + /// Fill the internal buffer from the underlying reader. + /// + /// If there are unread bytes in the internal buffer, then we move them + /// to the beginning of the internal buffer and fill the remainder. + /// + /// If the internal buffer is too small to read additional bytes, then an + /// error is returned. + #[inline(always)] // massive perf benefit (???) + fn fill(&mut self) -> io::Result<()> { + if self.pos < self.buflen { + if self.buflen >= self.buf.as_mut().len() { + return Err(io::Error::new( + io::ErrorKind::Other, + "DecodeReader: internal buffer exhausted")); + } + let newlen = self.buflen - self.pos; + let mut tmp = Vec::with_capacity(newlen); + tmp.extend_from_slice(&self.buf.as_mut()[self.pos..self.buflen]); + self.buf.as_mut()[..newlen].copy_from_slice(&tmp); + self.buflen = newlen; + } else { + self.buflen = 0; + } + self.pos = 0; + self.buflen += + try!(self.rdr.read(&mut self.buf.as_mut()[self.buflen..])); + Ok(()) + } + + /// Transcode the inner stream to UTF-8 in `buf`. This assumes that there + /// is a decoder capable of transcoding the inner stream to UTF-8. This + /// returns the number of bytes written to `buf`. + /// + /// When this function returns, exactly one of the following things will + /// be true: + /// + /// 1. A non-zero number of bytes were written to `buf`. + /// 2. The underlying reader reached EOF. + /// 3. An error is returned: the internal buffer ran out of room. + /// 4. An I/O error occurred. + /// + /// Note that `buf` must have at least 4 bytes of space. + fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> { + assert!(buf.len() >= 4); + if self.last { + return Ok(0); + } + if self.pos >= self.buflen { + try!(self.fill()); + } + let mut nwrite = 0; + loop { + let (_, nin, nout, _) = + self.decoder.as_mut().unwrap().decode_to_utf8( + &self.buf.as_mut()[self.pos..self.buflen], buf, false); + self.pos += nin; + nwrite += nout; + // If we've written at least one byte to the caller-provided + // buffer, then our mission is complete. + if nwrite > 0 { + break; + } + // Otherwise, we know that our internal buffer has insufficient + // data to transcode at least one char, so we attempt to refill it. + try!(self.fill()); + // Quit on EOF. + if self.buflen == 0 { + self.pos = 0; + self.last = true; + let (_, _, nout, _) = + self.decoder.as_mut().unwrap().decode_to_utf8( + &[], buf, true); + return Ok(nout); + } + } + Ok(nwrite) + } + + fn detect(&mut self) -> io::Result<()> { + let bom = try!(self.rdr.peek_bom()); + self.decoder = bom.decoder(); + Ok(()) + } +} + +impl<R: io::Read, B: AsMut<[u8]>> io::Read for DecodeReader<R, B> { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + if self.first { + self.first = false; + try!(self.detect()); + } + if self.decoder.is_none() { + return self.rdr.read(buf); + } + // When decoding UTF-8, we need at least 4 bytes of space to guarantee + // that we can decode at least one codepoint. If we don't have it, we + // can either return `0` for the number of bytes read or return an + // error. Since `0` would be interpreted as a possibly premature EOF, + // we opt for an error. + if buf.len() < 4 { + return Err(io::Error::new( + io::ErrorKind::Other, + "DecodeReader: byte buffer must have length at least 4")); + } + self.transcode(buf) + } +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use encoding_rs::Encoding; + + use super::{Bom, BomPeeker, DecodeReader}; + + fn utf8(bytes: &[u8]) -> &str { + ::std::str::from_utf8(bytes).unwrap() + } + + fn read_to_string<R: Read>(mut rdr: R) -> String { + let mut s = String::new(); + rdr.read_to_string(&mut s).unwrap(); + s + } + + #[test] + fn peeker_empty() { + let buf = []; + let mut peeker = BomPeeker::new(&buf[..]); + assert_eq!(Bom { bytes: [0; 3], len: 0}, peeker.peek_bom().unwrap()); + + let mut tmp = [0; 100]; + assert_eq!(0, peeker.read(&mut tmp).unwrap()); + } + + #[test] + fn peeker_one() { + let buf = [1]; + let mut peeker = BomPeeker::new(&buf[..]); + assert_eq!( + Bom { bytes: [1, 0, 0], len: 1}, + peeker.peek_bom().unwrap()); + + let mut tmp = [0; 100]; + assert_eq!(1, peeker.read(&mut tmp).unwrap()); + assert_eq!(1, tmp[0]); + assert_eq!(0, peeker.read(&mut tmp).unwrap()); + } + + #[test] + fn peeker_two() { + let buf = [1, 2]; + let mut peeker = BomPeeker::new(&buf[..]); + assert_eq!( + Bom { bytes: [1, 2, 0], len: 2}, + peeker.peek_bom().unwrap()); + + let mut tmp = [0; 100]; + assert_eq!(2, peeker.read(&mut tmp).unwrap()); + assert_eq!(1, tmp[0]); + assert_eq!(2, tmp[1]); + assert_eq!(0, peeker.read(&mut tmp).unwrap()); + } + + #[test] + fn peeker_three() { + let buf = [1, 2, 3]; + let mut peeker = BomPeeker::new(&buf[..]); + assert_eq!( + Bom { bytes: [1, 2, 3], len: 3}, + peeker.peek_bom().unwrap()); + + let mut tmp = [0; 100]; + assert_eq!(3, peeker.read(&mut tmp).unwrap()); + assert_eq!(1, tmp[0]); + assert_eq!(2, tmp[1]); + assert_eq!(3, tmp[2]); + assert_eq!(0, peeker.read(&mut tmp).unwrap()); + } + + #[test] + fn peeker_four() { + let buf = [1, 2, 3, 4]; + let mut peeker = BomPeeker::new(&buf[..]); + assert_eq!( + Bom { bytes: [1, 2, 3], len: 3}, + peeker.peek_bom().unwrap()); + + let mut tmp = [0; 100]; + assert_eq!(3, peeker.read(&mut tmp).unwrap()); + assert_eq!(1, tmp[0]); + assert_eq!(2, tmp[1]); + assert_eq!(3, tmp[2]); + assert_eq!(1, peeker.read(&mut tmp).unwrap()); + assert_eq!(4, tmp[0]); + assert_eq!(0, peeker.read(&mut tmp).unwrap()); + } + + #[test] + fn peeker_one_at_a_time() { + let buf = [1, 2, 3, 4]; + let mut peeker = BomPeeker::new(&buf[..]); + + let mut tmp = [0; 1]; + assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap()); + assert_eq!(0, tmp[0]); + assert_eq!(1, peeker.read(&mut tmp).unwrap()); + assert_eq!(1, tmp[0]); + assert_eq!(1, peeker.read(&mut tmp).unwrap()); + assert_eq!(2, tmp[0]); + assert_eq!(1, peeker.read(&mut tmp).unwrap()); + assert_eq!(3, tmp[0]); + assert_eq!(1, peeker.read(&mut tmp).unwrap()); + assert_eq!(4, tmp[0]); + } + + // In cases where all we have is a bom, we expect the bytes to be + // passed through unchanged. + #[test] + fn trans_utf16_bom() { + let srcbuf = vec![0xFF, 0xFE]; + let mut dstbuf = vec![0; 8 * (1<<10)]; + let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); + let n = rdr.read(&mut dstbuf).unwrap(); + assert_eq!(&*srcbuf, &dstbuf[..n]); + + let srcbuf = vec![0xFE, 0xFF]; + let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); + let n = rdr.read(&mut dstbuf).unwrap(); + assert_eq!(&*srcbuf, &dstbuf[..n]); + + let srcbuf = vec![0xEF, 0xBB, 0xBF]; + let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); + let n = rdr.read(&mut dstbuf).unwrap(); + assert_eq!(&*srcbuf, &dstbuf[..n]); + } + + // Test basic UTF-16 decoding. + #[test] + fn trans_utf16_basic() { + let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00]; + let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); + assert_eq!("a", read_to_string(&mut rdr)); + + let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61]; + let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); + assert_eq!("a", read_to_string(&mut rdr)); + } + + // Test incomplete UTF-16 decoding. This ensures we see a replacement char + // if the stream ends with an unpaired code unit. + #[test] + fn trans_utf16_incomplete() { + let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00]; + let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None); + assert_eq!("a\u{FFFD}", read_to_string(&mut rdr)); + } + + macro_rules! test_trans_simple { + ($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => { + #[test] + fn $name() { + let srcbuf = &$srcbytes[..]; + let enc = Encoding::for_label($enc.as_bytes()); + let mut rdr = DecodeReader::new( + &*srcbuf, vec![0; 8 * (1<<10)], enc); + assert_eq!($dst, read_to_string(&mut rdr)); + } + } + } + + // This isn't exhaustive obviously, but it lets us test base level support. + test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж"); + test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж"); + test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж"); + test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж"); + test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж"); + test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж"); + test_trans_simple!(trans_simple_big5_hkscs, "big5-hkscs", b"\xC7\xFA", "Ж"); + test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж"); + test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж"); + test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж"); + test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©"); +} diff --git a/src/main.rs b/src/main.rs index 47ce5b91..501f2ed7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ extern crate atty; extern crate bytecount; #[macro_use] extern crate clap; +extern crate encoding_rs; extern crate env_logger; extern crate grep; extern crate ignore; @@ -43,6 +44,7 @@ macro_rules! eprintln { mod app; mod args; +mod decoder; mod pathutil; mod printer; mod search_buffer; diff --git a/src/worker.rs b/src/worker.rs index 60dde722..51b7f64c 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -2,11 +2,13 @@ use std::fs::File; use std::io; use std::path::Path; +use encoding_rs::Encoding; use grep::Grep; use ignore::DirEntry; use memmap::{Mmap, Protection}; use termcolor::WriteColor; +use decoder::DecodeReader; use pathutil::strip_prefix; use printer::Printer; use search_buffer::BufferSearcher; @@ -27,6 +29,7 @@ pub struct WorkerBuilder { #[derive(Clone, Debug)] struct Options { mmap: bool, + encoding: Option<&'static Encoding>, after_context: usize, before_context: usize, count: bool, @@ -45,6 +48,7 @@ impl Default for Options { fn default() -> Options { Options { mmap: false, + encoding: None, after_context: 0, before_context: 0, count: false, @@ -80,6 +84,7 @@ impl WorkerBuilder { Worker { grep: self.grep, inpbuf: inpbuf, + decodebuf: vec![0; 8 * (1<<10)], opts: self.opts, } } @@ -106,6 +111,15 @@ impl WorkerBuilder { self } + /// Set the encoding to use to read each file. + /// + /// If the encoding is `None` (the default), then the encoding is + /// automatically detected on a best-effort per-file basis. + pub fn encoding(mut self, enc: Option<&'static Encoding>) -> Self { + self.opts.encoding = enc; + self + } + /// If enabled, searching will print the path instead of each match. /// /// Disabled by default. @@ -181,8 +195,9 @@ impl WorkerBuilder { /// Worker is responsible for executing searches on file paths, while choosing /// streaming search or memory map search as appropriate. pub struct Worker { - inpbuf: InputBuffer, grep: Grep, + inpbuf: InputBuffer, + decodebuf: Vec<u8>, opts: Options, } @@ -241,6 +256,8 @@ impl Worker { path: &Path, rdr: R, ) -> Result<u64> { + let rdr = DecodeReader::new( + rdr, &mut self.decodebuf, self.opts.encoding); let searcher = Searcher::new( &mut self.inpbuf, printer, &self.grep, path, rdr); searcher @@ -274,8 +291,13 @@ impl Worker { return self.search(printer, path, file); } let mmap = try!(Mmap::open(file, Protection::Read)); - let searcher = BufferSearcher::new( - printer, &self.grep, path, unsafe { mmap.as_slice() }); + let buf = unsafe { mmap.as_slice() }; + if buf.len() >= 3 && Encoding::for_bom(buf).is_some() { + // If we have a UTF-16 bom in our memory map, then we need to fall + // back to the stream reader, which will do transcoding. + return self.search(printer, path, file); + } + let searcher = BufferSearcher::new(printer, &self.grep, path, buf); Ok(searcher .count(self.opts.count) .files_with_matches(self.opts.files_with_matches) diff --git a/tests/tests.rs b/tests/tests.rs index 29848aa1..9e216b52 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1057,6 +1057,52 @@ clean!(regression_405, "test", ".", |wd: WorkDir, mut cmd: Command| { assert_eq!(lines, format!("{}:test\n", path("bar/foo/file2.txt"))); }); +// See: https://github.com/BurntSushi/ripgrep/issues/1 +clean!(feature_1_sjis, "Шерлок Холмс", ".", |wd: WorkDir, mut cmd: Command| { + let sherlock = + b"\x84Y\x84u\x84\x82\x84|\x84\x80\x84{ \x84V\x84\x80\x84|\x84}\x84\x83"; + wd.create_bytes("foo", &sherlock[..]); + cmd.arg("-Esjis"); + + let lines: String = wd.stdout(&mut cmd); + assert_eq!(lines, "foo:Шерлок Холмс\n"); +}); + +// See: https://github.com/BurntSushi/ripgrep/issues/1 +clean!(feature_1_utf16_auto, "Шерлок Холмс", ".", +|wd: WorkDir, mut cmd: Command| { + let sherlock = + b"\xff\xfe(\x045\x04@\x04;\x04>\x04:\x04 \x00%\x04>\x04;\x04<\x04A\x04"; + wd.create_bytes("foo", &sherlock[..]); + + let lines: String = wd.stdout(&mut cmd); + assert_eq!(lines, "foo:Шерлок Холмс\n"); +}); + +// See: https://github.com/BurntSushi/ripgrep/issues/1 +clean!(feature_1_utf16_explicit, "Шерлок Холмс", ".", +|wd: WorkDir, mut cmd: Command| { + let sherlock = + b"\xff\xfe(\x045\x04@\x04;\x04>\x04:\x04 \x00%\x04>\x04;\x04<\x04A\x04"; + wd.create_bytes("foo", &sherlock[..]); + cmd.arg("-Eutf-16le"); + + let lines: String = wd.stdout(&mut cmd); + assert_eq!(lines, "foo:Шерлок Холмс\n"); +}); + +// See: https://github.com/BurntSushi/ripgrep/issues/1 +clean!(feature_1_eucjp, "Шерлок Холмс", ".", +|wd: WorkDir, mut cmd: Command| { + let sherlock = + b"\xa7\xba\xa7\xd6\xa7\xe2\xa7\xdd\xa7\xe0\xa7\xdc \xa7\xb7\xa7\xe0\xa7\xdd\xa7\xde\xa7\xe3"; + wd.create_bytes("foo", &sherlock[..]); + cmd.arg("-Eeuc-jp"); + + let lines: String = wd.stdout(&mut cmd); + assert_eq!(lines, "foo:Шерлок Холмс\n"); +}); + // See: https://github.com/BurntSushi/ripgrep/issues/7 sherlock!(feature_7, "-fpat", "sherlock", |wd: WorkDir, mut cmd: Command| { wd.create("pat", "Sherlock\nHolmes");