From b00cd69a40329b14a304bb75567d13964856d733 Mon Sep 17 00:00:00 2001 From: lesnyrumcajs Date: Mon, 4 Mar 2019 17:18:45 +0100 Subject: [PATCH] searcher: add option to disable BOM sniffing This commit adds a new encoding feature where the -E/--encoding flag will now accept a value of 'none'. When given this value, all encoding related machinery is disabled and ripgrep will search the raw bytes of the file, including the BOM if it's present. Closes #1207, Closes #1208 --- Cargo.lock | 6 +-- GUIDE.md | 32 +++++++++++--- complete/_rg | 2 +- grep-regex/src/matcher.rs | 2 +- grep-searcher/Cargo.toml | 2 +- grep-searcher/src/searcher/mod.rs | 43 +++++++++++++++---- src/app.rs | 4 +- src/args.rs | 69 +++++++++++++++++++++++++------ tests/feature.rs | 32 ++++++++++++++ 9 files changed, 158 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 095ed414..a8420de4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -114,7 +114,7 @@ dependencies = [ [[package]] name = "encoding_rs_io" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", @@ -226,7 +226,7 @@ dependencies = [ "bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", - "encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", "grep-matcher 0.1.1", "grep-regex 0.1.2", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", @@ -704,7 +704,7 @@ dependencies = [ "checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b" "checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c" "checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed" -"checksum encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f94ef2bcdb2f5d58e982ef565baa1ecfd04b7cb653d0bf1b49af1dd472faa8d8" +"checksum encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9619ee7a2bf4e777e020b95c1439abaf008f8ea8041b78a0552c4f1bcf4df32c" "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" diff --git a/GUIDE.md b/GUIDE.md index 0094a7b4..8022f292 100644 --- a/GUIDE.md +++ b/GUIDE.md @@ -603,7 +603,7 @@ topic, but we can try to summarize its relevancy to ripgrep: * Files are generally just a bundle of bytes. There is no reliable way to know their encoding. * Either the encoding of the pattern must match the encoding of the files being - searched, or a form of transcoding must be performed converts either the + searched, or a form of transcoding must be performed that converts either the pattern or the file to the same encoding as the other. * ripgrep tends to work best on plain text files, and among plain text files, the most popular encodings likely consist of ASCII, latin1 or UTF-8. As @@ -626,12 +626,15 @@ given, which is the default: they correspond to a UTF-16 BOM, then ripgrep will transcode the contents of the file from UTF-16 to UTF-8, and then execute the search on the transcoded version of the file. (This incurs a performance penalty since transcoding - is slower than regex searching.) + is slower than regex searching.) If the file contains invalid UTF-16, then + the Unicode replacement codepoint is substituted in place of invalid code + units. * To handle other cases, ripgrep provides a `-E/--encoding` flag, which permits you to specify an encoding from the [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get). - ripgrep will assume *all* files searched are the encoding specified and - will perform a transcoding step just like in the UTF-16 case described above. + ripgrep will assume *all* files searched are the encoding specified (unless + the file has a BOM) and will perform a transcoding step just like in the + UTF-16 case described above. By default, ripgrep will not require its input be valid UTF-8. That is, ripgrep can and will search arbitrary bytes. The key here is that if you're searching @@ -641,9 +644,26 @@ pattern won't find anything. With all that said, this mode of operation is important, because it lets you find ASCII or UTF-8 *within* files that are otherwise arbitrary bytes. +As a special case, the `-E/--encoding` flag supports the value `none`, which +will completely disable all encoding related logic, including BOM sniffing. +When `-E/--encoding` is set to `none`, ripgrep will search the raw bytes of +the underlying file with no transcoding step. For example, here's how you might +search the raw UTF-16 encoding of the string `Шерлок`: + +``` +$ rg '(?-u)\(\x045\x04@\x04;\x04>\x04:\x04' -E none -a some-utf16-file +``` + +Of course, that's just an example meant to show how one can drop down into +raw bytes. Namely, the simpler command works as you might expect automatically: + +``` +$ rg 'Шерлок' some-utf16-file +``` + Finally, it is possible to disable ripgrep's Unicode support from within the -pattern regular expression. For example, let's say you wanted `.` to match any -byte rather than any Unicode codepoint. (You might want this while searching a +regular expression. For example, let's say you wanted `.` to match any byte +rather than any Unicode codepoint. (You might want this while searching a binary file, since `.` by default will not match invalid UTF-8.) You could do this by disabling Unicode via a regular expression flag: diff --git a/complete/_rg b/complete/_rg index 2e5c1937..c4a983ac 100644 --- a/complete/_rg +++ b/complete/_rg @@ -378,7 +378,7 @@ _rg_encodings() { shift{-,_}jis csshiftjis {,x-}sjis ms_kanji ms932 utf{,-}8 utf-16{,be,le} unicode-1-1-utf-8 windows-{31j,874,949,125{0..8}} dos-874 tis-620 ansi_x3.4-1968 - x-user-defined auto + x-user-defined auto none ) _wanted encodings expl encoding compadd -a "$@" - _encodings diff --git a/grep-regex/src/matcher.rs b/grep-regex/src/matcher.rs index 391439d9..d71f5777 100644 --- a/grep-regex/src/matcher.rs +++ b/grep-regex/src/matcher.rs @@ -52,7 +52,7 @@ impl RegexMatcherBuilder { } let matcher = RegexMatcherImpl::new(&chir)?; - trace!("final regex: {:?}", matcher.regex()); + trace!("final regex: {:?}", matcher.regex().to_string()); Ok(RegexMatcher { config: self.config.clone(), matcher: matcher, diff --git a/grep-searcher/Cargo.toml b/grep-searcher/Cargo.toml index f4875d9f..f3120a80 100644 --- a/grep-searcher/Cargo.toml +++ b/grep-searcher/Cargo.toml @@ -16,7 +16,7 @@ license = "Unlicense/MIT" bstr = { version = "0.1.2", default-features = false, features = ["std"] } bytecount = "0.5" encoding_rs = "0.8.14" -encoding_rs_io = "0.1.4" +encoding_rs_io = "0.1.6" grep-matcher = { version = "0.1.1", path = "../grep-matcher" } log = "0.4.5" memmap = "0.7" diff --git a/grep-searcher/src/searcher/mod.rs b/grep-searcher/src/searcher/mod.rs index c70b3a0e..729b491b 100644 --- a/grep-searcher/src/searcher/mod.rs +++ b/grep-searcher/src/searcher/mod.rs @@ -155,6 +155,8 @@ pub struct Config { /// An encoding that, when present, causes the searcher to transcode all /// input from the encoding to UTF-8. encoding: Option, + /// Whether to do automatic transcoding based on a BOM or not. + bom_sniffing: bool, } impl Default for Config { @@ -171,6 +173,7 @@ impl Default for Config { binary: BinaryDetection::default(), multi_line: false, encoding: None, + bom_sniffing: true, } } } @@ -303,12 +306,15 @@ impl SearcherBuilder { config.before_context = 0; config.after_context = 0; } + let mut decode_builder = DecodeReaderBytesBuilder::new(); decode_builder .encoding(self.config.encoding.as_ref().map(|e| e.0)) .utf8_passthru(true) - .strip_bom(true) - .bom_override(true); + .strip_bom(self.config.bom_sniffing) + .bom_override(true) + .bom_sniffing(self.config.bom_sniffing); + Searcher { config: config, decode_builder: decode_builder, @@ -506,12 +512,13 @@ impl SearcherBuilder { /// transcoding process encounters an error, then bytes are replaced with /// the Unicode replacement codepoint. /// - /// When no encoding is specified (the default), then BOM sniffing is used - /// to determine whether the source data is UTF-8 or UTF-16, and - /// transcoding will be performed automatically. If no BOM could be found, - /// then the source data is searched _as if_ it were UTF-8. However, so - /// long as the source data is at least ASCII compatible, then it is - /// possible for a search to produce useful results. + /// When no encoding is specified (the default), then BOM sniffing is + /// used (if it's enabled, which it is, by default) to determine whether + /// the source data is UTF-8 or UTF-16, and transcoding will be performed + /// automatically. If no BOM could be found, then the source data is + /// searched _as if_ it were UTF-8. However, so long as the source data is + /// at least ASCII compatible, then it is possible for a search to produce + /// useful results. pub fn encoding( &mut self, encoding: Option, @@ -519,6 +526,23 @@ impl SearcherBuilder { self.config.encoding = encoding; self } + + /// Enable automatic transcoding based on BOM sniffing. + /// + /// When this is enabled and an explicit encoding is not set, then this + /// searcher will try to detect the encoding of the bytes being searched + /// by sniffing its byte-order mark (BOM). In particular, when this is + /// enabled, UTF-16 encoded files will be searched seamlessly. + /// + /// When this is disabled and if an explicit encoding is not set, then + /// the bytes from the source stream will be passed through unchanged, + /// including its BOM, if one is present. + /// + /// This is enabled by default. + pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder { + self.config.bom_sniffing = yes; + self + } } /// A searcher executes searches over a haystack and writes results to a caller @@ -738,7 +762,8 @@ impl Searcher { /// Returns true if and only if the given slice needs to be transcoded. fn slice_needs_transcoding(&self, slice: &[u8]) -> bool { - self.config.encoding.is_some() || slice_has_utf16_bom(slice) + self.config.encoding.is_some() + || (self.config.bom_sniffing && slice_has_utf16_bom(slice)) } } diff --git a/src/app.rs b/src/app.rs index b4c81a7c..66eaedb4 100644 --- a/src/app.rs +++ b/src/app.rs @@ -984,7 +984,9 @@ Specify the text encoding that ripgrep will use on all files searched. The default value is 'auto', which will cause ripgrep to do a best effort automatic detection of encoding on a per-file basis. Automatic detection in this case only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM). -No other automatic detection is performend. +No other automatic detection is performed. One can also specify 'none' which +will then completely disable BOM sniffing and always result in searching the +raw bytes, including a BOM if it's present, regardless of its encoding. Other supported values can be found in the list of labels here: https://encoding.spec.whatwg.org/#concept-encoding-get diff --git a/src/args.rs b/src/args.rs index c9f2405b..166bc126 100644 --- a/src/args.rs +++ b/src/args.rs @@ -483,6 +483,37 @@ impl SortByKind { } } +/// Encoding mode the searcher will use. +#[derive(Clone, Debug)] +enum EncodingMode { + /// Use an explicit encoding forcefully, but let BOM sniffing override it. + Some(Encoding), + /// Use only BOM sniffing to auto-detect an encoding. + Auto, + /// Use no explicit encoding and disable all BOM sniffing. This will + /// always result in searching the raw bytes, regardless of their + /// true encoding. + Disabled, +} + +impl EncodingMode { + /// Checks if an explicit encoding has been set. Returns false for + /// automatic BOM sniffing and no sniffing. + /// + /// This is only used to determine whether PCRE2 needs to have its own + /// UTF-8 checking enabled. If we have an explicit encoding set, then + /// we're always guaranteed to get UTF-8, so we can disable PCRE2's check. + /// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8 + /// check. + #[cfg(feature = "pcre2")] + fn has_explicit_encoding(&self) -> bool { + match self { + EncodingMode::Some(_) => true, + _ => false + } + } +} + impl ArgMatches { /// Create an ArgMatches from clap's parse result. fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches { @@ -650,7 +681,7 @@ impl ArgMatches { } if self.pcre2_unicode() { builder.utf(true).ucp(true); - if self.encoding()?.is_some() { + if self.encoding()?.has_explicit_encoding() { // SAFETY: If an encoding was specified, then we're guaranteed // to get valid UTF-8, so we can disable PCRE2's UTF checking. // (Feeding invalid UTF-8 to PCRE2 is undefined behavior.) @@ -766,8 +797,16 @@ impl ArgMatches { .after_context(ctx_after) .passthru(self.is_present("passthru")) .memory_map(self.mmap_choice(paths)) - .binary_detection(self.binary_detection()) - .encoding(self.encoding()?); + .binary_detection(self.binary_detection()); + match self.encoding()? { + EncodingMode::Some(enc) => { + builder.encoding(Some(enc)); + } + EncodingMode::Auto => {} // default for the searcher + EncodingMode::Disabled => { + builder.bom_sniffing(false); + } + } Ok(builder.build()) } @@ -952,24 +991,30 @@ impl ArgMatches { u64_to_usize("dfa-size-limit", r) } - /// Returns the type of encoding to use. + /// Returns the encoding mode to use. /// - /// This only returns an encoding if one is explicitly specified. When no - /// encoding is present, the Searcher will still do BOM sniffing for UTF-16 - /// and transcode seamlessly. - fn encoding(&self) -> Result> { + /// This only returns an encoding if one is explicitly specified. Otherwise + /// if set to automatic, the Searcher will do BOM sniffing for UTF-16 + /// and transcode seamlessly. If disabled, no BOM sniffing nor transcoding + /// will occur. + fn encoding(&self) -> Result { if self.is_present("no-encoding") { - return Ok(None); + return Ok(EncodingMode::Auto); } + let label = match self.value_of_lossy("encoding") { None if self.pcre2_unicode() => "utf-8".to_string(), - None => return Ok(None), + None => return Ok(EncodingMode::Auto), Some(label) => label, }; + if label == "auto" { - return Ok(None); + return Ok(EncodingMode::Auto); + } else if label == "none" { + return Ok(EncodingMode::Disabled); } - Ok(Some(Encoding::new(&label)?)) + + Ok(EncodingMode::Some(Encoding::new(&label)?)) } /// Return the file separator to use based on the CLI configuration. diff --git a/tests/feature.rs b/tests/feature.rs index 1e7ecc48..d7b343f1 100644 --- a/tests/feature.rs +++ b/tests/feature.rs @@ -645,3 +645,35 @@ rgtest!(f1138_no_ignore_dot, |dir: Dir, mut cmd: TestCommand| { eqnice!("bar\nquux\n", cmd.arg("--no-ignore-dot").stdout()); eqnice!("bar\n", cmd.arg("--ignore-file").arg(".fzf-ignore").stdout()); }); + + +// See: https://github.com/BurntSushi/ripgrep/issues/1207 +// +// Tests if without encoding 'none' flag null bytes are consumed by automatic +// encoding detection. +rgtest!(f1207_auto_encoding, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes( + "foo", + b"\xFF\xFE\x00\x62" + ); + cmd.arg("-a").arg("\\x00").arg("foo"); + cmd.assert_exit_code(1); +}); + +// See: https://github.com/BurntSushi/ripgrep/issues/1207 +// +// Tests if encoding 'none' flag does treat file as raw bytes +rgtest!(f1207_ignore_encoding, |dir: Dir, mut cmd: TestCommand| { + // PCRE2 chokes on this test because it can't search invalid non-UTF-8 + // and the point of this test is to search raw UTF-16. + if dir.is_pcre2() { + return; + } + + dir.create_bytes( + "foo", + b"\xFF\xFE\x00\x62" + ); + cmd.arg("--encoding").arg("none").arg("-a").arg("\\x00").arg("foo"); + eqnice!("\u{FFFD}\u{FFFD}\x00b\n", cmd.stdout()); +});