1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-04-24 17:12:16 +02:00

searcher: add option to disable BOM sniffing

This commit adds a new encoding feature where the -E/--encoding flag
will now accept a value of 'none'. When given this value, all encoding
related machinery is disabled and ripgrep will search the raw bytes of
the file, including the BOM if it's present.

Closes #1207, Closes #1208
This commit is contained in:
lesnyrumcajs 2019-03-04 17:18:45 +01:00 committed by Andrew Gallant
parent 1604a18db3
commit 5962abc465
9 changed files with 158 additions and 34 deletions

6
Cargo.lock generated
View File

@ -114,7 +114,7 @@ dependencies = [
[[package]] [[package]]
name = "encoding_rs_io" name = "encoding_rs_io"
version = "0.1.5" version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
@ -226,7 +226,7 @@ dependencies = [
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", "bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)", "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", "encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"grep-matcher 0.1.1", "grep-matcher 0.1.1",
"grep-regex 0.1.2", "grep-regex 0.1.2",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
@ -704,7 +704,7 @@ dependencies = [
"checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b" "checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b"
"checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c" "checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c"
"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed" "checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed"
"checksum encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f94ef2bcdb2f5d58e982ef565baa1ecfd04b7cb653d0bf1b49af1dd472faa8d8" "checksum encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9619ee7a2bf4e777e020b95c1439abaf008f8ea8041b78a0552c4f1bcf4df32c"
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"

View File

@ -603,7 +603,7 @@ topic, but we can try to summarize its relevancy to ripgrep:
* Files are generally just a bundle of bytes. There is no reliable way to know * Files are generally just a bundle of bytes. There is no reliable way to know
their encoding. their encoding.
* Either the encoding of the pattern must match the encoding of the files being * Either the encoding of the pattern must match the encoding of the files being
searched, or a form of transcoding must be performed converts either the searched, or a form of transcoding must be performed that converts either the
pattern or the file to the same encoding as the other. pattern or the file to the same encoding as the other.
* ripgrep tends to work best on plain text files, and among plain text files, * ripgrep tends to work best on plain text files, and among plain text files,
the most popular encodings likely consist of ASCII, latin1 or UTF-8. As the most popular encodings likely consist of ASCII, latin1 or UTF-8. As
@ -626,12 +626,15 @@ given, which is the default:
they correspond to a UTF-16 BOM, then ripgrep will transcode the contents of they correspond to a UTF-16 BOM, then ripgrep will transcode the contents of
the file from UTF-16 to UTF-8, and then execute the search on the transcoded the file from UTF-16 to UTF-8, and then execute the search on the transcoded
version of the file. (This incurs a performance penalty since transcoding version of the file. (This incurs a performance penalty since transcoding
is slower than regex searching.) is slower than regex searching.) If the file contains invalid UTF-16, then
the Unicode replacement codepoint is substituted in place of invalid code
units.
* To handle other cases, ripgrep provides a `-E/--encoding` flag, which permits * To handle other cases, ripgrep provides a `-E/--encoding` flag, which permits
you to specify an encoding from the you to specify an encoding from the
[Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get). [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
ripgrep will assume *all* files searched are the encoding specified and ripgrep will assume *all* files searched are the encoding specified (unless
will perform a transcoding step just like in the UTF-16 case described above. the file has a BOM) and will perform a transcoding step just like in the
UTF-16 case described above.
By default, ripgrep will not require its input be valid UTF-8. That is, ripgrep By default, ripgrep will not require its input be valid UTF-8. That is, ripgrep
can and will search arbitrary bytes. The key here is that if you're searching can and will search arbitrary bytes. The key here is that if you're searching
@ -641,9 +644,26 @@ pattern won't find anything. With all that said, this mode of operation is
important, because it lets you find ASCII or UTF-8 *within* files that are important, because it lets you find ASCII or UTF-8 *within* files that are
otherwise arbitrary bytes. otherwise arbitrary bytes.
As a special case, the `-E/--encoding` flag supports the value `none`, which
will completely disable all encoding related logic, including BOM sniffing.
When `-E/--encoding` is set to `none`, ripgrep will search the raw bytes of
the underlying file with no transcoding step. For example, here's how you might
search the raw UTF-16 encoding of the string `Шерлок`:
```
$ rg '(?-u)\(\x045\x04@\x04;\x04>\x04:\x04' -E none -a some-utf16-file
```
Of course, that's just an example meant to show how one can drop down into
raw bytes. Namely, the simpler command works as you might expect automatically:
```
$ rg 'Шерлок' some-utf16-file
```
Finally, it is possible to disable ripgrep's Unicode support from within the Finally, it is possible to disable ripgrep's Unicode support from within the
pattern regular expression. For example, let's say you wanted `.` to match any regular expression. For example, let's say you wanted `.` to match any byte
byte rather than any Unicode codepoint. (You might want this while searching a rather than any Unicode codepoint. (You might want this while searching a
binary file, since `.` by default will not match invalid UTF-8.) You could do binary file, since `.` by default will not match invalid UTF-8.) You could do
this by disabling Unicode via a regular expression flag: this by disabling Unicode via a regular expression flag:

View File

@ -378,7 +378,7 @@ _rg_encodings() {
shift{-,_}jis csshiftjis {,x-}sjis ms_kanji ms932 shift{-,_}jis csshiftjis {,x-}sjis ms_kanji ms932
utf{,-}8 utf-16{,be,le} unicode-1-1-utf-8 utf{,-}8 utf-16{,be,le} unicode-1-1-utf-8
windows-{31j,874,949,125{0..8}} dos-874 tis-620 ansi_x3.4-1968 windows-{31j,874,949,125{0..8}} dos-874 tis-620 ansi_x3.4-1968
x-user-defined auto x-user-defined auto none
) )
_wanted encodings expl encoding compadd -a "$@" - _encodings _wanted encodings expl encoding compadd -a "$@" - _encodings

View File

@ -52,7 +52,7 @@ impl RegexMatcherBuilder {
} }
let matcher = RegexMatcherImpl::new(&chir)?; let matcher = RegexMatcherImpl::new(&chir)?;
trace!("final regex: {:?}", matcher.regex()); trace!("final regex: {:?}", matcher.regex().to_string());
Ok(RegexMatcher { Ok(RegexMatcher {
config: self.config.clone(), config: self.config.clone(),
matcher: matcher, matcher: matcher,

View File

@ -16,7 +16,7 @@ license = "Unlicense/MIT"
bstr = { version = "0.1.2", default-features = false, features = ["std"] } bstr = { version = "0.1.2", default-features = false, features = ["std"] }
bytecount = "0.5" bytecount = "0.5"
encoding_rs = "0.8.14" encoding_rs = "0.8.14"
encoding_rs_io = "0.1.4" encoding_rs_io = "0.1.6"
grep-matcher = { version = "0.1.1", path = "../grep-matcher" } grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
log = "0.4.5" log = "0.4.5"
memmap = "0.7" memmap = "0.7"

View File

@ -155,6 +155,8 @@ pub struct Config {
/// An encoding that, when present, causes the searcher to transcode all /// An encoding that, when present, causes the searcher to transcode all
/// input from the encoding to UTF-8. /// input from the encoding to UTF-8.
encoding: Option<Encoding>, encoding: Option<Encoding>,
/// Whether to do automatic transcoding based on a BOM or not.
bom_sniffing: bool,
} }
impl Default for Config { impl Default for Config {
@ -171,6 +173,7 @@ impl Default for Config {
binary: BinaryDetection::default(), binary: BinaryDetection::default(),
multi_line: false, multi_line: false,
encoding: None, encoding: None,
bom_sniffing: true,
} }
} }
} }
@ -303,12 +306,15 @@ impl SearcherBuilder {
config.before_context = 0; config.before_context = 0;
config.after_context = 0; config.after_context = 0;
} }
let mut decode_builder = DecodeReaderBytesBuilder::new(); let mut decode_builder = DecodeReaderBytesBuilder::new();
decode_builder decode_builder
.encoding(self.config.encoding.as_ref().map(|e| e.0)) .encoding(self.config.encoding.as_ref().map(|e| e.0))
.utf8_passthru(true) .utf8_passthru(true)
.strip_bom(true) .strip_bom(self.config.bom_sniffing)
.bom_override(true); .bom_override(true)
.bom_sniffing(self.config.bom_sniffing);
Searcher { Searcher {
config: config, config: config,
decode_builder: decode_builder, decode_builder: decode_builder,
@ -506,12 +512,13 @@ impl SearcherBuilder {
/// transcoding process encounters an error, then bytes are replaced with /// transcoding process encounters an error, then bytes are replaced with
/// the Unicode replacement codepoint. /// the Unicode replacement codepoint.
/// ///
/// When no encoding is specified (the default), then BOM sniffing is used /// When no encoding is specified (the default), then BOM sniffing is
/// to determine whether the source data is UTF-8 or UTF-16, and /// used (if it's enabled, which it is, by default) to determine whether
/// transcoding will be performed automatically. If no BOM could be found, /// the source data is UTF-8 or UTF-16, and transcoding will be performed
/// then the source data is searched _as if_ it were UTF-8. However, so /// automatically. If no BOM could be found, then the source data is
/// long as the source data is at least ASCII compatible, then it is /// searched _as if_ it were UTF-8. However, so long as the source data is
/// possible for a search to produce useful results. /// at least ASCII compatible, then it is possible for a search to produce
/// useful results.
pub fn encoding( pub fn encoding(
&mut self, &mut self,
encoding: Option<Encoding>, encoding: Option<Encoding>,
@ -519,6 +526,23 @@ impl SearcherBuilder {
self.config.encoding = encoding; self.config.encoding = encoding;
self self
} }
/// Enable automatic transcoding based on BOM sniffing.
///
/// When this is enabled and an explicit encoding is not set, then this
/// searcher will try to detect the encoding of the bytes being searched
/// by sniffing its byte-order mark (BOM). In particular, when this is
/// enabled, UTF-16 encoded files will be searched seamlessly.
///
/// When this is disabled and if an explicit encoding is not set, then
/// the bytes from the source stream will be passed through unchanged,
/// including its BOM, if one is present.
///
/// This is enabled by default.
pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
self.config.bom_sniffing = yes;
self
}
} }
/// A searcher executes searches over a haystack and writes results to a caller /// A searcher executes searches over a haystack and writes results to a caller
@ -738,7 +762,8 @@ impl Searcher {
/// Returns true if and only if the given slice needs to be transcoded. /// Returns true if and only if the given slice needs to be transcoded.
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool { fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
self.config.encoding.is_some() || slice_has_utf16_bom(slice) self.config.encoding.is_some()
|| (self.config.bom_sniffing && slice_has_utf16_bom(slice))
} }
} }

View File

@ -984,7 +984,9 @@ Specify the text encoding that ripgrep will use on all files searched. The
default value is 'auto', which will cause ripgrep to do a best effort automatic default value is 'auto', which will cause ripgrep to do a best effort automatic
detection of encoding on a per-file basis. Automatic detection in this case detection of encoding on a per-file basis. Automatic detection in this case
only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM). only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM).
No other automatic detection is performend. No other automatic detection is performed. One can also specify 'none' which
will then completely disable BOM sniffing and always result in searching the
raw bytes, including a BOM if it's present, regardless of its encoding.
Other supported values can be found in the list of labels here: Other supported values can be found in the list of labels here:
https://encoding.spec.whatwg.org/#concept-encoding-get https://encoding.spec.whatwg.org/#concept-encoding-get

View File

@ -483,6 +483,37 @@ impl SortByKind {
} }
} }
/// Encoding mode the searcher will use.
#[derive(Clone, Debug)]
enum EncodingMode {
/// Use an explicit encoding forcefully, but let BOM sniffing override it.
Some(Encoding),
/// Use only BOM sniffing to auto-detect an encoding.
Auto,
/// Use no explicit encoding and disable all BOM sniffing. This will
/// always result in searching the raw bytes, regardless of their
/// true encoding.
Disabled,
}
impl EncodingMode {
/// Checks if an explicit encoding has been set. Returns false for
/// automatic BOM sniffing and no sniffing.
///
/// This is only used to determine whether PCRE2 needs to have its own
/// UTF-8 checking enabled. If we have an explicit encoding set, then
/// we're always guaranteed to get UTF-8, so we can disable PCRE2's check.
/// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8
/// check.
#[cfg(feature = "pcre2")]
fn has_explicit_encoding(&self) -> bool {
match self {
EncodingMode::Some(_) => true,
_ => false
}
}
}
impl ArgMatches { impl ArgMatches {
/// Create an ArgMatches from clap's parse result. /// Create an ArgMatches from clap's parse result.
fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches { fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches {
@ -650,7 +681,7 @@ impl ArgMatches {
} }
if self.pcre2_unicode() { if self.pcre2_unicode() {
builder.utf(true).ucp(true); builder.utf(true).ucp(true);
if self.encoding()?.is_some() { if self.encoding()?.has_explicit_encoding() {
// SAFETY: If an encoding was specified, then we're guaranteed // SAFETY: If an encoding was specified, then we're guaranteed
// to get valid UTF-8, so we can disable PCRE2's UTF checking. // to get valid UTF-8, so we can disable PCRE2's UTF checking.
// (Feeding invalid UTF-8 to PCRE2 is undefined behavior.) // (Feeding invalid UTF-8 to PCRE2 is undefined behavior.)
@ -766,8 +797,16 @@ impl ArgMatches {
.after_context(ctx_after) .after_context(ctx_after)
.passthru(self.is_present("passthru")) .passthru(self.is_present("passthru"))
.memory_map(self.mmap_choice(paths)) .memory_map(self.mmap_choice(paths))
.binary_detection(self.binary_detection()) .binary_detection(self.binary_detection());
.encoding(self.encoding()?); match self.encoding()? {
EncodingMode::Some(enc) => {
builder.encoding(Some(enc));
}
EncodingMode::Auto => {} // default for the searcher
EncodingMode::Disabled => {
builder.bom_sniffing(false);
}
}
Ok(builder.build()) Ok(builder.build())
} }
@ -952,24 +991,30 @@ impl ArgMatches {
u64_to_usize("dfa-size-limit", r) u64_to_usize("dfa-size-limit", r)
} }
/// Returns the type of encoding to use. /// Returns the encoding mode to use.
/// ///
/// This only returns an encoding if one is explicitly specified. When no /// This only returns an encoding if one is explicitly specified. Otherwise
/// encoding is present, the Searcher will still do BOM sniffing for UTF-16 /// if set to automatic, the Searcher will do BOM sniffing for UTF-16
/// and transcode seamlessly. /// and transcode seamlessly. If disabled, no BOM sniffing nor transcoding
fn encoding(&self) -> Result<Option<Encoding>> { /// will occur.
fn encoding(&self) -> Result<EncodingMode> {
if self.is_present("no-encoding") { if self.is_present("no-encoding") {
return Ok(None); return Ok(EncodingMode::Auto);
} }
let label = match self.value_of_lossy("encoding") { let label = match self.value_of_lossy("encoding") {
None if self.pcre2_unicode() => "utf-8".to_string(), None if self.pcre2_unicode() => "utf-8".to_string(),
None => return Ok(None), None => return Ok(EncodingMode::Auto),
Some(label) => label, Some(label) => label,
}; };
if label == "auto" { if label == "auto" {
return Ok(None); return Ok(EncodingMode::Auto);
} else if label == "none" {
return Ok(EncodingMode::Disabled);
} }
Ok(Some(Encoding::new(&label)?))
Ok(EncodingMode::Some(Encoding::new(&label)?))
} }
/// Return the file separator to use based on the CLI configuration. /// Return the file separator to use based on the CLI configuration.

View File

@ -645,3 +645,35 @@ rgtest!(f1138_no_ignore_dot, |dir: Dir, mut cmd: TestCommand| {
eqnice!("bar\nquux\n", cmd.arg("--no-ignore-dot").stdout()); eqnice!("bar\nquux\n", cmd.arg("--no-ignore-dot").stdout());
eqnice!("bar\n", cmd.arg("--ignore-file").arg(".fzf-ignore").stdout()); eqnice!("bar\n", cmd.arg("--ignore-file").arg(".fzf-ignore").stdout());
}); });
// See: https://github.com/BurntSushi/ripgrep/issues/1207
//
// Tests if without encoding 'none' flag null bytes are consumed by automatic
// encoding detection.
rgtest!(f1207_auto_encoding, |dir: Dir, mut cmd: TestCommand| {
dir.create_bytes(
"foo",
b"\xFF\xFE\x00\x62"
);
cmd.arg("-a").arg("\\x00").arg("foo");
cmd.assert_exit_code(1);
});
// See: https://github.com/BurntSushi/ripgrep/issues/1207
//
// Tests if encoding 'none' flag does treat file as raw bytes
rgtest!(f1207_ignore_encoding, |dir: Dir, mut cmd: TestCommand| {
// PCRE2 chokes on this test because it can't search invalid non-UTF-8
// and the point of this test is to search raw UTF-16.
if dir.is_pcre2() {
return;
}
dir.create_bytes(
"foo",
b"\xFF\xFE\x00\x62"
);
cmd.arg("--encoding").arg("none").arg("-a").arg("\\x00").arg("foo");
eqnice!("\u{FFFD}\u{FFFD}\x00b\n", cmd.stdout());
});