mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2024-12-07 11:13:17 +02:00
searcher: add option to disable BOM sniffing
This commit adds a new encoding feature where the -E/--encoding flag will now accept a value of 'none'. When given this value, all encoding related machinery is disabled and ripgrep will search the raw bytes of the file, including the BOM if it's present. Closes #1207, Closes #1208
This commit is contained in:
parent
77439f99a4
commit
b00cd69a40
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -114,7 +114,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs_io"
|
||||
version = "0.1.5"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -226,7 +226,7 @@ dependencies = [
|
||||
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"grep-matcher 0.1.1",
|
||||
"grep-regex 0.1.2",
|
||||
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -704,7 +704,7 @@ dependencies = [
|
||||
"checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b"
|
||||
"checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c"
|
||||
"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed"
|
||||
"checksum encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f94ef2bcdb2f5d58e982ef565baa1ecfd04b7cb653d0bf1b49af1dd472faa8d8"
|
||||
"checksum encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9619ee7a2bf4e777e020b95c1439abaf008f8ea8041b78a0552c4f1bcf4df32c"
|
||||
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
|
||||
"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
|
||||
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
|
||||
|
32
GUIDE.md
32
GUIDE.md
@ -603,7 +603,7 @@ topic, but we can try to summarize its relevancy to ripgrep:
|
||||
* Files are generally just a bundle of bytes. There is no reliable way to know
|
||||
their encoding.
|
||||
* Either the encoding of the pattern must match the encoding of the files being
|
||||
searched, or a form of transcoding must be performed converts either the
|
||||
searched, or a form of transcoding must be performed that converts either the
|
||||
pattern or the file to the same encoding as the other.
|
||||
* ripgrep tends to work best on plain text files, and among plain text files,
|
||||
the most popular encodings likely consist of ASCII, latin1 or UTF-8. As
|
||||
@ -626,12 +626,15 @@ given, which is the default:
|
||||
they correspond to a UTF-16 BOM, then ripgrep will transcode the contents of
|
||||
the file from UTF-16 to UTF-8, and then execute the search on the transcoded
|
||||
version of the file. (This incurs a performance penalty since transcoding
|
||||
is slower than regex searching.)
|
||||
is slower than regex searching.) If the file contains invalid UTF-16, then
|
||||
the Unicode replacement codepoint is substituted in place of invalid code
|
||||
units.
|
||||
* To handle other cases, ripgrep provides a `-E/--encoding` flag, which permits
|
||||
you to specify an encoding from the
|
||||
[Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
|
||||
ripgrep will assume *all* files searched are the encoding specified and
|
||||
will perform a transcoding step just like in the UTF-16 case described above.
|
||||
ripgrep will assume *all* files searched are the encoding specified (unless
|
||||
the file has a BOM) and will perform a transcoding step just like in the
|
||||
UTF-16 case described above.
|
||||
|
||||
By default, ripgrep will not require its input be valid UTF-8. That is, ripgrep
|
||||
can and will search arbitrary bytes. The key here is that if you're searching
|
||||
@ -641,9 +644,26 @@ pattern won't find anything. With all that said, this mode of operation is
|
||||
important, because it lets you find ASCII or UTF-8 *within* files that are
|
||||
otherwise arbitrary bytes.
|
||||
|
||||
As a special case, the `-E/--encoding` flag supports the value `none`, which
|
||||
will completely disable all encoding related logic, including BOM sniffing.
|
||||
When `-E/--encoding` is set to `none`, ripgrep will search the raw bytes of
|
||||
the underlying file with no transcoding step. For example, here's how you might
|
||||
search the raw UTF-16 encoding of the string `Шерлок`:
|
||||
|
||||
```
|
||||
$ rg '(?-u)\(\x045\x04@\x04;\x04>\x04:\x04' -E none -a some-utf16-file
|
||||
```
|
||||
|
||||
Of course, that's just an example meant to show how one can drop down into
|
||||
raw bytes. Namely, the simpler command works as you might expect automatically:
|
||||
|
||||
```
|
||||
$ rg 'Шерлок' some-utf16-file
|
||||
```
|
||||
|
||||
Finally, it is possible to disable ripgrep's Unicode support from within the
|
||||
pattern regular expression. For example, let's say you wanted `.` to match any
|
||||
byte rather than any Unicode codepoint. (You might want this while searching a
|
||||
regular expression. For example, let's say you wanted `.` to match any byte
|
||||
rather than any Unicode codepoint. (You might want this while searching a
|
||||
binary file, since `.` by default will not match invalid UTF-8.) You could do
|
||||
this by disabling Unicode via a regular expression flag:
|
||||
|
||||
|
@ -378,7 +378,7 @@ _rg_encodings() {
|
||||
shift{-,_}jis csshiftjis {,x-}sjis ms_kanji ms932
|
||||
utf{,-}8 utf-16{,be,le} unicode-1-1-utf-8
|
||||
windows-{31j,874,949,125{0..8}} dos-874 tis-620 ansi_x3.4-1968
|
||||
x-user-defined auto
|
||||
x-user-defined auto none
|
||||
)
|
||||
|
||||
_wanted encodings expl encoding compadd -a "$@" - _encodings
|
||||
|
@ -52,7 +52,7 @@ impl RegexMatcherBuilder {
|
||||
}
|
||||
|
||||
let matcher = RegexMatcherImpl::new(&chir)?;
|
||||
trace!("final regex: {:?}", matcher.regex());
|
||||
trace!("final regex: {:?}", matcher.regex().to_string());
|
||||
Ok(RegexMatcher {
|
||||
config: self.config.clone(),
|
||||
matcher: matcher,
|
||||
|
@ -16,7 +16,7 @@ license = "Unlicense/MIT"
|
||||
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
|
||||
bytecount = "0.5"
|
||||
encoding_rs = "0.8.14"
|
||||
encoding_rs_io = "0.1.4"
|
||||
encoding_rs_io = "0.1.6"
|
||||
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
||||
log = "0.4.5"
|
||||
memmap = "0.7"
|
||||
|
@ -155,6 +155,8 @@ pub struct Config {
|
||||
/// An encoding that, when present, causes the searcher to transcode all
|
||||
/// input from the encoding to UTF-8.
|
||||
encoding: Option<Encoding>,
|
||||
/// Whether to do automatic transcoding based on a BOM or not.
|
||||
bom_sniffing: bool,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
@ -171,6 +173,7 @@ impl Default for Config {
|
||||
binary: BinaryDetection::default(),
|
||||
multi_line: false,
|
||||
encoding: None,
|
||||
bom_sniffing: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -303,12 +306,15 @@ impl SearcherBuilder {
|
||||
config.before_context = 0;
|
||||
config.after_context = 0;
|
||||
}
|
||||
|
||||
let mut decode_builder = DecodeReaderBytesBuilder::new();
|
||||
decode_builder
|
||||
.encoding(self.config.encoding.as_ref().map(|e| e.0))
|
||||
.utf8_passthru(true)
|
||||
.strip_bom(true)
|
||||
.bom_override(true);
|
||||
.strip_bom(self.config.bom_sniffing)
|
||||
.bom_override(true)
|
||||
.bom_sniffing(self.config.bom_sniffing);
|
||||
|
||||
Searcher {
|
||||
config: config,
|
||||
decode_builder: decode_builder,
|
||||
@ -506,12 +512,13 @@ impl SearcherBuilder {
|
||||
/// transcoding process encounters an error, then bytes are replaced with
|
||||
/// the Unicode replacement codepoint.
|
||||
///
|
||||
/// When no encoding is specified (the default), then BOM sniffing is used
|
||||
/// to determine whether the source data is UTF-8 or UTF-16, and
|
||||
/// transcoding will be performed automatically. If no BOM could be found,
|
||||
/// then the source data is searched _as if_ it were UTF-8. However, so
|
||||
/// long as the source data is at least ASCII compatible, then it is
|
||||
/// possible for a search to produce useful results.
|
||||
/// When no encoding is specified (the default), then BOM sniffing is
|
||||
/// used (if it's enabled, which it is, by default) to determine whether
|
||||
/// the source data is UTF-8 or UTF-16, and transcoding will be performed
|
||||
/// automatically. If no BOM could be found, then the source data is
|
||||
/// searched _as if_ it were UTF-8. However, so long as the source data is
|
||||
/// at least ASCII compatible, then it is possible for a search to produce
|
||||
/// useful results.
|
||||
pub fn encoding(
|
||||
&mut self,
|
||||
encoding: Option<Encoding>,
|
||||
@ -519,6 +526,23 @@ impl SearcherBuilder {
|
||||
self.config.encoding = encoding;
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable automatic transcoding based on BOM sniffing.
|
||||
///
|
||||
/// When this is enabled and an explicit encoding is not set, then this
|
||||
/// searcher will try to detect the encoding of the bytes being searched
|
||||
/// by sniffing its byte-order mark (BOM). In particular, when this is
|
||||
/// enabled, UTF-16 encoded files will be searched seamlessly.
|
||||
///
|
||||
/// When this is disabled and if an explicit encoding is not set, then
|
||||
/// the bytes from the source stream will be passed through unchanged,
|
||||
/// including its BOM, if one is present.
|
||||
///
|
||||
/// This is enabled by default.
|
||||
pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
|
||||
self.config.bom_sniffing = yes;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// A searcher executes searches over a haystack and writes results to a caller
|
||||
@ -738,7 +762,8 @@ impl Searcher {
|
||||
|
||||
/// Returns true if and only if the given slice needs to be transcoded.
|
||||
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
|
||||
self.config.encoding.is_some() || slice_has_utf16_bom(slice)
|
||||
self.config.encoding.is_some()
|
||||
|| (self.config.bom_sniffing && slice_has_utf16_bom(slice))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -984,7 +984,9 @@ Specify the text encoding that ripgrep will use on all files searched. The
|
||||
default value is 'auto', which will cause ripgrep to do a best effort automatic
|
||||
detection of encoding on a per-file basis. Automatic detection in this case
|
||||
only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM).
|
||||
No other automatic detection is performend.
|
||||
No other automatic detection is performed. One can also specify 'none' which
|
||||
will then completely disable BOM sniffing and always result in searching the
|
||||
raw bytes, including a BOM if it's present, regardless of its encoding.
|
||||
|
||||
Other supported values can be found in the list of labels here:
|
||||
https://encoding.spec.whatwg.org/#concept-encoding-get
|
||||
|
69
src/args.rs
69
src/args.rs
@ -483,6 +483,37 @@ impl SortByKind {
|
||||
}
|
||||
}
|
||||
|
||||
/// Encoding mode the searcher will use.
|
||||
#[derive(Clone, Debug)]
|
||||
enum EncodingMode {
|
||||
/// Use an explicit encoding forcefully, but let BOM sniffing override it.
|
||||
Some(Encoding),
|
||||
/// Use only BOM sniffing to auto-detect an encoding.
|
||||
Auto,
|
||||
/// Use no explicit encoding and disable all BOM sniffing. This will
|
||||
/// always result in searching the raw bytes, regardless of their
|
||||
/// true encoding.
|
||||
Disabled,
|
||||
}
|
||||
|
||||
impl EncodingMode {
|
||||
/// Checks if an explicit encoding has been set. Returns false for
|
||||
/// automatic BOM sniffing and no sniffing.
|
||||
///
|
||||
/// This is only used to determine whether PCRE2 needs to have its own
|
||||
/// UTF-8 checking enabled. If we have an explicit encoding set, then
|
||||
/// we're always guaranteed to get UTF-8, so we can disable PCRE2's check.
|
||||
/// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8
|
||||
/// check.
|
||||
#[cfg(feature = "pcre2")]
|
||||
fn has_explicit_encoding(&self) -> bool {
|
||||
match self {
|
||||
EncodingMode::Some(_) => true,
|
||||
_ => false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ArgMatches {
|
||||
/// Create an ArgMatches from clap's parse result.
|
||||
fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches {
|
||||
@ -650,7 +681,7 @@ impl ArgMatches {
|
||||
}
|
||||
if self.pcre2_unicode() {
|
||||
builder.utf(true).ucp(true);
|
||||
if self.encoding()?.is_some() {
|
||||
if self.encoding()?.has_explicit_encoding() {
|
||||
// SAFETY: If an encoding was specified, then we're guaranteed
|
||||
// to get valid UTF-8, so we can disable PCRE2's UTF checking.
|
||||
// (Feeding invalid UTF-8 to PCRE2 is undefined behavior.)
|
||||
@ -766,8 +797,16 @@ impl ArgMatches {
|
||||
.after_context(ctx_after)
|
||||
.passthru(self.is_present("passthru"))
|
||||
.memory_map(self.mmap_choice(paths))
|
||||
.binary_detection(self.binary_detection())
|
||||
.encoding(self.encoding()?);
|
||||
.binary_detection(self.binary_detection());
|
||||
match self.encoding()? {
|
||||
EncodingMode::Some(enc) => {
|
||||
builder.encoding(Some(enc));
|
||||
}
|
||||
EncodingMode::Auto => {} // default for the searcher
|
||||
EncodingMode::Disabled => {
|
||||
builder.bom_sniffing(false);
|
||||
}
|
||||
}
|
||||
Ok(builder.build())
|
||||
}
|
||||
|
||||
@ -952,24 +991,30 @@ impl ArgMatches {
|
||||
u64_to_usize("dfa-size-limit", r)
|
||||
}
|
||||
|
||||
/// Returns the type of encoding to use.
|
||||
/// Returns the encoding mode to use.
|
||||
///
|
||||
/// This only returns an encoding if one is explicitly specified. When no
|
||||
/// encoding is present, the Searcher will still do BOM sniffing for UTF-16
|
||||
/// and transcode seamlessly.
|
||||
fn encoding(&self) -> Result<Option<Encoding>> {
|
||||
/// This only returns an encoding if one is explicitly specified. Otherwise
|
||||
/// if set to automatic, the Searcher will do BOM sniffing for UTF-16
|
||||
/// and transcode seamlessly. If disabled, no BOM sniffing nor transcoding
|
||||
/// will occur.
|
||||
fn encoding(&self) -> Result<EncodingMode> {
|
||||
if self.is_present("no-encoding") {
|
||||
return Ok(None);
|
||||
return Ok(EncodingMode::Auto);
|
||||
}
|
||||
|
||||
let label = match self.value_of_lossy("encoding") {
|
||||
None if self.pcre2_unicode() => "utf-8".to_string(),
|
||||
None => return Ok(None),
|
||||
None => return Ok(EncodingMode::Auto),
|
||||
Some(label) => label,
|
||||
};
|
||||
|
||||
if label == "auto" {
|
||||
return Ok(None);
|
||||
return Ok(EncodingMode::Auto);
|
||||
} else if label == "none" {
|
||||
return Ok(EncodingMode::Disabled);
|
||||
}
|
||||
Ok(Some(Encoding::new(&label)?))
|
||||
|
||||
Ok(EncodingMode::Some(Encoding::new(&label)?))
|
||||
}
|
||||
|
||||
/// Return the file separator to use based on the CLI configuration.
|
||||
|
@ -645,3 +645,35 @@ rgtest!(f1138_no_ignore_dot, |dir: Dir, mut cmd: TestCommand| {
|
||||
eqnice!("bar\nquux\n", cmd.arg("--no-ignore-dot").stdout());
|
||||
eqnice!("bar\n", cmd.arg("--ignore-file").arg(".fzf-ignore").stdout());
|
||||
});
|
||||
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/1207
|
||||
//
|
||||
// Tests if without encoding 'none' flag null bytes are consumed by automatic
|
||||
// encoding detection.
|
||||
rgtest!(f1207_auto_encoding, |dir: Dir, mut cmd: TestCommand| {
|
||||
dir.create_bytes(
|
||||
"foo",
|
||||
b"\xFF\xFE\x00\x62"
|
||||
);
|
||||
cmd.arg("-a").arg("\\x00").arg("foo");
|
||||
cmd.assert_exit_code(1);
|
||||
});
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/1207
|
||||
//
|
||||
// Tests if encoding 'none' flag does treat file as raw bytes
|
||||
rgtest!(f1207_ignore_encoding, |dir: Dir, mut cmd: TestCommand| {
|
||||
// PCRE2 chokes on this test because it can't search invalid non-UTF-8
|
||||
// and the point of this test is to search raw UTF-16.
|
||||
if dir.is_pcre2() {
|
||||
return;
|
||||
}
|
||||
|
||||
dir.create_bytes(
|
||||
"foo",
|
||||
b"\xFF\xFE\x00\x62"
|
||||
);
|
||||
cmd.arg("--encoding").arg("none").arg("-a").arg("\\x00").arg("foo");
|
||||
eqnice!("\u{FFFD}\u{FFFD}\x00b\n", cmd.stdout());
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user