mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-04-24 17:12:16 +02:00
searcher: add option to disable BOM sniffing
This commit adds a new encoding feature where the -E/--encoding flag will now accept a value of 'none'. When given this value, all encoding related machinery is disabled and ripgrep will search the raw bytes of the file, including the BOM if it's present. Closes #1207, Closes #1208
This commit is contained in:
parent
1604a18db3
commit
5962abc465
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -114,7 +114,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "encoding_rs_io"
|
name = "encoding_rs_io"
|
||||||
version = "0.1.5"
|
version = "0.1.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -226,7 +226,7 @@ dependencies = [
|
|||||||
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"grep-matcher 0.1.1",
|
"grep-matcher 0.1.1",
|
||||||
"grep-regex 0.1.2",
|
"grep-regex 0.1.2",
|
||||||
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -704,7 +704,7 @@ dependencies = [
|
|||||||
"checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b"
|
"checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b"
|
||||||
"checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c"
|
"checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c"
|
||||||
"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed"
|
"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed"
|
||||||
"checksum encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f94ef2bcdb2f5d58e982ef565baa1ecfd04b7cb653d0bf1b49af1dd472faa8d8"
|
"checksum encoding_rs_io 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9619ee7a2bf4e777e020b95c1439abaf008f8ea8041b78a0552c4f1bcf4df32c"
|
||||||
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
|
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
|
||||||
"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
|
"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
|
||||||
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
|
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
|
||||||
|
32
GUIDE.md
32
GUIDE.md
@ -603,7 +603,7 @@ topic, but we can try to summarize its relevancy to ripgrep:
|
|||||||
* Files are generally just a bundle of bytes. There is no reliable way to know
|
* Files are generally just a bundle of bytes. There is no reliable way to know
|
||||||
their encoding.
|
their encoding.
|
||||||
* Either the encoding of the pattern must match the encoding of the files being
|
* Either the encoding of the pattern must match the encoding of the files being
|
||||||
searched, or a form of transcoding must be performed converts either the
|
searched, or a form of transcoding must be performed that converts either the
|
||||||
pattern or the file to the same encoding as the other.
|
pattern or the file to the same encoding as the other.
|
||||||
* ripgrep tends to work best on plain text files, and among plain text files,
|
* ripgrep tends to work best on plain text files, and among plain text files,
|
||||||
the most popular encodings likely consist of ASCII, latin1 or UTF-8. As
|
the most popular encodings likely consist of ASCII, latin1 or UTF-8. As
|
||||||
@ -626,12 +626,15 @@ given, which is the default:
|
|||||||
they correspond to a UTF-16 BOM, then ripgrep will transcode the contents of
|
they correspond to a UTF-16 BOM, then ripgrep will transcode the contents of
|
||||||
the file from UTF-16 to UTF-8, and then execute the search on the transcoded
|
the file from UTF-16 to UTF-8, and then execute the search on the transcoded
|
||||||
version of the file. (This incurs a performance penalty since transcoding
|
version of the file. (This incurs a performance penalty since transcoding
|
||||||
is slower than regex searching.)
|
is slower than regex searching.) If the file contains invalid UTF-16, then
|
||||||
|
the Unicode replacement codepoint is substituted in place of invalid code
|
||||||
|
units.
|
||||||
* To handle other cases, ripgrep provides a `-E/--encoding` flag, which permits
|
* To handle other cases, ripgrep provides a `-E/--encoding` flag, which permits
|
||||||
you to specify an encoding from the
|
you to specify an encoding from the
|
||||||
[Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
|
[Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
|
||||||
ripgrep will assume *all* files searched are the encoding specified and
|
ripgrep will assume *all* files searched are the encoding specified (unless
|
||||||
will perform a transcoding step just like in the UTF-16 case described above.
|
the file has a BOM) and will perform a transcoding step just like in the
|
||||||
|
UTF-16 case described above.
|
||||||
|
|
||||||
By default, ripgrep will not require its input be valid UTF-8. That is, ripgrep
|
By default, ripgrep will not require its input be valid UTF-8. That is, ripgrep
|
||||||
can and will search arbitrary bytes. The key here is that if you're searching
|
can and will search arbitrary bytes. The key here is that if you're searching
|
||||||
@ -641,9 +644,26 @@ pattern won't find anything. With all that said, this mode of operation is
|
|||||||
important, because it lets you find ASCII or UTF-8 *within* files that are
|
important, because it lets you find ASCII or UTF-8 *within* files that are
|
||||||
otherwise arbitrary bytes.
|
otherwise arbitrary bytes.
|
||||||
|
|
||||||
|
As a special case, the `-E/--encoding` flag supports the value `none`, which
|
||||||
|
will completely disable all encoding related logic, including BOM sniffing.
|
||||||
|
When `-E/--encoding` is set to `none`, ripgrep will search the raw bytes of
|
||||||
|
the underlying file with no transcoding step. For example, here's how you might
|
||||||
|
search the raw UTF-16 encoding of the string `Шерлок`:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ rg '(?-u)\(\x045\x04@\x04;\x04>\x04:\x04' -E none -a some-utf16-file
|
||||||
|
```
|
||||||
|
|
||||||
|
Of course, that's just an example meant to show how one can drop down into
|
||||||
|
raw bytes. Namely, the simpler command works as you might expect automatically:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ rg 'Шерлок' some-utf16-file
|
||||||
|
```
|
||||||
|
|
||||||
Finally, it is possible to disable ripgrep's Unicode support from within the
|
Finally, it is possible to disable ripgrep's Unicode support from within the
|
||||||
pattern regular expression. For example, let's say you wanted `.` to match any
|
regular expression. For example, let's say you wanted `.` to match any byte
|
||||||
byte rather than any Unicode codepoint. (You might want this while searching a
|
rather than any Unicode codepoint. (You might want this while searching a
|
||||||
binary file, since `.` by default will not match invalid UTF-8.) You could do
|
binary file, since `.` by default will not match invalid UTF-8.) You could do
|
||||||
this by disabling Unicode via a regular expression flag:
|
this by disabling Unicode via a regular expression flag:
|
||||||
|
|
||||||
|
@ -378,7 +378,7 @@ _rg_encodings() {
|
|||||||
shift{-,_}jis csshiftjis {,x-}sjis ms_kanji ms932
|
shift{-,_}jis csshiftjis {,x-}sjis ms_kanji ms932
|
||||||
utf{,-}8 utf-16{,be,le} unicode-1-1-utf-8
|
utf{,-}8 utf-16{,be,le} unicode-1-1-utf-8
|
||||||
windows-{31j,874,949,125{0..8}} dos-874 tis-620 ansi_x3.4-1968
|
windows-{31j,874,949,125{0..8}} dos-874 tis-620 ansi_x3.4-1968
|
||||||
x-user-defined auto
|
x-user-defined auto none
|
||||||
)
|
)
|
||||||
|
|
||||||
_wanted encodings expl encoding compadd -a "$@" - _encodings
|
_wanted encodings expl encoding compadd -a "$@" - _encodings
|
||||||
|
@ -52,7 +52,7 @@ impl RegexMatcherBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let matcher = RegexMatcherImpl::new(&chir)?;
|
let matcher = RegexMatcherImpl::new(&chir)?;
|
||||||
trace!("final regex: {:?}", matcher.regex());
|
trace!("final regex: {:?}", matcher.regex().to_string());
|
||||||
Ok(RegexMatcher {
|
Ok(RegexMatcher {
|
||||||
config: self.config.clone(),
|
config: self.config.clone(),
|
||||||
matcher: matcher,
|
matcher: matcher,
|
||||||
|
@ -16,7 +16,7 @@ license = "Unlicense/MIT"
|
|||||||
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
|
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
|
||||||
bytecount = "0.5"
|
bytecount = "0.5"
|
||||||
encoding_rs = "0.8.14"
|
encoding_rs = "0.8.14"
|
||||||
encoding_rs_io = "0.1.4"
|
encoding_rs_io = "0.1.6"
|
||||||
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
||||||
log = "0.4.5"
|
log = "0.4.5"
|
||||||
memmap = "0.7"
|
memmap = "0.7"
|
||||||
|
@ -155,6 +155,8 @@ pub struct Config {
|
|||||||
/// An encoding that, when present, causes the searcher to transcode all
|
/// An encoding that, when present, causes the searcher to transcode all
|
||||||
/// input from the encoding to UTF-8.
|
/// input from the encoding to UTF-8.
|
||||||
encoding: Option<Encoding>,
|
encoding: Option<Encoding>,
|
||||||
|
/// Whether to do automatic transcoding based on a BOM or not.
|
||||||
|
bom_sniffing: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
@ -171,6 +173,7 @@ impl Default for Config {
|
|||||||
binary: BinaryDetection::default(),
|
binary: BinaryDetection::default(),
|
||||||
multi_line: false,
|
multi_line: false,
|
||||||
encoding: None,
|
encoding: None,
|
||||||
|
bom_sniffing: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -303,12 +306,15 @@ impl SearcherBuilder {
|
|||||||
config.before_context = 0;
|
config.before_context = 0;
|
||||||
config.after_context = 0;
|
config.after_context = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut decode_builder = DecodeReaderBytesBuilder::new();
|
let mut decode_builder = DecodeReaderBytesBuilder::new();
|
||||||
decode_builder
|
decode_builder
|
||||||
.encoding(self.config.encoding.as_ref().map(|e| e.0))
|
.encoding(self.config.encoding.as_ref().map(|e| e.0))
|
||||||
.utf8_passthru(true)
|
.utf8_passthru(true)
|
||||||
.strip_bom(true)
|
.strip_bom(self.config.bom_sniffing)
|
||||||
.bom_override(true);
|
.bom_override(true)
|
||||||
|
.bom_sniffing(self.config.bom_sniffing);
|
||||||
|
|
||||||
Searcher {
|
Searcher {
|
||||||
config: config,
|
config: config,
|
||||||
decode_builder: decode_builder,
|
decode_builder: decode_builder,
|
||||||
@ -506,12 +512,13 @@ impl SearcherBuilder {
|
|||||||
/// transcoding process encounters an error, then bytes are replaced with
|
/// transcoding process encounters an error, then bytes are replaced with
|
||||||
/// the Unicode replacement codepoint.
|
/// the Unicode replacement codepoint.
|
||||||
///
|
///
|
||||||
/// When no encoding is specified (the default), then BOM sniffing is used
|
/// When no encoding is specified (the default), then BOM sniffing is
|
||||||
/// to determine whether the source data is UTF-8 or UTF-16, and
|
/// used (if it's enabled, which it is, by default) to determine whether
|
||||||
/// transcoding will be performed automatically. If no BOM could be found,
|
/// the source data is UTF-8 or UTF-16, and transcoding will be performed
|
||||||
/// then the source data is searched _as if_ it were UTF-8. However, so
|
/// automatically. If no BOM could be found, then the source data is
|
||||||
/// long as the source data is at least ASCII compatible, then it is
|
/// searched _as if_ it were UTF-8. However, so long as the source data is
|
||||||
/// possible for a search to produce useful results.
|
/// at least ASCII compatible, then it is possible for a search to produce
|
||||||
|
/// useful results.
|
||||||
pub fn encoding(
|
pub fn encoding(
|
||||||
&mut self,
|
&mut self,
|
||||||
encoding: Option<Encoding>,
|
encoding: Option<Encoding>,
|
||||||
@ -519,6 +526,23 @@ impl SearcherBuilder {
|
|||||||
self.config.encoding = encoding;
|
self.config.encoding = encoding;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Enable automatic transcoding based on BOM sniffing.
|
||||||
|
///
|
||||||
|
/// When this is enabled and an explicit encoding is not set, then this
|
||||||
|
/// searcher will try to detect the encoding of the bytes being searched
|
||||||
|
/// by sniffing its byte-order mark (BOM). In particular, when this is
|
||||||
|
/// enabled, UTF-16 encoded files will be searched seamlessly.
|
||||||
|
///
|
||||||
|
/// When this is disabled and if an explicit encoding is not set, then
|
||||||
|
/// the bytes from the source stream will be passed through unchanged,
|
||||||
|
/// including its BOM, if one is present.
|
||||||
|
///
|
||||||
|
/// This is enabled by default.
|
||||||
|
pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
|
||||||
|
self.config.bom_sniffing = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A searcher executes searches over a haystack and writes results to a caller
|
/// A searcher executes searches over a haystack and writes results to a caller
|
||||||
@ -738,7 +762,8 @@ impl Searcher {
|
|||||||
|
|
||||||
/// Returns true if and only if the given slice needs to be transcoded.
|
/// Returns true if and only if the given slice needs to be transcoded.
|
||||||
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
|
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
|
||||||
self.config.encoding.is_some() || slice_has_utf16_bom(slice)
|
self.config.encoding.is_some()
|
||||||
|
|| (self.config.bom_sniffing && slice_has_utf16_bom(slice))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -984,7 +984,9 @@ Specify the text encoding that ripgrep will use on all files searched. The
|
|||||||
default value is 'auto', which will cause ripgrep to do a best effort automatic
|
default value is 'auto', which will cause ripgrep to do a best effort automatic
|
||||||
detection of encoding on a per-file basis. Automatic detection in this case
|
detection of encoding on a per-file basis. Automatic detection in this case
|
||||||
only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM).
|
only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM).
|
||||||
No other automatic detection is performend.
|
No other automatic detection is performed. One can also specify 'none' which
|
||||||
|
will then completely disable BOM sniffing and always result in searching the
|
||||||
|
raw bytes, including a BOM if it's present, regardless of its encoding.
|
||||||
|
|
||||||
Other supported values can be found in the list of labels here:
|
Other supported values can be found in the list of labels here:
|
||||||
https://encoding.spec.whatwg.org/#concept-encoding-get
|
https://encoding.spec.whatwg.org/#concept-encoding-get
|
||||||
|
69
src/args.rs
69
src/args.rs
@ -483,6 +483,37 @@ impl SortByKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Encoding mode the searcher will use.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
enum EncodingMode {
|
||||||
|
/// Use an explicit encoding forcefully, but let BOM sniffing override it.
|
||||||
|
Some(Encoding),
|
||||||
|
/// Use only BOM sniffing to auto-detect an encoding.
|
||||||
|
Auto,
|
||||||
|
/// Use no explicit encoding and disable all BOM sniffing. This will
|
||||||
|
/// always result in searching the raw bytes, regardless of their
|
||||||
|
/// true encoding.
|
||||||
|
Disabled,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EncodingMode {
|
||||||
|
/// Checks if an explicit encoding has been set. Returns false for
|
||||||
|
/// automatic BOM sniffing and no sniffing.
|
||||||
|
///
|
||||||
|
/// This is only used to determine whether PCRE2 needs to have its own
|
||||||
|
/// UTF-8 checking enabled. If we have an explicit encoding set, then
|
||||||
|
/// we're always guaranteed to get UTF-8, so we can disable PCRE2's check.
|
||||||
|
/// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8
|
||||||
|
/// check.
|
||||||
|
#[cfg(feature = "pcre2")]
|
||||||
|
fn has_explicit_encoding(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
EncodingMode::Some(_) => true,
|
||||||
|
_ => false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl ArgMatches {
|
impl ArgMatches {
|
||||||
/// Create an ArgMatches from clap's parse result.
|
/// Create an ArgMatches from clap's parse result.
|
||||||
fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches {
|
fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches {
|
||||||
@ -650,7 +681,7 @@ impl ArgMatches {
|
|||||||
}
|
}
|
||||||
if self.pcre2_unicode() {
|
if self.pcre2_unicode() {
|
||||||
builder.utf(true).ucp(true);
|
builder.utf(true).ucp(true);
|
||||||
if self.encoding()?.is_some() {
|
if self.encoding()?.has_explicit_encoding() {
|
||||||
// SAFETY: If an encoding was specified, then we're guaranteed
|
// SAFETY: If an encoding was specified, then we're guaranteed
|
||||||
// to get valid UTF-8, so we can disable PCRE2's UTF checking.
|
// to get valid UTF-8, so we can disable PCRE2's UTF checking.
|
||||||
// (Feeding invalid UTF-8 to PCRE2 is undefined behavior.)
|
// (Feeding invalid UTF-8 to PCRE2 is undefined behavior.)
|
||||||
@ -766,8 +797,16 @@ impl ArgMatches {
|
|||||||
.after_context(ctx_after)
|
.after_context(ctx_after)
|
||||||
.passthru(self.is_present("passthru"))
|
.passthru(self.is_present("passthru"))
|
||||||
.memory_map(self.mmap_choice(paths))
|
.memory_map(self.mmap_choice(paths))
|
||||||
.binary_detection(self.binary_detection())
|
.binary_detection(self.binary_detection());
|
||||||
.encoding(self.encoding()?);
|
match self.encoding()? {
|
||||||
|
EncodingMode::Some(enc) => {
|
||||||
|
builder.encoding(Some(enc));
|
||||||
|
}
|
||||||
|
EncodingMode::Auto => {} // default for the searcher
|
||||||
|
EncodingMode::Disabled => {
|
||||||
|
builder.bom_sniffing(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(builder.build())
|
Ok(builder.build())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -952,24 +991,30 @@ impl ArgMatches {
|
|||||||
u64_to_usize("dfa-size-limit", r)
|
u64_to_usize("dfa-size-limit", r)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the type of encoding to use.
|
/// Returns the encoding mode to use.
|
||||||
///
|
///
|
||||||
/// This only returns an encoding if one is explicitly specified. When no
|
/// This only returns an encoding if one is explicitly specified. Otherwise
|
||||||
/// encoding is present, the Searcher will still do BOM sniffing for UTF-16
|
/// if set to automatic, the Searcher will do BOM sniffing for UTF-16
|
||||||
/// and transcode seamlessly.
|
/// and transcode seamlessly. If disabled, no BOM sniffing nor transcoding
|
||||||
fn encoding(&self) -> Result<Option<Encoding>> {
|
/// will occur.
|
||||||
|
fn encoding(&self) -> Result<EncodingMode> {
|
||||||
if self.is_present("no-encoding") {
|
if self.is_present("no-encoding") {
|
||||||
return Ok(None);
|
return Ok(EncodingMode::Auto);
|
||||||
}
|
}
|
||||||
|
|
||||||
let label = match self.value_of_lossy("encoding") {
|
let label = match self.value_of_lossy("encoding") {
|
||||||
None if self.pcre2_unicode() => "utf-8".to_string(),
|
None if self.pcre2_unicode() => "utf-8".to_string(),
|
||||||
None => return Ok(None),
|
None => return Ok(EncodingMode::Auto),
|
||||||
Some(label) => label,
|
Some(label) => label,
|
||||||
};
|
};
|
||||||
|
|
||||||
if label == "auto" {
|
if label == "auto" {
|
||||||
return Ok(None);
|
return Ok(EncodingMode::Auto);
|
||||||
|
} else if label == "none" {
|
||||||
|
return Ok(EncodingMode::Disabled);
|
||||||
}
|
}
|
||||||
Ok(Some(Encoding::new(&label)?))
|
|
||||||
|
Ok(EncodingMode::Some(Encoding::new(&label)?))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the file separator to use based on the CLI configuration.
|
/// Return the file separator to use based on the CLI configuration.
|
||||||
|
@ -645,3 +645,35 @@ rgtest!(f1138_no_ignore_dot, |dir: Dir, mut cmd: TestCommand| {
|
|||||||
eqnice!("bar\nquux\n", cmd.arg("--no-ignore-dot").stdout());
|
eqnice!("bar\nquux\n", cmd.arg("--no-ignore-dot").stdout());
|
||||||
eqnice!("bar\n", cmd.arg("--ignore-file").arg(".fzf-ignore").stdout());
|
eqnice!("bar\n", cmd.arg("--ignore-file").arg(".fzf-ignore").stdout());
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
// See: https://github.com/BurntSushi/ripgrep/issues/1207
|
||||||
|
//
|
||||||
|
// Tests if without encoding 'none' flag null bytes are consumed by automatic
|
||||||
|
// encoding detection.
|
||||||
|
rgtest!(f1207_auto_encoding, |dir: Dir, mut cmd: TestCommand| {
|
||||||
|
dir.create_bytes(
|
||||||
|
"foo",
|
||||||
|
b"\xFF\xFE\x00\x62"
|
||||||
|
);
|
||||||
|
cmd.arg("-a").arg("\\x00").arg("foo");
|
||||||
|
cmd.assert_exit_code(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
// See: https://github.com/BurntSushi/ripgrep/issues/1207
|
||||||
|
//
|
||||||
|
// Tests if encoding 'none' flag does treat file as raw bytes
|
||||||
|
rgtest!(f1207_ignore_encoding, |dir: Dir, mut cmd: TestCommand| {
|
||||||
|
// PCRE2 chokes on this test because it can't search invalid non-UTF-8
|
||||||
|
// and the point of this test is to search raw UTF-16.
|
||||||
|
if dir.is_pcre2() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
dir.create_bytes(
|
||||||
|
"foo",
|
||||||
|
b"\xFF\xFE\x00\x62"
|
||||||
|
);
|
||||||
|
cmd.arg("--encoding").arg("none").arg("-a").arg("\\x00").arg("foo");
|
||||||
|
eqnice!("\u{FFFD}\u{FFFD}\x00b\n", cmd.stdout());
|
||||||
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user