1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-06-14 22:15:13 +02:00

searcher: add option to disable BOM sniffing

This commit adds a new encoding feature where the -E/--encoding flag
will now accept a value of 'none'. When given this value, all encoding
related machinery is disabled and ripgrep will search the raw bytes of
the file, including the BOM if it's present.

Closes #1207, Closes #1208
This commit is contained in:
lesnyrumcajs
2019-03-04 17:18:45 +01:00
committed by Andrew Gallant
parent 1604a18db3
commit 5962abc465
9 changed files with 158 additions and 34 deletions

View File

@ -984,7 +984,9 @@ Specify the text encoding that ripgrep will use on all files searched. The
default value is 'auto', which will cause ripgrep to do a best effort automatic
detection of encoding on a per-file basis. Automatic detection in this case
only applies to files that begin with a UTF-8 or UTF-16 byte-order mark (BOM).
No other automatic detection is performend.
No other automatic detection is performed. One can also specify 'none' which
will then completely disable BOM sniffing and always result in searching the
raw bytes, including a BOM if it's present, regardless of its encoding.
Other supported values can be found in the list of labels here:
https://encoding.spec.whatwg.org/#concept-encoding-get

View File

@ -483,6 +483,37 @@ impl SortByKind {
}
}
/// Encoding mode the searcher will use.
#[derive(Clone, Debug)]
enum EncodingMode {
/// Use an explicit encoding forcefully, but let BOM sniffing override it.
Some(Encoding),
/// Use only BOM sniffing to auto-detect an encoding.
Auto,
/// Use no explicit encoding and disable all BOM sniffing. This will
/// always result in searching the raw bytes, regardless of their
/// true encoding.
Disabled,
}
impl EncodingMode {
/// Checks if an explicit encoding has been set. Returns false for
/// automatic BOM sniffing and no sniffing.
///
/// This is only used to determine whether PCRE2 needs to have its own
/// UTF-8 checking enabled. If we have an explicit encoding set, then
/// we're always guaranteed to get UTF-8, so we can disable PCRE2's check.
/// Otherwise, we have no such guarantee, and must enable PCRE2' UTF-8
/// check.
#[cfg(feature = "pcre2")]
fn has_explicit_encoding(&self) -> bool {
match self {
EncodingMode::Some(_) => true,
_ => false
}
}
}
impl ArgMatches {
/// Create an ArgMatches from clap's parse result.
fn new(clap_matches: clap::ArgMatches<'static>) -> ArgMatches {
@ -650,7 +681,7 @@ impl ArgMatches {
}
if self.pcre2_unicode() {
builder.utf(true).ucp(true);
if self.encoding()?.is_some() {
if self.encoding()?.has_explicit_encoding() {
// SAFETY: If an encoding was specified, then we're guaranteed
// to get valid UTF-8, so we can disable PCRE2's UTF checking.
// (Feeding invalid UTF-8 to PCRE2 is undefined behavior.)
@ -766,8 +797,16 @@ impl ArgMatches {
.after_context(ctx_after)
.passthru(self.is_present("passthru"))
.memory_map(self.mmap_choice(paths))
.binary_detection(self.binary_detection())
.encoding(self.encoding()?);
.binary_detection(self.binary_detection());
match self.encoding()? {
EncodingMode::Some(enc) => {
builder.encoding(Some(enc));
}
EncodingMode::Auto => {} // default for the searcher
EncodingMode::Disabled => {
builder.bom_sniffing(false);
}
}
Ok(builder.build())
}
@ -952,24 +991,30 @@ impl ArgMatches {
u64_to_usize("dfa-size-limit", r)
}
/// Returns the type of encoding to use.
/// Returns the encoding mode to use.
///
/// This only returns an encoding if one is explicitly specified. When no
/// encoding is present, the Searcher will still do BOM sniffing for UTF-16
/// and transcode seamlessly.
fn encoding(&self) -> Result<Option<Encoding>> {
/// This only returns an encoding if one is explicitly specified. Otherwise
/// if set to automatic, the Searcher will do BOM sniffing for UTF-16
/// and transcode seamlessly. If disabled, no BOM sniffing nor transcoding
/// will occur.
fn encoding(&self) -> Result<EncodingMode> {
if self.is_present("no-encoding") {
return Ok(None);
return Ok(EncodingMode::Auto);
}
let label = match self.value_of_lossy("encoding") {
None if self.pcre2_unicode() => "utf-8".to_string(),
None => return Ok(None),
None => return Ok(EncodingMode::Auto),
Some(label) => label,
};
if label == "auto" {
return Ok(None);
return Ok(EncodingMode::Auto);
} else if label == "none" {
return Ok(EncodingMode::Disabled);
}
Ok(Some(Encoding::new(&label)?))
Ok(EncodingMode::Some(Encoding::new(&label)?))
}
/// Return the file separator to use based on the CLI configuration.