1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-07-11 14:30:24 +02:00

searcher: add option to disable BOM sniffing

This commit adds a new encoding feature where the -E/--encoding flag
will now accept a value of 'none'. When given this value, all encoding
related machinery is disabled and ripgrep will search the raw bytes of
the file, including the BOM if it's present.

Closes #1207, Closes #1208
This commit is contained in:
lesnyrumcajs
2019-03-04 17:18:45 +01:00
committed by Andrew Gallant
parent 1604a18db3
commit 5962abc465
9 changed files with 158 additions and 34 deletions

View File

@ -16,7 +16,7 @@ license = "Unlicense/MIT"
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
bytecount = "0.5"
encoding_rs = "0.8.14"
encoding_rs_io = "0.1.4"
encoding_rs_io = "0.1.6"
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
log = "0.4.5"
memmap = "0.7"

View File

@ -155,6 +155,8 @@ pub struct Config {
/// An encoding that, when present, causes the searcher to transcode all
/// input from the encoding to UTF-8.
encoding: Option<Encoding>,
/// Whether to do automatic transcoding based on a BOM or not.
bom_sniffing: bool,
}
impl Default for Config {
@ -171,6 +173,7 @@ impl Default for Config {
binary: BinaryDetection::default(),
multi_line: false,
encoding: None,
bom_sniffing: true,
}
}
}
@ -303,12 +306,15 @@ impl SearcherBuilder {
config.before_context = 0;
config.after_context = 0;
}
let mut decode_builder = DecodeReaderBytesBuilder::new();
decode_builder
.encoding(self.config.encoding.as_ref().map(|e| e.0))
.utf8_passthru(true)
.strip_bom(true)
.bom_override(true);
.strip_bom(self.config.bom_sniffing)
.bom_override(true)
.bom_sniffing(self.config.bom_sniffing);
Searcher {
config: config,
decode_builder: decode_builder,
@ -506,12 +512,13 @@ impl SearcherBuilder {
/// transcoding process encounters an error, then bytes are replaced with
/// the Unicode replacement codepoint.
///
/// When no encoding is specified (the default), then BOM sniffing is used
/// to determine whether the source data is UTF-8 or UTF-16, and
/// transcoding will be performed automatically. If no BOM could be found,
/// then the source data is searched _as if_ it were UTF-8. However, so
/// long as the source data is at least ASCII compatible, then it is
/// possible for a search to produce useful results.
/// When no encoding is specified (the default), then BOM sniffing is
/// used (if it's enabled, which it is, by default) to determine whether
/// the source data is UTF-8 or UTF-16, and transcoding will be performed
/// automatically. If no BOM could be found, then the source data is
/// searched _as if_ it were UTF-8. However, so long as the source data is
/// at least ASCII compatible, then it is possible for a search to produce
/// useful results.
pub fn encoding(
&mut self,
encoding: Option<Encoding>,
@ -519,6 +526,23 @@ impl SearcherBuilder {
self.config.encoding = encoding;
self
}
/// Enable automatic transcoding based on BOM sniffing.
///
/// When this is enabled and an explicit encoding is not set, then this
/// searcher will try to detect the encoding of the bytes being searched
/// by sniffing its byte-order mark (BOM). In particular, when this is
/// enabled, UTF-16 encoded files will be searched seamlessly.
///
/// When this is disabled and if an explicit encoding is not set, then
/// the bytes from the source stream will be passed through unchanged,
/// including its BOM, if one is present.
///
/// This is enabled by default.
pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
self.config.bom_sniffing = yes;
self
}
}
/// A searcher executes searches over a haystack and writes results to a caller
@ -738,7 +762,8 @@ impl Searcher {
/// Returns true if and only if the given slice needs to be transcoded.
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
self.config.encoding.is_some() || slice_has_utf16_bom(slice)
self.config.encoding.is_some()
|| (self.config.bom_sniffing && slice_has_utf16_bom(slice))
}
}