mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-04-19 09:02:15 +02:00
searcher: do UTF-8 BOM sniffing like UTF-16
Previously, we were only looking for the UTF-16 BOM for determining whether to do transcoding or not. But we should also look for the UTF-8 BOM as well. Fixes #1638, Closes #1697
This commit is contained in:
parent
53c4855517
commit
2295061e80
@ -54,6 +54,8 @@ Bug fixes:
|
|||||||
Fix stdin detection when using PowerShell in UNIX environments.
|
Fix stdin detection when using PowerShell in UNIX environments.
|
||||||
* [BUG #1765](https://github.com/BurntSushi/ripgrep/issues/1765):
|
* [BUG #1765](https://github.com/BurntSushi/ripgrep/issues/1765):
|
||||||
Fix panic when `--crlf` is used in some cases.
|
Fix panic when `--crlf` is used in some cases.
|
||||||
|
* [BUG #1638](https://github.com/BurntSushi/ripgrep/issues/1638):
|
||||||
|
Correctly sniff UTF-8 and do transcoding, like we do for UTF-16.
|
||||||
* [BUG #1816](https://github.com/BurntSushi/ripgrep/issues/1816):
|
* [BUG #1816](https://github.com/BurntSushi/ripgrep/issues/1816):
|
||||||
Add documentation for glob alternate syntax, e.g., `{a,b,..}`.
|
Add documentation for glob alternate syntax, e.g., `{a,b,..}`.
|
||||||
* [BUG #1847](https://github.com/BurntSushi/ripgrep/issues/1847):
|
* [BUG #1847](https://github.com/BurntSushi/ripgrep/issues/1847):
|
||||||
|
@ -788,7 +788,7 @@ impl Searcher {
|
|||||||
/// Returns true if and only if the given slice needs to be transcoded.
|
/// Returns true if and only if the given slice needs to be transcoded.
|
||||||
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
|
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
|
||||||
self.config.encoding.is_some()
|
self.config.encoding.is_some()
|
||||||
|| (self.config.bom_sniffing && slice_has_utf16_bom(slice))
|
|| (self.config.bom_sniffing && slice_has_bom(slice))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -973,16 +973,18 @@ impl Searcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if and only if the given slice begins with a UTF-16 BOM.
|
/// Returns true if and only if the given slice begins with a UTF-8 or UTF-16
|
||||||
|
/// BOM.
|
||||||
///
|
///
|
||||||
/// This is used by the searcher to determine if a transcoder is necessary.
|
/// This is used by the searcher to determine if a transcoder is necessary.
|
||||||
/// Otherwise, it is advantageous to search the slice directly.
|
/// Otherwise, it is advantageous to search the slice directly.
|
||||||
fn slice_has_utf16_bom(slice: &[u8]) -> bool {
|
fn slice_has_bom(slice: &[u8]) -> bool {
|
||||||
let enc = match encoding_rs::Encoding::for_bom(slice) {
|
let enc = match encoding_rs::Encoding::for_bom(slice) {
|
||||||
None => return false,
|
None => return false,
|
||||||
Some((enc, _)) => enc,
|
Some((enc, _)) => enc,
|
||||||
};
|
};
|
||||||
[encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc)
|
[encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8]
|
||||||
|
.contains(&enc)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@ -1009,4 +1011,21 @@ mod tests {
|
|||||||
let res = searcher.search_slice(matcher, &[], sink);
|
let res = searcher.search_slice(matcher, &[], sink);
|
||||||
assert!(res.is_err());
|
assert!(res.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn uft8_bom_sniffing() {
|
||||||
|
// See: https://github.com/BurntSushi/ripgrep/issues/1638
|
||||||
|
// ripgrep must sniff utf-8 BOM, just like it does with utf-16
|
||||||
|
let matcher = RegexMatcher::new("foo");
|
||||||
|
let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f];
|
||||||
|
|
||||||
|
let mut sink = KitchenSink::new();
|
||||||
|
let mut searcher = SearcherBuilder::new().build();
|
||||||
|
|
||||||
|
let res = searcher.search_slice(matcher, haystack, &mut sink);
|
||||||
|
assert!(res.is_ok());
|
||||||
|
|
||||||
|
let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
|
||||||
|
assert_eq!(sink_output, "1:0:foo\nbyte count:3\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -867,6 +867,15 @@ use B;
|
|||||||
eqnice!("2\n", cmd.stdout());
|
eqnice!("2\n", cmd.stdout());
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// See: https://github.com/BurntSushi/ripgrep/issues/1638
|
||||||
|
//
|
||||||
|
// Tests if UTF-8 BOM is sniffed, then the column index is correct.
|
||||||
|
rgtest!(r1638, |dir: Dir, mut cmd: TestCommand| {
|
||||||
|
dir.create_bytes("foo", b"\xef\xbb\xbfx");
|
||||||
|
|
||||||
|
eqnice!("foo:1:1:x\n", cmd.arg("--column").arg("x").stdout());
|
||||||
|
});
|
||||||
|
|
||||||
// See: https://github.com/BurntSushi/ripgrep/issues/1765
|
// See: https://github.com/BurntSushi/ripgrep/issues/1765
|
||||||
rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| {
|
rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| {
|
||||||
dir.create("test", "\n");
|
dir.create("test", "\n");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user