mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2024-12-12 19:18:24 +02:00
cli: error when searching for NUL
Basically, unless the -a/--text flag is given, it is generally always an error to search for an explicit NUL byte because the binary detection will prevent it from matching. Fixes #1838
This commit is contained in:
parent
7bb9f35d2d
commit
9ed7565fcb
@ -34,6 +34,8 @@ Feature enhancements:
|
||||
Add new `--stop-on-nonmatch` flag.
|
||||
* [FEATURE #1814](https://github.com/BurntSushi/ripgrep/issues/1814):
|
||||
Flags are now categorized in `-h/--help` output and ripgrep's man page.
|
||||
* [FEATURE #1838](https://github.com/BurntSushi/ripgrep/issues/1838):
|
||||
An error is shown when searching for NUL bytes with binary detection enabled.
|
||||
* [FEATURE #2195](https://github.com/BurntSushi/ripgrep/issues/2195):
|
||||
When `extra-verbose` mode is enabled in zsh, show extra file type info.
|
||||
* [FEATURE #2298](https://github.com/BurntSushi/ripgrep/issues/2298):
|
||||
|
@ -499,9 +499,14 @@ impl HiArgs {
|
||||
if let Some(limit) = self.dfa_size_limit {
|
||||
builder.dfa_size_limit(limit);
|
||||
}
|
||||
if !self.binary.is_none() {
|
||||
builder.ban_byte(Some(b'\x00'));
|
||||
}
|
||||
let m = match builder.build_many(&self.patterns.patterns) {
|
||||
Ok(m) => m,
|
||||
Err(err) => anyhow::bail!(suggest_multiline(err.to_string())),
|
||||
Err(err) => {
|
||||
anyhow::bail!(suggest_text(suggest_multiline(err.to_string())))
|
||||
}
|
||||
};
|
||||
Ok(PatternMatcher::RustRegex(m))
|
||||
}
|
||||
@ -1144,6 +1149,13 @@ impl BinaryDetection {
|
||||
};
|
||||
BinaryDetection { explicit, implicit }
|
||||
}
|
||||
|
||||
/// Returns true when both implicit and explicit binary detection is
|
||||
/// disabled.
|
||||
pub(crate) fn is_none(&self) -> bool {
|
||||
let none = grep::searcher::BinaryDetection::none();
|
||||
self.explicit == none && self.implicit == none
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds the file type matcher from low level arguments.
|
||||
@ -1428,3 +1440,17 @@ When multiline mode is enabled, new line characters can be matched.",
|
||||
msg
|
||||
}
|
||||
}
|
||||
|
||||
/// Possibly suggest the `-a/--text` flag.
|
||||
fn suggest_text(msg: String) -> String {
|
||||
if msg.contains("pattern contains \"\\0\"") {
|
||||
format!(
|
||||
"{msg}
|
||||
|
||||
Consider enabling text mode with the --text flag (or -a for short). Otherwise,
|
||||
binary detection is enabled and matching a NUL byte is impossible.",
|
||||
)
|
||||
} else {
|
||||
msg
|
||||
}
|
||||
}
|
||||
|
83
crates/regex/src/ban.rs
Normal file
83
crates/regex/src/ban.rs
Normal file
@ -0,0 +1,83 @@
|
||||
use regex_syntax::hir::{
|
||||
self, ClassBytesRange, ClassUnicodeRange, Hir, HirKind,
|
||||
};
|
||||
|
||||
use crate::error::{Error, ErrorKind};
|
||||
|
||||
/// Returns an error when a sub-expression in `expr` must match `byte`.
|
||||
pub(crate) fn check(expr: &Hir, byte: u8) -> Result<(), Error> {
|
||||
assert!(byte.is_ascii(), "ban byte must be ASCII");
|
||||
let ch = char::from(byte);
|
||||
let invalid = || Err(Error::new(ErrorKind::Banned(byte)));
|
||||
match expr.kind() {
|
||||
HirKind::Empty => {}
|
||||
HirKind::Literal(hir::Literal(ref lit)) => {
|
||||
if lit.iter().find(|&&b| b == byte).is_some() {
|
||||
return invalid();
|
||||
}
|
||||
}
|
||||
HirKind::Class(hir::Class::Unicode(ref cls)) => {
|
||||
if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
|
||||
let contains =
|
||||
|r: &&ClassUnicodeRange| r.start() <= ch && ch <= r.end();
|
||||
if cls.ranges().iter().find(contains).is_some() {
|
||||
return invalid();
|
||||
}
|
||||
}
|
||||
}
|
||||
HirKind::Class(hir::Class::Bytes(ref cls)) => {
|
||||
if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
|
||||
let contains = |r: &&ClassBytesRange| {
|
||||
r.start() <= byte && byte <= r.end()
|
||||
};
|
||||
if cls.ranges().iter().find(contains).is_some() {
|
||||
return invalid();
|
||||
}
|
||||
}
|
||||
}
|
||||
HirKind::Look(_) => {}
|
||||
HirKind::Repetition(ref x) => check(&x.sub, byte)?,
|
||||
HirKind::Capture(ref x) => check(&x.sub, byte)?,
|
||||
HirKind::Concat(ref xs) => {
|
||||
for x in xs.iter() {
|
||||
check(x, byte)?;
|
||||
}
|
||||
}
|
||||
HirKind::Alternation(ref xs) => {
|
||||
for x in xs.iter() {
|
||||
check(x, byte)?;
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use regex_syntax::Parser;
|
||||
|
||||
/// Returns true when the given pattern is detected to contain the given
|
||||
/// banned byte.
|
||||
fn check(pattern: &str, byte: u8) -> bool {
|
||||
let hir = Parser::new().parse(pattern).unwrap();
|
||||
super::check(&hir, byte).is_err()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn various() {
|
||||
assert!(check(r"\x00", 0));
|
||||
assert!(check(r"a\x00", 0));
|
||||
assert!(check(r"\x00b", 0));
|
||||
assert!(check(r"a\x00b", 0));
|
||||
assert!(check(r"\x00|ab", 0));
|
||||
assert!(check(r"ab|\x00", 0));
|
||||
assert!(check(r"\x00?", 0));
|
||||
assert!(check(r"(\x00)", 0));
|
||||
|
||||
assert!(check(r"[\x00]", 0));
|
||||
assert!(check(r"[^[^\x00]]", 0));
|
||||
|
||||
assert!(!check(r"[^\x00]", 0));
|
||||
assert!(!check(r"[\x00a]", 0));
|
||||
}
|
||||
}
|
@ -8,7 +8,7 @@ use {
|
||||
};
|
||||
|
||||
use crate::{
|
||||
ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
|
||||
ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes,
|
||||
strip::strip_from_match,
|
||||
};
|
||||
|
||||
@ -35,6 +35,7 @@ pub(crate) struct Config {
|
||||
pub(crate) dfa_size_limit: usize,
|
||||
pub(crate) nest_limit: u32,
|
||||
pub(crate) line_terminator: Option<LineTerminator>,
|
||||
pub(crate) ban: Option<u8>,
|
||||
pub(crate) crlf: bool,
|
||||
pub(crate) word: bool,
|
||||
pub(crate) fixed_strings: bool,
|
||||
@ -58,6 +59,7 @@ impl Default for Config {
|
||||
dfa_size_limit: 1000 * (1 << 20),
|
||||
nest_limit: 250,
|
||||
line_terminator: None,
|
||||
ban: None,
|
||||
crlf: false,
|
||||
word: false,
|
||||
fixed_strings: false,
|
||||
@ -205,6 +207,9 @@ impl ConfiguredHIR {
|
||||
.build()
|
||||
.translate(&pattern, &ast)
|
||||
.map_err(Error::generic)?;
|
||||
if let Some(byte) = config.ban {
|
||||
ban::check(&hir, byte)?;
|
||||
}
|
||||
// We don't need to do this for the fixed-strings case above
|
||||
// because is_fixed_strings will return false if any pattern
|
||||
// contains a line terminator. Therefore, we don't need to strip
|
||||
|
@ -60,6 +60,8 @@ pub enum ErrorKind {
|
||||
///
|
||||
/// The invalid byte is included in this error.
|
||||
InvalidLineTerminator(u8),
|
||||
/// Occurs when a banned byte was found in a pattern.
|
||||
Banned(u8),
|
||||
}
|
||||
|
||||
impl std::error::Error for Error {}
|
||||
@ -76,8 +78,15 @@ impl std::fmt::Display for Error {
|
||||
ErrorKind::InvalidLineTerminator(byte) => {
|
||||
write!(
|
||||
f,
|
||||
"line terminators must be ASCII, but {} is not",
|
||||
[byte].as_bstr()
|
||||
"line terminators must be ASCII, but {byte:?} is not",
|
||||
byte = [byte].as_bstr(),
|
||||
)
|
||||
}
|
||||
ErrorKind::Banned(byte) => {
|
||||
write!(
|
||||
f,
|
||||
"pattern contains {byte:?} but it is impossible to match",
|
||||
byte = [byte].as_bstr(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ pub use crate::{
|
||||
};
|
||||
|
||||
mod ast;
|
||||
mod ban;
|
||||
mod config;
|
||||
mod error;
|
||||
mod literal;
|
||||
|
@ -286,6 +286,22 @@ impl RegexMatcherBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Ban a byte from occurring in a regular expression pattern.
|
||||
///
|
||||
/// If this byte is found in the regex pattern, then an error will be
|
||||
/// returned at construction time.
|
||||
///
|
||||
/// This is useful when binary detection is enabled. Callers will likely
|
||||
/// want to ban the same byte that is used to detect binary data, i.e.,
|
||||
/// the NUL byte. The reason for this is that when binary detection is
|
||||
/// enabled, it's impossible to match a NUL byte because binary detection
|
||||
/// will either quit when one is found, or will convert NUL bytes to line
|
||||
/// terminators to avoid exorbitant heap usage.
|
||||
pub fn ban_byte(&mut self, byte: Option<u8>) -> &mut RegexMatcherBuilder {
|
||||
self.config.ban = byte;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
|
||||
/// regex patterns.
|
||||
///
|
||||
|
@ -47,7 +47,7 @@ pub(crate) fn alloc_error(limit: usize) -> io::Error {
|
||||
/// is that binary data often indicates data that is undesirable to search
|
||||
/// using textual patterns. Of course, there are many cases in which this isn't
|
||||
/// true, which is why binary detection is disabled by default.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub(crate) enum BinaryDetection {
|
||||
/// No binary detection is performed. Data reported by the line buffer may
|
||||
/// contain arbitrary bytes.
|
||||
|
@ -52,7 +52,7 @@ type Range = Match;
|
||||
/// heap, then binary detection is only guaranteed to be applied to the
|
||||
/// parts corresponding to a match. When `Quit` is enabled, then the first
|
||||
/// few KB of the data are searched for binary data.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq)]
|
||||
pub struct BinaryDetection(line_buffer::BinaryDetection);
|
||||
|
||||
impl BinaryDetection {
|
||||
|
@ -977,6 +977,20 @@ rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| {
|
||||
assert!(!cmd.stdout().is_empty());
|
||||
});
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/1838
|
||||
rgtest!(r1838_nul_error_with_binary_detection, |dir: Dir, _: TestCommand| {
|
||||
// We don't support this error reporting with PCRE2 since we can't parse
|
||||
// the pattern (easily) to give a good error message.
|
||||
if dir.is_pcre2() {
|
||||
return;
|
||||
}
|
||||
dir.create("test", "foo\n");
|
||||
|
||||
dir.command().args(&[r"foo\x00?"]).assert_err();
|
||||
eqnice!("test:foo\n", dir.command().args(&["-a", r"foo\x00?"]).stdout());
|
||||
});
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/1866
|
||||
rgtest!(r1866, |dir: Dir, mut cmd: TestCommand| {
|
||||
dir.create("test", "foobar\nfoobar\nfoo quux");
|
||||
cmd.args(&[
|
||||
|
Loading…
Reference in New Issue
Block a user