1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2024-12-12 19:18:24 +02:00

cli: error when searching for NUL

Basically, unless the -a/--text flag is given, it is generally always an
error to search for an explicit NUL byte because the binary detection
will prevent it from matching.

Fixes #1838
This commit is contained in:
Andrew Gallant 2023-11-25 10:39:08 -05:00
parent 7bb9f35d2d
commit 9ed7565fcb
10 changed files with 162 additions and 6 deletions

View File

@ -34,6 +34,8 @@ Feature enhancements:
Add new `--stop-on-nonmatch` flag.
* [FEATURE #1814](https://github.com/BurntSushi/ripgrep/issues/1814):
Flags are now categorized in `-h/--help` output and ripgrep's man page.
* [FEATURE #1838](https://github.com/BurntSushi/ripgrep/issues/1838):
An error is shown when searching for NUL bytes with binary detection enabled.
* [FEATURE #2195](https://github.com/BurntSushi/ripgrep/issues/2195):
When `extra-verbose` mode is enabled in zsh, show extra file type info.
* [FEATURE #2298](https://github.com/BurntSushi/ripgrep/issues/2298):

View File

@ -499,9 +499,14 @@ impl HiArgs {
if let Some(limit) = self.dfa_size_limit {
builder.dfa_size_limit(limit);
}
if !self.binary.is_none() {
builder.ban_byte(Some(b'\x00'));
}
let m = match builder.build_many(&self.patterns.patterns) {
Ok(m) => m,
Err(err) => anyhow::bail!(suggest_multiline(err.to_string())),
Err(err) => {
anyhow::bail!(suggest_text(suggest_multiline(err.to_string())))
}
};
Ok(PatternMatcher::RustRegex(m))
}
@ -1144,6 +1149,13 @@ impl BinaryDetection {
};
BinaryDetection { explicit, implicit }
}
/// Returns true when both implicit and explicit binary detection is
/// disabled.
pub(crate) fn is_none(&self) -> bool {
let none = grep::searcher::BinaryDetection::none();
self.explicit == none && self.implicit == none
}
}
/// Builds the file type matcher from low level arguments.
@ -1428,3 +1440,17 @@ When multiline mode is enabled, new line characters can be matched.",
msg
}
}
/// Possibly suggest the `-a/--text` flag.
fn suggest_text(msg: String) -> String {
if msg.contains("pattern contains \"\\0\"") {
format!(
"{msg}
Consider enabling text mode with the --text flag (or -a for short). Otherwise,
binary detection is enabled and matching a NUL byte is impossible.",
)
} else {
msg
}
}

83
crates/regex/src/ban.rs Normal file
View File

@ -0,0 +1,83 @@
use regex_syntax::hir::{
self, ClassBytesRange, ClassUnicodeRange, Hir, HirKind,
};
use crate::error::{Error, ErrorKind};
/// Returns an error when a sub-expression in `expr` must match `byte`.
pub(crate) fn check(expr: &Hir, byte: u8) -> Result<(), Error> {
assert!(byte.is_ascii(), "ban byte must be ASCII");
let ch = char::from(byte);
let invalid = || Err(Error::new(ErrorKind::Banned(byte)));
match expr.kind() {
HirKind::Empty => {}
HirKind::Literal(hir::Literal(ref lit)) => {
if lit.iter().find(|&&b| b == byte).is_some() {
return invalid();
}
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
let contains =
|r: &&ClassUnicodeRange| r.start() <= ch && ch <= r.end();
if cls.ranges().iter().find(contains).is_some() {
return invalid();
}
}
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
let contains = |r: &&ClassBytesRange| {
r.start() <= byte && byte <= r.end()
};
if cls.ranges().iter().find(contains).is_some() {
return invalid();
}
}
}
HirKind::Look(_) => {}
HirKind::Repetition(ref x) => check(&x.sub, byte)?,
HirKind::Capture(ref x) => check(&x.sub, byte)?,
HirKind::Concat(ref xs) => {
for x in xs.iter() {
check(x, byte)?;
}
}
HirKind::Alternation(ref xs) => {
for x in xs.iter() {
check(x, byte)?;
}
}
};
Ok(())
}
#[cfg(test)]
mod tests {
use regex_syntax::Parser;
/// Returns true when the given pattern is detected to contain the given
/// banned byte.
fn check(pattern: &str, byte: u8) -> bool {
let hir = Parser::new().parse(pattern).unwrap();
super::check(&hir, byte).is_err()
}
#[test]
fn various() {
assert!(check(r"\x00", 0));
assert!(check(r"a\x00", 0));
assert!(check(r"\x00b", 0));
assert!(check(r"a\x00b", 0));
assert!(check(r"\x00|ab", 0));
assert!(check(r"ab|\x00", 0));
assert!(check(r"\x00?", 0));
assert!(check(r"(\x00)", 0));
assert!(check(r"[\x00]", 0));
assert!(check(r"[^[^\x00]]", 0));
assert!(!check(r"[^\x00]", 0));
assert!(!check(r"[\x00a]", 0));
}
}

View File

@ -8,7 +8,7 @@ use {
};
use crate::{
ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes,
strip::strip_from_match,
};
@ -35,6 +35,7 @@ pub(crate) struct Config {
pub(crate) dfa_size_limit: usize,
pub(crate) nest_limit: u32,
pub(crate) line_terminator: Option<LineTerminator>,
pub(crate) ban: Option<u8>,
pub(crate) crlf: bool,
pub(crate) word: bool,
pub(crate) fixed_strings: bool,
@ -58,6 +59,7 @@ impl Default for Config {
dfa_size_limit: 1000 * (1 << 20),
nest_limit: 250,
line_terminator: None,
ban: None,
crlf: false,
word: false,
fixed_strings: false,
@ -205,6 +207,9 @@ impl ConfiguredHIR {
.build()
.translate(&pattern, &ast)
.map_err(Error::generic)?;
if let Some(byte) = config.ban {
ban::check(&hir, byte)?;
}
// We don't need to do this for the fixed-strings case above
// because is_fixed_strings will return false if any pattern
// contains a line terminator. Therefore, we don't need to strip

View File

@ -60,6 +60,8 @@ pub enum ErrorKind {
///
/// The invalid byte is included in this error.
InvalidLineTerminator(u8),
/// Occurs when a banned byte was found in a pattern.
Banned(u8),
}
impl std::error::Error for Error {}
@ -76,8 +78,15 @@ impl std::fmt::Display for Error {
ErrorKind::InvalidLineTerminator(byte) => {
write!(
f,
"line terminators must be ASCII, but {} is not",
[byte].as_bstr()
"line terminators must be ASCII, but {byte:?} is not",
byte = [byte].as_bstr(),
)
}
ErrorKind::Banned(byte) => {
write!(
f,
"pattern contains {byte:?} but it is impossible to match",
byte = [byte].as_bstr(),
)
}
}

View File

@ -9,6 +9,7 @@ pub use crate::{
};
mod ast;
mod ban;
mod config;
mod error;
mod literal;

View File

@ -286,6 +286,22 @@ impl RegexMatcherBuilder {
self
}
/// Ban a byte from occurring in a regular expression pattern.
///
/// If this byte is found in the regex pattern, then an error will be
/// returned at construction time.
///
/// This is useful when binary detection is enabled. Callers will likely
/// want to ban the same byte that is used to detect binary data, i.e.,
/// the NUL byte. The reason for this is that when binary detection is
/// enabled, it's impossible to match a NUL byte because binary detection
/// will either quit when one is found, or will convert NUL bytes to line
/// terminators to avoid exorbitant heap usage.
pub fn ban_byte(&mut self, byte: Option<u8>) -> &mut RegexMatcherBuilder {
self.config.ban = byte;
self
}
/// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
/// regex patterns.
///

View File

@ -47,7 +47,7 @@ pub(crate) fn alloc_error(limit: usize) -> io::Error {
/// is that binary data often indicates data that is undesirable to search
/// using textual patterns. Of course, there are many cases in which this isn't
/// true, which is why binary detection is disabled by default.
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum BinaryDetection {
/// No binary detection is performed. Data reported by the line buffer may
/// contain arbitrary bytes.

View File

@ -52,7 +52,7 @@ type Range = Match;
/// heap, then binary detection is only guaranteed to be applied to the
/// parts corresponding to a match. When `Quit` is enabled, then the first
/// few KB of the data are searched for binary data.
#[derive(Clone, Debug, Default)]
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct BinaryDetection(line_buffer::BinaryDetection);
impl BinaryDetection {

View File

@ -977,6 +977,20 @@ rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| {
assert!(!cmd.stdout().is_empty());
});
// See: https://github.com/BurntSushi/ripgrep/issues/1838
rgtest!(r1838_nul_error_with_binary_detection, |dir: Dir, _: TestCommand| {
// We don't support this error reporting with PCRE2 since we can't parse
// the pattern (easily) to give a good error message.
if dir.is_pcre2() {
return;
}
dir.create("test", "foo\n");
dir.command().args(&[r"foo\x00?"]).assert_err();
eqnice!("test:foo\n", dir.command().args(&["-a", r"foo\x00?"]).stdout());
});
// See: https://github.com/BurntSushi/ripgrep/issues/1866
rgtest!(r1866, |dir: Dir, mut cmd: TestCommand| {
dir.create("test", "foobar\nfoobar\nfoo quux");
cmd.args(&[