From 9ed7565fcb47d02e76398d41f24ba0817f6b9c2f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 25 Nov 2023 10:39:08 -0500 Subject: [PATCH] cli: error when searching for NUL Basically, unless the -a/--text flag is given, it is generally always an error to search for an explicit NUL byte because the binary detection will prevent it from matching. Fixes #1838 --- CHANGELOG.md | 2 + crates/core/flags/hiargs.rs | 28 +++++++++- crates/regex/src/ban.rs | 83 +++++++++++++++++++++++++++++ crates/regex/src/config.rs | 7 ++- crates/regex/src/error.rs | 13 ++++- crates/regex/src/lib.rs | 1 + crates/regex/src/matcher.rs | 16 ++++++ crates/searcher/src/line_buffer.rs | 2 +- crates/searcher/src/searcher/mod.rs | 2 +- tests/regression.rs | 14 +++++ 10 files changed, 162 insertions(+), 6 deletions(-) create mode 100644 crates/regex/src/ban.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index afdbab2c..feaac274 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,8 @@ Feature enhancements: Add new `--stop-on-nonmatch` flag. * [FEATURE #1814](https://github.com/BurntSushi/ripgrep/issues/1814): Flags are now categorized in `-h/--help` output and ripgrep's man page. +* [FEATURE #1838](https://github.com/BurntSushi/ripgrep/issues/1838): + An error is shown when searching for NUL bytes with binary detection enabled. * [FEATURE #2195](https://github.com/BurntSushi/ripgrep/issues/2195): When `extra-verbose` mode is enabled in zsh, show extra file type info. * [FEATURE #2298](https://github.com/BurntSushi/ripgrep/issues/2298): diff --git a/crates/core/flags/hiargs.rs b/crates/core/flags/hiargs.rs index 41a6bc29..59a178e4 100644 --- a/crates/core/flags/hiargs.rs +++ b/crates/core/flags/hiargs.rs @@ -499,9 +499,14 @@ impl HiArgs { if let Some(limit) = self.dfa_size_limit { builder.dfa_size_limit(limit); } + if !self.binary.is_none() { + builder.ban_byte(Some(b'\x00')); + } let m = match builder.build_many(&self.patterns.patterns) { Ok(m) => m, - Err(err) => anyhow::bail!(suggest_multiline(err.to_string())), + Err(err) => { + anyhow::bail!(suggest_text(suggest_multiline(err.to_string()))) + } }; Ok(PatternMatcher::RustRegex(m)) } @@ -1144,6 +1149,13 @@ impl BinaryDetection { }; BinaryDetection { explicit, implicit } } + + /// Returns true when both implicit and explicit binary detection is + /// disabled. + pub(crate) fn is_none(&self) -> bool { + let none = grep::searcher::BinaryDetection::none(); + self.explicit == none && self.implicit == none + } } /// Builds the file type matcher from low level arguments. @@ -1428,3 +1440,17 @@ When multiline mode is enabled, new line characters can be matched.", msg } } + +/// Possibly suggest the `-a/--text` flag. +fn suggest_text(msg: String) -> String { + if msg.contains("pattern contains \"\\0\"") { + format!( + "{msg} + +Consider enabling text mode with the --text flag (or -a for short). Otherwise, +binary detection is enabled and matching a NUL byte is impossible.", + ) + } else { + msg + } +} diff --git a/crates/regex/src/ban.rs b/crates/regex/src/ban.rs new file mode 100644 index 00000000..1bd62e28 --- /dev/null +++ b/crates/regex/src/ban.rs @@ -0,0 +1,83 @@ +use regex_syntax::hir::{ + self, ClassBytesRange, ClassUnicodeRange, Hir, HirKind, +}; + +use crate::error::{Error, ErrorKind}; + +/// Returns an error when a sub-expression in `expr` must match `byte`. +pub(crate) fn check(expr: &Hir, byte: u8) -> Result<(), Error> { + assert!(byte.is_ascii(), "ban byte must be ASCII"); + let ch = char::from(byte); + let invalid = || Err(Error::new(ErrorKind::Banned(byte))); + match expr.kind() { + HirKind::Empty => {} + HirKind::Literal(hir::Literal(ref lit)) => { + if lit.iter().find(|&&b| b == byte).is_some() { + return invalid(); + } + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if cls.ranges().iter().map(|r| r.len()).sum::() == 1 { + let contains = + |r: &&ClassUnicodeRange| r.start() <= ch && ch <= r.end(); + if cls.ranges().iter().find(contains).is_some() { + return invalid(); + } + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if cls.ranges().iter().map(|r| r.len()).sum::() == 1 { + let contains = |r: &&ClassBytesRange| { + r.start() <= byte && byte <= r.end() + }; + if cls.ranges().iter().find(contains).is_some() { + return invalid(); + } + } + } + HirKind::Look(_) => {} + HirKind::Repetition(ref x) => check(&x.sub, byte)?, + HirKind::Capture(ref x) => check(&x.sub, byte)?, + HirKind::Concat(ref xs) => { + for x in xs.iter() { + check(x, byte)?; + } + } + HirKind::Alternation(ref xs) => { + for x in xs.iter() { + check(x, byte)?; + } + } + }; + Ok(()) +} + +#[cfg(test)] +mod tests { + use regex_syntax::Parser; + + /// Returns true when the given pattern is detected to contain the given + /// banned byte. + fn check(pattern: &str, byte: u8) -> bool { + let hir = Parser::new().parse(pattern).unwrap(); + super::check(&hir, byte).is_err() + } + + #[test] + fn various() { + assert!(check(r"\x00", 0)); + assert!(check(r"a\x00", 0)); + assert!(check(r"\x00b", 0)); + assert!(check(r"a\x00b", 0)); + assert!(check(r"\x00|ab", 0)); + assert!(check(r"ab|\x00", 0)); + assert!(check(r"\x00?", 0)); + assert!(check(r"(\x00)", 0)); + + assert!(check(r"[\x00]", 0)); + assert!(check(r"[^[^\x00]]", 0)); + + assert!(!check(r"[^\x00]", 0)); + assert!(!check(r"[\x00a]", 0)); + } +} diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index 8c69ef54..7ebe035d 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -8,7 +8,7 @@ use { }; use crate::{ - ast::AstAnalysis, error::Error, non_matching::non_matching_bytes, + ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes, strip::strip_from_match, }; @@ -35,6 +35,7 @@ pub(crate) struct Config { pub(crate) dfa_size_limit: usize, pub(crate) nest_limit: u32, pub(crate) line_terminator: Option, + pub(crate) ban: Option, pub(crate) crlf: bool, pub(crate) word: bool, pub(crate) fixed_strings: bool, @@ -58,6 +59,7 @@ impl Default for Config { dfa_size_limit: 1000 * (1 << 20), nest_limit: 250, line_terminator: None, + ban: None, crlf: false, word: false, fixed_strings: false, @@ -205,6 +207,9 @@ impl ConfiguredHIR { .build() .translate(&pattern, &ast) .map_err(Error::generic)?; + if let Some(byte) = config.ban { + ban::check(&hir, byte)?; + } // We don't need to do this for the fixed-strings case above // because is_fixed_strings will return false if any pattern // contains a line terminator. Therefore, we don't need to strip diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs index 88a8adbe..a75f1f5d 100644 --- a/crates/regex/src/error.rs +++ b/crates/regex/src/error.rs @@ -60,6 +60,8 @@ pub enum ErrorKind { /// /// The invalid byte is included in this error. InvalidLineTerminator(u8), + /// Occurs when a banned byte was found in a pattern. + Banned(u8), } impl std::error::Error for Error {} @@ -76,8 +78,15 @@ impl std::fmt::Display for Error { ErrorKind::InvalidLineTerminator(byte) => { write!( f, - "line terminators must be ASCII, but {} is not", - [byte].as_bstr() + "line terminators must be ASCII, but {byte:?} is not", + byte = [byte].as_bstr(), + ) + } + ErrorKind::Banned(byte) => { + write!( + f, + "pattern contains {byte:?} but it is impossible to match", + byte = [byte].as_bstr(), ) } } diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs index 4693bff1..93e477b6 100644 --- a/crates/regex/src/lib.rs +++ b/crates/regex/src/lib.rs @@ -9,6 +9,7 @@ pub use crate::{ }; mod ast; +mod ban; mod config; mod error; mod literal; diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index f3f673ff..003d415c 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -286,6 +286,22 @@ impl RegexMatcherBuilder { self } + /// Ban a byte from occurring in a regular expression pattern. + /// + /// If this byte is found in the regex pattern, then an error will be + /// returned at construction time. + /// + /// This is useful when binary detection is enabled. Callers will likely + /// want to ban the same byte that is used to detect binary data, i.e., + /// the NUL byte. The reason for this is that when binary detection is + /// enabled, it's impossible to match a NUL byte because binary detection + /// will either quit when one is found, or will convert NUL bytes to line + /// terminators to avoid exorbitant heap usage. + pub fn ban_byte(&mut self, byte: Option) -> &mut RegexMatcherBuilder { + self.config.ban = byte; + self + } + /// Set the line terminator to `\r\n` and enable CRLF matching for `$` in /// regex patterns. /// diff --git a/crates/searcher/src/line_buffer.rs b/crates/searcher/src/line_buffer.rs index badcc438..822bdee4 100644 --- a/crates/searcher/src/line_buffer.rs +++ b/crates/searcher/src/line_buffer.rs @@ -47,7 +47,7 @@ pub(crate) fn alloc_error(limit: usize) -> io::Error { /// is that binary data often indicates data that is undesirable to search /// using textual patterns. Of course, there are many cases in which this isn't /// true, which is why binary detection is disabled by default. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum BinaryDetection { /// No binary detection is performed. Data reported by the line buffer may /// contain arbitrary bytes. diff --git a/crates/searcher/src/searcher/mod.rs b/crates/searcher/src/searcher/mod.rs index ff1bea59..d4bd1563 100644 --- a/crates/searcher/src/searcher/mod.rs +++ b/crates/searcher/src/searcher/mod.rs @@ -52,7 +52,7 @@ type Range = Match; /// heap, then binary detection is only guaranteed to be applied to the /// parts corresponding to a match. When `Quit` is enabled, then the first /// few KB of the data are searched for binary data. -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct BinaryDetection(line_buffer::BinaryDetection); impl BinaryDetection { diff --git a/tests/regression.rs b/tests/regression.rs index 04138fdf..54490b98 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -977,6 +977,20 @@ rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| { assert!(!cmd.stdout().is_empty()); }); +// See: https://github.com/BurntSushi/ripgrep/issues/1838 +rgtest!(r1838_nul_error_with_binary_detection, |dir: Dir, _: TestCommand| { + // We don't support this error reporting with PCRE2 since we can't parse + // the pattern (easily) to give a good error message. + if dir.is_pcre2() { + return; + } + dir.create("test", "foo\n"); + + dir.command().args(&[r"foo\x00?"]).assert_err(); + eqnice!("test:foo\n", dir.command().args(&["-a", r"foo\x00?"]).stdout()); +}); + +// See: https://github.com/BurntSushi/ripgrep/issues/1866 rgtest!(r1866, |dir: Dir, mut cmd: TestCommand| { dir.create("test", "foobar\nfoobar\nfoo quux"); cmd.args(&[