1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-06-14 22:15:13 +02:00

cli: error when searching for NUL

Basically, unless the -a/--text flag is given, it is generally always an
error to search for an explicit NUL byte because the binary detection
will prevent it from matching.

Fixes #1838
This commit is contained in:
Andrew Gallant
2023-11-25 10:39:08 -05:00
parent 7bb9f35d2d
commit 9ed7565fcb
10 changed files with 162 additions and 6 deletions

83
crates/regex/src/ban.rs Normal file
View File

@ -0,0 +1,83 @@
use regex_syntax::hir::{
self, ClassBytesRange, ClassUnicodeRange, Hir, HirKind,
};
use crate::error::{Error, ErrorKind};
/// Returns an error when a sub-expression in `expr` must match `byte`.
pub(crate) fn check(expr: &Hir, byte: u8) -> Result<(), Error> {
assert!(byte.is_ascii(), "ban byte must be ASCII");
let ch = char::from(byte);
let invalid = || Err(Error::new(ErrorKind::Banned(byte)));
match expr.kind() {
HirKind::Empty => {}
HirKind::Literal(hir::Literal(ref lit)) => {
if lit.iter().find(|&&b| b == byte).is_some() {
return invalid();
}
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
let contains =
|r: &&ClassUnicodeRange| r.start() <= ch && ch <= r.end();
if cls.ranges().iter().find(contains).is_some() {
return invalid();
}
}
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
let contains = |r: &&ClassBytesRange| {
r.start() <= byte && byte <= r.end()
};
if cls.ranges().iter().find(contains).is_some() {
return invalid();
}
}
}
HirKind::Look(_) => {}
HirKind::Repetition(ref x) => check(&x.sub, byte)?,
HirKind::Capture(ref x) => check(&x.sub, byte)?,
HirKind::Concat(ref xs) => {
for x in xs.iter() {
check(x, byte)?;
}
}
HirKind::Alternation(ref xs) => {
for x in xs.iter() {
check(x, byte)?;
}
}
};
Ok(())
}
#[cfg(test)]
mod tests {
use regex_syntax::Parser;
/// Returns true when the given pattern is detected to contain the given
/// banned byte.
fn check(pattern: &str, byte: u8) -> bool {
let hir = Parser::new().parse(pattern).unwrap();
super::check(&hir, byte).is_err()
}
#[test]
fn various() {
assert!(check(r"\x00", 0));
assert!(check(r"a\x00", 0));
assert!(check(r"\x00b", 0));
assert!(check(r"a\x00b", 0));
assert!(check(r"\x00|ab", 0));
assert!(check(r"ab|\x00", 0));
assert!(check(r"\x00?", 0));
assert!(check(r"(\x00)", 0));
assert!(check(r"[\x00]", 0));
assert!(check(r"[^[^\x00]]", 0));
assert!(!check(r"[^\x00]", 0));
assert!(!check(r"[\x00a]", 0));
}
}

View File

@ -8,7 +8,7 @@ use {
};
use crate::{
ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes,
strip::strip_from_match,
};
@ -35,6 +35,7 @@ pub(crate) struct Config {
pub(crate) dfa_size_limit: usize,
pub(crate) nest_limit: u32,
pub(crate) line_terminator: Option<LineTerminator>,
pub(crate) ban: Option<u8>,
pub(crate) crlf: bool,
pub(crate) word: bool,
pub(crate) fixed_strings: bool,
@ -58,6 +59,7 @@ impl Default for Config {
dfa_size_limit: 1000 * (1 << 20),
nest_limit: 250,
line_terminator: None,
ban: None,
crlf: false,
word: false,
fixed_strings: false,
@ -205,6 +207,9 @@ impl ConfiguredHIR {
.build()
.translate(&pattern, &ast)
.map_err(Error::generic)?;
if let Some(byte) = config.ban {
ban::check(&hir, byte)?;
}
// We don't need to do this for the fixed-strings case above
// because is_fixed_strings will return false if any pattern
// contains a line terminator. Therefore, we don't need to strip

View File

@ -60,6 +60,8 @@ pub enum ErrorKind {
///
/// The invalid byte is included in this error.
InvalidLineTerminator(u8),
/// Occurs when a banned byte was found in a pattern.
Banned(u8),
}
impl std::error::Error for Error {}
@ -76,8 +78,15 @@ impl std::fmt::Display for Error {
ErrorKind::InvalidLineTerminator(byte) => {
write!(
f,
"line terminators must be ASCII, but {} is not",
[byte].as_bstr()
"line terminators must be ASCII, but {byte:?} is not",
byte = [byte].as_bstr(),
)
}
ErrorKind::Banned(byte) => {
write!(
f,
"pattern contains {byte:?} but it is impossible to match",
byte = [byte].as_bstr(),
)
}
}

View File

@ -9,6 +9,7 @@ pub use crate::{
};
mod ast;
mod ban;
mod config;
mod error;
mod literal;

View File

@ -286,6 +286,22 @@ impl RegexMatcherBuilder {
self
}
/// Ban a byte from occurring in a regular expression pattern.
///
/// If this byte is found in the regex pattern, then an error will be
/// returned at construction time.
///
/// This is useful when binary detection is enabled. Callers will likely
/// want to ban the same byte that is used to detect binary data, i.e.,
/// the NUL byte. The reason for this is that when binary detection is
/// enabled, it's impossible to match a NUL byte because binary detection
/// will either quit when one is found, or will convert NUL bytes to line
/// terminators to avoid exorbitant heap usage.
pub fn ban_byte(&mut self, byte: Option<u8>) -> &mut RegexMatcherBuilder {
self.config.ban = byte;
self
}
/// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
/// regex patterns.
///