diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml index 52293a33..f564b0e6 100644 --- a/crates/regex/Cargo.toml +++ b/crates/regex/Cargo.toml @@ -11,7 +11,7 @@ repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/regex" readme = "README.md" keywords = ["regex", "grep", "search", "pattern", "line"] license = "Unlicense OR MIT" -edition = "2018" +edition = "2021" [dependencies] aho-corasick = "1.0.2" diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs index 2320276c..cd7939dc 100644 --- a/crates/regex/src/error.rs +++ b/crates/regex/src/error.rs @@ -1,8 +1,3 @@ -use std::error; -use std::fmt; - -use crate::util; - /// An error that can occur in this crate. /// /// Generally, this error corresponds to problems building a regular @@ -32,7 +27,7 @@ impl Error { } } - pub(crate) fn generic(err: E) -> Error { + pub(crate) fn generic(err: E) -> Error { Error { kind: ErrorKind::Regex(err.to_string()) } } @@ -68,18 +63,23 @@ pub enum ErrorKind { InvalidLineTerminator(u8), } -impl error::Error for Error {} +impl std::error::Error for Error {} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use bstr::ByteSlice; -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.kind { ErrorKind::Regex(ref s) => write!(f, "{}", s), ErrorKind::NotAllowed(ref lit) => { - write!(f, "the literal '{:?}' is not allowed in a regex", lit) + write!(f, "the literal {:?} is not allowed in a regex", lit) } ErrorKind::InvalidLineTerminator(byte) => { - let x = util::show_bytes(&[byte]); - write!(f, "line terminators must be ASCII, but '{}' is not", x) + write!( + f, + "line terminators must be ASCII, but {} is not", + [byte].as_bstr() + ) } } } diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs index 35a3258d..a677b787 100644 --- a/crates/regex/src/lib.rs +++ b/crates/regex/src/lib.rs @@ -14,5 +14,4 @@ mod matcher; mod multi; mod non_matching; mod strip; -mod util; mod word; diff --git a/crates/regex/src/strip.rs b/crates/regex/src/strip.rs index 39bd1318..3e141563 100644 --- a/crates/regex/src/strip.rs +++ b/crates/regex/src/strip.rs @@ -1,5 +1,7 @@ -use grep_matcher::LineTerminator; -use regex_syntax::hir::{self, Hir, HirKind}; +use { + grep_matcher::LineTerminator, + regex_syntax::hir::{self, Hir, HirKind}, +}; use crate::error::{Error, ErrorKind}; @@ -15,7 +17,26 @@ use crate::error::{Error, ErrorKind}; /// /// If the given line terminator is not ASCII, then this function returns an /// error. -pub fn strip_from_match( +/// +/// Note that as of regex 1.9, this routine could theoretically be implemented +/// without returning an error. Namely, for example, we could turn +/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminators with a +/// sub-expression that can never match anything. Thus, ripgrep would accept +/// such regexes and just silently not match anything. Regex versions prior to 1.8 +/// don't support such constructs. I ended up deciding to leave the existing +/// behavior of returning an error instead. For example: +/// +/// ```text +/// $ echo -n 'foo\nbar\n' | rg 'foo\nbar' +/// the literal '"\n"' is not allowed in a regex +/// +/// Consider enabling multiline mode with the --multiline flag (or -U for short). +/// When multiline mode is enabled, new line characters can be matched. +/// ``` +/// +/// This looks like a good error message to me, and even suggests a flag that +/// the user can use instead. +pub(crate) fn strip_from_match( expr: Hir, line_term: LineTerminator, ) -> Result { @@ -23,23 +44,20 @@ pub fn strip_from_match( let expr1 = strip_from_match_ascii(expr, b'\r')?; strip_from_match_ascii(expr1, b'\n') } else { - let b = line_term.as_byte(); - if b > 0x7F { - return Err(Error::new(ErrorKind::InvalidLineTerminator(b))); - } - strip_from_match_ascii(expr, b) + strip_from_match_ascii(expr, line_term.as_byte()) } } -/// The implementation of strip_from_match. The given byte must be ASCII. This -/// function panics otherwise. +/// The implementation of strip_from_match. The given byte must be ASCII. +/// This function returns an error otherwise. It also returns an error if +/// it couldn't remove `\n` from the given regex without leaving an empty +/// character class in its place. fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result { - assert!(byte <= 0x7F); - let chr = byte as char; - assert_eq!(chr.len_utf8(), 1); - - let invalid = || Err(Error::new(ErrorKind::NotAllowed(chr.to_string()))); - + if !byte.is_ascii() { + return Err(Error::new(ErrorKind::InvalidLineTerminator(byte))); + } + let ch = char::from(byte); + let invalid = || Err(Error::new(ErrorKind::NotAllowed(ch.to_string()))); Ok(match expr.into_kind() { HirKind::Empty => Hir::empty(), HirKind::Literal(hir::Literal(lit)) => { @@ -50,7 +68,7 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result { } HirKind::Class(hir::Class::Unicode(mut cls)) => { let remove = hir::ClassUnicode::new(Some( - hir::ClassUnicodeRange::new(chr, chr), + hir::ClassUnicodeRange::new(ch, ch), )); cls.difference(&remove); if cls.ranges().is_empty() { diff --git a/crates/regex/src/util.rs b/crates/regex/src/util.rs deleted file mode 100644 index aef4f139..00000000 --- a/crates/regex/src/util.rs +++ /dev/null @@ -1,30 +0,0 @@ -/// Converts an arbitrary sequence of bytes to a literal suitable for building -/// a regular expression. -#[allow(dead_code)] -pub fn bytes_to_regex(bs: &[u8]) -> String { - use regex_syntax::is_meta_character; - use std::fmt::Write; - - let mut s = String::with_capacity(bs.len()); - for &b in bs { - if b <= 0x7F && !is_meta_character(b as char) { - write!(s, r"{}", b as char).unwrap(); - } else { - write!(s, r"\x{:02x}", b).unwrap(); - } - } - s -} - -/// Converts arbitrary bytes to a nice string. -pub fn show_bytes(bs: &[u8]) -> String { - use std::ascii::escape_default; - use std::str; - - let mut nice = String::new(); - for &b in bs { - let part: Vec = escape_default(b).collect(); - nice.push_str(str::from_utf8(&part).unwrap()); - } - nice -}