diff --git a/Cargo.lock b/Cargo.lock index 6029cc1a..9d8c9109 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2135563fb5c609d2b2b87c1e8ce7bc41b0b45430fa9661f457981503dd5bf0" +checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" dependencies = [ "memchr", ] @@ -219,7 +219,6 @@ dependencies = [ name = "grep-regex" version = "0.1.11" dependencies = [ - "aho-corasick", "bstr", "grep-matcher", "log", diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml index 280eacf0..f0ca8394 100644 --- a/crates/regex/Cargo.toml +++ b/crates/regex/Cargo.toml @@ -14,9 +14,8 @@ license = "Unlicense OR MIT" edition = "2021" [dependencies] -aho-corasick = "1.0.2" -bstr = "1.6.0" +bstr = "1.6.2" grep-matcher = { version = "0.1.6", path = "../matcher" } -log = "0.4.19" -regex-automata = { version = "0.3.0" } -regex-syntax = "0.7.2" +log = "0.4.20" +regex-automata = { version = "0.3.8" } +regex-syntax = "0.7.5" diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs index 9175be9d..068c7c71 100644 --- a/crates/regex/src/lib.rs +++ b/crates/regex/src/lib.rs @@ -3,8 +3,10 @@ An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine. */ #![deny(missing_docs)] -pub use crate::error::{Error, ErrorKind}; -pub use crate::matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder}; +pub use crate::{ + error::{Error, ErrorKind}, + matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder}, +}; mod ast; mod config; diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 7b132476..65c61d27 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -831,9 +831,10 @@ impl RegexCaptures { #[cfg(test)] mod tests { - use super::*; use grep_matcher::{LineMatchKind, Matcher}; + use super::*; + // Test that enabling word matches does the right thing and demonstrate // the difference between it and surrounding the regex in `\b`. #[test] diff --git a/crates/regex/src/multi.rs b/crates/regex/src/multi.rs deleted file mode 100644 index 8c24a845..00000000 --- a/crates/regex/src/multi.rs +++ /dev/null @@ -1,112 +0,0 @@ -use aho_corasick::{AhoCorasick, MatchKind}; -use grep_matcher::{Match, Matcher, NoError}; -use regex_syntax::hir::{Hir, HirKind}; - -use crate::error::Error; -use crate::matcher::RegexCaptures; - -/// A matcher for an alternation of literals. -/// -/// Ideally, this optimization would be pushed down into the regex engine, but -/// making this work correctly there would require quite a bit of refactoring. -/// Moreover, doing it one layer above lets us do thing like, "if we -/// specifically only want to search for literals, then don't bother with -/// regex parsing at all." -#[derive(Clone, Debug)] -pub struct MultiLiteralMatcher { - /// The Aho-Corasick automaton. - ac: AhoCorasick, -} - -impl MultiLiteralMatcher { - /// Create a new multi-literal matcher from the given literals. - pub fn new>( - literals: &[B], - ) -> Result { - let ac = AhoCorasick::builder() - .match_kind(MatchKind::LeftmostFirst) - .build(literals) - .map_err(Error::generic)?; - Ok(MultiLiteralMatcher { ac }) - } -} - -impl Matcher for MultiLiteralMatcher { - type Captures = RegexCaptures; - type Error = NoError; - - fn find_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result, NoError> { - match self.ac.find(&haystack[at..]) { - None => Ok(None), - Some(m) => Ok(Some(Match::new(at + m.start(), at + m.end()))), - } - } - - fn new_captures(&self) -> Result { - Ok(RegexCaptures::simple()) - } - - fn capture_count(&self) -> usize { - 1 - } - - fn capture_index(&self, _: &str) -> Option { - None - } - - fn captures_at( - &self, - haystack: &[u8], - at: usize, - caps: &mut RegexCaptures, - ) -> Result { - caps.set_simple(None); - let mat = self.find_at(haystack, at)?; - caps.set_simple(mat); - Ok(mat.is_some()) - } - - // We specifically do not implement other methods like find_iter. Namely, - // the iter methods are guaranteed to be correct by virtue of implementing - // find_at above. -} - -/// Alternation literals checks if the given HIR is a simple alternation of -/// literals, and if so, returns them. Otherwise, this returns None. -pub fn alternation_literals(expr: &Hir) -> Option>> { - // This is pretty hacky, but basically, if `is_alternation_literal` is - // true, then we can make several assumptions about the structure of our - // HIR. This is what justifies the `unreachable!` statements below. - - if !expr.properties().is_alternation_literal() { - return None; - } - let alts = match *expr.kind() { - HirKind::Alternation(ref alts) => alts, - _ => return None, // one literal isn't worth it - }; - - let mut lits = vec![]; - for alt in alts { - let mut lit = vec![]; - match *alt.kind() { - HirKind::Empty => {} - HirKind::Literal(ref x) => lit.extend_from_slice(&x.0), - HirKind::Concat(ref exprs) => { - for e in exprs { - match *e.kind() { - HirKind::Literal(ref x) => lit.extend_from_slice(&x.0), - _ => unreachable!("expected literal, got {:?}", e), - } - } - } - _ => unreachable!("expected literal or concat, got {:?}", alt), - } - lits.push(lit); - } - Some(lits) -} diff --git a/crates/regex/src/strip.rs b/crates/regex/src/strip.rs index f0da9446..1e960d22 100644 --- a/crates/regex/src/strip.rs +++ b/crates/regex/src/strip.rs @@ -20,11 +20,11 @@ use crate::error::{Error, ErrorKind}; /// /// Note that as of regex 1.9, this routine could theoretically be implemented /// without returning an error. Namely, for example, we could turn -/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminators with a +/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminator with a /// sub-expression that can never match anything. Thus, ripgrep would accept -/// such regexes and just silently not match anything. Regex versions prior to 1.8 -/// don't support such constructs. I ended up deciding to leave the existing -/// behavior of returning an error instead. For example: +/// such regexes and just silently not match anything. Regex versions prior +/// to 1.8 don't support such constructs. I ended up deciding to leave the +/// existing behavior of returning an error instead. For example: /// /// ```text /// $ echo -n 'foo\nbar\n' | rg 'foo\nbar'