From 9626f167573527858f9736a3054882de87d6cd79 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 9 Oct 2023 18:23:36 -0400 Subject: [PATCH] progress --- Cargo.lock | 24 +- Cargo.toml | 5 + crates/globset/Cargo.toml | 4 +- crates/ignore/Cargo.toml | 2 +- crates/regex/Cargo.toml | 4 +- crates/regex/src/ast.rs | 6 +- crates/regex/src/config.rs | 91 +------ crates/regex/src/error.rs | 4 - crates/regex/src/lib.rs | 1 - crates/regex/src/matcher.rs | 409 +++++-------------------------- crates/regex/src/non_matching.rs | 9 +- crates/regex/src/word.rs | 341 -------------------------- tests/misc.rs | 12 + tests/regression.rs | 9 +- 14 files changed, 113 insertions(+), 808 deletions(-) delete mode 100644 crates/regex/src/word.rs diff --git a/Cargo.lock b/Cargo.lock index 4605c11e..73e6ad5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "1.1.1" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] @@ -31,9 +31,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bstr" -version = "1.6.2" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" +checksum = "c79ad7fb2dd38f3dabd76b09c6a5a20c038fc0213ef1e9afd30eb777f120f019" dependencies = [ "memchr", "regex-automata", @@ -305,9 +305,9 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "memchr" -version = "2.6.3" +version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "memmap2" @@ -395,9 +395,7 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff" +version = "1.10.0" dependencies = [ "aho-corasick", "memchr", @@ -407,9 +405,7 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" +version = "0.4.1" dependencies = [ "aho-corasick", "memchr", @@ -418,9 +414,7 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +version = "0.8.0" [[package]] name = "ripgrep" diff --git a/Cargo.toml b/Cargo.toml index 3a905569..1a3a70c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,11 @@ members = [ "crates/ignore", ] +[patch.crates-io] +regex = { path = "/home/andrew/rust/regex" } +regex-automata = { path = "/home/andrew/rust/regex/regex-automata" } +regex-syntax = { path = "/home/andrew/rust/regex/regex-syntax" } + [dependencies] bstr = "1.6.0" grep = { version = "0.2.12", path = "crates/grep" } diff --git a/crates/globset/Cargo.toml b/crates/globset/Cargo.toml index b0602239..decc7804 100644 --- a/crates/globset/Cargo.toml +++ b/crates/globset/Cargo.toml @@ -26,12 +26,12 @@ log = { version = "0.4.20", optional = true } serde = { version = "1.0.188", optional = true } [dependencies.regex-syntax] -version = "0.7.5" +version = "0.8.0" default-features = false features = ["std"] [dependencies.regex-automata] -version = "0.3.8" +version = "0.4.0" default-features = false features = ["std", "perf", "syntax", "meta", "nfa", "hybrid"] diff --git a/crates/ignore/Cargo.toml b/crates/ignore/Cargo.toml index 31771f17..81dc9284 100644 --- a/crates/ignore/Cargo.toml +++ b/crates/ignore/Cargo.toml @@ -27,7 +27,7 @@ same-file = "1.0.6" walkdir = "2.4.0" [dependencies.regex-automata] -version = "0.3.8" +version = "0.4.0" default-features = false features = ["std", "perf", "syntax", "meta", "nfa", "hybrid", "dfa-onepass"] diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml index f0ca8394..f3266081 100644 --- a/crates/regex/Cargo.toml +++ b/crates/regex/Cargo.toml @@ -17,5 +17,5 @@ edition = "2021" bstr = "1.6.2" grep-matcher = { version = "0.1.6", path = "../matcher" } log = "0.4.20" -regex-automata = { version = "0.3.8" } -regex-syntax = "0.7.5" +regex-automata = { version = "0.4.0" } +regex-syntax = "0.8.0" diff --git a/crates/regex/src/ast.rs b/crates/regex/src/ast.rs index 4d170565..a5a0573a 100644 --- a/crates/regex/src/ast.rs +++ b/crates/regex/src/ast.rs @@ -62,12 +62,12 @@ impl AstAnalysis { Ast::Flags(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => {} + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => {} Ast::Literal(ref x) => { self.from_ast_literal(x); } - Ast::Class(ast::Class::Bracketed(ref x)) => { + Ast::ClassBracketed(ref x) => { self.from_ast_class_set(&x.kind); } Ast::Repetition(ref x) => { diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index 79642580..8c69ef54 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -3,7 +3,7 @@ use { regex_automata::meta::Regex, regex_syntax::{ ast, - hir::{self, Hir, HirKind}, + hir::{self, Hir}, }, }; @@ -296,35 +296,6 @@ impl ConfiguredHIR { } } - /// Turns this configured HIR into one that only matches when both sides of - /// the match correspond to a word boundary. - /// - /// Note that the HIR returned is like turning `pat` into - /// `(?m:^|\W)(pat)(?m:$|\W)`. That is, the true match is at capture group - /// `1` and not `0`. - pub(crate) fn into_word(self) -> Result { - // In theory building the HIR for \W should never fail, but there are - // likely some pathological cases (particularly with respect to certain - // values of limits) where it could in theory fail. - let non_word = { - let mut config = self.config.clone(); - config.fixed_strings = false; - ConfiguredHIR::new(config, &[r"\W"])? - }; - let line_anchor_start = Hir::look(self.line_anchor_start()); - let line_anchor_end = Hir::look(self.line_anchor_end()); - let hir = Hir::concat(vec![ - Hir::alternation(vec![line_anchor_start, non_word.hir.clone()]), - Hir::capture(hir::Capture { - index: 1, - name: None, - sub: Box::new(renumber_capture_indices(self.hir)?), - }), - Hir::alternation(vec![non_word.hir, line_anchor_end]), - ]); - Ok(ConfiguredHIR { config: self.config, hir }) - } - /// Turns this configured HIR into an equivalent one, but where it must /// match at the start and end of a line. pub(crate) fn into_whole_line(self) -> ConfiguredHIR { @@ -336,12 +307,20 @@ impl ConfiguredHIR { } /// Turns this configured HIR into an equivalent one, but where it must - /// match at the start and end of the haystack. - pub(crate) fn into_anchored(self) -> ConfiguredHIR { + /// match at word boundaries. + pub(crate) fn into_word(self) -> ConfiguredHIR { let hir = Hir::concat(vec![ - Hir::look(hir::Look::Start), + Hir::look(if self.config.unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }), self.hir, - Hir::look(hir::Look::End), + Hir::look(if self.config.unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), ]); ConfiguredHIR { config: self.config, hir } } @@ -365,50 +344,6 @@ impl ConfiguredHIR { } } -/// This increments the index of every capture group in the given hir by 1. If -/// any increment results in an overflow, then an error is returned. -fn renumber_capture_indices(hir: Hir) -> Result { - Ok(match hir.into_kind() { - HirKind::Empty => Hir::empty(), - HirKind::Literal(hir::Literal(lit)) => Hir::literal(lit), - HirKind::Class(cls) => Hir::class(cls), - HirKind::Look(x) => Hir::look(x), - HirKind::Repetition(mut x) => { - x.sub = Box::new(renumber_capture_indices(*x.sub)?); - Hir::repetition(x) - } - HirKind::Capture(mut cap) => { - cap.index = match cap.index.checked_add(1) { - Some(index) => index, - None => { - // This error message kind of sucks, but it's probably - // impossible for it to happen. The only way a capture - // index can overflow addition is if the regex is huge - // (or something else has gone horribly wrong). - let msg = "could not renumber capture index, too big"; - return Err(Error::any(msg)); - } - }; - cap.sub = Box::new(renumber_capture_indices(*cap.sub)?); - Hir::capture(cap) - } - HirKind::Concat(subs) => { - let subs = subs - .into_iter() - .map(|sub| renumber_capture_indices(sub)) - .collect::, Error>>()?; - Hir::concat(subs) - } - HirKind::Alternation(subs) => { - let subs = subs - .into_iter() - .map(|sub| renumber_capture_indices(sub)) - .collect::, Error>>()?; - Hir::alternation(subs) - } - }) -} - /// Returns true if the given literal string contains any byte from the line /// terminator given. fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool { diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs index 1c921773..88a8adbe 100644 --- a/crates/regex/src/error.rs +++ b/crates/regex/src/error.rs @@ -30,10 +30,6 @@ impl Error { Error { kind: ErrorKind::Regex(err.to_string()) } } - pub(crate) fn any(msg: E) -> Error { - Error { kind: ErrorKind::Regex(msg.to_string()) } - } - /// Return the kind of this error. pub fn kind(&self) -> &ErrorKind { &self.kind diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs index 068c7c71..4693bff1 100644 --- a/crates/regex/src/lib.rs +++ b/crates/regex/src/lib.rs @@ -15,4 +15,3 @@ mod literal; mod matcher; mod non_matching; mod strip; -mod word; diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 65c61d27..f3f673ff 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use { grep_matcher::{ ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, @@ -11,12 +9,7 @@ use { }, }; -use crate::{ - config::{Config, ConfiguredHIR}, - error::Error, - literal::InnerLiterals, - word::WordMatcher, -}; +use crate::{config::Config, error::Error, literal::InnerLiterals}; /// A builder for constructing a `Matcher` using regular expressions. /// @@ -61,9 +54,15 @@ impl RegexMatcherBuilder { &self, patterns: &[P], ) -> Result { - let chir = self.config.build_many(patterns)?; - let matcher = RegexMatcherImpl::new(chir)?; - let (chir, re) = (matcher.chir(), matcher.regex()); + let mut chir = self.config.build_many(patterns)?; + // 'whole_line' is a strict subset of 'word', so when it is enabled, + // we don't need to both with any specific to word matching. + if chir.config().whole_line { + chir = chir.into_whole_line(); + } else if chir.config().word { + chir = chir.into_word(); + } + let regex = chir.to_regex()?; log::trace!("final regex: {:?}", chir.hir().to_string()); let non_matching_bytes = chir.non_matching_bytes(); @@ -76,18 +75,13 @@ impl RegexMatcherBuilder { // then run the original regex on only that line. (In this case, the // regex engine is likely to handle this case for us since it's so // simple, but the idea applies.) - let fast_line_regex = InnerLiterals::new(chir, re).one_regex()?; + let fast_line_regex = InnerLiterals::new(&chir, ®ex).one_regex()?; // We override the line terminator in case the configured HIR doesn't // support it. let mut config = self.config.clone(); config.line_terminator = chir.line_terminator(); - Ok(RegexMatcher { - config, - matcher, - fast_line_regex, - non_matching_bytes, - }) + Ok(RegexMatcher { config, regex, fast_line_regex, non_matching_bytes }) } /// Build a new matcher from a plain alternation of literals. @@ -357,8 +351,9 @@ impl RegexMatcherBuilder { pub struct RegexMatcher { /// The configuration specified by the caller. config: Config, - /// The underlying matcher implementation. - matcher: RegexMatcherImpl, + /// The regular expression compiled from the pattern provided by the + /// caller. + regex: Regex, /// A regex that never reports false negatives but may report false /// positives that is believed to be capable of being matched more quickly /// than `regex`. Typically, this is a single literal or an alternation @@ -392,53 +387,6 @@ impl RegexMatcher { } } -/// An encapsulation of the type of matcher we use in `RegexMatcher`. -#[derive(Clone, Debug)] -enum RegexMatcherImpl { - /// The standard matcher used for all regular expressions. - Standard(StandardMatcher), - /// A matcher that only matches at word boundaries. This transforms the - /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`. - /// Because of this, the WordMatcher provides its own implementation of - /// `Matcher` to encapsulate its use of capture groups to make them - /// invisible to the caller. - Word(WordMatcher), -} - -impl RegexMatcherImpl { - /// Based on the configuration, create a new implementation of the - /// `Matcher` trait. - fn new(mut chir: ConfiguredHIR) -> Result { - // When whole_line is set, we don't use a word matcher even if word - // matching was requested. Why? Because `(?m:^)(pat)(?m:$)` implies - // word matching. - Ok(if chir.config().word && !chir.config().whole_line { - RegexMatcherImpl::Word(WordMatcher::new(chir)?) - } else { - if chir.config().whole_line { - chir = chir.into_whole_line(); - } - RegexMatcherImpl::Standard(StandardMatcher::new(chir)?) - }) - } - - /// Return the underlying regex object used. - fn regex(&self) -> &Regex { - match *self { - RegexMatcherImpl::Word(ref x) => x.regex(), - RegexMatcherImpl::Standard(ref x) => &x.regex, - } - } - - /// Return the underlying HIR of the regex used for searching. - fn chir(&self) -> &ConfiguredHIR { - match *self { - RegexMatcherImpl::Word(ref x) => x.chir(), - RegexMatcherImpl::Standard(ref x) => &x.chir, - } - } -} - // This implementation just dispatches on the internal matcher impl except // for the line terminator optimization, which is possibly executed via // `fast_line_regex`. @@ -446,265 +394,7 @@ impl Matcher for RegexMatcher { type Captures = RegexCaptures; type Error = NoError; - fn find_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find_at(haystack, at), - Word(ref m) => m.find_at(haystack, at), - } - } - - fn new_captures(&self) -> Result { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.new_captures(), - Word(ref m) => m.new_captures(), - } - } - - fn capture_count(&self) -> usize { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.capture_count(), - Word(ref m) => m.capture_count(), - } - } - - fn capture_index(&self, name: &str) -> Option { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.capture_index(name), - Word(ref m) => m.capture_index(name), - } - } - - fn find(&self, haystack: &[u8]) -> Result, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find(haystack), - Word(ref m) => m.find(haystack), - } - } - - fn find_iter(&self, haystack: &[u8], matched: F) -> Result<(), NoError> - where - F: FnMut(Match) -> bool, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find_iter(haystack, matched), - Word(ref m) => m.find_iter(haystack, matched), - } - } - - fn try_find_iter( - &self, - haystack: &[u8], - matched: F, - ) -> Result, NoError> - where - F: FnMut(Match) -> Result, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.try_find_iter(haystack, matched), - Word(ref m) => m.try_find_iter(haystack, matched), - } - } - - fn captures( - &self, - haystack: &[u8], - caps: &mut RegexCaptures, - ) -> Result { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures(haystack, caps), - Word(ref m) => m.captures(haystack, caps), - } - } - - fn captures_iter( - &self, - haystack: &[u8], - caps: &mut RegexCaptures, - matched: F, - ) -> Result<(), NoError> - where - F: FnMut(&RegexCaptures) -> bool, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures_iter(haystack, caps, matched), - Word(ref m) => m.captures_iter(haystack, caps, matched), - } - } - - fn try_captures_iter( - &self, - haystack: &[u8], - caps: &mut RegexCaptures, - matched: F, - ) -> Result, NoError> - where - F: FnMut(&RegexCaptures) -> Result, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.try_captures_iter(haystack, caps, matched), - Word(ref m) => m.try_captures_iter(haystack, caps, matched), - } - } - - fn captures_at( - &self, - haystack: &[u8], - at: usize, - caps: &mut RegexCaptures, - ) -> Result { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures_at(haystack, at, caps), - Word(ref m) => m.captures_at(haystack, at, caps), - } - } - - fn replace( - &self, - haystack: &[u8], - dst: &mut Vec, - append: F, - ) -> Result<(), NoError> - where - F: FnMut(Match, &mut Vec) -> bool, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.replace(haystack, dst, append), - Word(ref m) => m.replace(haystack, dst, append), - } - } - - fn replace_with_captures( - &self, - haystack: &[u8], - caps: &mut RegexCaptures, - dst: &mut Vec, - append: F, - ) -> Result<(), NoError> - where - F: FnMut(&Self::Captures, &mut Vec) -> bool, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => { - m.replace_with_captures(haystack, caps, dst, append) - } - Word(ref m) => { - m.replace_with_captures(haystack, caps, dst, append) - } - } - } - - fn is_match(&self, haystack: &[u8]) -> Result { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.is_match(haystack), - Word(ref m) => m.is_match(haystack), - } - } - - fn is_match_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.is_match_at(haystack, at), - Word(ref m) => m.is_match_at(haystack, at), - } - } - - fn shortest_match( - &self, - haystack: &[u8], - ) -> Result, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.shortest_match(haystack), - Word(ref m) => m.shortest_match(haystack), - } - } - - fn shortest_match_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.shortest_match_at(haystack, at), - Word(ref m) => m.shortest_match_at(haystack, at), - } - } - - fn non_matching_bytes(&self) -> Option<&ByteSet> { - Some(&self.non_matching_bytes) - } - - fn line_terminator(&self) -> Option { - self.config.line_terminator - } - - fn find_candidate_line( - &self, - haystack: &[u8], - ) -> Result, NoError> { - Ok(match self.fast_line_regex { - Some(ref regex) => { - let input = Input::new(haystack); - regex - .search_half(&input) - .map(|hm| LineMatchKind::Candidate(hm.offset())) - } - None => { - self.shortest_match(haystack)?.map(LineMatchKind::Confirmed) - } - }) - } -} - -/// The implementation of the standard regex matcher. -#[derive(Clone, Debug)] -struct StandardMatcher { - /// The regular expression compiled from the pattern provided by the - /// caller. - regex: Regex, - /// The HIR that produced this regex. - /// - /// We put this in an `Arc` because by the time it gets here, it won't - /// change. And because cloning and dropping an `Hir` is somewhat expensive - /// due to its deep recursive representation. - chir: Arc, -} - -impl StandardMatcher { - fn new(chir: ConfiguredHIR) -> Result { - let chir = Arc::new(chir); - let regex = chir.to_regex()?; - Ok(StandardMatcher { regex, chir }) - } -} - -impl Matcher for StandardMatcher { - type Captures = RegexCaptures; - type Error = NoError; - + #[inline] fn find_at( &self, haystack: &[u8], @@ -714,18 +404,22 @@ impl Matcher for StandardMatcher { Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end()))) } + #[inline] fn new_captures(&self) -> Result { Ok(RegexCaptures::new(self.regex.create_captures())) } + #[inline] fn capture_count(&self) -> usize { self.regex.captures_len() } + #[inline] fn capture_index(&self, name: &str) -> Option { self.regex.group_info().to_index(PatternID::ZERO, name) } + #[inline] fn try_find_iter( &self, haystack: &[u8], @@ -744,6 +438,7 @@ impl Matcher for StandardMatcher { Ok(Ok(())) } + #[inline] fn captures_at( &self, haystack: &[u8], @@ -756,6 +451,7 @@ impl Matcher for StandardMatcher { Ok(caps.is_match()) } + #[inline] fn shortest_match_at( &self, haystack: &[u8], @@ -764,6 +460,34 @@ impl Matcher for StandardMatcher { let input = Input::new(haystack).span(at..haystack.len()); Ok(self.regex.search_half(&input).map(|hm| hm.offset())) } + + #[inline] + fn non_matching_bytes(&self) -> Option<&ByteSet> { + Some(&self.non_matching_bytes) + } + + #[inline] + fn line_terminator(&self) -> Option { + self.config.line_terminator + } + + #[inline] + fn find_candidate_line( + &self, + haystack: &[u8], + ) -> Result, NoError> { + Ok(match self.fast_line_regex { + Some(ref regex) => { + let input = Input::new(haystack); + regex + .search_half(&input) + .map(|hm| LineMatchKind::Candidate(hm.offset())) + } + None => { + self.shortest_match(haystack)?.map(LineMatchKind::Confirmed) + } + }) + } } /// Represents the match offsets of each capturing group in a match. @@ -784,46 +508,27 @@ impl Matcher for StandardMatcher { pub struct RegexCaptures { /// Where the captures are stored. caps: AutomataCaptures, - /// These captures behave as if the capturing groups begin at the given - /// offset. When set to `0`, this has no affect and capture groups are - /// indexed like normal. - /// - /// This is useful when building matchers that wrap arbitrary regular - /// expressions. For example, `WordMatcher` takes an existing regex - /// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that - /// the regex has been wrapped from the caller. In order to do this, - /// the matcher and the capturing groups must behave as if `(re)` is - /// the `0`th capture group. - offset: usize, } impl Captures for RegexCaptures { + #[inline] fn len(&self) -> usize { - self.caps - .group_info() - .all_group_len() - .checked_sub(self.offset) - .unwrap() + self.caps.group_info().all_group_len() } + #[inline] fn get(&self, i: usize) -> Option { - let actual = i.checked_add(self.offset).unwrap(); - self.caps.get_group(actual).map(|sp| Match::new(sp.start, sp.end)) + self.caps.get_group(i).map(|sp| Match::new(sp.start, sp.end)) } } impl RegexCaptures { + #[inline] pub(crate) fn new(caps: AutomataCaptures) -> RegexCaptures { - RegexCaptures::with_offset(caps, 0) - } - - pub(crate) fn with_offset( - caps: AutomataCaptures, - offset: usize, - ) -> RegexCaptures { - RegexCaptures { caps, offset } + RegexCaptures { caps } } + #[inline] pub(crate) fn captures_mut(&mut self) -> &mut AutomataCaptures { &mut self.caps } diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs index 7fde6c46..f93ed13b 100644 --- a/crates/regex/src/non_matching.rs +++ b/crates/regex/src/non_matching.rs @@ -19,7 +19,14 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) { match *expr.kind() { HirKind::Empty | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate) - | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {} + | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) + | HirKind::Look(Look::WordStartAscii | Look::WordStartUnicode) + | HirKind::Look(Look::WordEndAscii | Look::WordEndUnicode) + | HirKind::Look( + Look::WordStartHalfAscii | Look::WordStartHalfUnicode, + ) + | HirKind::Look(Look::WordEndHalfAscii | Look::WordEndHalfUnicode) => { + } HirKind::Look(Look::Start | Look::End) => { // FIXME: This is wrong, but not doing this leads to incorrect // results because of how anchored searches are implemented in diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs deleted file mode 100644 index 52fb61ce..00000000 --- a/crates/regex/src/word.rs +++ /dev/null @@ -1,341 +0,0 @@ -use std::{ - collections::HashMap, - panic::{RefUnwindSafe, UnwindSafe}, - sync::Arc, -}; - -use { - grep_matcher::{Match, Matcher, NoError}, - regex_automata::{ - meta::Regex, util::captures::Captures, util::pool::Pool, Input, - PatternID, - }, -}; - -use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures}; - -type PoolFn = - Box Captures + Send + Sync + UnwindSafe + RefUnwindSafe>; - -/// A matcher for implementing "word match" semantics. -#[derive(Debug)] -pub(crate) struct WordMatcher { - /// The regex which is roughly `(?:^|\W)()(?:$|\W)`. - regex: Regex, - /// The HIR that produced the regex above. We don't keep the HIR for the - /// `original` regex. - /// - /// We put this in an `Arc` because by the time it gets here, it won't - /// change. And because cloning and dropping an `Hir` is somewhat expensive - /// due to its deep recursive representation. - chir: Arc, - /// The original regex supplied by the user, which we use in a fast path - /// to try and detect matches before deferring to slower engines. - original: Regex, - /// A map from capture group name to capture group index. - names: HashMap, - /// A thread-safe pool of reusable buffers for finding the match offset of - /// the inner group. - caps: Arc>, -} - -impl Clone for WordMatcher { - fn clone(&self) -> WordMatcher { - // We implement Clone manually so that we get a fresh Pool such that it - // can set its own thread owner. This permits each thread usings `caps` - // to hit the fast path. - // - // Note that cloning a regex is "cheap" since it uses reference - // counting internally. - let re = self.regex.clone(); - WordMatcher { - regex: self.regex.clone(), - chir: Arc::clone(&self.chir), - original: self.original.clone(), - names: self.names.clone(), - caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))), - } - } -} - -impl WordMatcher { - /// Create a new matcher from the given pattern that only produces matches - /// that are considered "words." - /// - /// The given options are used to construct the regular expression - /// internally. - pub(crate) fn new(chir: ConfiguredHIR) -> Result { - let original = chir.clone().into_anchored().to_regex()?; - let chir = Arc::new(chir.into_word()?); - let regex = chir.to_regex()?; - let caps = Arc::new(Pool::new({ - let regex = regex.clone(); - Box::new(move || regex.create_captures()) as PoolFn - })); - - let mut names = HashMap::new(); - let it = regex.group_info().pattern_names(PatternID::ZERO); - for (i, optional_name) in it.enumerate() { - if let Some(name) = optional_name { - names.insert(name.to_string(), i.checked_sub(1).unwrap()); - } - } - Ok(WordMatcher { regex, chir, original, names, caps }) - } - - /// Return the underlying regex used to match at word boundaries. - /// - /// The original regex is in the capture group at index 1. - pub(crate) fn regex(&self) -> &Regex { - &self.regex - } - - /// Return the underlying HIR for the regex used to match at word - /// boundaries. - pub(crate) fn chir(&self) -> &ConfiguredHIR { - &self.chir - } - - /// Attempt to do a fast confirmation of a word match that covers a subset - /// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned - /// when a match is found. Ok(None) is returned when there is definitively - /// no match. Err(()) is returned when this routine could not detect - /// whether there was a match or not. - fn fast_find( - &self, - haystack: &[u8], - at: usize, - ) -> Result, ()> { - // This is a bit hairy. The whole point here is to avoid running a - // slower regex engine to extract capture groups. Remember, our word - // regex looks like this: - // - // (^|\W)()(\W|$) - // - // What we want are the match offsets of . So in the - // easy/common case, the original regex will be sandwiched between - // two codepoints that are in the \W class. So our approach here is to - // look for a match of the overall word regexp, strip the \W ends and - // then check whether the original regex matches what's left. If so, - // then we are guaranteed a correct match. - // - // This only works though if we know that the match is sandwiched - // between two \W codepoints. This only occurs when neither ^ nor $ - // match. This in turn only occurs when the match is at either the - // beginning or end of the haystack. In either of those cases, we - // declare defeat and defer to the slower implementation. - // - // The reason why we cannot handle the ^/$ cases here is because we - // can't assume anything about the original pattern. (Try commenting - // out the checks for ^/$ below and run the tests to see examples.) - // - // NOTE(2023-07-31): After fixing #2574, this logic honestly still - // doesn't seem correct. Regex composition is hard. - let input = Input::new(haystack).span(at..haystack.len()); - let mut cand = match self.regex.find(input) { - None => return Ok(None), - Some(m) => Match::new(m.start(), m.end()), - }; - if cand.start() == 0 || cand.end() == haystack.len() { - return Err(()); - } - // We decode the chars on either side of the match. If either char is - // a word character, then that means the ^/$ matched and not \W. In - // that case, we defer to the slower engine. - let (ch, slen) = bstr::decode_utf8(&haystack[cand]); - if ch.map_or(true, regex_syntax::is_word_character) { - return Err(()); - } - let (ch, elen) = bstr::decode_last_utf8(&haystack[cand]); - if ch.map_or(true, regex_syntax::is_word_character) { - return Err(()); - } - let new_start = cand.start() + slen; - let new_end = cand.end() - elen; - // This occurs the original regex can match the empty string. In this - // case, just bail instead of trying to get it right here since it's - // likely a pathological case. - if new_start > new_end { - return Err(()); - } - cand = cand.with_start(new_start).with_end(new_end); - if self.original.is_match(&haystack[cand]) { - Ok(Some(cand)) - } else { - Err(()) - } - } -} - -impl Matcher for WordMatcher { - type Captures = RegexCaptures; - type Error = NoError; - - fn find_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result, NoError> { - // To make this easy to get right, we extract captures here instead of - // calling `find_at`. The actual match is at capture group `1` instead - // of `0`. We *could* use `find_at` here and then trim the match after - // the fact, but that's a bit harder to get right, and it's not clear - // if it's worth it. - // - // OK, well, it turns out that it is worth it! But it is quite tricky. - // See `fast_find` for details. Effectively, this lets us skip running - // a slower regex engine to extract capture groups in the vast majority - // of cases. However, the slower engine is I believe required for full - // correctness. - match self.fast_find(haystack, at) { - Ok(Some(m)) => return Ok(Some(m)), - Ok(None) => return Ok(None), - Err(()) => {} - } - - let input = Input::new(haystack).span(at..haystack.len()); - let mut caps = self.caps.get(); - self.regex.search_captures(&input, &mut caps); - Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end))) - } - - fn new_captures(&self) -> Result { - Ok(RegexCaptures::with_offset(self.regex.create_captures(), 1)) - } - - fn capture_count(&self) -> usize { - self.regex.captures_len().checked_sub(1).unwrap() - } - - fn capture_index(&self, name: &str) -> Option { - self.names.get(name).map(|i| *i) - } - - fn captures_at( - &self, - haystack: &[u8], - at: usize, - caps: &mut RegexCaptures, - ) -> Result { - let input = Input::new(haystack).span(at..haystack.len()); - let caps = caps.captures_mut(); - self.regex.search_captures(&input, caps); - Ok(caps.is_match()) - } - - // We specifically do not implement other methods like find_iter or - // captures_iter. Namely, the iter methods are guaranteed to be correct - // by virtue of implementing find_at and captures_at above. -} - -#[cfg(test)] -mod tests { - use super::WordMatcher; - use crate::config::Config; - use grep_matcher::{Captures, Match, Matcher}; - - fn matcher(pattern: &str) -> WordMatcher { - let chir = Config::default().build_many(&[pattern]).unwrap(); - WordMatcher::new(chir).unwrap() - } - - fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> { - matcher(pattern) - .find(haystack.as_bytes()) - .unwrap() - .map(|m| (m.start(), m.end())) - } - - fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> { - let m = matcher(pattern); - let mut caps = m.new_captures().unwrap(); - if !m.captures(haystack.as_bytes(), &mut caps).unwrap() { - None - } else { - caps.get(0).map(|m| (m.start(), m.end())) - } - } - - // Test that the standard `find` API reports offsets correctly. - #[test] - fn various_find() { - assert_eq!(Some((0, 3)), find(r"foo", "foo")); - assert_eq!(Some((0, 3)), find(r"foo", "foo(")); - assert_eq!(Some((1, 4)), find(r"foo", "!foo(")); - assert_eq!(None, find(r"foo", "!afoo(")); - - assert_eq!(Some((0, 3)), find(r"foo", "foo☃")); - assert_eq!(None, find(r"foo", "fooб")); - - assert_eq!(Some((0, 4)), find(r"foo5", "foo5")); - assert_eq!(None, find(r"foo", "foo5")); - - assert_eq!(Some((1, 4)), find(r"foo", "!foo!")); - assert_eq!(Some((1, 5)), find(r"foo!", "!foo!")); - assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!")); - - assert_eq!(Some((0, 3)), find(r"foo", "foo\n")); - assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n")); - assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n")); - assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n")); - - assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!")); - assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!")); - assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a")); - - assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n")); - assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!##")); - assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!\n##")); - assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##")); - assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa")); - } - - // See: https://github.com/BurntSushi/ripgrep/issues/389 - #[test] - fn regression_dash() { - assert_eq!(Some((0, 2)), find(r"-2", "-2")); - } - - // Test that the captures API also reports offsets correctly, just as - // find does. This exercises a different path in the code since captures - // are handled differently. - #[test] - fn various_captures() { - assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo")); - assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo(")); - assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo(")); - assert_eq!(None, find_by_caps(r"foo", "!afoo(")); - - assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃")); - assert_eq!(None, find_by_caps(r"foo", "fooб")); - // assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб")); - - // See: https://github.com/BurntSushi/ripgrep/issues/389 - assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2")); - } - - // Test that the capture reporting methods work as advertised. - #[test] - fn capture_indexing() { - let m = matcher(r"(a)(?Pb)(c)"); - assert_eq!(4, m.capture_count()); - assert_eq!(Some(2), m.capture_index("foo")); - - let mut caps = m.new_captures().unwrap(); - assert_eq!(4, caps.len()); - - assert!(m.captures(b"abc", &mut caps).unwrap()); - assert_eq!(caps.get(0), Some(Match::new(0, 3))); - assert_eq!(caps.get(1), Some(Match::new(0, 1))); - assert_eq!(caps.get(2), Some(Match::new(1, 2))); - assert_eq!(caps.get(3), Some(Match::new(2, 3))); - assert_eq!(caps.get(4), None); - - assert!(m.captures(b"#abc#", &mut caps).unwrap()); - assert_eq!(caps.get(0), Some(Match::new(1, 4))); - assert_eq!(caps.get(1), Some(Match::new(1, 2))); - assert_eq!(caps.get(2), Some(Match::new(2, 3))); - assert_eq!(caps.get(3), Some(Match::new(3, 4))); - assert_eq!(caps.get(4), None); - } -} diff --git a/tests/misc.rs b/tests/misc.rs index 4fa8632e..40779056 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -144,6 +144,18 @@ For the Doctor Watsons of this world, as opposed to the Sherlock eqnice!(expected, cmd.stdout()); }); +rgtest!(word_period, |dir: Dir, mut cmd: TestCommand| { + dir.create("haystack", "..."); + cmd.arg("-ow").arg(".").arg("haystack"); + + let expected = "\ +. +. +. +"; + eqnice!(expected, cmd.stdout()); +}); + rgtest!(line, |dir: Dir, mut cmd: TestCommand| { dir.create("sherlock", SHERLOCK); cmd.args(&[ diff --git a/tests/regression.rs b/tests/regression.rs index 24551fc0..994006a7 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -1046,17 +1046,10 @@ rgtest!(r1878, |dir: Dir, _: TestCommand| { // See: https://github.com/BurntSushi/ripgrep/issues/1891 rgtest!(r1891, |dir: Dir, mut cmd: TestCommand| { - // TODO: Sadly, PCRE2 has different behavior here. Not clear why. We should - // look into this and see if there's a fix needed at the regex engine - // level. - if dir.is_pcre2() { - return; - } - dir.create("test", "\n##\n"); // N.B. We use -o here to force the issue to occur, which seems to only // happen when each match needs to be detected. - eqnice!("1:\n2:\n2:\n", cmd.args(&["-won", "", "test"]).stdout()); + eqnice!("1:\n2:\n2:\n2:\n", cmd.args(&["-won", "", "test"]).stdout()); }); // See: https://github.com/BurntSushi/ripgrep/issues/2095