diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index d767def6..79642580 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -8,8 +8,8 @@ use { }; use crate::{ - ast::AstAnalysis, error::Error, literal::LiteralSets, - non_matching::non_matching_bytes, strip::strip_from_match, + ast::AstAnalysis, error::Error, non_matching::non_matching_bytes, + strip::strip_from_match, }; /// Config represents the configuration of a regex matcher in this crate. @@ -228,6 +228,11 @@ impl ConfiguredHIR { &self.config } + /// Return a reference to the underyling HIR. + pub(crate) fn hir(&self) -> &Hir { + &self.hir + } + /// Convert this HIR to a regex that can be used for matching. pub(crate) fn to_regex(&self) -> Result { let meta = Regex::config() @@ -240,8 +245,8 @@ impl ConfiguredHIR { // Same deal here. The default limit for full DFAs is VERY small, // but with ripgrep we can afford to spend a bit more time on // building them I think. - .dfa_size_limit(Some(10 * (1 << 20))) - .dfa_state_limit(Some(10_000)) + .dfa_size_limit(Some(1 * (1 << 20))) + .dfa_state_limit(Some(1_000)) .hybrid_cache_capacity(self.config.dfa_size_limit); Regex::builder() .configure(meta) @@ -249,31 +254,6 @@ impl ConfiguredHIR { .map_err(Error::regex) } - /// Convert this HIR to its concrete syntax. - pub(crate) fn to_pattern(&self) -> String { - self.hir.to_string() - } - - /// Attempt to extract a "fast" regex that can be used for quickly finding - /// candidates lines for a match. - /// - /// If no line terminator was configured, then this always returns - /// `Ok(None)`. If a line terminator is configured, then this may return a - /// regex. - pub(crate) fn to_fast_line_regex(&self) -> Result, Error> { - if self.config.line_terminator.is_none() { - return Ok(None); - } - match LiteralSets::new(&self.hir).one_regex(self.config.word) { - None => Ok(None), - Some(pattern) => { - let config = self.config.clone(); - let chir = ConfiguredHIR::new(config, &[pattern])?; - Ok(Some(chir.to_regex()?)) - } - } - } - /// Compute the set of non-matching bytes for this HIR expression. pub(crate) fn non_matching_bytes(&self) -> ByteSet { non_matching_bytes(&self.hir) diff --git a/crates/regex/src/literal.rs b/crates/regex/src/literal.rs index 19d0ccc2..518e54bf 100644 --- a/crates/regex/src/literal.rs +++ b/crates/regex/src/literal.rs @@ -1,32 +1,42 @@ -use regex_syntax::hir::Hir; +#![allow(warnings)] -// BREADCRUMBS: -// -// The way we deal with line terminators in the regex is clunky, but probably -// the least bad option for now unfortunately. -// -// The `non_matching_bytes` routine currently hardcodes line terminators for -// anchors. But it's not really clear it should even care about line terminators -// anyway, since anchors aren't actually part of a match. If we fix that -// though, that currently reveals a different bug elsewhere: '(?-m:^)' isn't -// implemented correctly in multi-line search, because it defers to the fast -// line-by-line strategy, which ends up being wrong. I think the way forward -// there is to: -// -// 1) Adding something in the grep-matcher interface that exposes a way to -// query for \A and \z specifically. If they're in the pattern, then we can -// decide how to handle them. -// -// 2) Perhaps provide a way to "translate \A/\z to ^/$" for cases when -// mulit-line search is not enabled. +use { + regex_automata::meta::Regex, + regex_syntax::hir::{ + literal::{self, Seq}, + Hir, + }, +}; + +use crate::config::ConfiguredHIR; #[derive(Clone, Debug)] -pub struct LiteralSets {} +pub(crate) struct InnerLiterals { + seq: Seq, +} -impl LiteralSets { - /// Create a set of literals from the given HIR expression. - pub fn new(_: &Hir) -> LiteralSets { - LiteralSets {} +impl InnerLiterals { + /// Create a set of inner literals from the given HIR expression. + /// + /// If no line terminator was configured, then this always declines to + /// extract literals because the inner literal optimization may not be + /// valid. + /// + /// Note that this requires the actual regex that will be used for a search + /// because it will query some state about the compiled regex. That state + /// may influence inner literal extraction. + pub(crate) fn new(chir: &ConfiguredHIR, re: &Regex) -> InnerLiterals { + let seq = Seq::infinite(); + if chir.config().line_terminator.is_none() || re.is_accelerated() { + return InnerLiterals::none(); + } + InnerLiterals { seq } + } + + /// Returns a infinite set of inner literals, such that it can never + /// produce a matcher. + pub(crate) fn none() -> InnerLiterals { + InnerLiterals { seq: Seq::infinite() } } /// If it is deemed advantageuous to do so (via various suspicious @@ -35,7 +45,10 @@ impl LiteralSets { /// generated these literal sets. The idea here is that the pattern /// returned by this method is much cheaper to search for. i.e., It is /// usually a single literal or an alternation of literals. - pub fn one_regex(&self, _word: bool) -> Option { + pub(crate) fn one_regex(&self) -> Option { + if !self.seq.is_finite() { + return None; + } None } } diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 95d25c43..73863374 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use { grep_matcher::{ ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, @@ -12,6 +14,7 @@ use { use crate::{ config::{Config, ConfiguredHIR}, error::Error, + literal::InnerLiterals, word::WordMatcher, }; @@ -59,8 +62,26 @@ impl RegexMatcherBuilder { patterns: &[P], ) -> Result { let chir = self.config.build_many(patterns)?; - let fast_line_regex = chir.to_fast_line_regex()?; + let matcher = RegexMatcherImpl::new(chir)?; + let (chir, re) = (matcher.chir(), matcher.regex()); + log::trace!("final regex: {:?}", chir.hir().to_string()); + let non_matching_bytes = chir.non_matching_bytes(); + // If we can pick out some literals from the regex, then we might be + // able to build a faster regex that quickly identifies candidate + // matching lines. The regex engine will do what it can on its own, but + // we can specifically do a little more when a line terminator is set. + // For example, for a regex like `\w+foo\w+`, we can look for `foo`, + // and when a match is found, look for the line containing `foo` and + // then run the original regex on only that line. (In this case, the + // regex engine is likely to handle this case for us since it's so + // simple, but the idea applies.) + let fast_line_regex = match InnerLiterals::new(chir, re).one_regex() { + None => None, + Some(pattern) => { + Some(chir.config().build_many(&[pattern])?.to_regex()?) + } + }; if let Some(ref re) = fast_line_regex { log::debug!("extracted fast line regex: {:?}", re); } @@ -69,8 +90,6 @@ impl RegexMatcherBuilder { // support it. let mut config = self.config.clone(); config.line_terminator = chir.line_terminator(); - let matcher = RegexMatcherImpl::new(chir)?; - log::trace!("final regex: {:?}", matcher.regex()); Ok(RegexMatcher { config, matcher, @@ -412,10 +431,18 @@ impl RegexMatcherImpl { } /// Return the underlying regex object used. - fn regex(&self) -> String { + fn regex(&self) -> &Regex { match *self { - RegexMatcherImpl::Word(ref x) => x.pattern().to_string(), - RegexMatcherImpl::Standard(ref x) => x.pattern.clone(), + RegexMatcherImpl::Word(ref x) => x.regex(), + RegexMatcherImpl::Standard(ref x) => &x.regex, + } + } + + /// Return the underlying HIR of the regex used for searching. + fn chir(&self) -> &ConfiguredHIR { + match *self { + RegexMatcherImpl::Word(ref x) => x.chir(), + RegexMatcherImpl::Standard(ref x) => &x.chir, } } } @@ -666,15 +693,19 @@ struct StandardMatcher { /// The regular expression compiled from the pattern provided by the /// caller. regex: Regex, - /// The underlying pattern string for the regex. - pattern: String, + /// The HIR that produced this regex. + /// + /// We put this in an `Arc` because by the time it gets here, it won't + /// change. And because cloning and dropping an `Hir` is somewhat expensive + /// due to its deep recursive representation. + chir: Arc, } impl StandardMatcher { fn new(chir: ConfiguredHIR) -> Result { + let chir = Arc::new(chir); let regex = chir.to_regex()?; - let pattern = chir.to_pattern(); - Ok(StandardMatcher { regex, pattern }) + Ok(StandardMatcher { regex, chir }) } } diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs index 643ae76a..af4480ab 100644 --- a/crates/regex/src/word.rs +++ b/crates/regex/src/word.rs @@ -22,8 +22,13 @@ type PoolFn = pub(crate) struct WordMatcher { /// The regex which is roughly `(?:^|\W)()(?:$|\W)`. regex: Regex, - /// The pattern string corresponding to the above regex. - pattern: String, + /// The HIR that produced the regex above. We don't keep the HIR for the + /// `original` regex. + /// + /// We put this in an `Arc` because by the time it gets here, it won't + /// change. And because cloning and dropping an `Hir` is somewhat expensive + /// due to its deep recursive representation. + chir: Arc, /// The original regex supplied by the user, which we use in a fast path /// to try and detect matches before deferring to slower engines. original: Regex, @@ -45,7 +50,7 @@ impl Clone for WordMatcher { let re = self.regex.clone(); WordMatcher { regex: self.regex.clone(), - pattern: self.pattern.clone(), + chir: Arc::clone(&self.chir), original: self.original.clone(), names: self.names.clone(), caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))), @@ -61,9 +66,8 @@ impl WordMatcher { /// internally. pub(crate) fn new(chir: ConfiguredHIR) -> Result { let original = chir.clone().into_anchored().to_regex()?; - let word_chir = chir.into_word()?; - let regex = word_chir.to_regex()?; - let pattern = word_chir.to_pattern(); + let chir = Arc::new(chir.into_word()?); + let regex = chir.to_regex()?; let caps = Arc::new(Pool::new({ let regex = regex.clone(); Box::new(move || regex.create_captures()) as PoolFn @@ -76,13 +80,20 @@ impl WordMatcher { names.insert(name.to_string(), i.checked_sub(1).unwrap()); } } - Ok(WordMatcher { regex, pattern, original, names, caps }) + Ok(WordMatcher { regex, chir, original, names, caps }) } - /// Return the underlying pattern string for the regex used by this - /// matcher. - pub(crate) fn pattern(&self) -> &str { - &self.pattern + /// Return the underlying regex used to match at word boundaries. + /// + /// The original regex is in the capture group at index 1. + pub(crate) fn regex(&self) -> &Regex { + &self.regex + } + + /// Return the underlying HIR for the regex used to match at word + /// boundaries. + pub(crate) fn chir(&self) -> &ConfiguredHIR { + &self.chir } /// Attempt to do a fast confirmation of a word match that covers a subset