1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-01-29 22:01:04 +02:00

regex: refactor matcher construction

This does a little bit of refactoring so that we can pass both a
ConfiguredHIR and a Regex to the inner literal extraction routine.

One downside of this approach is that a regex object hangs on to a
ConfiguredHIR. But the extra memory usage is probably negligible. A
benefit though is that converting the HIR to its concrete syntax is now
lazy and only happens when logging is enabled.
This commit is contained in:
Andrew Gallant 2023-06-20 13:05:57 -04:00
parent 04dde9a4eb
commit 8ac66a9e04
4 changed files with 111 additions and 76 deletions

View File

@ -8,8 +8,8 @@ use {
};
use crate::{
ast::AstAnalysis, error::Error, literal::LiteralSets,
non_matching::non_matching_bytes, strip::strip_from_match,
ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
strip::strip_from_match,
};
/// Config represents the configuration of a regex matcher in this crate.
@ -228,6 +228,11 @@ impl ConfiguredHIR {
&self.config
}
/// Return a reference to the underyling HIR.
pub(crate) fn hir(&self) -> &Hir {
&self.hir
}
/// Convert this HIR to a regex that can be used for matching.
pub(crate) fn to_regex(&self) -> Result<Regex, Error> {
let meta = Regex::config()
@ -240,8 +245,8 @@ impl ConfiguredHIR {
// Same deal here. The default limit for full DFAs is VERY small,
// but with ripgrep we can afford to spend a bit more time on
// building them I think.
.dfa_size_limit(Some(10 * (1 << 20)))
.dfa_state_limit(Some(10_000))
.dfa_size_limit(Some(1 * (1 << 20)))
.dfa_state_limit(Some(1_000))
.hybrid_cache_capacity(self.config.dfa_size_limit);
Regex::builder()
.configure(meta)
@ -249,31 +254,6 @@ impl ConfiguredHIR {
.map_err(Error::regex)
}
/// Convert this HIR to its concrete syntax.
pub(crate) fn to_pattern(&self) -> String {
self.hir.to_string()
}
/// Attempt to extract a "fast" regex that can be used for quickly finding
/// candidates lines for a match.
///
/// If no line terminator was configured, then this always returns
/// `Ok(None)`. If a line terminator is configured, then this may return a
/// regex.
pub(crate) fn to_fast_line_regex(&self) -> Result<Option<Regex>, Error> {
if self.config.line_terminator.is_none() {
return Ok(None);
}
match LiteralSets::new(&self.hir).one_regex(self.config.word) {
None => Ok(None),
Some(pattern) => {
let config = self.config.clone();
let chir = ConfiguredHIR::new(config, &[pattern])?;
Ok(Some(chir.to_regex()?))
}
}
}
/// Compute the set of non-matching bytes for this HIR expression.
pub(crate) fn non_matching_bytes(&self) -> ByteSet {
non_matching_bytes(&self.hir)

View File

@ -1,32 +1,42 @@
use regex_syntax::hir::Hir;
#![allow(warnings)]
// BREADCRUMBS:
//
// The way we deal with line terminators in the regex is clunky, but probably
// the least bad option for now unfortunately.
//
// The `non_matching_bytes` routine currently hardcodes line terminators for
// anchors. But it's not really clear it should even care about line terminators
// anyway, since anchors aren't actually part of a match. If we fix that
// though, that currently reveals a different bug elsewhere: '(?-m:^)' isn't
// implemented correctly in multi-line search, because it defers to the fast
// line-by-line strategy, which ends up being wrong. I think the way forward
// there is to:
//
// 1) Adding something in the grep-matcher interface that exposes a way to
// query for \A and \z specifically. If they're in the pattern, then we can
// decide how to handle them.
//
// 2) Perhaps provide a way to "translate \A/\z to ^/$" for cases when
// mulit-line search is not enabled.
use {
regex_automata::meta::Regex,
regex_syntax::hir::{
literal::{self, Seq},
Hir,
},
};
use crate::config::ConfiguredHIR;
#[derive(Clone, Debug)]
pub struct LiteralSets {}
pub(crate) struct InnerLiterals {
seq: Seq,
}
impl LiteralSets {
/// Create a set of literals from the given HIR expression.
pub fn new(_: &Hir) -> LiteralSets {
LiteralSets {}
impl InnerLiterals {
/// Create a set of inner literals from the given HIR expression.
///
/// If no line terminator was configured, then this always declines to
/// extract literals because the inner literal optimization may not be
/// valid.
///
/// Note that this requires the actual regex that will be used for a search
/// because it will query some state about the compiled regex. That state
/// may influence inner literal extraction.
pub(crate) fn new(chir: &ConfiguredHIR, re: &Regex) -> InnerLiterals {
let seq = Seq::infinite();
if chir.config().line_terminator.is_none() || re.is_accelerated() {
return InnerLiterals::none();
}
InnerLiterals { seq }
}
/// Returns a infinite set of inner literals, such that it can never
/// produce a matcher.
pub(crate) fn none() -> InnerLiterals {
InnerLiterals { seq: Seq::infinite() }
}
/// If it is deemed advantageuous to do so (via various suspicious
@ -35,7 +45,10 @@ impl LiteralSets {
/// generated these literal sets. The idea here is that the pattern
/// returned by this method is much cheaper to search for. i.e., It is
/// usually a single literal or an alternation of literals.
pub fn one_regex(&self, _word: bool) -> Option<String> {
pub(crate) fn one_regex(&self) -> Option<String> {
if !self.seq.is_finite() {
return None;
}
None
}
}

View File

@ -1,3 +1,5 @@
use std::sync::Arc;
use {
grep_matcher::{
ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
@ -12,6 +14,7 @@ use {
use crate::{
config::{Config, ConfiguredHIR},
error::Error,
literal::InnerLiterals,
word::WordMatcher,
};
@ -59,8 +62,26 @@ impl RegexMatcherBuilder {
patterns: &[P],
) -> Result<RegexMatcher, Error> {
let chir = self.config.build_many(patterns)?;
let fast_line_regex = chir.to_fast_line_regex()?;
let matcher = RegexMatcherImpl::new(chir)?;
let (chir, re) = (matcher.chir(), matcher.regex());
log::trace!("final regex: {:?}", chir.hir().to_string());
let non_matching_bytes = chir.non_matching_bytes();
// If we can pick out some literals from the regex, then we might be
// able to build a faster regex that quickly identifies candidate
// matching lines. The regex engine will do what it can on its own, but
// we can specifically do a little more when a line terminator is set.
// For example, for a regex like `\w+foo\w+`, we can look for `foo`,
// and when a match is found, look for the line containing `foo` and
// then run the original regex on only that line. (In this case, the
// regex engine is likely to handle this case for us since it's so
// simple, but the idea applies.)
let fast_line_regex = match InnerLiterals::new(chir, re).one_regex() {
None => None,
Some(pattern) => {
Some(chir.config().build_many(&[pattern])?.to_regex()?)
}
};
if let Some(ref re) = fast_line_regex {
log::debug!("extracted fast line regex: {:?}", re);
}
@ -69,8 +90,6 @@ impl RegexMatcherBuilder {
// support it.
let mut config = self.config.clone();
config.line_terminator = chir.line_terminator();
let matcher = RegexMatcherImpl::new(chir)?;
log::trace!("final regex: {:?}", matcher.regex());
Ok(RegexMatcher {
config,
matcher,
@ -412,10 +431,18 @@ impl RegexMatcherImpl {
}
/// Return the underlying regex object used.
fn regex(&self) -> String {
fn regex(&self) -> &Regex {
match *self {
RegexMatcherImpl::Word(ref x) => x.pattern().to_string(),
RegexMatcherImpl::Standard(ref x) => x.pattern.clone(),
RegexMatcherImpl::Word(ref x) => x.regex(),
RegexMatcherImpl::Standard(ref x) => &x.regex,
}
}
/// Return the underlying HIR of the regex used for searching.
fn chir(&self) -> &ConfiguredHIR {
match *self {
RegexMatcherImpl::Word(ref x) => x.chir(),
RegexMatcherImpl::Standard(ref x) => &x.chir,
}
}
}
@ -666,15 +693,19 @@ struct StandardMatcher {
/// The regular expression compiled from the pattern provided by the
/// caller.
regex: Regex,
/// The underlying pattern string for the regex.
pattern: String,
/// The HIR that produced this regex.
///
/// We put this in an `Arc` because by the time it gets here, it won't
/// change. And because cloning and dropping an `Hir` is somewhat expensive
/// due to its deep recursive representation.
chir: Arc<ConfiguredHIR>,
}
impl StandardMatcher {
fn new(chir: ConfiguredHIR) -> Result<StandardMatcher, Error> {
let chir = Arc::new(chir);
let regex = chir.to_regex()?;
let pattern = chir.to_pattern();
Ok(StandardMatcher { regex, pattern })
Ok(StandardMatcher { regex, chir })
}
}

View File

@ -22,8 +22,13 @@ type PoolFn =
pub(crate) struct WordMatcher {
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
regex: Regex,
/// The pattern string corresponding to the above regex.
pattern: String,
/// The HIR that produced the regex above. We don't keep the HIR for the
/// `original` regex.
///
/// We put this in an `Arc` because by the time it gets here, it won't
/// change. And because cloning and dropping an `Hir` is somewhat expensive
/// due to its deep recursive representation.
chir: Arc<ConfiguredHIR>,
/// The original regex supplied by the user, which we use in a fast path
/// to try and detect matches before deferring to slower engines.
original: Regex,
@ -45,7 +50,7 @@ impl Clone for WordMatcher {
let re = self.regex.clone();
WordMatcher {
regex: self.regex.clone(),
pattern: self.pattern.clone(),
chir: Arc::clone(&self.chir),
original: self.original.clone(),
names: self.names.clone(),
caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
@ -61,9 +66,8 @@ impl WordMatcher {
/// internally.
pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> {
let original = chir.clone().into_anchored().to_regex()?;
let word_chir = chir.into_word()?;
let regex = word_chir.to_regex()?;
let pattern = word_chir.to_pattern();
let chir = Arc::new(chir.into_word()?);
let regex = chir.to_regex()?;
let caps = Arc::new(Pool::new({
let regex = regex.clone();
Box::new(move || regex.create_captures()) as PoolFn
@ -76,13 +80,20 @@ impl WordMatcher {
names.insert(name.to_string(), i.checked_sub(1).unwrap());
}
}
Ok(WordMatcher { regex, pattern, original, names, caps })
Ok(WordMatcher { regex, chir, original, names, caps })
}
/// Return the underlying pattern string for the regex used by this
/// matcher.
pub(crate) fn pattern(&self) -> &str {
&self.pattern
/// Return the underlying regex used to match at word boundaries.
///
/// The original regex is in the capture group at index 1.
pub(crate) fn regex(&self) -> &Regex {
&self.regex
}
/// Return the underlying HIR for the regex used to match at word
/// boundaries.
pub(crate) fn chir(&self) -> &ConfiguredHIR {
&self.chir
}
/// Attempt to do a fast confirmation of a word match that covers a subset