mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-01-29 22:01:04 +02:00
regex: refactor matcher construction
This does a little bit of refactoring so that we can pass both a ConfiguredHIR and a Regex to the inner literal extraction routine. One downside of this approach is that a regex object hangs on to a ConfiguredHIR. But the extra memory usage is probably negligible. A benefit though is that converting the HIR to its concrete syntax is now lazy and only happens when logging is enabled.
This commit is contained in:
parent
04dde9a4eb
commit
8ac66a9e04
@ -8,8 +8,8 @@ use {
|
||||
};
|
||||
|
||||
use crate::{
|
||||
ast::AstAnalysis, error::Error, literal::LiteralSets,
|
||||
non_matching::non_matching_bytes, strip::strip_from_match,
|
||||
ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
|
||||
strip::strip_from_match,
|
||||
};
|
||||
|
||||
/// Config represents the configuration of a regex matcher in this crate.
|
||||
@ -228,6 +228,11 @@ impl ConfiguredHIR {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Return a reference to the underyling HIR.
|
||||
pub(crate) fn hir(&self) -> &Hir {
|
||||
&self.hir
|
||||
}
|
||||
|
||||
/// Convert this HIR to a regex that can be used for matching.
|
||||
pub(crate) fn to_regex(&self) -> Result<Regex, Error> {
|
||||
let meta = Regex::config()
|
||||
@ -240,8 +245,8 @@ impl ConfiguredHIR {
|
||||
// Same deal here. The default limit for full DFAs is VERY small,
|
||||
// but with ripgrep we can afford to spend a bit more time on
|
||||
// building them I think.
|
||||
.dfa_size_limit(Some(10 * (1 << 20)))
|
||||
.dfa_state_limit(Some(10_000))
|
||||
.dfa_size_limit(Some(1 * (1 << 20)))
|
||||
.dfa_state_limit(Some(1_000))
|
||||
.hybrid_cache_capacity(self.config.dfa_size_limit);
|
||||
Regex::builder()
|
||||
.configure(meta)
|
||||
@ -249,31 +254,6 @@ impl ConfiguredHIR {
|
||||
.map_err(Error::regex)
|
||||
}
|
||||
|
||||
/// Convert this HIR to its concrete syntax.
|
||||
pub(crate) fn to_pattern(&self) -> String {
|
||||
self.hir.to_string()
|
||||
}
|
||||
|
||||
/// Attempt to extract a "fast" regex that can be used for quickly finding
|
||||
/// candidates lines for a match.
|
||||
///
|
||||
/// If no line terminator was configured, then this always returns
|
||||
/// `Ok(None)`. If a line terminator is configured, then this may return a
|
||||
/// regex.
|
||||
pub(crate) fn to_fast_line_regex(&self) -> Result<Option<Regex>, Error> {
|
||||
if self.config.line_terminator.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
match LiteralSets::new(&self.hir).one_regex(self.config.word) {
|
||||
None => Ok(None),
|
||||
Some(pattern) => {
|
||||
let config = self.config.clone();
|
||||
let chir = ConfiguredHIR::new(config, &[pattern])?;
|
||||
Ok(Some(chir.to_regex()?))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the set of non-matching bytes for this HIR expression.
|
||||
pub(crate) fn non_matching_bytes(&self) -> ByteSet {
|
||||
non_matching_bytes(&self.hir)
|
||||
|
@ -1,32 +1,42 @@
|
||||
use regex_syntax::hir::Hir;
|
||||
#![allow(warnings)]
|
||||
|
||||
// BREADCRUMBS:
|
||||
//
|
||||
// The way we deal with line terminators in the regex is clunky, but probably
|
||||
// the least bad option for now unfortunately.
|
||||
//
|
||||
// The `non_matching_bytes` routine currently hardcodes line terminators for
|
||||
// anchors. But it's not really clear it should even care about line terminators
|
||||
// anyway, since anchors aren't actually part of a match. If we fix that
|
||||
// though, that currently reveals a different bug elsewhere: '(?-m:^)' isn't
|
||||
// implemented correctly in multi-line search, because it defers to the fast
|
||||
// line-by-line strategy, which ends up being wrong. I think the way forward
|
||||
// there is to:
|
||||
//
|
||||
// 1) Adding something in the grep-matcher interface that exposes a way to
|
||||
// query for \A and \z specifically. If they're in the pattern, then we can
|
||||
// decide how to handle them.
|
||||
//
|
||||
// 2) Perhaps provide a way to "translate \A/\z to ^/$" for cases when
|
||||
// mulit-line search is not enabled.
|
||||
use {
|
||||
regex_automata::meta::Regex,
|
||||
regex_syntax::hir::{
|
||||
literal::{self, Seq},
|
||||
Hir,
|
||||
},
|
||||
};
|
||||
|
||||
use crate::config::ConfiguredHIR;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LiteralSets {}
|
||||
pub(crate) struct InnerLiterals {
|
||||
seq: Seq,
|
||||
}
|
||||
|
||||
impl LiteralSets {
|
||||
/// Create a set of literals from the given HIR expression.
|
||||
pub fn new(_: &Hir) -> LiteralSets {
|
||||
LiteralSets {}
|
||||
impl InnerLiterals {
|
||||
/// Create a set of inner literals from the given HIR expression.
|
||||
///
|
||||
/// If no line terminator was configured, then this always declines to
|
||||
/// extract literals because the inner literal optimization may not be
|
||||
/// valid.
|
||||
///
|
||||
/// Note that this requires the actual regex that will be used for a search
|
||||
/// because it will query some state about the compiled regex. That state
|
||||
/// may influence inner literal extraction.
|
||||
pub(crate) fn new(chir: &ConfiguredHIR, re: &Regex) -> InnerLiterals {
|
||||
let seq = Seq::infinite();
|
||||
if chir.config().line_terminator.is_none() || re.is_accelerated() {
|
||||
return InnerLiterals::none();
|
||||
}
|
||||
InnerLiterals { seq }
|
||||
}
|
||||
|
||||
/// Returns a infinite set of inner literals, such that it can never
|
||||
/// produce a matcher.
|
||||
pub(crate) fn none() -> InnerLiterals {
|
||||
InnerLiterals { seq: Seq::infinite() }
|
||||
}
|
||||
|
||||
/// If it is deemed advantageuous to do so (via various suspicious
|
||||
@ -35,7 +45,10 @@ impl LiteralSets {
|
||||
/// generated these literal sets. The idea here is that the pattern
|
||||
/// returned by this method is much cheaper to search for. i.e., It is
|
||||
/// usually a single literal or an alternation of literals.
|
||||
pub fn one_regex(&self, _word: bool) -> Option<String> {
|
||||
pub(crate) fn one_regex(&self) -> Option<String> {
|
||||
if !self.seq.is_finite() {
|
||||
return None;
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,5 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use {
|
||||
grep_matcher::{
|
||||
ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
|
||||
@ -12,6 +14,7 @@ use {
|
||||
use crate::{
|
||||
config::{Config, ConfiguredHIR},
|
||||
error::Error,
|
||||
literal::InnerLiterals,
|
||||
word::WordMatcher,
|
||||
};
|
||||
|
||||
@ -59,8 +62,26 @@ impl RegexMatcherBuilder {
|
||||
patterns: &[P],
|
||||
) -> Result<RegexMatcher, Error> {
|
||||
let chir = self.config.build_many(patterns)?;
|
||||
let fast_line_regex = chir.to_fast_line_regex()?;
|
||||
let matcher = RegexMatcherImpl::new(chir)?;
|
||||
let (chir, re) = (matcher.chir(), matcher.regex());
|
||||
log::trace!("final regex: {:?}", chir.hir().to_string());
|
||||
|
||||
let non_matching_bytes = chir.non_matching_bytes();
|
||||
// If we can pick out some literals from the regex, then we might be
|
||||
// able to build a faster regex that quickly identifies candidate
|
||||
// matching lines. The regex engine will do what it can on its own, but
|
||||
// we can specifically do a little more when a line terminator is set.
|
||||
// For example, for a regex like `\w+foo\w+`, we can look for `foo`,
|
||||
// and when a match is found, look for the line containing `foo` and
|
||||
// then run the original regex on only that line. (In this case, the
|
||||
// regex engine is likely to handle this case for us since it's so
|
||||
// simple, but the idea applies.)
|
||||
let fast_line_regex = match InnerLiterals::new(chir, re).one_regex() {
|
||||
None => None,
|
||||
Some(pattern) => {
|
||||
Some(chir.config().build_many(&[pattern])?.to_regex()?)
|
||||
}
|
||||
};
|
||||
if let Some(ref re) = fast_line_regex {
|
||||
log::debug!("extracted fast line regex: {:?}", re);
|
||||
}
|
||||
@ -69,8 +90,6 @@ impl RegexMatcherBuilder {
|
||||
// support it.
|
||||
let mut config = self.config.clone();
|
||||
config.line_terminator = chir.line_terminator();
|
||||
let matcher = RegexMatcherImpl::new(chir)?;
|
||||
log::trace!("final regex: {:?}", matcher.regex());
|
||||
Ok(RegexMatcher {
|
||||
config,
|
||||
matcher,
|
||||
@ -412,10 +431,18 @@ impl RegexMatcherImpl {
|
||||
}
|
||||
|
||||
/// Return the underlying regex object used.
|
||||
fn regex(&self) -> String {
|
||||
fn regex(&self) -> &Regex {
|
||||
match *self {
|
||||
RegexMatcherImpl::Word(ref x) => x.pattern().to_string(),
|
||||
RegexMatcherImpl::Standard(ref x) => x.pattern.clone(),
|
||||
RegexMatcherImpl::Word(ref x) => x.regex(),
|
||||
RegexMatcherImpl::Standard(ref x) => &x.regex,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the underlying HIR of the regex used for searching.
|
||||
fn chir(&self) -> &ConfiguredHIR {
|
||||
match *self {
|
||||
RegexMatcherImpl::Word(ref x) => x.chir(),
|
||||
RegexMatcherImpl::Standard(ref x) => &x.chir,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -666,15 +693,19 @@ struct StandardMatcher {
|
||||
/// The regular expression compiled from the pattern provided by the
|
||||
/// caller.
|
||||
regex: Regex,
|
||||
/// The underlying pattern string for the regex.
|
||||
pattern: String,
|
||||
/// The HIR that produced this regex.
|
||||
///
|
||||
/// We put this in an `Arc` because by the time it gets here, it won't
|
||||
/// change. And because cloning and dropping an `Hir` is somewhat expensive
|
||||
/// due to its deep recursive representation.
|
||||
chir: Arc<ConfiguredHIR>,
|
||||
}
|
||||
|
||||
impl StandardMatcher {
|
||||
fn new(chir: ConfiguredHIR) -> Result<StandardMatcher, Error> {
|
||||
let chir = Arc::new(chir);
|
||||
let regex = chir.to_regex()?;
|
||||
let pattern = chir.to_pattern();
|
||||
Ok(StandardMatcher { regex, pattern })
|
||||
Ok(StandardMatcher { regex, chir })
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -22,8 +22,13 @@ type PoolFn =
|
||||
pub(crate) struct WordMatcher {
|
||||
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
||||
regex: Regex,
|
||||
/// The pattern string corresponding to the above regex.
|
||||
pattern: String,
|
||||
/// The HIR that produced the regex above. We don't keep the HIR for the
|
||||
/// `original` regex.
|
||||
///
|
||||
/// We put this in an `Arc` because by the time it gets here, it won't
|
||||
/// change. And because cloning and dropping an `Hir` is somewhat expensive
|
||||
/// due to its deep recursive representation.
|
||||
chir: Arc<ConfiguredHIR>,
|
||||
/// The original regex supplied by the user, which we use in a fast path
|
||||
/// to try and detect matches before deferring to slower engines.
|
||||
original: Regex,
|
||||
@ -45,7 +50,7 @@ impl Clone for WordMatcher {
|
||||
let re = self.regex.clone();
|
||||
WordMatcher {
|
||||
regex: self.regex.clone(),
|
||||
pattern: self.pattern.clone(),
|
||||
chir: Arc::clone(&self.chir),
|
||||
original: self.original.clone(),
|
||||
names: self.names.clone(),
|
||||
caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
|
||||
@ -61,9 +66,8 @@ impl WordMatcher {
|
||||
/// internally.
|
||||
pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> {
|
||||
let original = chir.clone().into_anchored().to_regex()?;
|
||||
let word_chir = chir.into_word()?;
|
||||
let regex = word_chir.to_regex()?;
|
||||
let pattern = word_chir.to_pattern();
|
||||
let chir = Arc::new(chir.into_word()?);
|
||||
let regex = chir.to_regex()?;
|
||||
let caps = Arc::new(Pool::new({
|
||||
let regex = regex.clone();
|
||||
Box::new(move || regex.create_captures()) as PoolFn
|
||||
@ -76,13 +80,20 @@ impl WordMatcher {
|
||||
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||
}
|
||||
}
|
||||
Ok(WordMatcher { regex, pattern, original, names, caps })
|
||||
Ok(WordMatcher { regex, chir, original, names, caps })
|
||||
}
|
||||
|
||||
/// Return the underlying pattern string for the regex used by this
|
||||
/// matcher.
|
||||
pub(crate) fn pattern(&self) -> &str {
|
||||
&self.pattern
|
||||
/// Return the underlying regex used to match at word boundaries.
|
||||
///
|
||||
/// The original regex is in the capture group at index 1.
|
||||
pub(crate) fn regex(&self) -> &Regex {
|
||||
&self.regex
|
||||
}
|
||||
|
||||
/// Return the underlying HIR for the regex used to match at word
|
||||
/// boundaries.
|
||||
pub(crate) fn chir(&self) -> &ConfiguredHIR {
|
||||
&self.chir
|
||||
}
|
||||
|
||||
/// Attempt to do a fast confirmation of a word match that covers a subset
|
||||
|
Loading…
x
Reference in New Issue
Block a user