1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2024-12-12 19:18:24 +02:00
ripgrep/grep-regex/src/config.rs
Andrew Gallant 9f15e3b671 regex: fix a perf bug when using -w flag
When looking for an inner literal to speed up searches, if only a prefix
is found, then we generally give up doing inner literal optimizations since
the regex engine will generally handle it for us. Unfortunately, this
decision was being made *before* we wrap the regex in (^|\W)...($|\W) when
using the -w/--word-regexp flag, which would then defeat the literal
optimizations inside the regex engine.

We fix this with a bit of a hack that says, "if we're doing a word regexp,
then give me back any literal you find, even if it's a prefix."
2019-04-05 23:24:08 -04:00

274 lines
11 KiB
Rust

use grep_matcher::{ByteSet, LineTerminator};
use regex::bytes::{Regex, RegexBuilder};
use regex_syntax::ast::{self, Ast};
use regex_syntax::hir::Hir;
use ast::AstAnalysis;
use crlf::crlfify;
use error::Error;
use literal::LiteralSets;
use non_matching::non_matching_bytes;
use strip::strip_from_match;
/// Config represents the configuration of a regex matcher in this crate.
/// The configuration is itself a rough combination of the knobs found in
/// the `regex` crate itself, along with additional `grep-matcher` specific
/// options.
///
/// The configuration can be used to build a "configured" HIR expression. A
/// configured HIR expression is an HIR expression that is aware of the
/// configuration which generated it, and provides transformation on that HIR
/// such that the configuration is preserved.
#[derive(Clone, Debug)]
pub struct Config {
pub case_insensitive: bool,
pub case_smart: bool,
pub multi_line: bool,
pub dot_matches_new_line: bool,
pub swap_greed: bool,
pub ignore_whitespace: bool,
pub unicode: bool,
pub octal: bool,
pub size_limit: usize,
pub dfa_size_limit: usize,
pub nest_limit: u32,
pub line_terminator: Option<LineTerminator>,
pub crlf: bool,
pub word: bool,
}
impl Default for Config {
fn default() -> Config {
Config {
case_insensitive: false,
case_smart: false,
multi_line: false,
dot_matches_new_line: false,
swap_greed: false,
ignore_whitespace: false,
unicode: true,
octal: false,
// These size limits are much bigger than what's in the regex
// crate.
size_limit: 100 * (1<<20),
dfa_size_limit: 1000 * (1<<20),
nest_limit: 250,
line_terminator: None,
crlf: false,
word: false,
}
}
}
impl Config {
/// Parse the given pattern and returned its HIR expression along with
/// the current configuration.
///
/// If there was a problem parsing the given expression then an error
/// is returned.
pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
let analysis = self.analysis(pattern)?;
let expr = ::regex_syntax::ParserBuilder::new()
.nest_limit(self.nest_limit)
.octal(self.octal)
.allow_invalid_utf8(true)
.ignore_whitespace(self.ignore_whitespace)
.case_insensitive(self.is_case_insensitive(&analysis)?)
.multi_line(self.multi_line)
.dot_matches_new_line(self.dot_matches_new_line)
.swap_greed(self.swap_greed)
.unicode(self.unicode)
.build()
.parse(pattern)
.map_err(Error::regex)?;
let expr = match self.line_terminator {
None => expr,
Some(line_term) => strip_from_match(expr, line_term)?,
};
Ok(ConfiguredHIR {
original: pattern.to_string(),
config: self.clone(),
analysis: analysis,
// If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
expr: if self.crlf { crlfify(expr) } else { expr },
})
}
/// Accounting for the `smart_case` config knob, return true if and only if
/// this pattern should be matched case insensitively.
fn is_case_insensitive(
&self,
analysis: &AstAnalysis,
) -> Result<bool, Error> {
if self.case_insensitive {
return Ok(true);
}
if !self.case_smart {
return Ok(false);
}
Ok(analysis.any_literal() && !analysis.any_uppercase())
}
/// Perform analysis on the AST of this pattern.
///
/// This returns an error if the given pattern failed to parse.
fn analysis(&self, pattern: &str) -> Result<AstAnalysis, Error> {
Ok(AstAnalysis::from_ast(&self.ast(pattern)?))
}
/// Parse the given pattern into its abstract syntax.
///
/// This returns an error if the given pattern failed to parse.
fn ast(&self, pattern: &str) -> Result<Ast, Error> {
ast::parse::ParserBuilder::new()
.nest_limit(self.nest_limit)
.octal(self.octal)
.ignore_whitespace(self.ignore_whitespace)
.build()
.parse(pattern)
.map_err(Error::regex)
}
}
/// A "configured" HIR expression, which is aware of the configuration which
/// produced this HIR.
///
/// Since the configuration is tracked, values with this type can be
/// transformed into other HIR expressions (or regular expressions) in a way
/// that preserves the configuration. For example, the `fast_line_regex`
/// method will apply literal extraction to the inner HIR and use that to build
/// a new regex that matches the extracted literals in a way that is
/// consistent with the configuration that produced this HIR. For example, the
/// size limits set on the configured HIR will be propagated out to any
/// subsequently constructed HIR or regular expression.
#[derive(Clone, Debug)]
pub struct ConfiguredHIR {
original: String,
config: Config,
analysis: AstAnalysis,
expr: Hir,
}
impl ConfiguredHIR {
/// Return the configuration for this HIR expression.
pub fn config(&self) -> &Config {
&self.config
}
/// Compute the set of non-matching bytes for this HIR expression.
pub fn non_matching_bytes(&self) -> ByteSet {
non_matching_bytes(&self.expr)
}
/// Returns true if and only if this regex needs to have its match offsets
/// tweaked because of CRLF support. Specifically, this occurs when the
/// CRLF hack is enabled and the regex is line anchored at the end. In
/// this case, matches that end with a `\r` have the `\r` stripped.
pub fn needs_crlf_stripped(&self) -> bool {
self.config.crlf && self.expr.is_line_anchored_end()
}
/// Builds a regular expression from this HIR expression.
pub fn regex(&self) -> Result<Regex, Error> {
self.pattern_to_regex(&self.expr.to_string())
}
/// Applies the given function to the concrete syntax of this HIR and then
/// generates a new HIR based on the result of the function in a way that
/// preserves the configuration.
///
/// For example, this can be used to wrap a user provided regular
/// expression with additional semantics. e.g., See the `WordMatcher`.
pub fn with_pattern<F: FnMut(&str) -> String>(
&self,
mut f: F,
) -> Result<ConfiguredHIR, Error>
{
self.pattern_to_hir(&f(&self.expr.to_string()))
}
/// If the current configuration has a line terminator set and if useful
/// literals could be extracted, then a regular expression matching those
/// literals is returned. If no line terminator is set, then `None` is
/// returned.
///
/// If compiling the resulting regular expression failed, then an error
/// is returned.
///
/// This method only returns something when a line terminator is set
/// because matches from this regex are generally candidates that must be
/// confirmed before reporting a match. When performing a line oriented
/// search, confirmation is easy: just extend the candidate match to its
/// respective line boundaries and then re-search that line for a full
/// match. This only works when the line terminator is set because the line
/// terminator setting guarantees that the regex itself can never match
/// through the line terminator byte.
pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
if self.config.line_terminator.is_none() {
return Ok(None);
}
match LiteralSets::new(&self.expr).one_regex(self.config.word) {
None => Ok(None),
Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
}
}
/// Create a regex from the given pattern using this HIR's configuration.
fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> {
// The settings we explicitly set here are intentionally a subset
// of the settings we have. The key point here is that our HIR
// expression is computed with the settings in mind, such that setting
// them here could actually lead to unintended behavior. For example,
// consider the pattern `(?U)a+`. This will get folded into the HIR
// as a non-greedy repetition operator which will in turn get printed
// to the concrete syntax as `a+?`, which is correct. But if we
// set the `swap_greed` option again, then we'll wind up with `(?U)a+?`
// which is equal to `a+` which is not the same as what we were given.
//
// We also don't need to apply `case_insensitive` since this gets
// folded into the HIR and would just cause us to do redundant work.
//
// Finally, we don't need to set `ignore_whitespace` since the concrete
// syntax emitted by the HIR printer never needs it.
//
// We set the rest of the options. Some of them are important, such as
// the size limit, and some of them are necessary to preserve the
// intention of the original pattern. For example, the Unicode flag
// will impact how the WordMatcher functions, namely, whether its
// word boundaries are Unicode aware or not.
RegexBuilder::new(&pattern)
.nest_limit(self.config.nest_limit)
.octal(self.config.octal)
.multi_line(self.config.multi_line)
.dot_matches_new_line(self.config.dot_matches_new_line)
.unicode(self.config.unicode)
.size_limit(self.config.size_limit)
.dfa_size_limit(self.config.dfa_size_limit)
.build()
.map_err(Error::regex)
}
/// Create an HIR expression from the given pattern using this HIR's
/// configuration.
fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
// See `pattern_to_regex` comment for explanation of why we only set
// a subset of knobs here. e.g., `swap_greed` is explicitly left out.
let expr = ::regex_syntax::ParserBuilder::new()
.nest_limit(self.config.nest_limit)
.octal(self.config.octal)
.allow_invalid_utf8(true)
.multi_line(self.config.multi_line)
.dot_matches_new_line(self.config.dot_matches_new_line)
.unicode(self.config.unicode)
.build()
.parse(pattern)
.map_err(Error::regex)?;
Ok(ConfiguredHIR {
original: self.original.clone(),
config: self.config.clone(),
analysis: self.analysis.clone(),
expr: expr,
})
}
}