mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2024-12-12 19:18:24 +02:00
9f15e3b671
When looking for an inner literal to speed up searches, if only a prefix is found, then we generally give up doing inner literal optimizations since the regex engine will generally handle it for us. Unfortunately, this decision was being made *before* we wrap the regex in (^|\W)...($|\W) when using the -w/--word-regexp flag, which would then defeat the literal optimizations inside the regex engine. We fix this with a bit of a hack that says, "if we're doing a word regexp, then give me back any literal you find, even if it's a prefix."
274 lines
11 KiB
Rust
274 lines
11 KiB
Rust
use grep_matcher::{ByteSet, LineTerminator};
|
|
use regex::bytes::{Regex, RegexBuilder};
|
|
use regex_syntax::ast::{self, Ast};
|
|
use regex_syntax::hir::Hir;
|
|
|
|
use ast::AstAnalysis;
|
|
use crlf::crlfify;
|
|
use error::Error;
|
|
use literal::LiteralSets;
|
|
use non_matching::non_matching_bytes;
|
|
use strip::strip_from_match;
|
|
|
|
/// Config represents the configuration of a regex matcher in this crate.
|
|
/// The configuration is itself a rough combination of the knobs found in
|
|
/// the `regex` crate itself, along with additional `grep-matcher` specific
|
|
/// options.
|
|
///
|
|
/// The configuration can be used to build a "configured" HIR expression. A
|
|
/// configured HIR expression is an HIR expression that is aware of the
|
|
/// configuration which generated it, and provides transformation on that HIR
|
|
/// such that the configuration is preserved.
|
|
#[derive(Clone, Debug)]
|
|
pub struct Config {
|
|
pub case_insensitive: bool,
|
|
pub case_smart: bool,
|
|
pub multi_line: bool,
|
|
pub dot_matches_new_line: bool,
|
|
pub swap_greed: bool,
|
|
pub ignore_whitespace: bool,
|
|
pub unicode: bool,
|
|
pub octal: bool,
|
|
pub size_limit: usize,
|
|
pub dfa_size_limit: usize,
|
|
pub nest_limit: u32,
|
|
pub line_terminator: Option<LineTerminator>,
|
|
pub crlf: bool,
|
|
pub word: bool,
|
|
}
|
|
|
|
impl Default for Config {
|
|
fn default() -> Config {
|
|
Config {
|
|
case_insensitive: false,
|
|
case_smart: false,
|
|
multi_line: false,
|
|
dot_matches_new_line: false,
|
|
swap_greed: false,
|
|
ignore_whitespace: false,
|
|
unicode: true,
|
|
octal: false,
|
|
// These size limits are much bigger than what's in the regex
|
|
// crate.
|
|
size_limit: 100 * (1<<20),
|
|
dfa_size_limit: 1000 * (1<<20),
|
|
nest_limit: 250,
|
|
line_terminator: None,
|
|
crlf: false,
|
|
word: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Config {
|
|
/// Parse the given pattern and returned its HIR expression along with
|
|
/// the current configuration.
|
|
///
|
|
/// If there was a problem parsing the given expression then an error
|
|
/// is returned.
|
|
pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
|
let analysis = self.analysis(pattern)?;
|
|
let expr = ::regex_syntax::ParserBuilder::new()
|
|
.nest_limit(self.nest_limit)
|
|
.octal(self.octal)
|
|
.allow_invalid_utf8(true)
|
|
.ignore_whitespace(self.ignore_whitespace)
|
|
.case_insensitive(self.is_case_insensitive(&analysis)?)
|
|
.multi_line(self.multi_line)
|
|
.dot_matches_new_line(self.dot_matches_new_line)
|
|
.swap_greed(self.swap_greed)
|
|
.unicode(self.unicode)
|
|
.build()
|
|
.parse(pattern)
|
|
.map_err(Error::regex)?;
|
|
let expr = match self.line_terminator {
|
|
None => expr,
|
|
Some(line_term) => strip_from_match(expr, line_term)?,
|
|
};
|
|
Ok(ConfiguredHIR {
|
|
original: pattern.to_string(),
|
|
config: self.clone(),
|
|
analysis: analysis,
|
|
// If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
|
|
expr: if self.crlf { crlfify(expr) } else { expr },
|
|
})
|
|
}
|
|
|
|
/// Accounting for the `smart_case` config knob, return true if and only if
|
|
/// this pattern should be matched case insensitively.
|
|
fn is_case_insensitive(
|
|
&self,
|
|
analysis: &AstAnalysis,
|
|
) -> Result<bool, Error> {
|
|
if self.case_insensitive {
|
|
return Ok(true);
|
|
}
|
|
if !self.case_smart {
|
|
return Ok(false);
|
|
}
|
|
Ok(analysis.any_literal() && !analysis.any_uppercase())
|
|
}
|
|
|
|
/// Perform analysis on the AST of this pattern.
|
|
///
|
|
/// This returns an error if the given pattern failed to parse.
|
|
fn analysis(&self, pattern: &str) -> Result<AstAnalysis, Error> {
|
|
Ok(AstAnalysis::from_ast(&self.ast(pattern)?))
|
|
}
|
|
|
|
/// Parse the given pattern into its abstract syntax.
|
|
///
|
|
/// This returns an error if the given pattern failed to parse.
|
|
fn ast(&self, pattern: &str) -> Result<Ast, Error> {
|
|
ast::parse::ParserBuilder::new()
|
|
.nest_limit(self.nest_limit)
|
|
.octal(self.octal)
|
|
.ignore_whitespace(self.ignore_whitespace)
|
|
.build()
|
|
.parse(pattern)
|
|
.map_err(Error::regex)
|
|
}
|
|
}
|
|
|
|
/// A "configured" HIR expression, which is aware of the configuration which
|
|
/// produced this HIR.
|
|
///
|
|
/// Since the configuration is tracked, values with this type can be
|
|
/// transformed into other HIR expressions (or regular expressions) in a way
|
|
/// that preserves the configuration. For example, the `fast_line_regex`
|
|
/// method will apply literal extraction to the inner HIR and use that to build
|
|
/// a new regex that matches the extracted literals in a way that is
|
|
/// consistent with the configuration that produced this HIR. For example, the
|
|
/// size limits set on the configured HIR will be propagated out to any
|
|
/// subsequently constructed HIR or regular expression.
|
|
#[derive(Clone, Debug)]
|
|
pub struct ConfiguredHIR {
|
|
original: String,
|
|
config: Config,
|
|
analysis: AstAnalysis,
|
|
expr: Hir,
|
|
}
|
|
|
|
impl ConfiguredHIR {
|
|
/// Return the configuration for this HIR expression.
|
|
pub fn config(&self) -> &Config {
|
|
&self.config
|
|
}
|
|
|
|
/// Compute the set of non-matching bytes for this HIR expression.
|
|
pub fn non_matching_bytes(&self) -> ByteSet {
|
|
non_matching_bytes(&self.expr)
|
|
}
|
|
|
|
/// Returns true if and only if this regex needs to have its match offsets
|
|
/// tweaked because of CRLF support. Specifically, this occurs when the
|
|
/// CRLF hack is enabled and the regex is line anchored at the end. In
|
|
/// this case, matches that end with a `\r` have the `\r` stripped.
|
|
pub fn needs_crlf_stripped(&self) -> bool {
|
|
self.config.crlf && self.expr.is_line_anchored_end()
|
|
}
|
|
|
|
/// Builds a regular expression from this HIR expression.
|
|
pub fn regex(&self) -> Result<Regex, Error> {
|
|
self.pattern_to_regex(&self.expr.to_string())
|
|
}
|
|
|
|
/// Applies the given function to the concrete syntax of this HIR and then
|
|
/// generates a new HIR based on the result of the function in a way that
|
|
/// preserves the configuration.
|
|
///
|
|
/// For example, this can be used to wrap a user provided regular
|
|
/// expression with additional semantics. e.g., See the `WordMatcher`.
|
|
pub fn with_pattern<F: FnMut(&str) -> String>(
|
|
&self,
|
|
mut f: F,
|
|
) -> Result<ConfiguredHIR, Error>
|
|
{
|
|
self.pattern_to_hir(&f(&self.expr.to_string()))
|
|
}
|
|
|
|
/// If the current configuration has a line terminator set and if useful
|
|
/// literals could be extracted, then a regular expression matching those
|
|
/// literals is returned. If no line terminator is set, then `None` is
|
|
/// returned.
|
|
///
|
|
/// If compiling the resulting regular expression failed, then an error
|
|
/// is returned.
|
|
///
|
|
/// This method only returns something when a line terminator is set
|
|
/// because matches from this regex are generally candidates that must be
|
|
/// confirmed before reporting a match. When performing a line oriented
|
|
/// search, confirmation is easy: just extend the candidate match to its
|
|
/// respective line boundaries and then re-search that line for a full
|
|
/// match. This only works when the line terminator is set because the line
|
|
/// terminator setting guarantees that the regex itself can never match
|
|
/// through the line terminator byte.
|
|
pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
|
|
if self.config.line_terminator.is_none() {
|
|
return Ok(None);
|
|
}
|
|
match LiteralSets::new(&self.expr).one_regex(self.config.word) {
|
|
None => Ok(None),
|
|
Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
|
|
}
|
|
}
|
|
|
|
/// Create a regex from the given pattern using this HIR's configuration.
|
|
fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> {
|
|
// The settings we explicitly set here are intentionally a subset
|
|
// of the settings we have. The key point here is that our HIR
|
|
// expression is computed with the settings in mind, such that setting
|
|
// them here could actually lead to unintended behavior. For example,
|
|
// consider the pattern `(?U)a+`. This will get folded into the HIR
|
|
// as a non-greedy repetition operator which will in turn get printed
|
|
// to the concrete syntax as `a+?`, which is correct. But if we
|
|
// set the `swap_greed` option again, then we'll wind up with `(?U)a+?`
|
|
// which is equal to `a+` which is not the same as what we were given.
|
|
//
|
|
// We also don't need to apply `case_insensitive` since this gets
|
|
// folded into the HIR and would just cause us to do redundant work.
|
|
//
|
|
// Finally, we don't need to set `ignore_whitespace` since the concrete
|
|
// syntax emitted by the HIR printer never needs it.
|
|
//
|
|
// We set the rest of the options. Some of them are important, such as
|
|
// the size limit, and some of them are necessary to preserve the
|
|
// intention of the original pattern. For example, the Unicode flag
|
|
// will impact how the WordMatcher functions, namely, whether its
|
|
// word boundaries are Unicode aware or not.
|
|
RegexBuilder::new(&pattern)
|
|
.nest_limit(self.config.nest_limit)
|
|
.octal(self.config.octal)
|
|
.multi_line(self.config.multi_line)
|
|
.dot_matches_new_line(self.config.dot_matches_new_line)
|
|
.unicode(self.config.unicode)
|
|
.size_limit(self.config.size_limit)
|
|
.dfa_size_limit(self.config.dfa_size_limit)
|
|
.build()
|
|
.map_err(Error::regex)
|
|
}
|
|
|
|
/// Create an HIR expression from the given pattern using this HIR's
|
|
/// configuration.
|
|
fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
|
// See `pattern_to_regex` comment for explanation of why we only set
|
|
// a subset of knobs here. e.g., `swap_greed` is explicitly left out.
|
|
let expr = ::regex_syntax::ParserBuilder::new()
|
|
.nest_limit(self.config.nest_limit)
|
|
.octal(self.config.octal)
|
|
.allow_invalid_utf8(true)
|
|
.multi_line(self.config.multi_line)
|
|
.dot_matches_new_line(self.config.dot_matches_new_line)
|
|
.unicode(self.config.unicode)
|
|
.build()
|
|
.parse(pattern)
|
|
.map_err(Error::regex)?;
|
|
Ok(ConfiguredHIR {
|
|
original: self.original.clone(),
|
|
config: self.config.clone(),
|
|
analysis: self.analysis.clone(),
|
|
expr: expr,
|
|
})
|
|
}
|
|
}
|