mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-03-23 04:34:39 +02:00
This does a little bit of refactoring so that we can pass both a ConfiguredHIR and a Regex to the inner literal extraction routine. One downside of this approach is that a regex object hangs on to a ConfiguredHIR. But the extra memory usage is probably negligible. A benefit though is that converting the HIR to its concrete syntax is now lazy and only happens when logging is enabled.
421 lines
17 KiB
Rust
421 lines
17 KiB
Rust
use {
|
|
grep_matcher::{ByteSet, LineTerminator},
|
|
regex_automata::meta::Regex,
|
|
regex_syntax::{
|
|
ast,
|
|
hir::{self, Hir, HirKind},
|
|
},
|
|
};
|
|
|
|
use crate::{
|
|
ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
|
|
strip::strip_from_match,
|
|
};
|
|
|
|
/// Config represents the configuration of a regex matcher in this crate.
|
|
/// The configuration is itself a rough combination of the knobs found in
|
|
/// the `regex` crate itself, along with additional `grep-matcher` specific
|
|
/// options.
|
|
///
|
|
/// The configuration can be used to build a "configured" HIR expression. A
|
|
/// configured HIR expression is an HIR expression that is aware of the
|
|
/// configuration which generated it, and provides transformation on that HIR
|
|
/// such that the configuration is preserved.
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) struct Config {
|
|
pub(crate) case_insensitive: bool,
|
|
pub(crate) case_smart: bool,
|
|
pub(crate) multi_line: bool,
|
|
pub(crate) dot_matches_new_line: bool,
|
|
pub(crate) swap_greed: bool,
|
|
pub(crate) ignore_whitespace: bool,
|
|
pub(crate) unicode: bool,
|
|
pub(crate) octal: bool,
|
|
pub(crate) size_limit: usize,
|
|
pub(crate) dfa_size_limit: usize,
|
|
pub(crate) nest_limit: u32,
|
|
pub(crate) line_terminator: Option<LineTerminator>,
|
|
pub(crate) crlf: bool,
|
|
pub(crate) word: bool,
|
|
pub(crate) fixed_strings: bool,
|
|
pub(crate) whole_line: bool,
|
|
}
|
|
|
|
impl Default for Config {
|
|
fn default() -> Config {
|
|
Config {
|
|
case_insensitive: false,
|
|
case_smart: false,
|
|
multi_line: false,
|
|
dot_matches_new_line: false,
|
|
swap_greed: false,
|
|
ignore_whitespace: false,
|
|
unicode: true,
|
|
octal: false,
|
|
// These size limits are much bigger than what's in the regex
|
|
// crate by default.
|
|
size_limit: 100 * (1 << 20),
|
|
dfa_size_limit: 1000 * (1 << 20),
|
|
nest_limit: 250,
|
|
line_terminator: None,
|
|
crlf: false,
|
|
word: false,
|
|
fixed_strings: false,
|
|
whole_line: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Config {
|
|
/// Use this configuration to build an HIR from the given patterns. The HIR
|
|
/// returned corresponds to a single regex that is an alternation of the
|
|
/// patterns given.
|
|
pub(crate) fn build_many<P: AsRef<str>>(
|
|
&self,
|
|
patterns: &[P],
|
|
) -> Result<ConfiguredHIR, Error> {
|
|
ConfiguredHIR::new(self.clone(), patterns)
|
|
}
|
|
|
|
/// Accounting for the `smart_case` config knob, return true if and only if
|
|
/// this pattern should be matched case insensitively.
|
|
fn is_case_insensitive(&self, analysis: &AstAnalysis) -> bool {
|
|
if self.case_insensitive {
|
|
return true;
|
|
}
|
|
if !self.case_smart {
|
|
return false;
|
|
}
|
|
analysis.any_literal() && !analysis.any_uppercase()
|
|
}
|
|
|
|
/// Returns whether the given patterns should be treated as "fixed strings"
|
|
/// literals. This is different from just querying the `fixed_strings` knob
|
|
/// in that if the knob is false, this will still return true in some cases
|
|
/// if the patterns are themselves indistinguishable from literals.
|
|
///
|
|
/// The main idea here is that if this returns true, then it is safe
|
|
/// to build an `regex_syntax::hir::Hir` value directly from the given
|
|
/// patterns as an alternation of `hir::Literal` values.
|
|
fn is_fixed_strings<P: AsRef<str>>(&self, patterns: &[P]) -> bool {
|
|
// When these are enabled, we really need to parse the patterns and
|
|
// let them go through the standard HIR translation process in order
|
|
// for case folding transforms to be applied.
|
|
if self.case_insensitive || self.case_smart {
|
|
return false;
|
|
}
|
|
// Even if whole_line or word is enabled, both of those things can
|
|
// be implemented by wrapping the Hir generated by an alternation of
|
|
// fixed string literals. So for here at least, we don't care about the
|
|
// word or whole_line settings.
|
|
if self.fixed_strings {
|
|
// ... but if any literal contains a line terminator, then we've
|
|
// got to bail out because this will ultimately result in an error.
|
|
if let Some(lineterm) = self.line_terminator {
|
|
for p in patterns.iter() {
|
|
if has_line_terminator(lineterm, p.as_ref()) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
// In this case, the only way we can hand construct the Hir is if none
|
|
// of the patterns contain meta characters. If they do, then we need to
|
|
// send them through the standard parsing/translation process.
|
|
for p in patterns.iter() {
|
|
let p = p.as_ref();
|
|
if p.chars().any(regex_syntax::is_meta_character) {
|
|
return false;
|
|
}
|
|
// Same deal as when fixed_strings is set above. If the pattern has
|
|
// a line terminator anywhere, then we need to bail out and let
|
|
// an error occur.
|
|
if let Some(lineterm) = self.line_terminator {
|
|
if has_line_terminator(lineterm, p) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
true
|
|
}
|
|
}
|
|
|
|
/// A "configured" HIR expression, which is aware of the configuration which
|
|
/// produced this HIR.
|
|
///
|
|
/// Since the configuration is tracked, values with this type can be
|
|
/// transformed into other HIR expressions (or regular expressions) in a way
|
|
/// that preserves the configuration. For example, the `fast_line_regex`
|
|
/// method will apply literal extraction to the inner HIR and use that to build
|
|
/// a new regex that matches the extracted literals in a way that is
|
|
/// consistent with the configuration that produced this HIR. For example, the
|
|
/// size limits set on the configured HIR will be propagated out to any
|
|
/// subsequently constructed HIR or regular expression.
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) struct ConfiguredHIR {
|
|
config: Config,
|
|
hir: Hir,
|
|
}
|
|
|
|
impl ConfiguredHIR {
|
|
/// Parse the given patterns into a single HIR expression that represents
|
|
/// an alternation of the patterns given.
|
|
fn new<P: AsRef<str>>(
|
|
config: Config,
|
|
patterns: &[P],
|
|
) -> Result<ConfiguredHIR, Error> {
|
|
let hir = if config.is_fixed_strings(patterns) {
|
|
let mut alts = vec![];
|
|
for p in patterns.iter() {
|
|
alts.push(Hir::literal(p.as_ref().as_bytes()));
|
|
}
|
|
log::debug!(
|
|
"assembling HIR from {} fixed string literals",
|
|
alts.len()
|
|
);
|
|
let hir = Hir::alternation(alts);
|
|
hir
|
|
} else {
|
|
let mut alts = vec![];
|
|
for p in patterns.iter() {
|
|
alts.push(if config.fixed_strings {
|
|
format!("(?:{})", regex_syntax::escape(p.as_ref()))
|
|
} else {
|
|
format!("(?:{})", p.as_ref())
|
|
});
|
|
}
|
|
let pattern = alts.join("|");
|
|
let ast = ast::parse::ParserBuilder::new()
|
|
.nest_limit(config.nest_limit)
|
|
.octal(config.octal)
|
|
.ignore_whitespace(config.ignore_whitespace)
|
|
.build()
|
|
.parse(&pattern)
|
|
.map_err(Error::generic)?;
|
|
let analysis = AstAnalysis::from_ast(&ast);
|
|
let mut hir = hir::translate::TranslatorBuilder::new()
|
|
.utf8(false)
|
|
.case_insensitive(config.is_case_insensitive(&analysis))
|
|
.multi_line(config.multi_line)
|
|
.dot_matches_new_line(config.dot_matches_new_line)
|
|
.crlf(config.crlf)
|
|
.swap_greed(config.swap_greed)
|
|
.unicode(config.unicode)
|
|
.build()
|
|
.translate(&pattern, &ast)
|
|
.map_err(Error::generic)?;
|
|
// We don't need to do this for the fixed-strings case above
|
|
// because is_fixed_strings will return false if any pattern
|
|
// contains a line terminator. Therefore, we don't need to strip
|
|
// it.
|
|
//
|
|
// We go to some pains to avoid doing this in the fixed-strings
|
|
// case because this can result in building a new HIR when ripgrep
|
|
// is given a huge set of literals to search for. And this can
|
|
// actually take a little time. It's not huge, but it's noticeable.
|
|
hir = match config.line_terminator {
|
|
None => hir,
|
|
Some(line_term) => strip_from_match(hir, line_term)?,
|
|
};
|
|
hir
|
|
};
|
|
Ok(ConfiguredHIR { config, hir })
|
|
}
|
|
|
|
/// Return a reference to the underlying configuration.
|
|
pub(crate) fn config(&self) -> &Config {
|
|
&self.config
|
|
}
|
|
|
|
/// Return a reference to the underyling HIR.
|
|
pub(crate) fn hir(&self) -> &Hir {
|
|
&self.hir
|
|
}
|
|
|
|
/// Convert this HIR to a regex that can be used for matching.
|
|
pub(crate) fn to_regex(&self) -> Result<Regex, Error> {
|
|
let meta = Regex::config()
|
|
.utf8_empty(false)
|
|
.nfa_size_limit(Some(self.config.size_limit))
|
|
// We don't expose a knob for this because the one-pass DFA is
|
|
// usually not a perf bottleneck for ripgrep. But we give it some
|
|
// extra room than the default.
|
|
.onepass_size_limit(Some(10 * (1 << 20)))
|
|
// Same deal here. The default limit for full DFAs is VERY small,
|
|
// but with ripgrep we can afford to spend a bit more time on
|
|
// building them I think.
|
|
.dfa_size_limit(Some(1 * (1 << 20)))
|
|
.dfa_state_limit(Some(1_000))
|
|
.hybrid_cache_capacity(self.config.dfa_size_limit);
|
|
Regex::builder()
|
|
.configure(meta)
|
|
.build_from_hir(&self.hir)
|
|
.map_err(Error::regex)
|
|
}
|
|
|
|
/// Compute the set of non-matching bytes for this HIR expression.
|
|
pub(crate) fn non_matching_bytes(&self) -> ByteSet {
|
|
non_matching_bytes(&self.hir)
|
|
}
|
|
|
|
/// Returns the line terminator configured on this expression.
|
|
///
|
|
/// When we have beginning/end anchors (NOT line anchors), the fast line
|
|
/// searching path isn't quite correct. Or at least, doesn't match the slow
|
|
/// path. Namely, the slow path strips line terminators while the fast path
|
|
/// does not. Since '$' (when multi-line mode is disabled) doesn't match at
|
|
/// line boundaries, the existence of a line terminator might cause it to
|
|
/// not match when it otherwise would with the line terminator stripped.
|
|
///
|
|
/// Since searching with text anchors is exceptionally rare in the context
|
|
/// of line oriented searching (multi-line mode is basically always
|
|
/// enabled), we just disable this optimization when there are text
|
|
/// anchors. We disable it by not returning a line terminator, since
|
|
/// without a line terminator, the fast search path can't be executed.
|
|
///
|
|
/// Actually, the above is no longer quite correct. Later on, another
|
|
/// optimization was added where if the line terminator was in the set of
|
|
/// bytes that was guaranteed to never be part of a match, then the higher
|
|
/// level search infrastructure assumes that the fast line-by-line search
|
|
/// path can still be taken. This optimization applies when multi-line
|
|
/// search (not multi-line mode) is enabled. In that case, there is no
|
|
/// configured line terminator since the regex is permitted to match a
|
|
/// line terminator. But if the regex is guaranteed to never match across
|
|
/// multiple lines despite multi-line search being requested, we can still
|
|
/// do the faster and more flexible line-by-line search. This is why the
|
|
/// non-matching extraction routine removes `\n` when `\A` and `\z` are
|
|
/// present even though that's not quite correct...
|
|
///
|
|
/// See: <https://github.com/BurntSushi/ripgrep/issues/2260>
|
|
pub(crate) fn line_terminator(&self) -> Option<LineTerminator> {
|
|
if self.hir.properties().look_set().contains_anchor_haystack() {
|
|
None
|
|
} else {
|
|
self.config.line_terminator
|
|
}
|
|
}
|
|
|
|
/// Turns this configured HIR into one that only matches when both sides of
|
|
/// the match correspond to a word boundary.
|
|
///
|
|
/// Note that the HIR returned is like turning `pat` into
|
|
/// `(?m:^|\W)(pat)(?m:$|\W)`. That is, the true match is at capture group
|
|
/// `1` and not `0`.
|
|
pub(crate) fn into_word(self) -> Result<ConfiguredHIR, Error> {
|
|
// In theory building the HIR for \W should never fail, but there are
|
|
// likely some pathological cases (particularly with respect to certain
|
|
// values of limits) where it could in theory fail.
|
|
let non_word = {
|
|
let mut config = self.config.clone();
|
|
config.fixed_strings = false;
|
|
ConfiguredHIR::new(config, &[r"\W"])?
|
|
};
|
|
let line_anchor_start = Hir::look(self.line_anchor_start());
|
|
let line_anchor_end = Hir::look(self.line_anchor_end());
|
|
let hir = Hir::concat(vec![
|
|
Hir::alternation(vec![line_anchor_start, non_word.hir.clone()]),
|
|
Hir::capture(hir::Capture {
|
|
index: 1,
|
|
name: None,
|
|
sub: Box::new(renumber_capture_indices(self.hir)?),
|
|
}),
|
|
Hir::alternation(vec![non_word.hir, line_anchor_end]),
|
|
]);
|
|
Ok(ConfiguredHIR { config: self.config, hir })
|
|
}
|
|
|
|
/// Turns this configured HIR into an equivalent one, but where it must
|
|
/// match at the start and end of a line.
|
|
pub(crate) fn into_whole_line(self) -> ConfiguredHIR {
|
|
let line_anchor_start = Hir::look(self.line_anchor_start());
|
|
let line_anchor_end = Hir::look(self.line_anchor_end());
|
|
let hir =
|
|
Hir::concat(vec![line_anchor_start, self.hir, line_anchor_end]);
|
|
ConfiguredHIR { config: self.config, hir }
|
|
}
|
|
|
|
/// Turns this configured HIR into an equivalent one, but where it must
|
|
/// match at the start and end of the haystack.
|
|
pub(crate) fn into_anchored(self) -> ConfiguredHIR {
|
|
let hir = Hir::concat(vec![
|
|
Hir::look(hir::Look::Start),
|
|
self.hir,
|
|
Hir::look(hir::Look::End),
|
|
]);
|
|
ConfiguredHIR { config: self.config, hir }
|
|
}
|
|
|
|
/// Returns the "start line" anchor for this configuration.
|
|
fn line_anchor_start(&self) -> hir::Look {
|
|
if self.config.crlf {
|
|
hir::Look::StartCRLF
|
|
} else {
|
|
hir::Look::StartLF
|
|
}
|
|
}
|
|
|
|
/// Returns the "end line" anchor for this configuration.
|
|
fn line_anchor_end(&self) -> hir::Look {
|
|
if self.config.crlf {
|
|
hir::Look::EndCRLF
|
|
} else {
|
|
hir::Look::EndLF
|
|
}
|
|
}
|
|
}
|
|
|
|
/// This increments the index of every capture group in the given hir by 1. If
|
|
/// any increment results in an overflow, then an error is returned.
|
|
fn renumber_capture_indices(hir: Hir) -> Result<Hir, Error> {
|
|
Ok(match hir.into_kind() {
|
|
HirKind::Empty => Hir::empty(),
|
|
HirKind::Literal(hir::Literal(lit)) => Hir::literal(lit),
|
|
HirKind::Class(cls) => Hir::class(cls),
|
|
HirKind::Look(x) => Hir::look(x),
|
|
HirKind::Repetition(mut x) => {
|
|
x.sub = Box::new(renumber_capture_indices(*x.sub)?);
|
|
Hir::repetition(x)
|
|
}
|
|
HirKind::Capture(mut cap) => {
|
|
cap.index = match cap.index.checked_add(1) {
|
|
Some(index) => index,
|
|
None => {
|
|
// This error message kind of sucks, but it's probably
|
|
// impossible for it to happen. The only way a capture
|
|
// index can overflow addition is if the regex is huge
|
|
// (or something else has gone horribly wrong).
|
|
let msg = "could not renumber capture index, too big";
|
|
return Err(Error::any(msg));
|
|
}
|
|
};
|
|
cap.sub = Box::new(renumber_capture_indices(*cap.sub)?);
|
|
Hir::capture(cap)
|
|
}
|
|
HirKind::Concat(subs) => {
|
|
let subs = subs
|
|
.into_iter()
|
|
.map(|sub| renumber_capture_indices(sub))
|
|
.collect::<Result<Vec<Hir>, Error>>()?;
|
|
Hir::concat(subs)
|
|
}
|
|
HirKind::Alternation(subs) => {
|
|
let subs = subs
|
|
.into_iter()
|
|
.map(|sub| renumber_capture_indices(sub))
|
|
.collect::<Result<Vec<Hir>, Error>>()?;
|
|
Hir::alternation(subs)
|
|
}
|
|
})
|
|
}
|
|
|
|
/// Returns true if the given literal string contains any byte from the line
|
|
/// terminator given.
|
|
fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool {
|
|
if lineterm.is_crlf() {
|
|
literal.as_bytes().iter().copied().any(|b| b == b'\r' || b == b'\n')
|
|
} else {
|
|
literal.as_bytes().iter().copied().any(|b| b == lineterm.as_byte())
|
|
}
|
|
}
|