use { grep_matcher::{ByteSet, LineTerminator}, regex_automata::meta::Regex, regex_syntax::{ ast, hir::{self, Hir}, }, }; use crate::{ ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes, strip::strip_from_match, }; /// Config represents the configuration of a regex matcher in this crate. /// The configuration is itself a rough combination of the knobs found in /// the `regex` crate itself, along with additional `grep-matcher` specific /// options. /// /// The configuration can be used to build a "configured" HIR expression. A /// configured HIR expression is an HIR expression that is aware of the /// configuration which generated it, and provides transformation on that HIR /// such that the configuration is preserved. #[derive(Clone, Debug)] pub(crate) struct Config { pub(crate) case_insensitive: bool, pub(crate) case_smart: bool, pub(crate) multi_line: bool, pub(crate) dot_matches_new_line: bool, pub(crate) swap_greed: bool, pub(crate) ignore_whitespace: bool, pub(crate) unicode: bool, pub(crate) octal: bool, pub(crate) size_limit: usize, pub(crate) dfa_size_limit: usize, pub(crate) nest_limit: u32, pub(crate) line_terminator: Option, pub(crate) ban: Option, pub(crate) crlf: bool, pub(crate) word: bool, pub(crate) fixed_strings: bool, pub(crate) whole_line: bool, } impl Default for Config { fn default() -> Config { Config { case_insensitive: false, case_smart: false, multi_line: false, dot_matches_new_line: false, swap_greed: false, ignore_whitespace: false, unicode: true, octal: false, // These size limits are much bigger than what's in the regex // crate by default. size_limit: 100 * (1 << 20), dfa_size_limit: 1000 * (1 << 20), nest_limit: 250, line_terminator: None, ban: None, crlf: false, word: false, fixed_strings: false, whole_line: false, } } } impl Config { /// Use this configuration to build an HIR from the given patterns. The HIR /// returned corresponds to a single regex that is an alternation of the /// patterns given. pub(crate) fn build_many>( &self, patterns: &[P], ) -> Result { ConfiguredHIR::new(self.clone(), patterns) } /// Accounting for the `smart_case` config knob, return true if and only if /// this pattern should be matched case insensitively. fn is_case_insensitive(&self, analysis: &AstAnalysis) -> bool { if self.case_insensitive { return true; } if !self.case_smart { return false; } analysis.any_literal() && !analysis.any_uppercase() } /// Returns whether the given patterns should be treated as "fixed strings" /// literals. This is different from just querying the `fixed_strings` knob /// in that if the knob is false, this will still return true in some cases /// if the patterns are themselves indistinguishable from literals. /// /// The main idea here is that if this returns true, then it is safe /// to build an `regex_syntax::hir::Hir` value directly from the given /// patterns as an alternation of `hir::Literal` values. fn is_fixed_strings>(&self, patterns: &[P]) -> bool { // When these are enabled, we really need to parse the patterns and // let them go through the standard HIR translation process in order // for case folding transforms to be applied. if self.case_insensitive || self.case_smart { return false; } // Even if whole_line or word is enabled, both of those things can // be implemented by wrapping the Hir generated by an alternation of // fixed string literals. So for here at least, we don't care about the // word or whole_line settings. if self.fixed_strings { // ... but if any literal contains a line terminator, then we've // got to bail out because this will ultimately result in an error. if let Some(lineterm) = self.line_terminator { for p in patterns.iter() { if has_line_terminator(lineterm, p.as_ref()) { return false; } } } return true; } // In this case, the only way we can hand construct the Hir is if none // of the patterns contain meta characters. If they do, then we need to // send them through the standard parsing/translation process. for p in patterns.iter() { let p = p.as_ref(); if p.chars().any(regex_syntax::is_meta_character) { return false; } // Same deal as when fixed_strings is set above. If the pattern has // a line terminator anywhere, then we need to bail out and let // an error occur. if let Some(lineterm) = self.line_terminator { if has_line_terminator(lineterm, p) { return false; } } } true } } /// A "configured" HIR expression, which is aware of the configuration which /// produced this HIR. /// /// Since the configuration is tracked, values with this type can be /// transformed into other HIR expressions (or regular expressions) in a way /// that preserves the configuration. For example, the `fast_line_regex` /// method will apply literal extraction to the inner HIR and use that to build /// a new regex that matches the extracted literals in a way that is /// consistent with the configuration that produced this HIR. For example, the /// size limits set on the configured HIR will be propagated out to any /// subsequently constructed HIR or regular expression. #[derive(Clone, Debug)] pub(crate) struct ConfiguredHIR { config: Config, hir: Hir, } impl ConfiguredHIR { /// Parse the given patterns into a single HIR expression that represents /// an alternation of the patterns given. fn new>( config: Config, patterns: &[P], ) -> Result { let hir = if config.is_fixed_strings(patterns) { let mut alts = vec![]; for p in patterns.iter() { alts.push(Hir::literal(p.as_ref().as_bytes())); } log::debug!( "assembling HIR from {} fixed string literals", alts.len() ); let hir = Hir::alternation(alts); hir } else { let mut alts = vec![]; for p in patterns.iter() { alts.push(if config.fixed_strings { format!("(?:{})", regex_syntax::escape(p.as_ref())) } else { format!("(?:{})", p.as_ref()) }); } let pattern = alts.join("|"); let ast = ast::parse::ParserBuilder::new() .nest_limit(config.nest_limit) .octal(config.octal) .ignore_whitespace(config.ignore_whitespace) .build() .parse(&pattern) .map_err(Error::generic)?; let analysis = AstAnalysis::from_ast(&ast); let mut hir = hir::translate::TranslatorBuilder::new() .utf8(false) .case_insensitive(config.is_case_insensitive(&analysis)) .multi_line(config.multi_line) .dot_matches_new_line(config.dot_matches_new_line) .crlf(config.crlf) .swap_greed(config.swap_greed) .unicode(config.unicode) .build() .translate(&pattern, &ast) .map_err(Error::generic)?; if let Some(byte) = config.ban { ban::check(&hir, byte)?; } // We don't need to do this for the fixed-strings case above // because is_fixed_strings will return false if any pattern // contains a line terminator. Therefore, we don't need to strip // it. // // We go to some pains to avoid doing this in the fixed-strings // case because this can result in building a new HIR when ripgrep // is given a huge set of literals to search for. And this can // actually take a little time. It's not huge, but it's noticeable. hir = match config.line_terminator { None => hir, Some(line_term) => strip_from_match(hir, line_term)?, }; hir }; Ok(ConfiguredHIR { config, hir }) } /// Return a reference to the underlying configuration. pub(crate) fn config(&self) -> &Config { &self.config } /// Return a reference to the underyling HIR. pub(crate) fn hir(&self) -> &Hir { &self.hir } /// Convert this HIR to a regex that can be used for matching. pub(crate) fn to_regex(&self) -> Result { let meta = Regex::config() .utf8_empty(false) .nfa_size_limit(Some(self.config.size_limit)) // We don't expose a knob for this because the one-pass DFA is // usually not a perf bottleneck for ripgrep. But we give it some // extra room than the default. .onepass_size_limit(Some(10 * (1 << 20))) // Same deal here. The default limit for full DFAs is VERY small, // but with ripgrep we can afford to spend a bit more time on // building them I think. .dfa_size_limit(Some(1 * (1 << 20))) .dfa_state_limit(Some(1_000)) .hybrid_cache_capacity(self.config.dfa_size_limit); Regex::builder() .configure(meta) .build_from_hir(&self.hir) .map_err(Error::regex) } /// Compute the set of non-matching bytes for this HIR expression. pub(crate) fn non_matching_bytes(&self) -> ByteSet { non_matching_bytes(&self.hir) } /// Returns the line terminator configured on this expression. /// /// When we have beginning/end anchors (NOT line anchors), the fast line /// searching path isn't quite correct. Or at least, doesn't match the slow /// path. Namely, the slow path strips line terminators while the fast path /// does not. Since '$' (when multi-line mode is disabled) doesn't match at /// line boundaries, the existence of a line terminator might cause it to /// not match when it otherwise would with the line terminator stripped. /// /// Since searching with text anchors is exceptionally rare in the context /// of line oriented searching (multi-line mode is basically always /// enabled), we just disable this optimization when there are text /// anchors. We disable it by not returning a line terminator, since /// without a line terminator, the fast search path can't be executed. /// /// Actually, the above is no longer quite correct. Later on, another /// optimization was added where if the line terminator was in the set of /// bytes that was guaranteed to never be part of a match, then the higher /// level search infrastructure assumes that the fast line-by-line search /// path can still be taken. This optimization applies when multi-line /// search (not multi-line mode) is enabled. In that case, there is no /// configured line terminator since the regex is permitted to match a /// line terminator. But if the regex is guaranteed to never match across /// multiple lines despite multi-line search being requested, we can still /// do the faster and more flexible line-by-line search. This is why the /// non-matching extraction routine removes `\n` when `\A` and `\z` are /// present even though that's not quite correct... /// /// See: pub(crate) fn line_terminator(&self) -> Option { if self.hir.properties().look_set().contains_anchor_haystack() { None } else { self.config.line_terminator } } /// Turns this configured HIR into an equivalent one, but where it must /// match at the start and end of a line. pub(crate) fn into_whole_line(self) -> ConfiguredHIR { let line_anchor_start = Hir::look(self.line_anchor_start()); let line_anchor_end = Hir::look(self.line_anchor_end()); let hir = Hir::concat(vec![line_anchor_start, self.hir, line_anchor_end]); ConfiguredHIR { config: self.config, hir } } /// Turns this configured HIR into an equivalent one, but where it must /// match at word boundaries. pub(crate) fn into_word(self) -> ConfiguredHIR { let hir = Hir::concat(vec![ Hir::look(if self.config.unicode { hir::Look::WordStartHalfUnicode } else { hir::Look::WordStartHalfAscii }), self.hir, Hir::look(if self.config.unicode { hir::Look::WordEndHalfUnicode } else { hir::Look::WordEndHalfAscii }), ]); ConfiguredHIR { config: self.config, hir } } /// Returns the "start line" anchor for this configuration. fn line_anchor_start(&self) -> hir::Look { if self.config.crlf { hir::Look::StartCRLF } else { hir::Look::StartLF } } /// Returns the "end line" anchor for this configuration. fn line_anchor_end(&self) -> hir::Look { if self.config.crlf { hir::Look::EndCRLF } else { hir::Look::EndLF } } } /// Returns true if the given literal string contains any byte from the line /// terminator given. fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool { if lineterm.is_crlf() { literal.as_bytes().iter().copied().any(|b| b == b'\r' || b == b'\n') } else { literal.as_bytes().iter().copied().any(|b| b == lineterm.as_byte()) } }