ripgrep/crates/regex/src/config.rs

use {
    grep_matcher::{ByteSet, LineTerminator},
    regex_automata::meta::Regex,
    regex_syntax::{
        ast,
        hir::{self, Hir},
    },
};

use crate::{
    ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes,
    strip::strip_from_match,
};

/// Config represents the configuration of a regex matcher in this crate.
/// The configuration is itself a rough combination of the knobs found in
/// the `regex` crate itself, along with additional `grep-matcher` specific
/// options.
///
/// The configuration can be used to build a "configured" HIR expression. A
/// configured HIR expression is an HIR expression that is aware of the
/// configuration which generated it, and provides transformation on that HIR
/// such that the configuration is preserved.
#[derive(Clone, Debug)]
pub(crate) struct Config {
    pub(crate) case_insensitive: bool,
    pub(crate) case_smart: bool,
    pub(crate) multi_line: bool,
    pub(crate) dot_matches_new_line: bool,
    pub(crate) swap_greed: bool,
    pub(crate) ignore_whitespace: bool,
    pub(crate) unicode: bool,
    pub(crate) octal: bool,
    pub(crate) size_limit: usize,
    pub(crate) dfa_size_limit: usize,
    pub(crate) nest_limit: u32,
    pub(crate) line_terminator: Option<LineTerminator>,
    pub(crate) ban: Option<u8>,
    pub(crate) crlf: bool,
    pub(crate) word: bool,
    pub(crate) fixed_strings: bool,
    pub(crate) whole_line: bool,
}

impl Default for Config {
    fn default() -> Config {
        Config {
            case_insensitive: false,
            case_smart: false,
            multi_line: false,
            dot_matches_new_line: false,
            swap_greed: false,
            ignore_whitespace: false,
            unicode: true,
            octal: false,
            // These size limits are much bigger than what's in the regex
            // crate by default.
            size_limit: 100 * (1 << 20),
            dfa_size_limit: 1000 * (1 << 20),
            nest_limit: 250,
            line_terminator: None,
            ban: None,
            crlf: false,
            word: false,
            fixed_strings: false,
            whole_line: false,
        }
    }
}

impl Config {
    /// Use this configuration to build an HIR from the given patterns. The HIR
    /// returned corresponds to a single regex that is an alternation of the
    /// patterns given.
    pub(crate) fn build_many<P: AsRef<str>>(
        &self,
        patterns: &[P],
    ) -> Result<ConfiguredHIR, Error> {
        ConfiguredHIR::new(self.clone(), patterns)
    }

    /// Accounting for the `smart_case` config knob, return true if and only if
    /// this pattern should be matched case insensitively.
    fn is_case_insensitive(&self, analysis: &AstAnalysis) -> bool {
        if self.case_insensitive {
            return true;
        }
        if !self.case_smart {
            return false;
        }
        analysis.any_literal() && !analysis.any_uppercase()
    }

    /// Returns whether the given patterns should be treated as "fixed strings"
    /// literals. This is different from just querying the `fixed_strings` knob
    /// in that if the knob is false, this will still return true in some cases
    /// if the patterns are themselves indistinguishable from literals.
    ///
    /// The main idea here is that if this returns true, then it is safe
    /// to build an `regex_syntax::hir::Hir` value directly from the given
    /// patterns as an alternation of `hir::Literal` values.
    fn is_fixed_strings<P: AsRef<str>>(&self, patterns: &[P]) -> bool {
        // When these are enabled, we really need to parse the patterns and
        // let them go through the standard HIR translation process in order
        // for case folding transforms to be applied.
        if self.case_insensitive || self.case_smart {
            return false;
        }
        // Even if whole_line or word is enabled, both of those things can
        // be implemented by wrapping the Hir generated by an alternation of
        // fixed string literals. So for here at least, we don't care about the
        // word or whole_line settings.
        if self.fixed_strings {
            // ... but if any literal contains a line terminator, then we've
            // got to bail out because this will ultimately result in an error.
            if let Some(lineterm) = self.line_terminator {
                for p in patterns.iter() {
                    if has_line_terminator(lineterm, p.as_ref()) {
                        return false;
                    }
                }
            }
            return true;
        }
        // In this case, the only way we can hand construct the Hir is if none
        // of the patterns contain meta characters. If they do, then we need to
        // send them through the standard parsing/translation process.
        for p in patterns.iter() {
            let p = p.as_ref();
            if p.chars().any(regex_syntax::is_meta_character) {
                return false;
            }
            // Same deal as when fixed_strings is set above. If the pattern has
            // a line terminator anywhere, then we need to bail out and let
            // an error occur.
            if let Some(lineterm) = self.line_terminator {
                if has_line_terminator(lineterm, p) {
                    return false;
                }
            }
        }
        true
    }
}

/// A "configured" HIR expression, which is aware of the configuration which
/// produced this HIR.
///
/// Since the configuration is tracked, values with this type can be
/// transformed into other HIR expressions (or regular expressions) in a way
/// that preserves the configuration. For example, the `fast_line_regex`
/// method will apply literal extraction to the inner HIR and use that to build
/// a new regex that matches the extracted literals in a way that is
/// consistent with the configuration that produced this HIR. For example, the
/// size limits set on the configured HIR will be propagated out to any
/// subsequently constructed HIR or regular expression.
#[derive(Clone, Debug)]
pub(crate) struct ConfiguredHIR {
    config: Config,
    hir: Hir,
}

impl ConfiguredHIR {
    /// Parse the given patterns into a single HIR expression that represents
    /// an alternation of the patterns given.
    fn new<P: AsRef<str>>(
        config: Config,
        patterns: &[P],
    ) -> Result<ConfiguredHIR, Error> {
        let hir = if config.is_fixed_strings(patterns) {
            let mut alts = vec![];
            for p in patterns.iter() {
                alts.push(Hir::literal(p.as_ref().as_bytes()));
            }
            log::debug!(
                "assembling HIR from {} fixed string literals",
                alts.len()
            );
            let hir = Hir::alternation(alts);
            hir
        } else {
            let mut alts = vec![];
            for p in patterns.iter() {
                alts.push(if config.fixed_strings {
                    format!("(?:{})", regex_syntax::escape(p.as_ref()))
                } else {
                    format!("(?:{})", p.as_ref())
                });
            }
            let pattern = alts.join("|");
            let ast = ast::parse::ParserBuilder::new()
                .nest_limit(config.nest_limit)
                .octal(config.octal)
                .ignore_whitespace(config.ignore_whitespace)
                .build()
                .parse(&pattern)
                .map_err(Error::generic)?;
            let analysis = AstAnalysis::from_ast(&ast);
            let mut hir = hir::translate::TranslatorBuilder::new()
                .utf8(false)
                .case_insensitive(config.is_case_insensitive(&analysis))
                .multi_line(config.multi_line)
                .dot_matches_new_line(config.dot_matches_new_line)
                .crlf(config.crlf)
                .swap_greed(config.swap_greed)
                .unicode(config.unicode)
                .build()
                .translate(&pattern, &ast)
                .map_err(Error::generic)?;
            if let Some(byte) = config.ban {
                ban::check(&hir, byte)?;
            }
            // We don't need to do this for the fixed-strings case above
            // because is_fixed_strings will return false if any pattern
            // contains a line terminator. Therefore, we don't need to strip
            // it.
            //
            // We go to some pains to avoid doing this in the fixed-strings
            // case because this can result in building a new HIR when ripgrep
            // is given a huge set of literals to search for. And this can
            // actually take a little time. It's not huge, but it's noticeable.
            hir = match config.line_terminator {
                None => hir,
                Some(line_term) => strip_from_match(hir, line_term)?,
            };
            hir
        };
        Ok(ConfiguredHIR { config, hir })
    }

    /// Return a reference to the underlying configuration.
    pub(crate) fn config(&self) -> &Config {
        &self.config
    }

    /// Return a reference to the underyling HIR.
    pub(crate) fn hir(&self) -> &Hir {
        &self.hir
    }

    /// Convert this HIR to a regex that can be used for matching.
    pub(crate) fn to_regex(&self) -> Result<Regex, Error> {
        let meta = Regex::config()
            .utf8_empty(false)
            .nfa_size_limit(Some(self.config.size_limit))
            // We don't expose a knob for this because the one-pass DFA is
            // usually not a perf bottleneck for ripgrep. But we give it some
            // extra room than the default.
            .onepass_size_limit(Some(10 * (1 << 20)))
            // Same deal here. The default limit for full DFAs is VERY small,
            // but with ripgrep we can afford to spend a bit more time on
            // building them I think.
            .dfa_size_limit(Some(1 * (1 << 20)))
            .dfa_state_limit(Some(1_000))
            .hybrid_cache_capacity(self.config.dfa_size_limit);
        Regex::builder()
            .configure(meta)
            .build_from_hir(&self.hir)
            .map_err(Error::regex)
    }

    /// Compute the set of non-matching bytes for this HIR expression.
    pub(crate) fn non_matching_bytes(&self) -> ByteSet {
        non_matching_bytes(&self.hir)
    }

    /// Returns the line terminator configured on this expression.
    ///
    /// When we have beginning/end anchors (NOT line anchors), the fast line
    /// searching path isn't quite correct. Or at least, doesn't match the slow
    /// path. Namely, the slow path strips line terminators while the fast path
    /// does not. Since '$' (when multi-line mode is disabled) doesn't match at
    /// line boundaries, the existence of a line terminator might cause it to
    /// not match when it otherwise would with the line terminator stripped.
    ///
    /// Since searching with text anchors is exceptionally rare in the context
    /// of line oriented searching (multi-line mode is basically always
    /// enabled), we just disable this optimization when there are text
    /// anchors. We disable it by not returning a line terminator, since
    /// without a line terminator, the fast search path can't be executed.
    ///
    /// Actually, the above is no longer quite correct. Later on, another
    /// optimization was added where if the line terminator was in the set of
    /// bytes that was guaranteed to never be part of a match, then the higher
    /// level search infrastructure assumes that the fast line-by-line search
    /// path can still be taken. This optimization applies when multi-line
    /// search (not multi-line mode) is enabled. In that case, there is no
    /// configured line terminator since the regex is permitted to match a
    /// line terminator. But if the regex is guaranteed to never match across
    /// multiple lines despite multi-line search being requested, we can still
    /// do the faster and more flexible line-by-line search. This is why the
    /// non-matching extraction routine removes `\n` when `\A` and `\z` are
    /// present even though that's not quite correct...
    ///
    /// See: <https://github.com/BurntSushi/ripgrep/issues/2260>
    pub(crate) fn line_terminator(&self) -> Option<LineTerminator> {
        if self.hir.properties().look_set().contains_anchor_haystack() {
            None
        } else {
            self.config.line_terminator
        }
    }

    /// Turns this configured HIR into an equivalent one, but where it must
    /// match at the start and end of a line.
    pub(crate) fn into_whole_line(self) -> ConfiguredHIR {
        let line_anchor_start = Hir::look(self.line_anchor_start());
        let line_anchor_end = Hir::look(self.line_anchor_end());
        let hir =
            Hir::concat(vec![line_anchor_start, self.hir, line_anchor_end]);
        ConfiguredHIR { config: self.config, hir }
    }

    /// Turns this configured HIR into an equivalent one, but where it must
    /// match at word boundaries.
    pub(crate) fn into_word(self) -> ConfiguredHIR {
        let hir = Hir::concat(vec![
            Hir::look(if self.config.unicode {
                hir::Look::WordStartHalfUnicode
            } else {
                hir::Look::WordStartHalfAscii
            }),
            self.hir,
            Hir::look(if self.config.unicode {
                hir::Look::WordEndHalfUnicode
            } else {
                hir::Look::WordEndHalfAscii
            }),
        ]);
        ConfiguredHIR { config: self.config, hir }
    }

    /// Returns the "start line" anchor for this configuration.
    fn line_anchor_start(&self) -> hir::Look {
        if self.config.crlf {
            hir::Look::StartCRLF
        } else {
            hir::Look::StartLF
        }
    }

    /// Returns the "end line" anchor for this configuration.
    fn line_anchor_end(&self) -> hir::Look {
        if self.config.crlf {
            hir::Look::EndCRLF
        } else {
            hir::Look::EndLF
        }
    }
}

/// Returns true if the given literal string contains any byte from the line
/// terminator given.
fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool {
    if lineterm.is_crlf() {
        literal.as_bytes().iter().copied().any(|b| b == b'\r' || b == b'\n')
    } else {
        literal.as_bytes().iter().copied().any(|b| b == lineterm.as_byte())
    }
}