mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-03-17 20:28:03 +02:00
Basically, unless the -a/--text flag is given, it is generally always an error to search for an explicit NUL byte because the binary detection will prevent it from matching. Fixes #1838
361 lines
14 KiB
Rust
361 lines
14 KiB
Rust
use {
|
|
grep_matcher::{ByteSet, LineTerminator},
|
|
regex_automata::meta::Regex,
|
|
regex_syntax::{
|
|
ast,
|
|
hir::{self, Hir},
|
|
},
|
|
};
|
|
|
|
use crate::{
|
|
ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes,
|
|
strip::strip_from_match,
|
|
};
|
|
|
|
/// Config represents the configuration of a regex matcher in this crate.
|
|
/// The configuration is itself a rough combination of the knobs found in
|
|
/// the `regex` crate itself, along with additional `grep-matcher` specific
|
|
/// options.
|
|
///
|
|
/// The configuration can be used to build a "configured" HIR expression. A
|
|
/// configured HIR expression is an HIR expression that is aware of the
|
|
/// configuration which generated it, and provides transformation on that HIR
|
|
/// such that the configuration is preserved.
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) struct Config {
|
|
pub(crate) case_insensitive: bool,
|
|
pub(crate) case_smart: bool,
|
|
pub(crate) multi_line: bool,
|
|
pub(crate) dot_matches_new_line: bool,
|
|
pub(crate) swap_greed: bool,
|
|
pub(crate) ignore_whitespace: bool,
|
|
pub(crate) unicode: bool,
|
|
pub(crate) octal: bool,
|
|
pub(crate) size_limit: usize,
|
|
pub(crate) dfa_size_limit: usize,
|
|
pub(crate) nest_limit: u32,
|
|
pub(crate) line_terminator: Option<LineTerminator>,
|
|
pub(crate) ban: Option<u8>,
|
|
pub(crate) crlf: bool,
|
|
pub(crate) word: bool,
|
|
pub(crate) fixed_strings: bool,
|
|
pub(crate) whole_line: bool,
|
|
}
|
|
|
|
impl Default for Config {
|
|
fn default() -> Config {
|
|
Config {
|
|
case_insensitive: false,
|
|
case_smart: false,
|
|
multi_line: false,
|
|
dot_matches_new_line: false,
|
|
swap_greed: false,
|
|
ignore_whitespace: false,
|
|
unicode: true,
|
|
octal: false,
|
|
// These size limits are much bigger than what's in the regex
|
|
// crate by default.
|
|
size_limit: 100 * (1 << 20),
|
|
dfa_size_limit: 1000 * (1 << 20),
|
|
nest_limit: 250,
|
|
line_terminator: None,
|
|
ban: None,
|
|
crlf: false,
|
|
word: false,
|
|
fixed_strings: false,
|
|
whole_line: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Config {
|
|
/// Use this configuration to build an HIR from the given patterns. The HIR
|
|
/// returned corresponds to a single regex that is an alternation of the
|
|
/// patterns given.
|
|
pub(crate) fn build_many<P: AsRef<str>>(
|
|
&self,
|
|
patterns: &[P],
|
|
) -> Result<ConfiguredHIR, Error> {
|
|
ConfiguredHIR::new(self.clone(), patterns)
|
|
}
|
|
|
|
/// Accounting for the `smart_case` config knob, return true if and only if
|
|
/// this pattern should be matched case insensitively.
|
|
fn is_case_insensitive(&self, analysis: &AstAnalysis) -> bool {
|
|
if self.case_insensitive {
|
|
return true;
|
|
}
|
|
if !self.case_smart {
|
|
return false;
|
|
}
|
|
analysis.any_literal() && !analysis.any_uppercase()
|
|
}
|
|
|
|
/// Returns whether the given patterns should be treated as "fixed strings"
|
|
/// literals. This is different from just querying the `fixed_strings` knob
|
|
/// in that if the knob is false, this will still return true in some cases
|
|
/// if the patterns are themselves indistinguishable from literals.
|
|
///
|
|
/// The main idea here is that if this returns true, then it is safe
|
|
/// to build an `regex_syntax::hir::Hir` value directly from the given
|
|
/// patterns as an alternation of `hir::Literal` values.
|
|
fn is_fixed_strings<P: AsRef<str>>(&self, patterns: &[P]) -> bool {
|
|
// When these are enabled, we really need to parse the patterns and
|
|
// let them go through the standard HIR translation process in order
|
|
// for case folding transforms to be applied.
|
|
if self.case_insensitive || self.case_smart {
|
|
return false;
|
|
}
|
|
// Even if whole_line or word is enabled, both of those things can
|
|
// be implemented by wrapping the Hir generated by an alternation of
|
|
// fixed string literals. So for here at least, we don't care about the
|
|
// word or whole_line settings.
|
|
if self.fixed_strings {
|
|
// ... but if any literal contains a line terminator, then we've
|
|
// got to bail out because this will ultimately result in an error.
|
|
if let Some(lineterm) = self.line_terminator {
|
|
for p in patterns.iter() {
|
|
if has_line_terminator(lineterm, p.as_ref()) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
// In this case, the only way we can hand construct the Hir is if none
|
|
// of the patterns contain meta characters. If they do, then we need to
|
|
// send them through the standard parsing/translation process.
|
|
for p in patterns.iter() {
|
|
let p = p.as_ref();
|
|
if p.chars().any(regex_syntax::is_meta_character) {
|
|
return false;
|
|
}
|
|
// Same deal as when fixed_strings is set above. If the pattern has
|
|
// a line terminator anywhere, then we need to bail out and let
|
|
// an error occur.
|
|
if let Some(lineterm) = self.line_terminator {
|
|
if has_line_terminator(lineterm, p) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
true
|
|
}
|
|
}
|
|
|
|
/// A "configured" HIR expression, which is aware of the configuration which
|
|
/// produced this HIR.
|
|
///
|
|
/// Since the configuration is tracked, values with this type can be
|
|
/// transformed into other HIR expressions (or regular expressions) in a way
|
|
/// that preserves the configuration. For example, the `fast_line_regex`
|
|
/// method will apply literal extraction to the inner HIR and use that to build
|
|
/// a new regex that matches the extracted literals in a way that is
|
|
/// consistent with the configuration that produced this HIR. For example, the
|
|
/// size limits set on the configured HIR will be propagated out to any
|
|
/// subsequently constructed HIR or regular expression.
|
|
#[derive(Clone, Debug)]
|
|
pub(crate) struct ConfiguredHIR {
|
|
config: Config,
|
|
hir: Hir,
|
|
}
|
|
|
|
impl ConfiguredHIR {
|
|
/// Parse the given patterns into a single HIR expression that represents
|
|
/// an alternation of the patterns given.
|
|
fn new<P: AsRef<str>>(
|
|
config: Config,
|
|
patterns: &[P],
|
|
) -> Result<ConfiguredHIR, Error> {
|
|
let hir = if config.is_fixed_strings(patterns) {
|
|
let mut alts = vec![];
|
|
for p in patterns.iter() {
|
|
alts.push(Hir::literal(p.as_ref().as_bytes()));
|
|
}
|
|
log::debug!(
|
|
"assembling HIR from {} fixed string literals",
|
|
alts.len()
|
|
);
|
|
let hir = Hir::alternation(alts);
|
|
hir
|
|
} else {
|
|
let mut alts = vec![];
|
|
for p in patterns.iter() {
|
|
alts.push(if config.fixed_strings {
|
|
format!("(?:{})", regex_syntax::escape(p.as_ref()))
|
|
} else {
|
|
format!("(?:{})", p.as_ref())
|
|
});
|
|
}
|
|
let pattern = alts.join("|");
|
|
let ast = ast::parse::ParserBuilder::new()
|
|
.nest_limit(config.nest_limit)
|
|
.octal(config.octal)
|
|
.ignore_whitespace(config.ignore_whitespace)
|
|
.build()
|
|
.parse(&pattern)
|
|
.map_err(Error::generic)?;
|
|
let analysis = AstAnalysis::from_ast(&ast);
|
|
let mut hir = hir::translate::TranslatorBuilder::new()
|
|
.utf8(false)
|
|
.case_insensitive(config.is_case_insensitive(&analysis))
|
|
.multi_line(config.multi_line)
|
|
.dot_matches_new_line(config.dot_matches_new_line)
|
|
.crlf(config.crlf)
|
|
.swap_greed(config.swap_greed)
|
|
.unicode(config.unicode)
|
|
.build()
|
|
.translate(&pattern, &ast)
|
|
.map_err(Error::generic)?;
|
|
if let Some(byte) = config.ban {
|
|
ban::check(&hir, byte)?;
|
|
}
|
|
// We don't need to do this for the fixed-strings case above
|
|
// because is_fixed_strings will return false if any pattern
|
|
// contains a line terminator. Therefore, we don't need to strip
|
|
// it.
|
|
//
|
|
// We go to some pains to avoid doing this in the fixed-strings
|
|
// case because this can result in building a new HIR when ripgrep
|
|
// is given a huge set of literals to search for. And this can
|
|
// actually take a little time. It's not huge, but it's noticeable.
|
|
hir = match config.line_terminator {
|
|
None => hir,
|
|
Some(line_term) => strip_from_match(hir, line_term)?,
|
|
};
|
|
hir
|
|
};
|
|
Ok(ConfiguredHIR { config, hir })
|
|
}
|
|
|
|
/// Return a reference to the underlying configuration.
|
|
pub(crate) fn config(&self) -> &Config {
|
|
&self.config
|
|
}
|
|
|
|
/// Return a reference to the underyling HIR.
|
|
pub(crate) fn hir(&self) -> &Hir {
|
|
&self.hir
|
|
}
|
|
|
|
/// Convert this HIR to a regex that can be used for matching.
|
|
pub(crate) fn to_regex(&self) -> Result<Regex, Error> {
|
|
let meta = Regex::config()
|
|
.utf8_empty(false)
|
|
.nfa_size_limit(Some(self.config.size_limit))
|
|
// We don't expose a knob for this because the one-pass DFA is
|
|
// usually not a perf bottleneck for ripgrep. But we give it some
|
|
// extra room than the default.
|
|
.onepass_size_limit(Some(10 * (1 << 20)))
|
|
// Same deal here. The default limit for full DFAs is VERY small,
|
|
// but with ripgrep we can afford to spend a bit more time on
|
|
// building them I think.
|
|
.dfa_size_limit(Some(1 * (1 << 20)))
|
|
.dfa_state_limit(Some(1_000))
|
|
.hybrid_cache_capacity(self.config.dfa_size_limit);
|
|
Regex::builder()
|
|
.configure(meta)
|
|
.build_from_hir(&self.hir)
|
|
.map_err(Error::regex)
|
|
}
|
|
|
|
/// Compute the set of non-matching bytes for this HIR expression.
|
|
pub(crate) fn non_matching_bytes(&self) -> ByteSet {
|
|
non_matching_bytes(&self.hir)
|
|
}
|
|
|
|
/// Returns the line terminator configured on this expression.
|
|
///
|
|
/// When we have beginning/end anchors (NOT line anchors), the fast line
|
|
/// searching path isn't quite correct. Or at least, doesn't match the slow
|
|
/// path. Namely, the slow path strips line terminators while the fast path
|
|
/// does not. Since '$' (when multi-line mode is disabled) doesn't match at
|
|
/// line boundaries, the existence of a line terminator might cause it to
|
|
/// not match when it otherwise would with the line terminator stripped.
|
|
///
|
|
/// Since searching with text anchors is exceptionally rare in the context
|
|
/// of line oriented searching (multi-line mode is basically always
|
|
/// enabled), we just disable this optimization when there are text
|
|
/// anchors. We disable it by not returning a line terminator, since
|
|
/// without a line terminator, the fast search path can't be executed.
|
|
///
|
|
/// Actually, the above is no longer quite correct. Later on, another
|
|
/// optimization was added where if the line terminator was in the set of
|
|
/// bytes that was guaranteed to never be part of a match, then the higher
|
|
/// level search infrastructure assumes that the fast line-by-line search
|
|
/// path can still be taken. This optimization applies when multi-line
|
|
/// search (not multi-line mode) is enabled. In that case, there is no
|
|
/// configured line terminator since the regex is permitted to match a
|
|
/// line terminator. But if the regex is guaranteed to never match across
|
|
/// multiple lines despite multi-line search being requested, we can still
|
|
/// do the faster and more flexible line-by-line search. This is why the
|
|
/// non-matching extraction routine removes `\n` when `\A` and `\z` are
|
|
/// present even though that's not quite correct...
|
|
///
|
|
/// See: <https://github.com/BurntSushi/ripgrep/issues/2260>
|
|
pub(crate) fn line_terminator(&self) -> Option<LineTerminator> {
|
|
if self.hir.properties().look_set().contains_anchor_haystack() {
|
|
None
|
|
} else {
|
|
self.config.line_terminator
|
|
}
|
|
}
|
|
|
|
/// Turns this configured HIR into an equivalent one, but where it must
|
|
/// match at the start and end of a line.
|
|
pub(crate) fn into_whole_line(self) -> ConfiguredHIR {
|
|
let line_anchor_start = Hir::look(self.line_anchor_start());
|
|
let line_anchor_end = Hir::look(self.line_anchor_end());
|
|
let hir =
|
|
Hir::concat(vec![line_anchor_start, self.hir, line_anchor_end]);
|
|
ConfiguredHIR { config: self.config, hir }
|
|
}
|
|
|
|
/// Turns this configured HIR into an equivalent one, but where it must
|
|
/// match at word boundaries.
|
|
pub(crate) fn into_word(self) -> ConfiguredHIR {
|
|
let hir = Hir::concat(vec![
|
|
Hir::look(if self.config.unicode {
|
|
hir::Look::WordStartHalfUnicode
|
|
} else {
|
|
hir::Look::WordStartHalfAscii
|
|
}),
|
|
self.hir,
|
|
Hir::look(if self.config.unicode {
|
|
hir::Look::WordEndHalfUnicode
|
|
} else {
|
|
hir::Look::WordEndHalfAscii
|
|
}),
|
|
]);
|
|
ConfiguredHIR { config: self.config, hir }
|
|
}
|
|
|
|
/// Returns the "start line" anchor for this configuration.
|
|
fn line_anchor_start(&self) -> hir::Look {
|
|
if self.config.crlf {
|
|
hir::Look::StartCRLF
|
|
} else {
|
|
hir::Look::StartLF
|
|
}
|
|
}
|
|
|
|
/// Returns the "end line" anchor for this configuration.
|
|
fn line_anchor_end(&self) -> hir::Look {
|
|
if self.config.crlf {
|
|
hir::Look::EndCRLF
|
|
} else {
|
|
hir::Look::EndLF
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns true if the given literal string contains any byte from the line
|
|
/// terminator given.
|
|
fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool {
|
|
if lineterm.is_crlf() {
|
|
literal.as_bytes().iter().copied().any(|b| b == b'\r' || b == b'\n')
|
|
} else {
|
|
literal.as_bytes().iter().copied().any(|b| b == lineterm.as_byte())
|
|
}
|
|
}
|