mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-01-29 22:01:04 +02:00
Fix a performance bug where using -w could result in very bad performance.
The specific issue is that -w causes the regex to be wrapped in Unicode word boundaries. Regrettably, Unicode word boundaries are the one thing our regex engine can't handle well in the presence of non-ASCII text. We work around its slowness by stripping word boundaries in some circumstances, and using the resulting expression as a way to produce match candidates that are then verified by the full original regex. This doesn't fix all cases, but it should fix all cases where -w is used.
This commit is contained in:
parent
4d6b3c727e
commit
2a2b1506d4
@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match};
|
|||||||
mod literals;
|
mod literals;
|
||||||
mod nonl;
|
mod nonl;
|
||||||
mod search;
|
mod search;
|
||||||
|
mod word_boundary;
|
||||||
|
|
||||||
/// Result is a convenient type alias that fixes the type of the error to
|
/// Result is a convenient type alias that fixes the type of the error to
|
||||||
/// the `Error` type defined in this crate.
|
/// the `Error` type defined in this crate.
|
||||||
|
@ -4,6 +4,8 @@ use syntax;
|
|||||||
|
|
||||||
use literals::LiteralSets;
|
use literals::LiteralSets;
|
||||||
use nonl;
|
use nonl;
|
||||||
|
use syntax::Expr;
|
||||||
|
use word_boundary::strip_unicode_word_boundaries;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// A matched line.
|
/// A matched line.
|
||||||
@ -127,7 +129,25 @@ impl GrepBuilder {
|
|||||||
pub fn build(self) -> Result<Grep> {
|
pub fn build(self) -> Result<Grep> {
|
||||||
let expr = try!(self.parse());
|
let expr = try!(self.parse());
|
||||||
let literals = LiteralSets::create(&expr);
|
let literals = LiteralSets::create(&expr);
|
||||||
let re = try!(
|
let re = try!(self.regex(&expr));
|
||||||
|
let required = literals.to_regex().or_else(|| {
|
||||||
|
let expr = match strip_unicode_word_boundaries(&expr) {
|
||||||
|
None => return None,
|
||||||
|
Some(expr) => expr,
|
||||||
|
};
|
||||||
|
debug!("Stripped Unicode word boundaries. New AST:\n{:?}", expr);
|
||||||
|
self.regex(&expr).ok()
|
||||||
|
});
|
||||||
|
Ok(Grep {
|
||||||
|
re: re,
|
||||||
|
required: required,
|
||||||
|
opts: self.opts,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a new regex from the given expression with the current
|
||||||
|
/// configuration.
|
||||||
|
fn regex(&self, expr: &Expr) -> Result<Regex> {
|
||||||
RegexBuilder::new(&expr.to_string())
|
RegexBuilder::new(&expr.to_string())
|
||||||
.case_insensitive(self.opts.case_insensitive)
|
.case_insensitive(self.opts.case_insensitive)
|
||||||
.multi_line(true)
|
.multi_line(true)
|
||||||
@ -135,12 +155,7 @@ impl GrepBuilder {
|
|||||||
.size_limit(self.opts.size_limit)
|
.size_limit(self.opts.size_limit)
|
||||||
.dfa_size_limit(self.opts.dfa_size_limit)
|
.dfa_size_limit(self.opts.dfa_size_limit)
|
||||||
.compile()
|
.compile()
|
||||||
);
|
.map_err(From::from)
|
||||||
Ok(Grep {
|
|
||||||
re: re,
|
|
||||||
required: literals.to_regex(),
|
|
||||||
opts: self.opts,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parses the underlying pattern and ensures the pattern can never match
|
/// Parses the underlying pattern and ensures the pattern can never match
|
||||||
|
54
grep/src/word_boundary.rs
Normal file
54
grep/src/word_boundary.rs
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
use syntax::Expr;
|
||||||
|
|
||||||
|
/// Strips Unicode word boundaries from the given expression.
|
||||||
|
///
|
||||||
|
/// The key invariant this maintains is that the expression returned will match
|
||||||
|
/// *at least* every where the expression given will match. Namely, a match of
|
||||||
|
/// the returned expression can report false positives but it will never report
|
||||||
|
/// false negatives.
|
||||||
|
///
|
||||||
|
/// If no word boundaries could be stripped, then None is returned.
|
||||||
|
pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
|
||||||
|
// The real reason we do this is because Unicode word boundaries are the
|
||||||
|
// one thing that Rust's regex DFA engine can't handle. When it sees a
|
||||||
|
// Unicode word boundary among non-ASCII text, it falls back to one of the
|
||||||
|
// slower engines. We work around this limitation by attempting to use
|
||||||
|
// a regex to find candidate matches without a Unicode word boundary. We'll
|
||||||
|
// only then use the full (and slower) regex to confirm a candidate as a
|
||||||
|
// match or not during search.
|
||||||
|
use syntax::Expr::*;
|
||||||
|
|
||||||
|
match *expr {
|
||||||
|
Concat(ref es) if !es.is_empty() => {
|
||||||
|
let first = is_unicode_word_boundary(&es[0]);
|
||||||
|
let last = is_unicode_word_boundary(es.last().unwrap());
|
||||||
|
// Be careful not to strip word boundaries if there are no other
|
||||||
|
// expressions to match.
|
||||||
|
match (first, last) {
|
||||||
|
(true, false) if es.len() > 1 => {
|
||||||
|
Some(Concat(es[1..].to_vec()))
|
||||||
|
}
|
||||||
|
(false, true) if es.len() > 1 => {
|
||||||
|
Some(Concat(es[..es.len() - 1].to_vec()))
|
||||||
|
}
|
||||||
|
(true, true) if es.len() > 2 => {
|
||||||
|
Some(Concat(es[1..es.len() - 1].to_vec()))
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if the given expression is a Unicode word boundary.
|
||||||
|
fn is_unicode_word_boundary(expr: &Expr) -> bool {
|
||||||
|
use syntax::Expr::*;
|
||||||
|
|
||||||
|
match *expr {
|
||||||
|
WordBoundary => true,
|
||||||
|
NotWordBoundary => true,
|
||||||
|
Group { ref e, .. } => is_unicode_word_boundary(e),
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user