Fix a performance bug where using -w could result in very bad performance.

The specific issue is that -w causes the regex to be wrapped in Unicode word boundaries. Regrettably, Unicode word boundaries are the one thing our regex engine can't handle well in the presence of non-ASCII text. We work around its slowness by stripping word boundaries in some circumstances, and using the resulting expression as a way to produce match candidates that are then verified by the full original regex. This doesn't fix all cases, but it should fix all cases where -w is used.
2025-07-11 14:30:24 +02:00 · 2016-09-21 19:12:07 -04:00
parent 4d6b3c727e
commit 2a2b1506d4
3 changed files with 80 additions and 10 deletions
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match};
 mod literals;
 mod nonl;
 mod search;
 mod word_boundary;
 /// Result is a convenient type alias that fixes the type of the error to
 /// the `Error` type defined in this crate.
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@ -4,6 +4,8 @@ use syntax;
 use literals::LiteralSets;
 use nonl;
 use syntax::Expr;
 use word_boundary::strip_unicode_word_boundaries;
 use Result;
 /// A matched line.
@ -127,7 +129,25 @@ impl GrepBuilder {
    pub fn build(self) -> Result<Grep> {
        let expr = try!(self.parse());
        let literals = LiteralSets::create(&expr);
-        let re = try!(
+        let re = try!(self.regex(&expr));
        let required = literals.to_regex().or_else(|| {
            let expr = match strip_unicode_word_boundaries(&expr) {
                None => return None,
                Some(expr) => expr,
            };
            debug!("Stripped Unicode word boundaries. New AST:\n{:?}", expr);
            self.regex(&expr).ok()
        });
        Ok(Grep {
            re: re,
            required: required,
            opts: self.opts,
        })
    }
    /// Creates a new regex from the given expression with the current
    /// configuration.
    fn regex(&self, expr: &Expr) -> Result<Regex> {
        RegexBuilder::new(&expr.to_string())
            .case_insensitive(self.opts.case_insensitive)
            .multi_line(true)
@ -135,12 +155,7 @@ impl GrepBuilder {
            .size_limit(self.opts.size_limit)
            .dfa_size_limit(self.opts.dfa_size_limit)
            .compile()
-        );
+            .map_err(From::from)
        Ok(Grep {
            re: re,
            required: literals.to_regex(),
            opts: self.opts,
        })
    }
    /// Parses the underlying pattern and ensures the pattern can never match
--- a/grep/src/word_boundary.rs
+++ b/grep/src/word_boundary.rs
@ -0,0 +1,54 @@
 use syntax::Expr;
 /// Strips Unicode word boundaries from the given expression.
 ///
 /// The key invariant this maintains is that the expression returned will match
 /// *at least* every where the expression given will match. Namely, a match of
 /// the returned expression can report false positives but it will never report
 /// false negatives.
 ///
 /// If no word boundaries could be stripped, then None is returned.
 pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
    // The real reason we do this is because Unicode word boundaries are the
    // one thing that Rust's regex DFA engine can't handle. When it sees a
    // Unicode word boundary among non-ASCII text, it falls back to one of the
    // slower engines. We work around this limitation by attempting to use
    // a regex to find candidate matches without a Unicode word boundary. We'll
    // only then use the full (and slower) regex to confirm a candidate as a
    // match or not during search.
    use syntax::Expr::*;
    match *expr {
        Concat(ref es) if !es.is_empty() => {
            let first = is_unicode_word_boundary(&es[0]);
            let last = is_unicode_word_boundary(es.last().unwrap());
            // Be careful not to strip word boundaries if there are no other
            // expressions to match.
            match (first, last) {
                (true, false) if es.len() > 1 => {
                    Some(Concat(es[1..].to_vec()))
                }
                (false, true) if es.len() > 1 => {
                    Some(Concat(es[..es.len() - 1].to_vec()))
                }
                (true, true) if es.len() > 2 => {
                    Some(Concat(es[1..es.len() - 1].to_vec()))
                }
                _ => None,
            }
        }
        _ => None,
    }
 }
 /// Returns true if the given expression is a Unicode word boundary.
 fn is_unicode_word_boundary(expr: &Expr) -> bool {
    use syntax::Expr::*;
    match *expr {
        WordBoundary => true,
        NotWordBoundary => true,
        Group { ref e, .. } => is_unicode_word_boundary(e),
        _ => false,
    }
 }