1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-03-03 14:32:22 +02:00

grep-regex: improve literal detection with -w

When the -w/--word-regexp was used, ripgrep would in many cases fail to
apply literal optimizations. This occurs specifically when the regex
given by the user is an alternation of literals with no common prefixes
or suffixes, e.g.,

    rg -w 'foo|bar|baz|quux'

In this case, the inner literal detector fails. Normally, this would
result in literal prefixes being detected by the regex engine. But
because of the -w/--word-regexp flag, the actual regex that we run ends
up looking like this:

    (^|\W)(foo|bar|baz|quux)($|\W)

which of course defeats any prefix or suffix literal optimizations in
the regex crate's somewhat naive extractor. (A better extractor could
still do literal optimizations in the above case.)

So this commit fixes this by falling back to prefix or suffix literals
when they're available instead of prematurely giving up and assuming the
regex engine will do the rest.
This commit is contained in:
Andrew Gallant 2020-02-16 10:36:38 -05:00
parent ad97e9c93f
commit 6a0e0147e0
3 changed files with 60 additions and 2 deletions

View File

@ -10,6 +10,8 @@ Performance improvements:
Improve inner literal detection to cover more cases more effectively.
e.g., ` +Sherlock Holmes +` now has ` Sherlock Holmes ` extracted instead
of ` `.
* PERF:
Improve literal detection when the `-w/--word-regexp` flag is used.
Feature enhancements:

View File

@ -110,7 +110,63 @@ impl LiteralSets {
// We're matching raw bytes, so disable Unicode mode.
Some(format!("(?-u:{})", alts.join("|")))
} else if lit.is_empty() {
None
// If we're here, then we have no LCP. No LCS. And no detected
// inner required literals. In theory this shouldn't happen, but
// the inner literal detector isn't as nice as we hope and doens't
// actually support returning a set of alternating required
// literals. (Instead, it only returns a set where EVERY literal
// in it is required. It cannot currently express "either P or Q
// is required.")
//
// In this case, it is possible that we still have meaningful
// prefixes or suffixes to use. So we look for the set of literals
// with the highest minimum length and use that to build our "fast"
// regex.
//
// This manifest in fairly common scenarios. e.g.,
//
// rg -w 'foo|bar|baz|quux'
//
// Normally, without the `-w`, the regex engine itself would
// detect the prefix correctly. Unfortunately, the `-w` option
// turns the regex into something like this:
//
// rg '(^|\W)(foo|bar|baz|quux)($|\W)'
//
// Which will defeat all prefix and suffix literal optimizations.
// (Not in theory---it could be better. But the current
// implementation isn't good enough.) ... So we make up for it
// here.
let p_min_len = self.prefixes.min_len();
let s_min_len = self.suffixes.min_len();
let lits = match (p_min_len, s_min_len) {
(None, None) => return None,
(Some(_), None) => {
debug!("prefix literals found");
self.prefixes.literals()
}
(None, Some(_)) => {
debug!("suffix literals found");
self.suffixes.literals()
}
(Some(p), Some(s)) => {
if p >= s {
debug!("prefix literals found");
self.prefixes.literals()
} else {
debug!("suffix literals found");
self.suffixes.literals()
}
}
};
debug!("prefix/suffix literals found: {:?}", lits);
let alts: Vec<String> = lits
.into_iter()
.map(|x| util::bytes_to_regex(x))
.collect();
// We're matching raw bytes, so disable Unicode mode.
Some(format!("(?-u:{})", alts.join("|")))
} else {
debug!("required literal found: {:?}", util::show_bytes(lit));
Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))

View File

@ -49,7 +49,7 @@ impl RegexMatcherBuilder {
let fast_line_regex = chir.fast_line_regex()?;
let non_matching_bytes = chir.non_matching_bytes();
if let Some(ref re) = fast_line_regex {
trace!("extracted fast line regex: {:?}", re);
debug!("extracted fast line regex: {:?}", re);
}
let matcher = RegexMatcherImpl::new(&chir)?;