1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-06-14 22:15:13 +02:00

grep-regex: fix inner literal detection

It seems the inner literal detector fails spectacularly in cases of
concatenations that involve groups. The issue here is that if the prefix
of a group inside a concatenation can match the empty string, then any
literals generated to that point in the concatenation need to be cut
such that they are never extended. The detector isn't really built to
handle this case, so we just act conservative cut literals whenever we
see a sub-group. This may make some regexes slower, but the inner
literal detector already misses plenty of cases.

Literal detection (including in the regex engine) is a key component
that needs to be completely rethought at some point.

Fixes #1064
This commit is contained in:
Andrew Gallant
2018-09-24 19:24:33 -04:00
parent f72c2dfd90
commit ba503eb677
2 changed files with 30 additions and 2 deletions

View File

@ -166,10 +166,10 @@ fn union_required(expr: &Hir, lits: &mut Literals) {
lits.cut();
continue;
}
if lits2.contains_empty() {
if lits2.contains_empty() || !is_simple(&e) {
lits.cut();
}
if !lits.cross_product(&lits2) {
if !lits.cross_product(&lits2) || !lits2.any_complete() {
// If this expression couldn't yield any literal that
// could be extended, then we need to quit. Since we're
// short-circuiting, we also need to freeze every member.
@ -250,6 +250,20 @@ fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
}
}
fn is_simple(expr: &Hir) -> bool {
match *expr.kind() {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Repetition(_)
| HirKind::Concat(_)
| HirKind::Alternation(_) => true,
HirKind::Anchor(_)
| HirKind::WordBoundary(_)
| HirKind::Group(_) => false,
}
}
/// Return the number of characters in the given class.
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
@ -301,4 +315,12 @@ mod tests {
// assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
// assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
}
#[test]
fn regression_1064() {
// Regression from:
// https://github.com/BurntSushi/ripgrep/issues/1064
// assert_eq!(one_regex(r"a.*c"), pat("a"));
assert_eq!(one_regex(r"a(.*c)"), pat("a"));
}
}