grep-regex: fix inner literal extraction bug

This appears to be another transcription bug from copying this code from the prefix literal detection from inside the regex crate. Namely, when it comes to inner literals, we only want to treat counted repetition as two separate cases: the case when the minimum match is 0 and the case when the minimum match is more than 0. In the former case, we treat `e{0,n}` as `e*` and in the latter we treat `e{m,n}` where `m >= 1` as just `e`. We could definitely do better here. e.g., This means regexes like `(foo){10}` will only have `foo` extracted as a literal, where searching for the full literal would likely be faster. The actual bug here was that we were not implementing this logic correctly. Namely, we weren't always "cutting" the literals in the second case to prevent them from being expanded. Fixes #1319, Closes #1367
2025-11-23 21:54:45 +02:00 · 2019-09-05 13:39:08 +00:00
parent f8e70294d5
commit b435eaafc8
3 changed files with 21 additions and 10 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,6 +30,8 @@ Bug fixes:

 * [BUG #1291](https://github.com/BurntSushi/ripgrep/issues/1291):
  ripgrep now works in non-existent directories.
+* [BUG #1319](https://github.com/BurntSushi/ripgrep/issues/1319):
+  Fix match bug due to errant literal detection.
 * [**BUG #1335**](https://github.com/BurntSushi/ripgrep/issues/1335):
  Fixes a performance bug when searching plain text files with very long lines.
  This was a serious performance regression in some cases.
--- a/grep-regex/src/literal.rs
+++ b/grep-regex/src/literal.rs
@@ -5,8 +5,6 @@ the regex engine doesn't look for inner literals. Since we're doing line based
 searching, we can use them, so we need to do it ourselves.
 */

-use std::cmp;
-
 use regex_syntax::hir::{self, Hir, HirKind};
 use regex_syntax::hir::literal::{Literal, Literals};

@@ -248,7 +246,7 @@ fn union_required(expr: &Hir, lits: &mut Literals) {
 fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
    e: &Hir,
    min: u32,
-    max: Option<u32>,
+    _max: Option<u32>,
    _greedy: bool,
    lits: &mut Literals,
    mut f: F,
@@ -259,19 +257,13 @@ fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
        // just treat it as `e*`.
        lits.cut();
    } else {
-        let n = cmp::min(lits.limit_size(), min as usize);
        // We only extract literals from a single repetition, even though
        // we could do more. e.g., `a{3}` will have `a` extracted instead of
        // `aaa`. The reason is that inner literal extraction can't be unioned
        // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
        // is wrong.
        f(e, lits);
-        if n < min as usize {
-            lits.cut();
-        }
-        if max.map_or(true, |max| min < max) {
-            lits.cut();
-        }
+        lits.cut();
    }
 }

@@ -383,4 +375,13 @@ mod tests {
        // assert_eq!(one_regex(r"a.*c"), pat("a"));
        assert_eq!(one_regex(r"a(.*c)"), pat("a"));
    }
+
+    #[test]
+    fn regression_1319() {
+        // Regression from:
+        // https://github.com/BurntSushi/ripgrep/issues/1319
+        assert_eq!(one_regex(r"TTGAGTCCAGGAG[ATCG]{2}C"),
+            pat("TTGAGTCCAGGAGA|TTGAGTCCAGGAGC|\
+                 TTGAGTCCAGGAGG|TTGAGTCCAGGAGT"));
+    }
 }
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -729,6 +729,14 @@ rgtest!(r1259_drop_last_byte_nonl, |dir: Dir, mut cmd: TestCommand| {
    eqnice!("fz\n", cmd.arg("-f").arg("patterns-nl").arg("test").stdout());
 });

+// See: https://github.com/BurntSushi/ripgrep/issues/1319
+rgtest!(r1319, |dir: Dir, mut cmd: TestCommand| {
+    dir.create("input", "CCAGCTACTCGGGAGGCTGAGGCTGGAGGATCGCTTGAGTCCAGGAGTTC");
+    eqnice!(
+        "input:CCAGCTACTCGGGAGGCTGAGGCTGGAGGATCGCTTGAGTCCAGGAGTTC\n",
+        cmd.arg("TTGAGTCCAGGAG[ATCG]{2}C").stdout());
+});
+
 // See: https://github.com/BurntSushi/ripgrep/issues/1334
 rgtest!(r1334_crazy_literals, |dir: Dir, mut cmd: TestCommand| {
    dir.create("patterns", &"1.208.0.0/12\n".repeat(40));