regex: fix inner literal extraction that resulted in false negatives

In some rare cases, it was possible for ripgrep's inner literal detector to extract a set of literals that could produce a false negative. #2884 gives an example: `(?i:e.x|ex)`. In this case, the set extracted can be discovered by running `rg '(?i:e.x|ex) --trace`: Seq[E("EX"), E("Ex"), E("eX"), E("ex")] This extraction leads to building a multi-substring matcher for `EX`, `Ex`, `eX` and `ex`. Searching the haystack `e-x` produces no match, and thus, ripgrep shows no matches. But the regex `(?i:e.x|ex)` matches `e-x`. The issue at play here was that when two extracted literal sequences were unioned, we were correctly unioning their "prefix" attribute. And this in turn leads to those literal sequences being combined incorrectly via cross product. This case in particular triggers it because two different optimizations combine to produce an incorrect result. Firslty, the regex has a common prefix extracted and is rewritten as `(?i:e(?:.x|x))`. Secondly, the `x` in the first branch of the alternation has its `prefix` attribute set to `false` (correctly), which means it can't be cross producted with another concatenation. But in this case, it is unioned with the `x` from the second branch, and this results in the union result having `prefix` set to `true`. This in turn pops up and lets it get cross producted with the `e` prefix, producing an incorrect literal sequence. We fix this by changing the implementation of `union` to return `prefix` set to `true` only when *both* literal sequences being unioned have `prefix` set to `true`. Doing this exposed a second bug that was present, but was purely cosmetic: the extracted literals in this case, after the fix, are `X` and `x`. They were considered "exact" (i.e., lead to a match), but of course they are not. Observing an `X` or an `x` does not mean there is a match. This was fixed by making `choose` always return an inexact literal sequence. This is perhaps too conservative in aggregate in some cases, but always correct. The idea here is that if one is choosing between two concatenations, then it is likely the case that the sequence returned should be considered inexact. The issue is that this can lead to avoiding cross products in some cases that would otherwise be correct. This is bad because it means extracting shorter literals in some cases. (In general, the longer the literal the better.) But we prioritize correctness for now and fix it. You can see a few tests where this shortens some extracted literals. Fixes #2884
2025-08-04 21:52:54 +02:00 · 2024-09-08 21:44:26 -04:00
parent 6c5108ed17
commit 9d738ad0c0
2 changed files with 23 additions and 28 deletions
--- a/crates/regex/src/literal.rs
+++ b/crates/regex/src/literal.rs
@ -430,6 +430,7 @@ impl Extractor {
        }
        seq1.union(seq2);
        assert!(seq1.len().map_or(true, |x| x <= self.limit_total));
+        seq1.prefix = seq1.prefix && seq2.prefix;
        seq1
    }

@ -586,10 +587,15 @@ impl TSeq {
        lits.iter().any(is_poisonous)
    }

-    /// Compare the two sequences and return the one that is believed to be best
-    /// according to a hodge podge of heuristics.
+    /// Compare the two sequences and return the one that is believed to be
+    /// best according to a hodge podge of heuristics.
    fn choose(self, other: TSeq) -> TSeq {
-        let (seq1, seq2) = (self, other);
+        let (mut seq1, mut seq2) = (self, other);
+        // Whichever one we pick, by virtue of picking one, we choose
+        // to not take the other. So we must consider the result inexact.
+        seq1.make_inexact();
+        seq2.make_inexact();
+
        if !seq1.is_finite() {
            return seq2;
        } else if !seq2.is_finite() {
@ -681,7 +687,7 @@ mod tests {
        assert_eq!(e(r"foo"), seq([E("foo")]));
        assert_eq!(e(r"[a-z]foo[a-z]"), seq([I("foo")]));
        assert_eq!(e(r"[a-z](foo)(bar)[a-z]"), seq([I("foobar")]));
-        assert_eq!(e(r"[a-z]([a-z]foo)(bar[a-z])[a-z]"), seq([I("foobar")]));
+        assert_eq!(e(r"[a-z]([a-z]foo)(bar[a-z])[a-z]"), seq([I("foo")]));
        assert_eq!(e(r"[a-z]([a-z]foo)([a-z]foo)[a-z]"), seq([I("foo")]));
        assert_eq!(e(r"(\d{1,3}\.){3}\d{1,3}"), seq([I(".")]));
        assert_eq!(e(r"[a-z]([a-z]foo){3}[a-z]"), seq([I("foo")]));
@ -689,7 +695,7 @@ mod tests {
        assert_eq!(e(r"[a-z]([a-z]foo[a-z]){3}[a-z]"), seq([I("foo")]));
        assert_eq!(
            e(r"[a-z]([a-z]foo){3}(bar[a-z]){3}[a-z]"),
-            seq([I("foobar")])
+            seq([I("foo")])
        );
    }

@ -935,14 +941,14 @@ mod tests {
        assert_eq!(Seq::infinite(), e(r"[A-Z]+"));
        assert_eq!(seq([I("1")]), e(r"1[A-Z]"));
        assert_eq!(seq([I("1")]), e(r"1[A-Z]2"));
-        assert_eq!(seq([E("123")]), e(r"[A-Z]+123"));
+        assert_eq!(seq([I("123")]), e(r"[A-Z]+123"));
        assert_eq!(seq([I("123")]), e(r"[A-Z]+123[A-Z]+"));
        assert_eq!(Seq::infinite(), e(r"1|[A-Z]|3"));
        assert_eq!(seq([E("1"), I("2"), E("3")]), e(r"1|2[A-Z]|3"),);
        assert_eq!(seq([E("1"), I("2"), E("3")]), e(r"1|[A-Z]2[A-Z]|3"),);
-        assert_eq!(seq([E("1"), E("2"), E("3")]), e(r"1|[A-Z]2|3"),);
+        assert_eq!(seq([E("1"), I("2"), E("3")]), e(r"1|[A-Z]2|3"),);
        assert_eq!(seq([E("1"), I("2"), E("4")]), e(r"1|2[A-Z]3|4"),);
-        assert_eq!(seq([E("2")]), e(r"(?:|1)[A-Z]2"));
+        assert_eq!(seq([I("2")]), e(r"(?:|1)[A-Z]2"));
        assert_eq!(inexact([I("a")]), e(r"a.z"));
    }

@ -1005,4 +1011,11 @@ mod tests {
        let s = e(r"foobarfoo|foo| |foofoo");
        assert_eq!(Seq::infinite(), s);
    }
+
+    // Regression test for: https://github.com/BurntSushi/ripgrep/issues/2884
+    #[test]
+    fn case_insensitive_alternation() {
+        let s = e(r"(?i:e.x|ex)");
+        assert_eq!(s, seq([I("X"), I("x")]));
+    }
 }
--- a/tests/util.rs
+++ b/tests/util.rs
@ -284,16 +284,7 @@ impl TestCommand {
    /// Runs and captures the stdout of the given command.
    pub fn stdout(&mut self) -> String {
        let o = self.output();
-        let stdout = String::from_utf8_lossy(&o.stdout);
-        match stdout.parse() {
-            Ok(t) => t,
-            Err(err) => {
-                panic!(
-                    "could not convert from string: {:?}\n\n{}",
-                    err, stdout
-                );
-            }
-        }
+        String::from_utf8_lossy(&o.stdout).into_owned()
    }

    /// Pipe `input` to a command, and collect the output.
@ -313,16 +304,7 @@ impl TestCommand {
        let output = self.expect_success(child.wait_with_output().unwrap());
        worker.join().unwrap().unwrap();

-        let stdout = String::from_utf8_lossy(&output.stdout);
-        match stdout.parse() {
-            Ok(t) => t,
-            Err(err) => {
-                panic!(
-                    "could not convert from string: {:?}\n\n{}",
-                    err, stdout
-                );
-            }
-        }
+        String::from_utf8_lossy(&output.stdout).into_owned()
    }

    /// Gets the output of a command. If the command failed, then this panics.