diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b435273..c637aeae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,8 @@ Bug fixes: Fix bug when using inline regex flags with `-e/--regexp`. * [BUG #2523](https://github.com/BurntSushi/ripgrep/issues/2523): Make executable searching take `.com` into account on Windows. +* [BUG #2574](https://github.com/BurntSushi/ripgrep/issues/2574): + Fix bug in `-w/--word-regexp` that would result in incorrect match offsets. 13.0.0 (2021-06-12) diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs index af4480ab..52fb61ce 100644 --- a/crates/regex/src/word.rs +++ b/crates/regex/src/word.rs @@ -128,6 +128,9 @@ impl WordMatcher { // The reason why we cannot handle the ^/$ cases here is because we // can't assume anything about the original pattern. (Try commenting // out the checks for ^/$ below and run the tests to see examples.) + // + // NOTE(2023-07-31): After fixing #2574, this logic honestly still + // doesn't seem correct. Regex composition is hard. let input = Input::new(haystack).span(at..haystack.len()); let mut cand = match self.regex.find(input) { None => return Ok(None), @@ -136,8 +139,17 @@ impl WordMatcher { if cand.start() == 0 || cand.end() == haystack.len() { return Err(()); } - let (_, slen) = bstr::decode_utf8(&haystack[cand]); - let (_, elen) = bstr::decode_last_utf8(&haystack[cand]); + // We decode the chars on either side of the match. If either char is + // a word character, then that means the ^/$ matched and not \W. In + // that case, we defer to the slower engine. + let (ch, slen) = bstr::decode_utf8(&haystack[cand]); + if ch.map_or(true, regex_syntax::is_word_character) { + return Err(()); + } + let (ch, elen) = bstr::decode_last_utf8(&haystack[cand]); + if ch.map_or(true, regex_syntax::is_word_character) { + return Err(()); + } let new_start = cand.start() + slen; let new_end = cand.end() - elen; // This occurs the original regex can match the empty string. In this diff --git a/tests/regression.rs b/tests/regression.rs index b9076803..5ef741cf 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -1173,3 +1173,18 @@ rgtest!(r2480, |dir: Dir, mut cmd: TestCommand| { cmd.args(&["--only-matching", "-e", "(?i)notfoo", "-e", "bar", "file"]); cmd.assert_err(); }); + +// See: https://github.com/BurntSushi/ripgrep/issues/2574 +rgtest!(r2574, |dir: Dir, mut cmd: TestCommand| { + dir.create("haystack", "some.domain.com\nsome.domain.com/x\n"); + let got = cmd + .args(&[ + "--no-filename", + "--no-unicode", + "-w", + "-o", + r"(\w+\.)*domain\.(\w+)", + ]) + .stdout(); + eqnice!("some.domain.com\nsome.domain.com\n", got); +});