mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-04-14 00:58:43 +02:00
It turns out our fast path for -w/--word-regexp wasn't quite correct in some cases. Namely, we use `(?m:^|\W)(<original-regex>)(?m:\W|$)` as the implementation of -w/--word-regexp since `\b(<original-regex>)\b` has some unintuitive results in certain cases, specifically when <original-regex> matches non-word characters at match boundaries. The problem is that using this formulation means that you need to extract the capture group around <original-regex> to find the "real" match, since the surrounding (^|\W) and (\W|$) aren't part of the match. This is fine, but the capture group engine is usually slow, so we have a fast path where we try to deduce the correct match boundary after an initial match (before running capture groups). The problem is that doing this is rather tricky because it's hard to know, in general, whether the `^` or the `\W` matched. This still doesn't seem quite right overall, but we at least fix one more case. Fixes #2574
342 lines
13 KiB
Rust
342 lines
13 KiB
Rust
use std::{
|
|
collections::HashMap,
|
|
panic::{RefUnwindSafe, UnwindSafe},
|
|
sync::Arc,
|
|
};
|
|
|
|
use {
|
|
grep_matcher::{Match, Matcher, NoError},
|
|
regex_automata::{
|
|
meta::Regex, util::captures::Captures, util::pool::Pool, Input,
|
|
PatternID,
|
|
},
|
|
};
|
|
|
|
use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
|
|
|
|
type PoolFn =
|
|
Box<dyn Fn() -> Captures + Send + Sync + UnwindSafe + RefUnwindSafe>;
|
|
|
|
/// A matcher for implementing "word match" semantics.
|
|
#[derive(Debug)]
|
|
pub(crate) struct WordMatcher {
|
|
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
|
regex: Regex,
|
|
/// The HIR that produced the regex above. We don't keep the HIR for the
|
|
/// `original` regex.
|
|
///
|
|
/// We put this in an `Arc` because by the time it gets here, it won't
|
|
/// change. And because cloning and dropping an `Hir` is somewhat expensive
|
|
/// due to its deep recursive representation.
|
|
chir: Arc<ConfiguredHIR>,
|
|
/// The original regex supplied by the user, which we use in a fast path
|
|
/// to try and detect matches before deferring to slower engines.
|
|
original: Regex,
|
|
/// A map from capture group name to capture group index.
|
|
names: HashMap<String, usize>,
|
|
/// A thread-safe pool of reusable buffers for finding the match offset of
|
|
/// the inner group.
|
|
caps: Arc<Pool<Captures, PoolFn>>,
|
|
}
|
|
|
|
impl Clone for WordMatcher {
|
|
fn clone(&self) -> WordMatcher {
|
|
// We implement Clone manually so that we get a fresh Pool such that it
|
|
// can set its own thread owner. This permits each thread usings `caps`
|
|
// to hit the fast path.
|
|
//
|
|
// Note that cloning a regex is "cheap" since it uses reference
|
|
// counting internally.
|
|
let re = self.regex.clone();
|
|
WordMatcher {
|
|
regex: self.regex.clone(),
|
|
chir: Arc::clone(&self.chir),
|
|
original: self.original.clone(),
|
|
names: self.names.clone(),
|
|
caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl WordMatcher {
|
|
/// Create a new matcher from the given pattern that only produces matches
|
|
/// that are considered "words."
|
|
///
|
|
/// The given options are used to construct the regular expression
|
|
/// internally.
|
|
pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> {
|
|
let original = chir.clone().into_anchored().to_regex()?;
|
|
let chir = Arc::new(chir.into_word()?);
|
|
let regex = chir.to_regex()?;
|
|
let caps = Arc::new(Pool::new({
|
|
let regex = regex.clone();
|
|
Box::new(move || regex.create_captures()) as PoolFn
|
|
}));
|
|
|
|
let mut names = HashMap::new();
|
|
let it = regex.group_info().pattern_names(PatternID::ZERO);
|
|
for (i, optional_name) in it.enumerate() {
|
|
if let Some(name) = optional_name {
|
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
|
}
|
|
}
|
|
Ok(WordMatcher { regex, chir, original, names, caps })
|
|
}
|
|
|
|
/// Return the underlying regex used to match at word boundaries.
|
|
///
|
|
/// The original regex is in the capture group at index 1.
|
|
pub(crate) fn regex(&self) -> &Regex {
|
|
&self.regex
|
|
}
|
|
|
|
/// Return the underlying HIR for the regex used to match at word
|
|
/// boundaries.
|
|
pub(crate) fn chir(&self) -> &ConfiguredHIR {
|
|
&self.chir
|
|
}
|
|
|
|
/// Attempt to do a fast confirmation of a word match that covers a subset
|
|
/// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
|
|
/// when a match is found. Ok(None) is returned when there is definitively
|
|
/// no match. Err(()) is returned when this routine could not detect
|
|
/// whether there was a match or not.
|
|
fn fast_find(
|
|
&self,
|
|
haystack: &[u8],
|
|
at: usize,
|
|
) -> Result<Option<Match>, ()> {
|
|
// This is a bit hairy. The whole point here is to avoid running a
|
|
// slower regex engine to extract capture groups. Remember, our word
|
|
// regex looks like this:
|
|
//
|
|
// (^|\W)(<original regex>)(\W|$)
|
|
//
|
|
// What we want are the match offsets of <original regex>. So in the
|
|
// easy/common case, the original regex will be sandwiched between
|
|
// two codepoints that are in the \W class. So our approach here is to
|
|
// look for a match of the overall word regexp, strip the \W ends and
|
|
// then check whether the original regex matches what's left. If so,
|
|
// then we are guaranteed a correct match.
|
|
//
|
|
// This only works though if we know that the match is sandwiched
|
|
// between two \W codepoints. This only occurs when neither ^ nor $
|
|
// match. This in turn only occurs when the match is at either the
|
|
// beginning or end of the haystack. In either of those cases, we
|
|
// declare defeat and defer to the slower implementation.
|
|
//
|
|
// The reason why we cannot handle the ^/$ cases here is because we
|
|
// can't assume anything about the original pattern. (Try commenting
|
|
// out the checks for ^/$ below and run the tests to see examples.)
|
|
//
|
|
// NOTE(2023-07-31): After fixing #2574, this logic honestly still
|
|
// doesn't seem correct. Regex composition is hard.
|
|
let input = Input::new(haystack).span(at..haystack.len());
|
|
let mut cand = match self.regex.find(input) {
|
|
None => return Ok(None),
|
|
Some(m) => Match::new(m.start(), m.end()),
|
|
};
|
|
if cand.start() == 0 || cand.end() == haystack.len() {
|
|
return Err(());
|
|
}
|
|
// We decode the chars on either side of the match. If either char is
|
|
// a word character, then that means the ^/$ matched and not \W. In
|
|
// that case, we defer to the slower engine.
|
|
let (ch, slen) = bstr::decode_utf8(&haystack[cand]);
|
|
if ch.map_or(true, regex_syntax::is_word_character) {
|
|
return Err(());
|
|
}
|
|
let (ch, elen) = bstr::decode_last_utf8(&haystack[cand]);
|
|
if ch.map_or(true, regex_syntax::is_word_character) {
|
|
return Err(());
|
|
}
|
|
let new_start = cand.start() + slen;
|
|
let new_end = cand.end() - elen;
|
|
// This occurs the original regex can match the empty string. In this
|
|
// case, just bail instead of trying to get it right here since it's
|
|
// likely a pathological case.
|
|
if new_start > new_end {
|
|
return Err(());
|
|
}
|
|
cand = cand.with_start(new_start).with_end(new_end);
|
|
if self.original.is_match(&haystack[cand]) {
|
|
Ok(Some(cand))
|
|
} else {
|
|
Err(())
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Matcher for WordMatcher {
|
|
type Captures = RegexCaptures;
|
|
type Error = NoError;
|
|
|
|
fn find_at(
|
|
&self,
|
|
haystack: &[u8],
|
|
at: usize,
|
|
) -> Result<Option<Match>, NoError> {
|
|
// To make this easy to get right, we extract captures here instead of
|
|
// calling `find_at`. The actual match is at capture group `1` instead
|
|
// of `0`. We *could* use `find_at` here and then trim the match after
|
|
// the fact, but that's a bit harder to get right, and it's not clear
|
|
// if it's worth it.
|
|
//
|
|
// OK, well, it turns out that it is worth it! But it is quite tricky.
|
|
// See `fast_find` for details. Effectively, this lets us skip running
|
|
// a slower regex engine to extract capture groups in the vast majority
|
|
// of cases. However, the slower engine is I believe required for full
|
|
// correctness.
|
|
match self.fast_find(haystack, at) {
|
|
Ok(Some(m)) => return Ok(Some(m)),
|
|
Ok(None) => return Ok(None),
|
|
Err(()) => {}
|
|
}
|
|
|
|
let input = Input::new(haystack).span(at..haystack.len());
|
|
let mut caps = self.caps.get();
|
|
self.regex.search_captures(&input, &mut caps);
|
|
Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end)))
|
|
}
|
|
|
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
|
Ok(RegexCaptures::with_offset(self.regex.create_captures(), 1))
|
|
}
|
|
|
|
fn capture_count(&self) -> usize {
|
|
self.regex.captures_len().checked_sub(1).unwrap()
|
|
}
|
|
|
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
|
self.names.get(name).map(|i| *i)
|
|
}
|
|
|
|
fn captures_at(
|
|
&self,
|
|
haystack: &[u8],
|
|
at: usize,
|
|
caps: &mut RegexCaptures,
|
|
) -> Result<bool, NoError> {
|
|
let input = Input::new(haystack).span(at..haystack.len());
|
|
let caps = caps.captures_mut();
|
|
self.regex.search_captures(&input, caps);
|
|
Ok(caps.is_match())
|
|
}
|
|
|
|
// We specifically do not implement other methods like find_iter or
|
|
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
|
// by virtue of implementing find_at and captures_at above.
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::WordMatcher;
|
|
use crate::config::Config;
|
|
use grep_matcher::{Captures, Match, Matcher};
|
|
|
|
fn matcher(pattern: &str) -> WordMatcher {
|
|
let chir = Config::default().build_many(&[pattern]).unwrap();
|
|
WordMatcher::new(chir).unwrap()
|
|
}
|
|
|
|
fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
|
matcher(pattern)
|
|
.find(haystack.as_bytes())
|
|
.unwrap()
|
|
.map(|m| (m.start(), m.end()))
|
|
}
|
|
|
|
fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
|
let m = matcher(pattern);
|
|
let mut caps = m.new_captures().unwrap();
|
|
if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
|
|
None
|
|
} else {
|
|
caps.get(0).map(|m| (m.start(), m.end()))
|
|
}
|
|
}
|
|
|
|
// Test that the standard `find` API reports offsets correctly.
|
|
#[test]
|
|
fn various_find() {
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo"));
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo("));
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
|
|
assert_eq!(None, find(r"foo", "!afoo("));
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
|
|
assert_eq!(None, find(r"foo", "fooб"));
|
|
|
|
assert_eq!(Some((0, 4)), find(r"foo5", "foo5"));
|
|
assert_eq!(None, find(r"foo", "foo5"));
|
|
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo!"));
|
|
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!"));
|
|
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!"));
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo\n"));
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n"));
|
|
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n"));
|
|
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n"));
|
|
|
|
assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!"));
|
|
assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!"));
|
|
assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a"));
|
|
|
|
assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n"));
|
|
assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!##"));
|
|
assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!\n##"));
|
|
assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##"));
|
|
assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa"));
|
|
}
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
|
#[test]
|
|
fn regression_dash() {
|
|
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
|
|
}
|
|
|
|
// Test that the captures API also reports offsets correctly, just as
|
|
// find does. This exercises a different path in the code since captures
|
|
// are handled differently.
|
|
#[test]
|
|
fn various_captures() {
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
|
|
assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
|
|
assert_eq!(None, find_by_caps(r"foo", "!afoo("));
|
|
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
|
|
assert_eq!(None, find_by_caps(r"foo", "fooб"));
|
|
// assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
|
assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
|
|
}
|
|
|
|
// Test that the capture reporting methods work as advertised.
|
|
#[test]
|
|
fn capture_indexing() {
|
|
let m = matcher(r"(a)(?P<foo>b)(c)");
|
|
assert_eq!(4, m.capture_count());
|
|
assert_eq!(Some(2), m.capture_index("foo"));
|
|
|
|
let mut caps = m.new_captures().unwrap();
|
|
assert_eq!(4, caps.len());
|
|
|
|
assert!(m.captures(b"abc", &mut caps).unwrap());
|
|
assert_eq!(caps.get(0), Some(Match::new(0, 3)));
|
|
assert_eq!(caps.get(1), Some(Match::new(0, 1)));
|
|
assert_eq!(caps.get(2), Some(Match::new(1, 2)));
|
|
assert_eq!(caps.get(3), Some(Match::new(2, 3)));
|
|
assert_eq!(caps.get(4), None);
|
|
|
|
assert!(m.captures(b"#abc#", &mut caps).unwrap());
|
|
assert_eq!(caps.get(0), Some(Match::new(1, 4)));
|
|
assert_eq!(caps.get(1), Some(Match::new(1, 2)));
|
|
assert_eq!(caps.get(2), Some(Match::new(2, 3)));
|
|
assert_eq!(caps.get(3), Some(Match::new(3, 4)));
|
|
assert_eq!(caps.get(4), None);
|
|
}
|
|
}
|