1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-06-25 14:22:54 +02:00
Files
ripgrep/crates/regex/src/word.rs
Andrew Gallant bc76a30c23 regex: fix -w when regex can match empty string
This is a weird bug where our optimization for handling -w more quickly
than we would otherwise failed. In particular, if the original regex can
match the empty string, then our word boundary detection would produce
invalid indices to the start the next search at. We "fix" it by simply
bailing when the indices are known to be incorrect.

This wasn't a problem in a previous release since ripgrep 13 tweaked how
word boundaries are detected in commit efd9cfb2.

Fixes #1891
2021-06-12 14:18:53 -04:00

302 lines
11 KiB
Rust

use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::Arc;
use grep_matcher::{Match, Matcher, NoError};
use regex::bytes::{CaptureLocations, Regex};
use thread_local::ThreadLocal;
use crate::config::ConfiguredHIR;
use crate::error::Error;
use crate::matcher::RegexCaptures;
/// A matcher for implementing "word match" semantics.
#[derive(Debug)]
pub struct WordMatcher {
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
regex: Regex,
/// The original regex supplied by the user, which we use in a fast path
/// to try and detect matches before deferring to slower engines.
original: Regex,
/// A map from capture group name to capture group index.
names: HashMap<String, usize>,
/// A reusable buffer for finding the match location of the inner group.
locs: Arc<ThreadLocal<RefCell<CaptureLocations>>>,
}
impl Clone for WordMatcher {
fn clone(&self) -> WordMatcher {
// We implement Clone manually so that we get a fresh ThreadLocal such
// that it can set its own thread owner. This permits each thread
// usings `locs` to hit the fast path.
WordMatcher {
regex: self.regex.clone(),
original: self.original.clone(),
names: self.names.clone(),
locs: Arc::new(ThreadLocal::new()),
}
}
}
impl WordMatcher {
/// Create a new matcher from the given pattern that only produces matches
/// that are considered "words."
///
/// The given options are used to construct the regular expression
/// internally.
pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
let original =
expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?;
let word_expr = expr.with_pattern(|pat| {
let pat = format!(r"(?:(?m:^)|\W)({})(?:\W|(?m:$))", pat);
log::debug!("word regex: {:?}", pat);
pat
})?;
let regex = word_expr.regex()?;
let locs = Arc::new(ThreadLocal::new());
let mut names = HashMap::new();
for (i, optional_name) in regex.capture_names().enumerate() {
if let Some(name) = optional_name {
names.insert(name.to_string(), i.checked_sub(1).unwrap());
}
}
Ok(WordMatcher { regex, original, names, locs })
}
/// Return the underlying regex used by this matcher.
pub fn regex(&self) -> &Regex {
&self.regex
}
/// Attempt to do a fast confirmation of a word match that covers a subset
/// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
/// when a match is found. Ok(None) is returned when there is definitively
/// no match. Err(()) is returned when this routine could not detect
/// whether there was a match or not.
fn fast_find(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, ()> {
// This is a bit hairy. The whole point here is to avoid running an
// NFA simulation in the regex engine. Remember, our word regex looks
// like this:
//
// (^|\W)(<original regex>)($|\W)
// where ^ and $ have multiline mode DISABLED
//
// What we want are the match offsets of <original regex>. So in the
// easy/common case, the original regex will be sandwiched between
// two codepoints that are in the \W class. So our approach here is to
// look for a match of the overall word regexp, strip the \W ends and
// then check whether the original regex matches what's left. If so,
// then we are guaranteed a correct match.
//
// This only works though if we know that the match is sandwiched
// between two \W codepoints. This only occurs when neither ^ nor $
// match. This in turn only occurs when the match is at either the
// beginning or end of the haystack. In either of those cases, we
// declare defeat and defer to the slower implementation.
//
// The reason why we cannot handle the ^/$ cases here is because we
// can't assume anything about the original pattern. (Try commenting
// out the checks for ^/$ below and run the tests to see examples.)
let mut cand = match self.regex.find_at(haystack, at) {
None => return Ok(None),
Some(m) => Match::new(m.start(), m.end()),
};
if cand.start() == 0 || cand.end() == haystack.len() {
return Err(());
}
let (_, slen) = bstr::decode_utf8(&haystack[cand]);
let (_, elen) = bstr::decode_last_utf8(&haystack[cand]);
let new_start = cand.start() + slen;
let new_end = cand.end() - elen;
// This occurs the original regex can match the empty string. In this
// case, just bail instead of trying to get it right here since it's
// likely a pathological case.
if new_start > new_end {
return Err(());
}
cand = cand.with_start(new_start).with_end(new_end);
if self.original.is_match(&haystack[cand]) {
Ok(Some(cand))
} else {
Err(())
}
}
}
impl Matcher for WordMatcher {
type Captures = RegexCaptures;
type Error = NoError;
fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, NoError> {
// To make this easy to get right, we extract captures here instead of
// calling `find_at`. The actual match is at capture group `1` instead
// of `0`. We *could* use `find_at` here and then trim the match after
// the fact, but that's a bit harder to get right, and it's not clear
// if it's worth it.
//
// OK, well, it turns out that it is worth it! But it is quite tricky.
// See `fast_find` for details. Effectively, this lets us skip running
// the NFA simulation in the regex engine in the vast majority of
// cases. However, the NFA simulation is required for full correctness.
match self.fast_find(haystack, at) {
Ok(Some(m)) => return Ok(Some(m)),
Ok(None) => return Ok(None),
Err(()) => {}
}
let cell =
self.locs.get_or(|| RefCell::new(self.regex.capture_locations()));
let mut caps = cell.borrow_mut();
self.regex.captures_read_at(&mut caps, haystack, at);
Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
}
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
}
fn capture_count(&self) -> usize {
self.regex.captures_len().checked_sub(1).unwrap()
}
fn capture_index(&self, name: &str) -> Option<usize> {
self.names.get(name).map(|i| *i)
}
fn captures_at(
&self,
haystack: &[u8],
at: usize,
caps: &mut RegexCaptures,
) -> Result<bool, NoError> {
let r =
self.regex.captures_read_at(caps.locations_mut(), haystack, at);
Ok(r.is_some())
}
// We specifically do not implement other methods like find_iter or
// captures_iter. Namely, the iter methods are guaranteed to be correct
// by virtue of implementing find_at and captures_at above.
}
#[cfg(test)]
mod tests {
use super::WordMatcher;
use crate::config::Config;
use grep_matcher::{Captures, Match, Matcher};
fn matcher(pattern: &str) -> WordMatcher {
let chir = Config::default().hir(pattern).unwrap();
WordMatcher::new(&chir).unwrap()
}
fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
matcher(pattern)
.find(haystack.as_bytes())
.unwrap()
.map(|m| (m.start(), m.end()))
}
fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
let m = matcher(pattern);
let mut caps = m.new_captures().unwrap();
if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
None
} else {
caps.get(0).map(|m| (m.start(), m.end()))
}
}
// Test that the standard `find` API reports offsets correctly.
#[test]
fn various_find() {
assert_eq!(Some((0, 3)), find(r"foo", "foo"));
assert_eq!(Some((0, 3)), find(r"foo", "foo("));
assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
assert_eq!(None, find(r"foo", "!afoo("));
assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
assert_eq!(None, find(r"foo", "fooб"));
assert_eq!(Some((0, 4)), find(r"foo5", "foo5"));
assert_eq!(None, find(r"foo", "foo5"));
assert_eq!(Some((1, 4)), find(r"foo", "!foo!"));
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!"));
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!"));
assert_eq!(Some((0, 3)), find(r"foo", "foo\n"));
assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n"));
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n"));
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n"));
assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!"));
assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!"));
assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a"));
assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n"));
assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!##"));
assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!\n##"));
assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##"));
assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa"));
}
// See: https://github.com/BurntSushi/ripgrep/issues/389
#[test]
fn regression_dash() {
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
}
// Test that the captures API also reports offsets correctly, just as
// find does. This exercises a different path in the code since captures
// are handled differently.
#[test]
fn various_captures() {
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
assert_eq!(None, find_by_caps(r"foo", "!afoo("));
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
assert_eq!(None, find_by_caps(r"foo", "fooб"));
// assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));
// See: https://github.com/BurntSushi/ripgrep/issues/389
assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
}
// Test that the capture reporting methods work as advertised.
#[test]
fn capture_indexing() {
let m = matcher(r"(a)(?P<foo>b)(c)");
assert_eq!(4, m.capture_count());
assert_eq!(Some(2), m.capture_index("foo"));
let mut caps = m.new_captures().unwrap();
assert_eq!(4, caps.len());
assert!(m.captures(b"abc", &mut caps).unwrap());
assert_eq!(caps.get(0), Some(Match::new(0, 3)));
assert_eq!(caps.get(1), Some(Match::new(0, 1)));
assert_eq!(caps.get(2), Some(Match::new(1, 2)));
assert_eq!(caps.get(3), Some(Match::new(2, 3)));
assert_eq!(caps.get(4), None);
assert!(m.captures(b"#abc#", &mut caps).unwrap());
assert_eq!(caps.get(0), Some(Match::new(1, 4)));
assert_eq!(caps.get(1), Some(Match::new(1, 2)));
assert_eq!(caps.get(2), Some(Match::new(2, 3)));
assert_eq!(caps.get(3), Some(Match::new(3, 4)));
assert_eq!(caps.get(4), None);
}
}