2018-04-29 09:29:52 -04:00
|
|
|
use std::cell::RefCell;
|
2020-02-17 18:08:47 -05:00
|
|
|
use std::collections::HashMap;
|
2018-04-29 09:29:52 -04:00
|
|
|
use std::sync::Arc;
|
|
|
|
|
|
|
|
use grep_matcher::{Match, Matcher, NoError};
|
|
|
|
use regex::bytes::{CaptureLocations, Regex};
|
|
|
|
use thread_local::CachedThreadLocal;
|
|
|
|
|
|
|
|
use config::ConfiguredHIR;
|
|
|
|
use error::Error;
|
|
|
|
use matcher::RegexCaptures;
|
|
|
|
|
|
|
|
/// A matcher for implementing "word match" semantics.
|
|
|
|
#[derive(Debug)]
|
|
|
|
pub struct WordMatcher {
|
|
|
|
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
|
|
|
regex: Regex,
|
2020-02-16 10:43:26 -05:00
|
|
|
/// The original regex supplied by the user, which we use in a fast path
|
|
|
|
/// to try and detect matches before deferring to slower engines.
|
|
|
|
original: Regex,
|
2018-04-29 09:29:52 -04:00
|
|
|
/// A map from capture group name to capture group index.
|
|
|
|
names: HashMap<String, usize>,
|
|
|
|
/// A reusable buffer for finding the match location of the inner group.
|
|
|
|
locs: Arc<CachedThreadLocal<RefCell<CaptureLocations>>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Clone for WordMatcher {
|
|
|
|
fn clone(&self) -> WordMatcher {
|
|
|
|
// We implement Clone manually so that we get a fresh CachedThreadLocal
|
|
|
|
// such that it can set its own thread owner. This permits each thread
|
|
|
|
// usings `locs` to hit the fast path.
|
|
|
|
WordMatcher {
|
|
|
|
regex: self.regex.clone(),
|
2020-02-16 10:43:26 -05:00
|
|
|
original: self.original.clone(),
|
2018-04-29 09:29:52 -04:00
|
|
|
names: self.names.clone(),
|
|
|
|
locs: Arc::new(CachedThreadLocal::new()),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl WordMatcher {
|
|
|
|
/// Create a new matcher from the given pattern that only produces matches
|
|
|
|
/// that are considered "words."
|
|
|
|
///
|
|
|
|
/// The given options are used to construct the regular expression
|
|
|
|
/// internally.
|
|
|
|
pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
|
2020-02-17 18:08:47 -05:00
|
|
|
let original =
|
|
|
|
expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?;
|
2018-04-29 09:29:52 -04:00
|
|
|
let word_expr = expr.with_pattern(|pat| {
|
2020-02-16 10:43:26 -05:00
|
|
|
let pat = format!(r"(?:(?-m:^)|\W)({})(?:(?-m:$)|\W)", pat);
|
|
|
|
debug!("word regex: {:?}", pat);
|
|
|
|
pat
|
2018-04-29 09:29:52 -04:00
|
|
|
})?;
|
|
|
|
let regex = word_expr.regex()?;
|
|
|
|
let locs = Arc::new(CachedThreadLocal::new());
|
|
|
|
|
|
|
|
let mut names = HashMap::new();
|
|
|
|
for (i, optional_name) in regex.capture_names().enumerate() {
|
|
|
|
if let Some(name) = optional_name {
|
|
|
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
|
|
|
}
|
|
|
|
}
|
2020-02-16 10:43:26 -05:00
|
|
|
Ok(WordMatcher { regex, original, names, locs })
|
2018-04-29 09:29:52 -04:00
|
|
|
}
|
2019-04-05 21:03:22 -04:00
|
|
|
|
|
|
|
/// Return the underlying regex used by this matcher.
|
|
|
|
pub fn regex(&self) -> &Regex {
|
|
|
|
&self.regex
|
|
|
|
}
|
2020-02-16 10:43:26 -05:00
|
|
|
|
|
|
|
/// Attempt to do a fast confirmation of a word match that covers a subset
|
|
|
|
/// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
|
|
|
|
/// when a match is found. Ok(None) is returned when there is definitively
|
|
|
|
/// no match. Err(()) is returned when this routine could not detect
|
|
|
|
/// whether there was a match or not.
|
|
|
|
fn fast_find(
|
|
|
|
&self,
|
|
|
|
haystack: &[u8],
|
|
|
|
at: usize,
|
|
|
|
) -> Result<Option<Match>, ()> {
|
|
|
|
// This is a bit hairy. The whole point here is to avoid running an
|
|
|
|
// NFA simulation in the regex engine. Remember, our word regex looks
|
|
|
|
// like this:
|
|
|
|
//
|
|
|
|
// (^|\W)(<original regex>)($|\W)
|
|
|
|
// where ^ and $ have multiline mode DISABLED
|
|
|
|
//
|
|
|
|
// What we want are the match offsets of <original regex>. So in the
|
|
|
|
// easy/common case, the original regex will be sandwiched between
|
|
|
|
// two codepoints that are in the \W class. So our approach here is to
|
|
|
|
// look for a match of the overall word regexp, strip the \W ends and
|
|
|
|
// then check whether the original regex matches what's left. If so,
|
|
|
|
// then we are guaranteed a correct match.
|
|
|
|
//
|
|
|
|
// This only works though if we know that the match is sandwiched
|
|
|
|
// between two \W codepoints. This only occurs when neither ^ nor $
|
|
|
|
// match. This in turn only occurs when the match is at either the
|
|
|
|
// beginning or end of the haystack. In either of those cases, we
|
|
|
|
// declare defeat and defer to the slower implementation.
|
|
|
|
//
|
|
|
|
// The reason why we cannot handle the ^/$ cases here is because we
|
|
|
|
// can't assume anything about the original pattern. (Try commenting
|
|
|
|
// out the checks for ^/$ below and run the tests to see examples.)
|
|
|
|
let mut cand = match self.regex.find_at(haystack, at) {
|
|
|
|
None => return Ok(None),
|
|
|
|
Some(m) => Match::new(m.start(), m.end()),
|
|
|
|
};
|
|
|
|
if cand.start() == 0 || cand.end() == haystack.len() {
|
|
|
|
return Err(());
|
|
|
|
}
|
|
|
|
let (_, slen) = bstr::decode_utf8(&haystack[cand]);
|
|
|
|
let (_, elen) = bstr::decode_last_utf8(&haystack[cand]);
|
2020-02-17 18:08:47 -05:00
|
|
|
cand =
|
|
|
|
cand.with_start(cand.start() + slen).with_end(cand.end() - elen);
|
2020-02-16 10:43:26 -05:00
|
|
|
if self.original.is_match(&haystack[cand]) {
|
|
|
|
Ok(Some(cand))
|
|
|
|
} else {
|
|
|
|
Err(())
|
|
|
|
}
|
|
|
|
}
|
2018-04-29 09:29:52 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Matcher for WordMatcher {
|
|
|
|
type Captures = RegexCaptures;
|
|
|
|
type Error = NoError;
|
|
|
|
|
|
|
|
fn find_at(
|
|
|
|
&self,
|
|
|
|
haystack: &[u8],
|
|
|
|
at: usize,
|
|
|
|
) -> Result<Option<Match>, NoError> {
|
|
|
|
// To make this easy to get right, we extract captures here instead of
|
|
|
|
// calling `find_at`. The actual match is at capture group `1` instead
|
|
|
|
// of `0`. We *could* use `find_at` here and then trim the match after
|
|
|
|
// the fact, but that's a bit harder to get right, and it's not clear
|
|
|
|
// if it's worth it.
|
2020-02-16 10:43:26 -05:00
|
|
|
//
|
|
|
|
// OK, well, it turns out that it is worth it! But it is quite tricky.
|
|
|
|
// See `fast_find` for details. Effectively, this lets us skip running
|
|
|
|
// the NFA simulation in the regex engine in the vast majority of
|
|
|
|
// cases. However, the NFA simulation is required for full correctness.
|
|
|
|
match self.fast_find(haystack, at) {
|
|
|
|
Ok(Some(m)) => return Ok(Some(m)),
|
|
|
|
Ok(None) => return Ok(None),
|
|
|
|
Err(()) => {}
|
|
|
|
}
|
2018-04-29 09:29:52 -04:00
|
|
|
|
2020-02-17 18:08:47 -05:00
|
|
|
let cell =
|
|
|
|
self.locs.get_or(|| RefCell::new(self.regex.capture_locations()));
|
2018-04-29 09:29:52 -04:00
|
|
|
let mut caps = cell.borrow_mut();
|
|
|
|
self.regex.captures_read_at(&mut caps, haystack, at);
|
|
|
|
Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
|
|
|
Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn capture_count(&self) -> usize {
|
|
|
|
self.regex.captures_len().checked_sub(1).unwrap()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
|
|
|
self.names.get(name).map(|i| *i)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn captures_at(
|
|
|
|
&self,
|
|
|
|
haystack: &[u8],
|
|
|
|
at: usize,
|
|
|
|
caps: &mut RegexCaptures,
|
|
|
|
) -> Result<bool, NoError> {
|
2020-02-17 18:08:47 -05:00
|
|
|
let r =
|
|
|
|
self.regex.captures_read_at(caps.locations_mut(), haystack, at);
|
2018-04-29 09:29:52 -04:00
|
|
|
Ok(r.is_some())
|
|
|
|
}
|
|
|
|
|
|
|
|
// We specifically do not implement other methods like find_iter or
|
|
|
|
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
|
|
|
// by virtue of implementing find_at and captures_at above.
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::WordMatcher;
|
2020-02-17 18:08:47 -05:00
|
|
|
use config::Config;
|
|
|
|
use grep_matcher::{Captures, Match, Matcher};
|
2018-04-29 09:29:52 -04:00
|
|
|
|
|
|
|
fn matcher(pattern: &str) -> WordMatcher {
|
|
|
|
let chir = Config::default().hir(pattern).unwrap();
|
|
|
|
WordMatcher::new(&chir).unwrap()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
|
|
|
matcher(pattern)
|
|
|
|
.find(haystack.as_bytes())
|
|
|
|
.unwrap()
|
|
|
|
.map(|m| (m.start(), m.end()))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
|
|
|
let m = matcher(pattern);
|
|
|
|
let mut caps = m.new_captures().unwrap();
|
|
|
|
if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
|
|
|
|
None
|
|
|
|
} else {
|
|
|
|
caps.get(0).map(|m| (m.start(), m.end()))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test that the standard `find` API reports offsets correctly.
|
|
|
|
#[test]
|
|
|
|
fn various_find() {
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo"));
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo("));
|
|
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
|
|
|
|
assert_eq!(None, find(r"foo", "!afoo("));
|
|
|
|
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
|
|
|
|
assert_eq!(None, find(r"foo", "fooб"));
|
|
|
|
|
2020-02-16 10:43:26 -05:00
|
|
|
assert_eq!(Some((0, 4)), find(r"foo5", "foo5"));
|
|
|
|
assert_eq!(None, find(r"foo", "foo5"));
|
|
|
|
|
|
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo!"));
|
|
|
|
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!"));
|
|
|
|
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!"));
|
|
|
|
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo\n"));
|
|
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n"));
|
|
|
|
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n"));
|
|
|
|
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n"));
|
|
|
|
|
|
|
|
assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!"));
|
|
|
|
assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!"));
|
|
|
|
assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a"));
|
|
|
|
|
|
|
|
assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n"));
|
|
|
|
assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##"));
|
|
|
|
assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa"));
|
|
|
|
}
|
|
|
|
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
|
|
|
#[test]
|
|
|
|
fn regression_dash() {
|
2018-04-29 09:29:52 -04:00
|
|
|
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test that the captures API also reports offsets correctly, just as
|
|
|
|
// find does. This exercises a different path in the code since captures
|
|
|
|
// are handled differently.
|
|
|
|
#[test]
|
|
|
|
fn various_captures() {
|
|
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
|
|
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
|
|
|
|
assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
|
|
|
|
assert_eq!(None, find_by_caps(r"foo", "!afoo("));
|
|
|
|
|
|
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
|
|
|
|
assert_eq!(None, find_by_caps(r"foo", "fooб"));
|
|
|
|
// assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));
|
|
|
|
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
|
|
|
assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test that the capture reporting methods work as advertised.
|
|
|
|
#[test]
|
|
|
|
fn capture_indexing() {
|
|
|
|
let m = matcher(r"(a)(?P<foo>b)(c)");
|
|
|
|
assert_eq!(4, m.capture_count());
|
|
|
|
assert_eq!(Some(2), m.capture_index("foo"));
|
|
|
|
|
|
|
|
let mut caps = m.new_captures().unwrap();
|
|
|
|
assert_eq!(4, caps.len());
|
|
|
|
|
|
|
|
assert!(m.captures(b"abc", &mut caps).unwrap());
|
|
|
|
assert_eq!(caps.get(0), Some(Match::new(0, 3)));
|
|
|
|
assert_eq!(caps.get(1), Some(Match::new(0, 1)));
|
|
|
|
assert_eq!(caps.get(2), Some(Match::new(1, 2)));
|
|
|
|
assert_eq!(caps.get(3), Some(Match::new(2, 3)));
|
|
|
|
assert_eq!(caps.get(4), None);
|
|
|
|
|
|
|
|
assert!(m.captures(b"#abc#", &mut caps).unwrap());
|
|
|
|
assert_eq!(caps.get(0), Some(Match::new(1, 4)));
|
|
|
|
assert_eq!(caps.get(1), Some(Match::new(1, 2)));
|
|
|
|
assert_eq!(caps.get(2), Some(Match::new(2, 3)));
|
|
|
|
assert_eq!(caps.get(3), Some(Match::new(3, 4)));
|
|
|
|
assert_eq!(caps.get(4), None);
|
|
|
|
}
|
|
|
|
}
|