2018-04-29 09:29:52 -04:00
|
|
|
use std::cell::RefCell;
|
2020-02-17 18:08:47 -05:00
|
|
|
use std::collections::HashMap;
|
2018-04-29 09:29:52 -04:00
|
|
|
use std::sync::Arc;
|
|
|
|
|
|
|
|
use grep_matcher::{Match, Matcher, NoError};
|
|
|
|
use regex::bytes::{CaptureLocations, Regex};
|
2021-01-25 10:36:53 -05:00
|
|
|
use thread_local::ThreadLocal;
|
2018-04-29 09:29:52 -04:00
|
|
|
|
2021-06-01 19:29:50 -04:00
|
|
|
use crate::config::ConfiguredHIR;
|
|
|
|
use crate::error::Error;
|
|
|
|
use crate::matcher::RegexCaptures;
|
2018-04-29 09:29:52 -04:00
|
|
|
|
|
|
|
/// A matcher for implementing "word match" semantics.
|
|
|
|
#[derive(Debug)]
|
|
|
|
pub struct WordMatcher {
|
|
|
|
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
|
|
|
regex: Regex,
|
2020-02-16 10:43:26 -05:00
|
|
|
/// The original regex supplied by the user, which we use in a fast path
|
|
|
|
/// to try and detect matches before deferring to slower engines.
|
|
|
|
original: Regex,
|
2018-04-29 09:29:52 -04:00
|
|
|
/// A map from capture group name to capture group index.
|
|
|
|
names: HashMap<String, usize>,
|
|
|
|
/// A reusable buffer for finding the match location of the inner group.
|
2021-01-25 10:36:53 -05:00
|
|
|
locs: Arc<ThreadLocal<RefCell<CaptureLocations>>>,
|
2018-04-29 09:29:52 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Clone for WordMatcher {
|
|
|
|
fn clone(&self) -> WordMatcher {
|
2021-01-25 10:36:53 -05:00
|
|
|
// We implement Clone manually so that we get a fresh ThreadLocal such
|
|
|
|
// that it can set its own thread owner. This permits each thread
|
2018-04-29 09:29:52 -04:00
|
|
|
// usings `locs` to hit the fast path.
|
|
|
|
WordMatcher {
|
|
|
|
regex: self.regex.clone(),
|
2020-02-16 10:43:26 -05:00
|
|
|
original: self.original.clone(),
|
2018-04-29 09:29:52 -04:00
|
|
|
names: self.names.clone(),
|
2021-01-25 10:36:53 -05:00
|
|
|
locs: Arc::new(ThreadLocal::new()),
|
2018-04-29 09:29:52 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl WordMatcher {
|
|
|
|
/// Create a new matcher from the given pattern that only produces matches
|
|
|
|
/// that are considered "words."
|
|
|
|
///
|
|
|
|
/// The given options are used to construct the regular expression
|
|
|
|
/// internally.
|
|
|
|
pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
|
2020-02-17 18:08:47 -05:00
|
|
|
let original =
|
|
|
|
expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?;
|
2018-04-29 09:29:52 -04:00
|
|
|
let word_expr = expr.with_pattern(|pat| {
|
grep: fix bugs in handling multi-line look-around
This commit hacks in a bug fix for handling look-around across multiple
lines. The main problem is that by the time the matching lines are sent
to the printer, the surrounding context---which some look-behind or
look-ahead might have matched---could have been dropped if it wasn't
part of the set of matching lines. Therefore, when the printer re-runs
the regex engine in some cases (to do replacements, color matches, etc
etc), it won't be guaranteed to see the same matches that the searcher
found.
Overall, this is a giant clusterfuck and suggests that the way I divided
the abstraction boundary between the printer and the searcher is just
wrong. It's likely that the searcher needs to handle more of the work of
matching and pass that info on to the printer. The tricky part is that
this additional work isn't always needed. Ultimately, this means a
serious re-design of the interface between searching and printing. Sigh.
The way this fix works is to smuggle the underlying buffer used by the
searcher through into the printer. Since these bugs only impact
multi-line search (otherwise, searches are only limited to matches
across a single line), and since multi-line search always requires
having the entire file contents in a single contiguous slice (memory
mapped or on the heap), it follows that the buffer we pass through when
we need it is, in fact, the entire haystack. So this commit refactors
the printer's regex searching to use that buffer instead of the intended
bundle of bytes containing just the relevant matching portions of that
same buffer.
There is one last little hiccup: PCRE2 doesn't seem to have a way to
specify an ending position for a search. So when we re-run the search to
find matches, we can't say, "but don't search past here." Since the
buffer is likely to contain the entire file, we really cannot do
anything here other than specify a fixed upper bound on the number of
bytes to search. So if look-ahead goes more than N bytes beyond the
match, this code will break by simply being unable to find the match. In
practice, this is probably pretty rare. I believe that if we did a
better fix for this bug by fixing the interfaces, then we'd probably try
to have PCRE2 find the pertinent matches up front so that it never needs
to re-discover them.
Fixes #1412
2021-05-31 08:29:01 -04:00
|
|
|
let pat = format!(r"(?:(?m:^)|\W)({})(?:\W|(?m:$))", pat);
|
2021-06-01 20:45:45 -04:00
|
|
|
log::debug!("word regex: {:?}", pat);
|
2020-02-16 10:43:26 -05:00
|
|
|
pat
|
2018-04-29 09:29:52 -04:00
|
|
|
})?;
|
|
|
|
let regex = word_expr.regex()?;
|
2021-01-25 10:36:53 -05:00
|
|
|
let locs = Arc::new(ThreadLocal::new());
|
2018-04-29 09:29:52 -04:00
|
|
|
|
|
|
|
let mut names = HashMap::new();
|
|
|
|
for (i, optional_name) in regex.capture_names().enumerate() {
|
|
|
|
if let Some(name) = optional_name {
|
|
|
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
|
|
|
}
|
|
|
|
}
|
2020-02-16 10:43:26 -05:00
|
|
|
Ok(WordMatcher { regex, original, names, locs })
|
2018-04-29 09:29:52 -04:00
|
|
|
}
|
2019-04-05 21:03:22 -04:00
|
|
|
|
|
|
|
/// Return the underlying regex used by this matcher.
|
|
|
|
pub fn regex(&self) -> &Regex {
|
|
|
|
&self.regex
|
|
|
|
}
|
2020-02-16 10:43:26 -05:00
|
|
|
|
|
|
|
/// Attempt to do a fast confirmation of a word match that covers a subset
|
|
|
|
/// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
|
|
|
|
/// when a match is found. Ok(None) is returned when there is definitively
|
|
|
|
/// no match. Err(()) is returned when this routine could not detect
|
|
|
|
/// whether there was a match or not.
|
|
|
|
fn fast_find(
|
|
|
|
&self,
|
|
|
|
haystack: &[u8],
|
|
|
|
at: usize,
|
|
|
|
) -> Result<Option<Match>, ()> {
|
|
|
|
// This is a bit hairy. The whole point here is to avoid running an
|
|
|
|
// NFA simulation in the regex engine. Remember, our word regex looks
|
|
|
|
// like this:
|
|
|
|
//
|
|
|
|
// (^|\W)(<original regex>)($|\W)
|
|
|
|
// where ^ and $ have multiline mode DISABLED
|
|
|
|
//
|
|
|
|
// What we want are the match offsets of <original regex>. So in the
|
|
|
|
// easy/common case, the original regex will be sandwiched between
|
|
|
|
// two codepoints that are in the \W class. So our approach here is to
|
|
|
|
// look for a match of the overall word regexp, strip the \W ends and
|
|
|
|
// then check whether the original regex matches what's left. If so,
|
|
|
|
// then we are guaranteed a correct match.
|
|
|
|
//
|
|
|
|
// This only works though if we know that the match is sandwiched
|
|
|
|
// between two \W codepoints. This only occurs when neither ^ nor $
|
|
|
|
// match. This in turn only occurs when the match is at either the
|
|
|
|
// beginning or end of the haystack. In either of those cases, we
|
|
|
|
// declare defeat and defer to the slower implementation.
|
|
|
|
//
|
|
|
|
// The reason why we cannot handle the ^/$ cases here is because we
|
|
|
|
// can't assume anything about the original pattern. (Try commenting
|
|
|
|
// out the checks for ^/$ below and run the tests to see examples.)
|
|
|
|
let mut cand = match self.regex.find_at(haystack, at) {
|
|
|
|
None => return Ok(None),
|
|
|
|
Some(m) => Match::new(m.start(), m.end()),
|
|
|
|
};
|
|
|
|
if cand.start() == 0 || cand.end() == haystack.len() {
|
|
|
|
return Err(());
|
|
|
|
}
|
|
|
|
let (_, slen) = bstr::decode_utf8(&haystack[cand]);
|
|
|
|
let (_, elen) = bstr::decode_last_utf8(&haystack[cand]);
|
2020-02-17 18:08:47 -05:00
|
|
|
cand =
|
|
|
|
cand.with_start(cand.start() + slen).with_end(cand.end() - elen);
|
2020-02-16 10:43:26 -05:00
|
|
|
if self.original.is_match(&haystack[cand]) {
|
|
|
|
Ok(Some(cand))
|
|
|
|
} else {
|
|
|
|
Err(())
|
|
|
|
}
|
|
|
|
}
|
2018-04-29 09:29:52 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Matcher for WordMatcher {
|
|
|
|
type Captures = RegexCaptures;
|
|
|
|
type Error = NoError;
|
|
|
|
|
|
|
|
fn find_at(
|
|
|
|
&self,
|
|
|
|
haystack: &[u8],
|
|
|
|
at: usize,
|
|
|
|
) -> Result<Option<Match>, NoError> {
|
|
|
|
// To make this easy to get right, we extract captures here instead of
|
|
|
|
// calling `find_at`. The actual match is at capture group `1` instead
|
|
|
|
// of `0`. We *could* use `find_at` here and then trim the match after
|
|
|
|
// the fact, but that's a bit harder to get right, and it's not clear
|
|
|
|
// if it's worth it.
|
2020-02-16 10:43:26 -05:00
|
|
|
//
|
|
|
|
// OK, well, it turns out that it is worth it! But it is quite tricky.
|
|
|
|
// See `fast_find` for details. Effectively, this lets us skip running
|
|
|
|
// the NFA simulation in the regex engine in the vast majority of
|
|
|
|
// cases. However, the NFA simulation is required for full correctness.
|
|
|
|
match self.fast_find(haystack, at) {
|
|
|
|
Ok(Some(m)) => return Ok(Some(m)),
|
|
|
|
Ok(None) => return Ok(None),
|
|
|
|
Err(()) => {}
|
|
|
|
}
|
2018-04-29 09:29:52 -04:00
|
|
|
|
2020-02-17 18:08:47 -05:00
|
|
|
let cell =
|
|
|
|
self.locs.get_or(|| RefCell::new(self.regex.capture_locations()));
|
2018-04-29 09:29:52 -04:00
|
|
|
let mut caps = cell.borrow_mut();
|
|
|
|
self.regex.captures_read_at(&mut caps, haystack, at);
|
|
|
|
Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
|
|
|
Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn capture_count(&self) -> usize {
|
|
|
|
self.regex.captures_len().checked_sub(1).unwrap()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
|
|
|
self.names.get(name).map(|i| *i)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn captures_at(
|
|
|
|
&self,
|
|
|
|
haystack: &[u8],
|
|
|
|
at: usize,
|
|
|
|
caps: &mut RegexCaptures,
|
|
|
|
) -> Result<bool, NoError> {
|
2020-02-17 18:08:47 -05:00
|
|
|
let r =
|
|
|
|
self.regex.captures_read_at(caps.locations_mut(), haystack, at);
|
2018-04-29 09:29:52 -04:00
|
|
|
Ok(r.is_some())
|
|
|
|
}
|
|
|
|
|
|
|
|
// We specifically do not implement other methods like find_iter or
|
|
|
|
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
|
|
|
// by virtue of implementing find_at and captures_at above.
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::WordMatcher;
|
2021-06-01 19:29:50 -04:00
|
|
|
use crate::config::Config;
|
2020-02-17 18:08:47 -05:00
|
|
|
use grep_matcher::{Captures, Match, Matcher};
|
2018-04-29 09:29:52 -04:00
|
|
|
|
|
|
|
fn matcher(pattern: &str) -> WordMatcher {
|
|
|
|
let chir = Config::default().hir(pattern).unwrap();
|
|
|
|
WordMatcher::new(&chir).unwrap()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
|
|
|
matcher(pattern)
|
|
|
|
.find(haystack.as_bytes())
|
|
|
|
.unwrap()
|
|
|
|
.map(|m| (m.start(), m.end()))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
|
|
|
let m = matcher(pattern);
|
|
|
|
let mut caps = m.new_captures().unwrap();
|
|
|
|
if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
|
|
|
|
None
|
|
|
|
} else {
|
|
|
|
caps.get(0).map(|m| (m.start(), m.end()))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test that the standard `find` API reports offsets correctly.
|
|
|
|
#[test]
|
|
|
|
fn various_find() {
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo"));
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo("));
|
|
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
|
|
|
|
assert_eq!(None, find(r"foo", "!afoo("));
|
|
|
|
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
|
|
|
|
assert_eq!(None, find(r"foo", "fooб"));
|
|
|
|
|
2020-02-16 10:43:26 -05:00
|
|
|
assert_eq!(Some((0, 4)), find(r"foo5", "foo5"));
|
|
|
|
assert_eq!(None, find(r"foo", "foo5"));
|
|
|
|
|
|
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo!"));
|
|
|
|
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!"));
|
|
|
|
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!"));
|
|
|
|
|
|
|
|
assert_eq!(Some((0, 3)), find(r"foo", "foo\n"));
|
|
|
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n"));
|
|
|
|
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n"));
|
|
|
|
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n"));
|
|
|
|
|
|
|
|
assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!"));
|
|
|
|
assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!"));
|
|
|
|
assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a"));
|
|
|
|
|
|
|
|
assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n"));
|
grep: fix bugs in handling multi-line look-around
This commit hacks in a bug fix for handling look-around across multiple
lines. The main problem is that by the time the matching lines are sent
to the printer, the surrounding context---which some look-behind or
look-ahead might have matched---could have been dropped if it wasn't
part of the set of matching lines. Therefore, when the printer re-runs
the regex engine in some cases (to do replacements, color matches, etc
etc), it won't be guaranteed to see the same matches that the searcher
found.
Overall, this is a giant clusterfuck and suggests that the way I divided
the abstraction boundary between the printer and the searcher is just
wrong. It's likely that the searcher needs to handle more of the work of
matching and pass that info on to the printer. The tricky part is that
this additional work isn't always needed. Ultimately, this means a
serious re-design of the interface between searching and printing. Sigh.
The way this fix works is to smuggle the underlying buffer used by the
searcher through into the printer. Since these bugs only impact
multi-line search (otherwise, searches are only limited to matches
across a single line), and since multi-line search always requires
having the entire file contents in a single contiguous slice (memory
mapped or on the heap), it follows that the buffer we pass through when
we need it is, in fact, the entire haystack. So this commit refactors
the printer's regex searching to use that buffer instead of the intended
bundle of bytes containing just the relevant matching portions of that
same buffer.
There is one last little hiccup: PCRE2 doesn't seem to have a way to
specify an ending position for a search. So when we re-run the search to
find matches, we can't say, "but don't search past here." Since the
buffer is likely to contain the entire file, we really cannot do
anything here other than specify a fixed upper bound on the number of
bytes to search. So if look-ahead goes more than N bytes beyond the
match, this code will break by simply being unable to find the match. In
practice, this is probably pretty rare. I believe that if we did a
better fix for this bug by fixing the interfaces, then we'd probably try
to have PCRE2 find the pertinent matches up front so that it never needs
to re-discover them.
Fixes #1412
2021-05-31 08:29:01 -04:00
|
|
|
assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!##"));
|
|
|
|
assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!\n##"));
|
2020-02-16 10:43:26 -05:00
|
|
|
assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##"));
|
|
|
|
assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa"));
|
|
|
|
}
|
|
|
|
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
|
|
|
#[test]
|
|
|
|
fn regression_dash() {
|
2018-04-29 09:29:52 -04:00
|
|
|
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test that the captures API also reports offsets correctly, just as
|
|
|
|
// find does. This exercises a different path in the code since captures
|
|
|
|
// are handled differently.
|
|
|
|
#[test]
|
|
|
|
fn various_captures() {
|
|
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
|
|
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
|
|
|
|
assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
|
|
|
|
assert_eq!(None, find_by_caps(r"foo", "!afoo("));
|
|
|
|
|
|
|
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
|
|
|
|
assert_eq!(None, find_by_caps(r"foo", "fooб"));
|
|
|
|
// assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));
|
|
|
|
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
|
|
|
assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test that the capture reporting methods work as advertised.
|
|
|
|
#[test]
|
|
|
|
fn capture_indexing() {
|
|
|
|
let m = matcher(r"(a)(?P<foo>b)(c)");
|
|
|
|
assert_eq!(4, m.capture_count());
|
|
|
|
assert_eq!(Some(2), m.capture_index("foo"));
|
|
|
|
|
|
|
|
let mut caps = m.new_captures().unwrap();
|
|
|
|
assert_eq!(4, caps.len());
|
|
|
|
|
|
|
|
assert!(m.captures(b"abc", &mut caps).unwrap());
|
|
|
|
assert_eq!(caps.get(0), Some(Match::new(0, 3)));
|
|
|
|
assert_eq!(caps.get(1), Some(Match::new(0, 1)));
|
|
|
|
assert_eq!(caps.get(2), Some(Match::new(1, 2)));
|
|
|
|
assert_eq!(caps.get(3), Some(Match::new(2, 3)));
|
|
|
|
assert_eq!(caps.get(4), None);
|
|
|
|
|
|
|
|
assert!(m.captures(b"#abc#", &mut caps).unwrap());
|
|
|
|
assert_eq!(caps.get(0), Some(Match::new(1, 4)));
|
|
|
|
assert_eq!(caps.get(1), Some(Match::new(1, 2)));
|
|
|
|
assert_eq!(caps.get(2), Some(Match::new(2, 3)));
|
|
|
|
assert_eq!(caps.get(3), Some(Match::new(3, 4)));
|
|
|
|
assert_eq!(caps.get(4), None);
|
|
|
|
}
|
|
|
|
}
|