ripgrep/grep-regex/src/word.rs

use std::collections::HashMap;
use std::cell::RefCell;
use std::sync::Arc;

use grep_matcher::{Match, Matcher, NoError};
use regex::bytes::{CaptureLocations, Regex};
use thread_local::CachedThreadLocal;

use config::ConfiguredHIR;
use error::Error;
use matcher::RegexCaptures;

/// A matcher for implementing "word match" semantics.
#[derive(Debug)]
pub struct WordMatcher {
    /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
    regex: Regex,
    /// A map from capture group name to capture group index.
    names: HashMap<String, usize>,
    /// A reusable buffer for finding the match location of the inner group.
    locs: Arc<CachedThreadLocal<RefCell<CaptureLocations>>>,
}

impl Clone for WordMatcher {
    fn clone(&self) -> WordMatcher {
        // We implement Clone manually so that we get a fresh CachedThreadLocal
        // such that it can set its own thread owner. This permits each thread
        // usings `locs` to hit the fast path.
        WordMatcher {
            regex: self.regex.clone(),
            names: self.names.clone(),
            locs: Arc::new(CachedThreadLocal::new()),
        }
    }
}

impl WordMatcher {
    /// Create a new matcher from the given pattern that only produces matches
    /// that are considered "words."
    ///
    /// The given options are used to construct the regular expression
    /// internally.
    pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
        let word_expr = expr.with_pattern(|pat| {
            format!(r"(?:(?m:^)|\W)({})(?:(?m:$)|\W)", pat)
        })?;
        let regex = word_expr.regex()?;
        let locs = Arc::new(CachedThreadLocal::new());

        let mut names = HashMap::new();
        for (i, optional_name) in regex.capture_names().enumerate() {
            if let Some(name) = optional_name {
                names.insert(name.to_string(), i.checked_sub(1).unwrap());
            }
        }
        Ok(WordMatcher { regex, names, locs })
    }

    /// Return the underlying regex used by this matcher.
    pub fn regex(&self) -> &Regex {
        &self.regex
    }
}

impl Matcher for WordMatcher {
    type Captures = RegexCaptures;
    type Error = NoError;

    fn find_at(
        &self,
        haystack: &[u8],
        at: usize,
    ) -> Result<Option<Match>, NoError> {
        // To make this easy to get right, we extract captures here instead of
        // calling `find_at`. The actual match is at capture group `1` instead
        // of `0`. We *could* use `find_at` here and then trim the match after
        // the fact, but that's a bit harder to get right, and it's not clear
        // if it's worth it.

        let cell = self.locs.get_or(|| {
            Box::new(RefCell::new(self.regex.capture_locations()))
        });
        let mut caps = cell.borrow_mut();
        self.regex.captures_read_at(&mut caps, haystack, at);
        Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
    }

    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
        Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
    }

    fn capture_count(&self) -> usize {
        self.regex.captures_len().checked_sub(1).unwrap()
    }

    fn capture_index(&self, name: &str) -> Option<usize> {
        self.names.get(name).map(|i| *i)
    }

    fn captures_at(
        &self,
        haystack: &[u8],
        at: usize,
        caps: &mut RegexCaptures,
    ) -> Result<bool, NoError> {
        let r = self.regex.captures_read_at(
            caps.locations_mut(), haystack, at,
        );
        Ok(r.is_some())
    }

    // We specifically do not implement other methods like find_iter or
    // captures_iter. Namely, the iter methods are guaranteed to be correct
    // by virtue of implementing find_at and captures_at above.
}

#[cfg(test)]
mod tests {
    use grep_matcher::{Captures, Match, Matcher};
    use config::Config;
    use super::WordMatcher;

    fn matcher(pattern: &str) -> WordMatcher {
        let chir = Config::default().hir(pattern).unwrap();
        WordMatcher::new(&chir).unwrap()
    }

    fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
        matcher(pattern)
            .find(haystack.as_bytes())
            .unwrap()
            .map(|m| (m.start(), m.end()))
    }

    fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
        let m = matcher(pattern);
        let mut caps = m.new_captures().unwrap();
        if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
            None
        } else {
            caps.get(0).map(|m| (m.start(), m.end()))
        }
    }

    // Test that the standard `find` API reports offsets correctly.
    #[test]
    fn various_find() {
        assert_eq!(Some((0, 3)), find(r"foo", "foo"));
        assert_eq!(Some((0, 3)), find(r"foo", "foo("));
        assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
        assert_eq!(None, find(r"foo", "!afoo("));

        assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
        assert_eq!(None, find(r"foo", "fooб"));
        // assert_eq!(Some((0, 3)), find(r"foo", "fooб"));

        // See: https://github.com/BurntSushi/ripgrep/issues/389
        assert_eq!(Some((0, 2)), find(r"-2", "-2"));
    }

    // Test that the captures API also reports offsets correctly, just as
    // find does. This exercises a different path in the code since captures
    // are handled differently.
    #[test]
    fn various_captures() {
        assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
        assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
        assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
        assert_eq!(None, find_by_caps(r"foo", "!afoo("));

        assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
        assert_eq!(None, find_by_caps(r"foo", "fooб"));
        // assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));

        // See: https://github.com/BurntSushi/ripgrep/issues/389
        assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
    }

    // Test that the capture reporting methods work as advertised.
    #[test]
    fn capture_indexing() {
        let m = matcher(r"(a)(?P<foo>b)(c)");
        assert_eq!(4, m.capture_count());
        assert_eq!(Some(2), m.capture_index("foo"));

        let mut caps = m.new_captures().unwrap();
        assert_eq!(4, caps.len());

        assert!(m.captures(b"abc", &mut caps).unwrap());
        assert_eq!(caps.get(0), Some(Match::new(0, 3)));
        assert_eq!(caps.get(1), Some(Match::new(0, 1)));
        assert_eq!(caps.get(2), Some(Match::new(1, 2)));
        assert_eq!(caps.get(3), Some(Match::new(2, 3)));
        assert_eq!(caps.get(4), None);

        assert!(m.captures(b"#abc#", &mut caps).unwrap());
        assert_eq!(caps.get(0), Some(Match::new(1, 4)));
        assert_eq!(caps.get(1), Some(Match::new(1, 2)));
        assert_eq!(caps.get(2), Some(Match::new(2, 3)));
        assert_eq!(caps.get(3), Some(Match::new(3, 4)));
        assert_eq!(caps.get(4), None);
    }
}
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`use std::collections::HashMap;`
			`use std::cell::RefCell;`
			`use std::sync::Arc;`

			`use grep_matcher::{Match, Matcher, NoError};`
			`use regex::bytes::{CaptureLocations, Regex};`
			`use thread_local::CachedThreadLocal;`

			`use config::ConfiguredHIR;`
			`use error::Error;`
			`use matcher::RegexCaptures;`

			`/// A matcher for implementing "word match" semantics.`
			`#[derive(Debug)]`
			`pub struct WordMatcher {`
			/// The regex which is roughly `(?:^\|\W)(<original pattern>)(?:$\|\W)`.
			`regex: Regex,`
			`/// A map from capture group name to capture group index.`
			`names: HashMap<String, usize>,`
			`/// A reusable buffer for finding the match location of the inner group.`
			`locs: Arc<CachedThreadLocal<RefCell<CaptureLocations>>>,`
			`}`

			`impl Clone for WordMatcher {`
			`fn clone(&self) -> WordMatcher {`
			`// We implement Clone manually so that we get a fresh CachedThreadLocal`
			`// such that it can set its own thread owner. This permits each thread`
			// usings `locs` to hit the fast path.
			`WordMatcher {`
			`regex: self.regex.clone(),`
			`names: self.names.clone(),`
			`locs: Arc::new(CachedThreadLocal::new()),`
			`}`
			`}`
			`}`

			`impl WordMatcher {`
			`/// Create a new matcher from the given pattern that only produces matches`
			`/// that are considered "words."`
			`///`
			`/// The given options are used to construct the regular expression`
			`/// internally.`
			`pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {`
			`let word_expr = expr.with_pattern(\|pat\| {`
			`format!(r"(?:(?m:^)\|\W)({})(?:(?m:$)\|\W)", pat)`
			`})?;`
			`let regex = word_expr.regex()?;`
			`let locs = Arc::new(CachedThreadLocal::new());`

			`let mut names = HashMap::new();`
			`for (i, optional_name) in regex.capture_names().enumerate() {`
			`if let Some(name) = optional_name {`
			`names.insert(name.to_string(), i.checked_sub(1).unwrap());`
			`}`
			`}`
			`Ok(WordMatcher { regex, names, locs })`
			`}`
regex: print out final regex in trace mode This is useful for debugging to see what regex is actually being run. We put this as a trace since the regex can be quite gnarly. (It is not pretty printed.) 2019-04-05 21:03:22 -04:00
			`/// Return the underlying regex used by this matcher.`
			`pub fn regex(&self) -> &Regex {`
			`&self.regex`
			`}`
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`}`

			`impl Matcher for WordMatcher {`
			`type Captures = RegexCaptures;`
			`type Error = NoError;`

			`fn find_at(`
			`&self,`
			`haystack: &[u8],`
			`at: usize,`
			`) -> Result<Option<Match>, NoError> {`
			`// To make this easy to get right, we extract captures here instead of`
			// calling `find_at`. The actual match is at capture group `1` instead
			// of `0`. We could use `find_at` here and then trim the match after
			`// the fact, but that's a bit harder to get right, and it's not clear`
			`// if it's worth it.`

			`let cell = self.locs.get_or(\|\| {`
			`Box::new(RefCell::new(self.regex.capture_locations()))`
			`});`
			`let mut caps = cell.borrow_mut();`
			`self.regex.captures_read_at(&mut caps, haystack, at);`
			`Ok(caps.get(1).map(\|m\| Match::new(m.0, m.1)))`
			`}`

			`fn new_captures(&self) -> Result<RegexCaptures, NoError> {`
			`Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))`
			`}`

			`fn capture_count(&self) -> usize {`
			`self.regex.captures_len().checked_sub(1).unwrap()`
			`}`

			`fn capture_index(&self, name: &str) -> Option<usize> {`
			`self.names.get(name).map(\|i\| *i)`
			`}`

			`fn captures_at(`
			`&self,`
			`haystack: &[u8],`
			`at: usize,`
			`caps: &mut RegexCaptures,`
			`) -> Result<bool, NoError> {`
regex: make multi-literal searcher faster This makes the case of searching for a dictionary of a very large number of literals much much faster. (~10x or so.) In particular, we achieve this by short-circuiting the construction of a full regex when we know we have a simple alternation of literals. Building the regex for a large dictionary (>100,000 literals) turns out to be quite slow, even if it internally will dispatch to Aho-Corasick. Even that isn't quite enough. It turns out that even parsing such a regex is quite slow. So when the -F/--fixed-strings flag is set, we short circuit regex parsing completely and jump straight to Aho-Corasick. We aren't quite as fast as GNU grep here, but it's much closer (less than 2x slower). In general, this is somewhat of a hack. In particular, it seems plausible that this optimization could be implemented entirely in the regex engine. Unfortunately, the regex engine's internals are just not amenable to this at all, so it would require a larger refactoring effort. For now, it's good enough to add this fairly simple hack at a higher level. Unfortunately, if you don't pass -F/--fixed-strings, then ripgrep will be slower, because of the aforementioned missing optimization. Moreover, passing flags like `-i` or `-S` will cause ripgrep to abandon this optimization and fall back to something potentially much slower. Again, this fix really needs to happen inside the regex engine, although we might be able to special case -i when the input literals are pure ASCII via Aho-Corasick's `ascii_case_insensitive`. Fixes #497, Fixes #838 2019-04-07 18:43:01 -04:00			`let r = self.regex.captures_read_at(`
			`caps.locations_mut(), haystack, at,`
			`);`
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`Ok(r.is_some())`
			`}`

			`// We specifically do not implement other methods like find_iter or`
			`// captures_iter. Namely, the iter methods are guaranteed to be correct`
			`// by virtue of implementing find_at and captures_at above.`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use grep_matcher::{Captures, Match, Matcher};`
			`use config::Config;`
			`use super::WordMatcher;`

			`fn matcher(pattern: &str) -> WordMatcher {`
			`let chir = Config::default().hir(pattern).unwrap();`
			`WordMatcher::new(&chir).unwrap()`
			`}`

			`fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {`
			`matcher(pattern)`
			`.find(haystack.as_bytes())`
			`.unwrap()`
			`.map(\|m\| (m.start(), m.end()))`
			`}`

			`fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {`
			`let m = matcher(pattern);`
			`let mut caps = m.new_captures().unwrap();`
			`if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {`
			`None`
			`} else {`
			`caps.get(0).map(\|m\| (m.start(), m.end()))`
			`}`
			`}`

			// Test that the standard `find` API reports offsets correctly.
			`#[test]`
			`fn various_find() {`
			`assert_eq!(Some((0, 3)), find(r"foo", "foo"));`
			`assert_eq!(Some((0, 3)), find(r"foo", "foo("));`
			`assert_eq!(Some((1, 4)), find(r"foo", "!foo("));`
			`assert_eq!(None, find(r"foo", "!afoo("));`

			`assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));`
			`assert_eq!(None, find(r"foo", "fooб"));`
			`// assert_eq!(Some((0, 3)), find(r"foo", "fooб"));`

			`// See: https://github.com/BurntSushi/ripgrep/issues/389`
			`assert_eq!(Some((0, 2)), find(r"-2", "-2"));`
			`}`

			`// Test that the captures API also reports offsets correctly, just as`
			`// find does. This exercises a different path in the code since captures`
			`// are handled differently.`
			`#[test]`
			`fn various_captures() {`
			`assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));`
			`assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));`
			`assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));`
			`assert_eq!(None, find_by_caps(r"foo", "!afoo("));`

			`assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));`
			`assert_eq!(None, find_by_caps(r"foo", "fooб"));`
			`// assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));`

			`// See: https://github.com/BurntSushi/ripgrep/issues/389`
			`assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));`
			`}`

			`// Test that the capture reporting methods work as advertised.`
			`#[test]`
			`fn capture_indexing() {`
			`let m = matcher(r"(a)(?P<foo>b)(c)");`
			`assert_eq!(4, m.capture_count());`
			`assert_eq!(Some(2), m.capture_index("foo"));`

			`let mut caps = m.new_captures().unwrap();`
			`assert_eq!(4, caps.len());`

			`assert!(m.captures(b"abc", &mut caps).unwrap());`
			`assert_eq!(caps.get(0), Some(Match::new(0, 3)));`
			`assert_eq!(caps.get(1), Some(Match::new(0, 1)));`
			`assert_eq!(caps.get(2), Some(Match::new(1, 2)));`
			`assert_eq!(caps.get(3), Some(Match::new(2, 3)));`
			`assert_eq!(caps.get(4), None);`

			`assert!(m.captures(b"#abc#", &mut caps).unwrap());`
			`assert_eq!(caps.get(0), Some(Match::new(1, 4)));`
			`assert_eq!(caps.get(1), Some(Match::new(1, 2)));`
			`assert_eq!(caps.get(2), Some(Match::new(2, 3)));`
			`assert_eq!(caps.get(3), Some(Match::new(3, 4)));`
			`assert_eq!(caps.get(4), None);`
			`}`
			`}`