ripgrep/grep-regex/src/crlf.rs

use std::collections::HashMap;

use grep_matcher::{Match, Matcher, NoError};
use regex::bytes::Regex;
use regex_syntax::hir::{self, Hir, HirKind};

use config::ConfiguredHIR;
use error::Error;
use matcher::RegexCaptures;

/// A matcher for implementing "word match" semantics.
#[derive(Clone, Debug)]
pub struct CRLFMatcher {
    /// The regex.
    regex: Regex,
    /// A map from capture group name to capture group index.
    names: HashMap<String, usize>,
}

impl CRLFMatcher {
    /// Create a new matcher from the given pattern that strips `\r` from the
    /// end of every match.
    ///
    /// This panics if the given expression doesn't need its CRLF stripped.
    pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {
        assert!(expr.needs_crlf_stripped());

        let regex = expr.regex()?;
        let mut names = HashMap::new();
        for (i, optional_name) in regex.capture_names().enumerate() {
            if let Some(name) = optional_name {
                names.insert(name.to_string(), i.checked_sub(1).unwrap());
            }
        }
        Ok(CRLFMatcher { regex, names })
    }

    /// Return the underlying regex used by this matcher.
    pub fn regex(&self) -> &Regex {
        &self.regex
    }
}

impl Matcher for CRLFMatcher {
    type Captures = RegexCaptures;
    type Error = NoError;

    fn find_at(
        &self,
        haystack: &[u8],
        at: usize,
    ) -> Result<Option<Match>, NoError> {
        let m = match self.regex.find_at(haystack, at) {
            None => return Ok(None),
            Some(m) => Match::new(m.start(), m.end()),
        };
        Ok(Some(adjust_match(haystack, m)))
    }

    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
        Ok(RegexCaptures::new(self.regex.capture_locations()))
    }

    fn capture_count(&self) -> usize {
        self.regex.captures_len().checked_sub(1).unwrap()
    }

    fn capture_index(&self, name: &str) -> Option<usize> {
        self.names.get(name).map(|i| *i)
    }

    fn captures_at(
        &self,
        haystack: &[u8],
        at: usize,
        caps: &mut RegexCaptures,
    ) -> Result<bool, NoError> {
        caps.strip_crlf(false);
        let r = self.regex.captures_read_at(
            caps.locations_mut(), haystack, at,
        );
        if !r.is_some() {
            return Ok(false);
        }

        // If the end of our match includes a `\r`, then strip it from all
        // capture groups ending at the same location.
        let end = caps.locations().get(0).unwrap().1;
        if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
            caps.strip_crlf(true);
        }
        Ok(true)
    }

    // We specifically do not implement other methods like find_iter or
    // captures_iter. Namely, the iter methods are guaranteed to be correct
    // by virtue of implementing find_at and captures_at above.
}

/// If the given match ends with a `\r`, then return a new match that ends
/// immediately before the `\r`.
pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
    if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {
        m.with_end(m.end() - 1)
    } else {
        m
    }
}

/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
///
/// This does not preserve the exact semantics of the given expression,
/// however, it does have the useful property that anything that matched the
/// given expression will also match the returned expression. The difference is
/// that the returned expression can match possibly other things as well.
///
/// The principle reason why we do this is because the underlying regex engine
/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that
/// level, but we perform this kludge in the mean time.
///
/// Note that while the match preserving semantics are nice and neat, the
/// match position semantics are quite a bit messier. Namely, `$` only ever
/// matches the position between characters where as `\r??` can match a
/// character and change the offset. This is regretable, but works out pretty
/// nicely in most cases, especially when a match is limited to a single line.
pub fn crlfify(expr: Hir) -> Hir {
    match expr.into_kind() {
        HirKind::Anchor(hir::Anchor::EndLine) => {
            let concat = Hir::concat(vec![
                Hir::repetition(hir::Repetition {
                    kind: hir::RepetitionKind::ZeroOrOne,
                    greedy: false,
                    hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),
                }),
                Hir::anchor(hir::Anchor::EndLine),
            ]);
            Hir::group(hir::Group {
                kind: hir::GroupKind::NonCapturing,
                hir: Box::new(concat),
            })
        }
        HirKind::Empty => Hir::empty(),
        HirKind::Literal(x) => Hir::literal(x),
        HirKind::Class(x) => Hir::class(x),
        HirKind::Anchor(x) => Hir::anchor(x),
        HirKind::WordBoundary(x) => Hir::word_boundary(x),
        HirKind::Repetition(mut x) => {
            x.hir = Box::new(crlfify(*x.hir));
            Hir::repetition(x)
        }
        HirKind::Group(mut x) => {
            x.hir = Box::new(crlfify(*x.hir));
            Hir::group(x)
        }
        HirKind::Concat(xs) => {
            Hir::concat(xs.into_iter().map(crlfify).collect())
        }
        HirKind::Alternation(xs) => {
            Hir::alternation(xs.into_iter().map(crlfify).collect())
        }
    }
}

#[cfg(test)]
mod tests {
    use regex_syntax::Parser;
    use super::crlfify;

    fn roundtrip(pattern: &str) -> String {
        let expr1 = Parser::new().parse(pattern).unwrap();
        let expr2 = crlfify(expr1);
        expr2.to_string()
    }

    #[test]
    fn various() {
        assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
        assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");
        assert_eq!(
            roundtrip(r"(?m)(?:foo$|bar$)"),
            "(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))"
        );
        assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");

        // Not a multiline `$`, so no crlfifying occurs.
        assert_eq!(roundtrip(r"$"), "\\z");
        // It's a literal, derp.
        assert_eq!(roundtrip(r"\$"), "\\$");
    }
}
regex: make CRLF hack more robust This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo\|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r\|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095 2019-01-26 12:25:21 -05:00			`use std::collections::HashMap;`

			`use grep_matcher::{Match, Matcher, NoError};`
			`use regex::bytes::Regex;`
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`use regex_syntax::hir::{self, Hir, HirKind};`

regex: make CRLF hack more robust This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo\|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r\|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095 2019-01-26 12:25:21 -05:00			`use config::ConfiguredHIR;`
			`use error::Error;`
			`use matcher::RegexCaptures;`

			`/// A matcher for implementing "word match" semantics.`
			`#[derive(Clone, Debug)]`
			`pub struct CRLFMatcher {`
			`/// The regex.`
			`regex: Regex,`
			`/// A map from capture group name to capture group index.`
			`names: HashMap<String, usize>,`
			`}`

			`impl CRLFMatcher {`
			/// Create a new matcher from the given pattern that strips `\r` from the
			`/// end of every match.`
			`///`
			`/// This panics if the given expression doesn't need its CRLF stripped.`
			`pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {`
			`assert!(expr.needs_crlf_stripped());`

			`let regex = expr.regex()?;`
			`let mut names = HashMap::new();`
			`for (i, optional_name) in regex.capture_names().enumerate() {`
			`if let Some(name) = optional_name {`
			`names.insert(name.to_string(), i.checked_sub(1).unwrap());`
			`}`
			`}`
			`Ok(CRLFMatcher { regex, names })`
			`}`
regex: print out final regex in trace mode This is useful for debugging to see what regex is actually being run. We put this as a trace since the regex can be quite gnarly. (It is not pretty printed.) 2019-04-05 21:03:22 -04:00
			`/// Return the underlying regex used by this matcher.`
			`pub fn regex(&self) -> &Regex {`
			`&self.regex`
			`}`
regex: make CRLF hack more robust This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo\|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r\|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095 2019-01-26 12:25:21 -05:00			`}`

			`impl Matcher for CRLFMatcher {`
			`type Captures = RegexCaptures;`
			`type Error = NoError;`

			`fn find_at(`
			`&self,`
			`haystack: &[u8],`
			`at: usize,`
			`) -> Result<Option<Match>, NoError> {`
			`let m = match self.regex.find_at(haystack, at) {`
			`None => return Ok(None),`
			`Some(m) => Match::new(m.start(), m.end()),`
			`};`
			`Ok(Some(adjust_match(haystack, m)))`
			`}`

			`fn new_captures(&self) -> Result<RegexCaptures, NoError> {`
			`Ok(RegexCaptures::new(self.regex.capture_locations()))`
			`}`

			`fn capture_count(&self) -> usize {`
			`self.regex.captures_len().checked_sub(1).unwrap()`
			`}`

			`fn capture_index(&self, name: &str) -> Option<usize> {`
			`self.names.get(name).map(\|i\| *i)`
			`}`

			`fn captures_at(`
			`&self,`
			`haystack: &[u8],`
			`at: usize,`
			`caps: &mut RegexCaptures,`
			`) -> Result<bool, NoError> {`
			`caps.strip_crlf(false);`
regex: make multi-literal searcher faster This makes the case of searching for a dictionary of a very large number of literals much much faster. (~10x or so.) In particular, we achieve this by short-circuiting the construction of a full regex when we know we have a simple alternation of literals. Building the regex for a large dictionary (>100,000 literals) turns out to be quite slow, even if it internally will dispatch to Aho-Corasick. Even that isn't quite enough. It turns out that even parsing such a regex is quite slow. So when the -F/--fixed-strings flag is set, we short circuit regex parsing completely and jump straight to Aho-Corasick. We aren't quite as fast as GNU grep here, but it's much closer (less than 2x slower). In general, this is somewhat of a hack. In particular, it seems plausible that this optimization could be implemented entirely in the regex engine. Unfortunately, the regex engine's internals are just not amenable to this at all, so it would require a larger refactoring effort. For now, it's good enough to add this fairly simple hack at a higher level. Unfortunately, if you don't pass -F/--fixed-strings, then ripgrep will be slower, because of the aforementioned missing optimization. Moreover, passing flags like `-i` or `-S` will cause ripgrep to abandon this optimization and fall back to something potentially much slower. Again, this fix really needs to happen inside the regex engine, although we might be able to special case -i when the input literals are pure ASCII via Aho-Corasick's `ascii_case_insensitive`. Fixes #497, Fixes #838 2019-04-07 18:43:01 -04:00			`let r = self.regex.captures_read_at(`
			`caps.locations_mut(), haystack, at,`
			`);`
regex: make CRLF hack more robust This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo\|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r\|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095 2019-01-26 12:25:21 -05:00			`if !r.is_some() {`
			`return Ok(false);`
			`}`

			// If the end of our match includes a `\r`, then strip it from all
			`// capture groups ending at the same location.`
			`let end = caps.locations().get(0).unwrap().1;`
			`if end > 0 && haystack.get(end - 1) == Some(&b'\r') {`
			`caps.strip_crlf(true);`
			`}`
			`Ok(true)`
			`}`

			`// We specifically do not implement other methods like find_iter or`
			`// captures_iter. Namely, the iter methods are guaranteed to be correct`
			`// by virtue of implementing find_at and captures_at above.`
			`}`

			/// If the given match ends with a `\r`, then return a new match that ends
			/// immediately before the `\r`.
			`pub fn adjust_match(haystack: &[u8], m: Match) -> Match {`
			`if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {`
			`m.with_end(m.end() - 1)`
			`} else {`
			`m`
			`}`
			`}`

libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
			`///`
			`/// This does not preserve the exact semantics of the given expression,`
			`/// however, it does have the useful property that anything that matched the`
			`/// given expression will also match the returned expression. The difference is`
			`/// that the returned expression can match possibly other things as well.`
			`///`
			`/// The principle reason why we do this is because the underlying regex engine`
			/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that
			`/// level, but we perform this kludge in the mean time.`
			`///`
			`/// Note that while the match preserving semantics are nice and neat, the`
			/// match position semantics are quite a bit messier. Namely, `$` only ever
			/// matches the position between characters where as `\r??` can match a
			`/// character and change the offset. This is regretable, but works out pretty`
			`/// nicely in most cases, especially when a match is limited to a single line.`
			`pub fn crlfify(expr: Hir) -> Hir {`
			`match expr.into_kind() {`
			`HirKind::Anchor(hir::Anchor::EndLine) => {`
			`let concat = Hir::concat(vec![`
			`Hir::repetition(hir::Repetition {`
			`kind: hir::RepetitionKind::ZeroOrOne,`
			`greedy: false,`
			`hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),`
			`}),`
			`Hir::anchor(hir::Anchor::EndLine),`
			`]);`
			`Hir::group(hir::Group {`
			`kind: hir::GroupKind::NonCapturing,`
			`hir: Box::new(concat),`
			`})`
			`}`
			`HirKind::Empty => Hir::empty(),`
			`HirKind::Literal(x) => Hir::literal(x),`
			`HirKind::Class(x) => Hir::class(x),`
			`HirKind::Anchor(x) => Hir::anchor(x),`
			`HirKind::WordBoundary(x) => Hir::word_boundary(x),`
			`HirKind::Repetition(mut x) => {`
			`x.hir = Box::new(crlfify(*x.hir));`
			`Hir::repetition(x)`
			`}`
			`HirKind::Group(mut x) => {`
			`x.hir = Box::new(crlfify(*x.hir));`
			`Hir::group(x)`
			`}`
			`HirKind::Concat(xs) => {`
			`Hir::concat(xs.into_iter().map(crlfify).collect())`
			`}`
			`HirKind::Alternation(xs) => {`
			`Hir::alternation(xs.into_iter().map(crlfify).collect())`
			`}`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use regex_syntax::Parser;`
			`use super::crlfify;`

			`fn roundtrip(pattern: &str) -> String {`
			`let expr1 = Parser::new().parse(pattern).unwrap();`
			`let expr2 = crlfify(expr1);`
			`expr2.to_string()`
			`}`

			`#[test]`
			`fn various() {`
			`assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");`
			`assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");`
			`assert_eq!(`
			`roundtrip(r"(?m)(?:foo$\|bar$)"),`
			`"(?:foo(?:\r??(?m:$))\|bar(?:\r??(?m:$)))"`
			`);`
			`assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");`

			// Not a multiline `$`, so no crlfifying occurs.
			`assert_eq!(roundtrip(r"$"), "\\z");`
			`// It's a literal, derp.`
			`assert_eq!(roundtrip(r"\$"), "\\$");`
			`}`
			`}`