use std::collections::HashMap; use grep_matcher::{Match, Matcher, NoError}; use regex::bytes::Regex; use regex_syntax::hir::{self, Hir, HirKind}; use crate::config::ConfiguredHIR; use crate::error::Error; use crate::matcher::RegexCaptures; /// A matcher for implementing "word match" semantics. #[derive(Clone, Debug)] pub struct CRLFMatcher { /// The regex. regex: Regex, /// A map from capture group name to capture group index. names: HashMap, } impl CRLFMatcher { /// Create a new matcher from the given pattern that strips `\r` from the /// end of every match. /// /// This panics if the given expression doesn't need its CRLF stripped. pub fn new(expr: &ConfiguredHIR) -> Result { assert!(expr.needs_crlf_stripped()); let regex = expr.regex()?; let mut names = HashMap::new(); for (i, optional_name) in regex.capture_names().enumerate() { if let Some(name) = optional_name { names.insert(name.to_string(), i.checked_sub(1).unwrap()); } } Ok(CRLFMatcher { regex, names }) } /// Return the underlying regex used by this matcher. pub fn regex(&self) -> &Regex { &self.regex } } impl Matcher for CRLFMatcher { type Captures = RegexCaptures; type Error = NoError; fn find_at( &self, haystack: &[u8], at: usize, ) -> Result, NoError> { let m = match self.regex.find_at(haystack, at) { None => return Ok(None), Some(m) => Match::new(m.start(), m.end()), }; Ok(Some(adjust_match(haystack, m))) } fn new_captures(&self) -> Result { Ok(RegexCaptures::new(self.regex.capture_locations())) } fn capture_count(&self) -> usize { self.regex.captures_len().checked_sub(1).unwrap() } fn capture_index(&self, name: &str) -> Option { self.names.get(name).map(|i| *i) } fn captures_at( &self, haystack: &[u8], at: usize, caps: &mut RegexCaptures, ) -> Result { caps.strip_crlf(false); let r = self.regex.captures_read_at(caps.locations_mut(), haystack, at); if !r.is_some() { return Ok(false); } // If the end of our match includes a `\r`, then strip it from all // capture groups ending at the same location. let end = caps.locations().get(0).unwrap().1; if end > 0 && haystack.get(end - 1) == Some(&b'\r') { caps.strip_crlf(true); } Ok(true) } // We specifically do not implement other methods like find_iter or // captures_iter. Namely, the iter methods are guaranteed to be correct // by virtue of implementing find_at and captures_at above. } /// If the given match ends with a `\r`, then return a new match that ends /// immediately before the `\r`. pub fn adjust_match(haystack: &[u8], m: Match) -> Match { if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') { m.with_end(m.end() - 1) } else { m } } /// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`. /// /// This does not preserve the exact semantics of the given expression, /// however, it does have the useful property that anything that matched the /// given expression will also match the returned expression. The difference is /// that the returned expression can match possibly other things as well. /// /// The principle reason why we do this is because the underlying regex engine /// doesn't support CRLF aware `$` look-around. It's planned to fix it at that /// level, but we perform this kludge in the mean time. /// /// Note that while the match preserving semantics are nice and neat, the /// match position semantics are quite a bit messier. Namely, `$` only ever /// matches the position between characters where as `\r??` can match a /// character and change the offset. This is regretable, but works out pretty /// nicely in most cases, especially when a match is limited to a single line. pub fn crlfify(expr: Hir) -> Hir { match expr.into_kind() { HirKind::Anchor(hir::Anchor::EndLine) => { let concat = Hir::concat(vec![ Hir::repetition(hir::Repetition { kind: hir::RepetitionKind::ZeroOrOne, greedy: false, hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))), }), Hir::anchor(hir::Anchor::EndLine), ]); Hir::group(hir::Group { kind: hir::GroupKind::NonCapturing, hir: Box::new(concat), }) } HirKind::Empty => Hir::empty(), HirKind::Literal(x) => Hir::literal(x), HirKind::Class(x) => Hir::class(x), HirKind::Anchor(x) => Hir::anchor(x), HirKind::WordBoundary(x) => Hir::word_boundary(x), HirKind::Repetition(mut x) => { x.hir = Box::new(crlfify(*x.hir)); Hir::repetition(x) } HirKind::Group(mut x) => { x.hir = Box::new(crlfify(*x.hir)); Hir::group(x) } HirKind::Concat(xs) => { Hir::concat(xs.into_iter().map(crlfify).collect()) } HirKind::Alternation(xs) => { Hir::alternation(xs.into_iter().map(crlfify).collect()) } } } #[cfg(test)] mod tests { use super::crlfify; use regex_syntax::Parser; fn roundtrip(pattern: &str) -> String { let expr1 = Parser::new().parse(pattern).unwrap(); let expr2 = crlfify(expr1); expr2.to_string() } #[test] fn various() { assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))"); assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))"); assert_eq!( roundtrip(r"(?m)(?:foo$|bar$)"), "(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))" ); assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a"); // Not a multiline `$`, so no crlfifying occurs. assert_eq!(roundtrip(r"$"), "\\z"); // It's a literal, derp. assert_eq!(roundtrip(r"\$"), "\\$"); } }