diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index ed1c2f86..8d785b6d 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -6,7 +6,7 @@ use { }; use crate::{ - ast::AstAnalysis, crlf::crlfify, error::Error, literal::LiteralSets, + ast::AstAnalysis, error::Error, literal::LiteralSets, multi::alternation_literals, non_matching::non_matching_bytes, strip::strip_from_match, }; @@ -75,6 +75,7 @@ impl Config { .case_insensitive(self.is_case_insensitive(&analysis)) .multi_line(self.multi_line) .dot_matches_new_line(self.dot_matches_new_line) + .crlf(self.crlf) .swap_greed(self.swap_greed) .unicode(self.unicode) .build() @@ -88,8 +89,7 @@ impl Config { original: pattern.to_string(), config: self.clone(), analysis, - // If CRLF mode is enabled, replace `$` with `(?:\r?$)`. - expr: if self.crlf { crlfify(expr) } else { expr }, + expr, }) } @@ -167,19 +167,6 @@ impl ConfiguredHIR { non_matching_bytes(&self.expr) } - /// Returns true if and only if this regex needs to have its match offsets - /// tweaked because of CRLF support. Specifically, this occurs when the - /// CRLF hack is enabled and the regex is line anchored at the end. In - /// this case, matches that end with a `\r` have the `\r` stripped. - pub fn needs_crlf_stripped(&self) -> bool { - self.config.crlf - && self - .expr - .properties() - .look_set_suffix_any() - .contains(hir::Look::EndLF) - } - /// Returns the line terminator configured on this expression. /// /// When we have beginning/end anchors (NOT line anchors), the fast line @@ -298,6 +285,7 @@ impl ConfiguredHIR { .octal(self.config.octal) .multi_line(self.config.multi_line) .dot_matches_new_line(self.config.dot_matches_new_line) + .crlf(self.config.crlf) .unicode(self.config.unicode); let meta = Regex::config() .utf8_empty(false) @@ -321,6 +309,7 @@ impl ConfiguredHIR { .utf8(false) .multi_line(self.config.multi_line) .dot_matches_new_line(self.config.dot_matches_new_line) + .crlf(self.config.crlf) .unicode(self.config.unicode) .build() .parse(pattern) diff --git a/crates/regex/src/crlf.rs b/crates/regex/src/crlf.rs deleted file mode 100644 index e32204ef..00000000 --- a/crates/regex/src/crlf.rs +++ /dev/null @@ -1,189 +0,0 @@ -use std::collections::HashMap; - -use { - grep_matcher::{Match, Matcher, NoError}, - regex_automata::{meta::Regex, Input, PatternID}, - regex_syntax::hir::{self, Hir, HirKind}, -}; - -use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures}; - -/// A matcher for implementing "word match" semantics. -#[derive(Clone, Debug)] -pub struct CRLFMatcher { - /// The regex. - regex: Regex, - /// The pattern string corresponding to the regex above. - pattern: String, - /// A map from capture group name to capture group index. - names: HashMap, -} - -impl CRLFMatcher { - /// Create a new matcher from the given pattern that strips `\r` from the - /// end of every match. - /// - /// This panics if the given expression doesn't need its CRLF stripped. - pub fn new(expr: &ConfiguredHIR) -> Result { - assert!(expr.needs_crlf_stripped()); - - let regex = expr.regex()?; - let pattern = expr.pattern(); - let mut names = HashMap::new(); - let it = regex.group_info().pattern_names(PatternID::ZERO); - for (i, optional_name) in it.enumerate() { - if let Some(name) = optional_name { - names.insert(name.to_string(), i.checked_sub(1).unwrap()); - } - } - Ok(CRLFMatcher { regex, pattern, names }) - } - - /// Return the underlying pattern string for the regex used by this - /// matcher. - pub fn pattern(&self) -> &str { - &self.pattern - } -} - -impl Matcher for CRLFMatcher { - type Captures = RegexCaptures; - type Error = NoError; - - fn find_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result, NoError> { - let input = Input::new(haystack).span(at..haystack.len()); - let m = match self.regex.find(input) { - None => return Ok(None), - Some(m) => Match::new(m.start(), m.end()), - }; - Ok(Some(adjust_match(haystack, m))) - } - - fn new_captures(&self) -> Result { - Ok(RegexCaptures::new(self.regex.create_captures())) - } - - fn capture_count(&self) -> usize { - self.regex.captures_len().checked_sub(1).unwrap() - } - - fn capture_index(&self, name: &str) -> Option { - self.names.get(name).map(|i| *i) - } - - fn captures_at( - &self, - haystack: &[u8], - at: usize, - caps: &mut RegexCaptures, - ) -> Result { - caps.strip_crlf(false); - let input = Input::new(haystack).span(at..haystack.len()); - self.regex.search_captures(&input, caps.locations_mut()); - if !caps.locations().is_match() { - return Ok(false); - } - - // If the end of our match includes a `\r`, then strip it from all - // capture groups ending at the same location. - let end = caps.locations().get_match().unwrap().end(); - if end > 0 && haystack.get(end - 1) == Some(&b'\r') { - caps.strip_crlf(true); - } - Ok(true) - } - - // We specifically do not implement other methods like find_iter or - // captures_iter. Namely, the iter methods are guaranteed to be correct - // by virtue of implementing find_at and captures_at above. -} - -/// If the given match ends with a `\r`, then return a new match that ends -/// immediately before the `\r`. -pub fn adjust_match(haystack: &[u8], m: Match) -> Match { - if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') { - m.with_end(m.end() - 1) - } else { - m - } -} - -/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`. -/// -/// This does not preserve the exact semantics of the given expression, -/// however, it does have the useful property that anything that matched the -/// given expression will also match the returned expression. The difference is -/// that the returned expression can match possibly other things as well. -/// -/// The principle reason why we do this is because the underlying regex engine -/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that -/// level, but we perform this kludge in the mean time. -/// -/// Note that while the match preserving semantics are nice and neat, the -/// match position semantics are quite a bit messier. Namely, `$` only ever -/// matches the position between characters where as `\r??` can match a -/// character and change the offset. This is regretable, but works out pretty -/// nicely in most cases, especially when a match is limited to a single line. -pub fn crlfify(expr: Hir) -> Hir { - match expr.into_kind() { - HirKind::Look(hir::Look::EndLF) => Hir::concat(vec![ - Hir::repetition(hir::Repetition { - min: 0, - max: Some(1), - greedy: false, - sub: Box::new(Hir::literal("\r".as_bytes())), - }), - Hir::look(hir::Look::EndLF), - ]), - HirKind::Empty => Hir::empty(), - HirKind::Literal(hir::Literal(x)) => Hir::literal(x), - HirKind::Class(x) => Hir::class(x), - HirKind::Look(x) => Hir::look(x), - HirKind::Repetition(mut x) => { - x.sub = Box::new(crlfify(*x.sub)); - Hir::repetition(x) - } - HirKind::Capture(mut x) => { - x.sub = Box::new(crlfify(*x.sub)); - Hir::capture(x) - } - HirKind::Concat(xs) => { - Hir::concat(xs.into_iter().map(crlfify).collect()) - } - HirKind::Alternation(xs) => { - Hir::alternation(xs.into_iter().map(crlfify).collect()) - } - } -} - -#[cfg(test)] -mod tests { - use super::crlfify; - use regex_syntax::Parser; - - fn roundtrip(pattern: &str) -> String { - let expr1 = Parser::new().parse(pattern).unwrap(); - let expr2 = crlfify(expr1); - expr2.to_string() - } - - #[test] - fn various() { - assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))"); - assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$)\r??(?m:$))"); - assert_eq!( - roundtrip(r"(?m)(?:foo$|bar$)"), - "(?:(?:(?:foo)\r??(?m:$))|(?:(?:bar)\r??(?m:$)))" - ); - assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$)a)"); - - // Not a multiline `$`, so no crlfifying occurs. - assert_eq!(roundtrip(r"$"), "\\z"); - // It's a literal, derp. - assert_eq!(roundtrip(r"\$"), "\\$"); - } -} diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs index e83b4361..35a3258d 100644 --- a/crates/regex/src/lib.rs +++ b/crates/regex/src/lib.rs @@ -8,7 +8,6 @@ pub use crate::matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder}; mod ast; mod config; -mod crlf; mod error; mod literal; mod matcher; diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index a32f4f31..a79d14bb 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -11,7 +11,6 @@ use { use crate::{ config::{Config, ConfiguredHIR}, - crlf::CRLFMatcher, error::Error, multi::MultiLiteralMatcher, word::WordMatcher, @@ -428,11 +427,6 @@ enum RegexMatcherImpl { Standard(StandardMatcher), /// A matcher for an alternation of plain literals. MultiLiteral(MultiLiteralMatcher), - /// A matcher that strips `\r` from the end of matches. - /// - /// This is only used when the CRLF hack is enabled and the regex is line - /// anchored at the end. - CRLF(CRLFMatcher), /// A matcher that only matches at word boundaries. This transforms the /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`. /// Because of this, the WordMatcher provides its own implementation of @@ -447,8 +441,6 @@ impl RegexMatcherImpl { fn new(expr: &ConfiguredHIR) -> Result { if expr.config().word { Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?)) - } else if expr.needs_crlf_stripped() { - Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?)) } else { if let Some(lits) = expr.alternation_literals() { if lits.len() >= 40 { @@ -464,7 +456,6 @@ impl RegexMatcherImpl { fn regex(&self) -> String { match *self { RegexMatcherImpl::Word(ref x) => x.pattern().to_string(), - RegexMatcherImpl::CRLF(ref x) => x.pattern().to_string(), RegexMatcherImpl::MultiLiteral(_) => "".to_string(), RegexMatcherImpl::Standard(ref x) => x.pattern.clone(), } @@ -487,7 +478,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.find_at(haystack, at), MultiLiteral(ref m) => m.find_at(haystack, at), - CRLF(ref m) => m.find_at(haystack, at), Word(ref m) => m.find_at(haystack, at), } } @@ -497,7 +487,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.new_captures(), MultiLiteral(ref m) => m.new_captures(), - CRLF(ref m) => m.new_captures(), Word(ref m) => m.new_captures(), } } @@ -507,7 +496,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.capture_count(), MultiLiteral(ref m) => m.capture_count(), - CRLF(ref m) => m.capture_count(), Word(ref m) => m.capture_count(), } } @@ -517,7 +505,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.capture_index(name), MultiLiteral(ref m) => m.capture_index(name), - CRLF(ref m) => m.capture_index(name), Word(ref m) => m.capture_index(name), } } @@ -527,7 +514,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.find(haystack), MultiLiteral(ref m) => m.find(haystack), - CRLF(ref m) => m.find(haystack), Word(ref m) => m.find(haystack), } } @@ -540,7 +526,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.find_iter(haystack, matched), MultiLiteral(ref m) => m.find_iter(haystack, matched), - CRLF(ref m) => m.find_iter(haystack, matched), Word(ref m) => m.find_iter(haystack, matched), } } @@ -557,7 +542,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.try_find_iter(haystack, matched), MultiLiteral(ref m) => m.try_find_iter(haystack, matched), - CRLF(ref m) => m.try_find_iter(haystack, matched), Word(ref m) => m.try_find_iter(haystack, matched), } } @@ -571,7 +555,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.captures(haystack, caps), MultiLiteral(ref m) => m.captures(haystack, caps), - CRLF(ref m) => m.captures(haystack, caps), Word(ref m) => m.captures(haystack, caps), } } @@ -589,7 +572,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.captures_iter(haystack, caps, matched), MultiLiteral(ref m) => m.captures_iter(haystack, caps, matched), - CRLF(ref m) => m.captures_iter(haystack, caps, matched), Word(ref m) => m.captures_iter(haystack, caps, matched), } } @@ -609,7 +591,6 @@ impl Matcher for RegexMatcher { MultiLiteral(ref m) => { m.try_captures_iter(haystack, caps, matched) } - CRLF(ref m) => m.try_captures_iter(haystack, caps, matched), Word(ref m) => m.try_captures_iter(haystack, caps, matched), } } @@ -624,7 +605,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.captures_at(haystack, at, caps), MultiLiteral(ref m) => m.captures_at(haystack, at, caps), - CRLF(ref m) => m.captures_at(haystack, at, caps), Word(ref m) => m.captures_at(haystack, at, caps), } } @@ -642,7 +622,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.replace(haystack, dst, append), MultiLiteral(ref m) => m.replace(haystack, dst, append), - CRLF(ref m) => m.replace(haystack, dst, append), Word(ref m) => m.replace(haystack, dst, append), } } @@ -665,9 +644,6 @@ impl Matcher for RegexMatcher { MultiLiteral(ref m) => { m.replace_with_captures(haystack, caps, dst, append) } - CRLF(ref m) => { - m.replace_with_captures(haystack, caps, dst, append) - } Word(ref m) => { m.replace_with_captures(haystack, caps, dst, append) } @@ -679,7 +655,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.is_match(haystack), MultiLiteral(ref m) => m.is_match(haystack), - CRLF(ref m) => m.is_match(haystack), Word(ref m) => m.is_match(haystack), } } @@ -693,7 +668,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.is_match_at(haystack, at), MultiLiteral(ref m) => m.is_match_at(haystack, at), - CRLF(ref m) => m.is_match_at(haystack, at), Word(ref m) => m.is_match_at(haystack, at), } } @@ -706,7 +680,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.shortest_match(haystack), MultiLiteral(ref m) => m.shortest_match(haystack), - CRLF(ref m) => m.shortest_match(haystack), Word(ref m) => m.shortest_match(haystack), } } @@ -720,7 +693,6 @@ impl Matcher for RegexMatcher { match self.matcher { Standard(ref m) => m.shortest_match_at(haystack, at), MultiLiteral(ref m) => m.shortest_match_at(haystack, at), - CRLF(ref m) => m.shortest_match_at(haystack, at), Word(ref m) => m.shortest_match_at(haystack, at), } } @@ -871,9 +843,6 @@ enum RegexCapturesImp { /// the matcher and the capturing groups must behave as if `(re)` is /// the `0`th capture group. offset: usize, - /// When enable, the end of a match has `\r` stripped from it, if one - /// exists. - strip_crlf: bool, }, } @@ -896,32 +865,9 @@ impl Captures for RegexCaptures { None } } - RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => { - if !strip_crlf { - let actual = i.checked_add(offset).unwrap(); - return locs - .get_group(actual) - .map(|sp| Match::new(sp.start, sp.end)); - } - - // currently don't support capture offsetting with CRLF - // stripping - assert_eq!(offset, 0); - let m = match locs - .get_group(i) - .map(|sp| Match::new(sp.start, sp.end)) - { - None => return None, - Some(m) => m, - }; - // If the end position of this match corresponds to the end - // position of the overall match, then we apply our CRLF - // stripping. Otherwise, we cannot assume stripping is correct. - if i == 0 || m.end() == locs.get_group(0).unwrap().end { - Some(m.with_end(m.end() - 1)) - } else { - Some(m) - } + RegexCapturesImp::Regex { ref locs, offset } => { + let actual = i.checked_add(offset).unwrap(); + locs.get_group(actual).map(|sp| Match::new(sp.start, sp.end)) } } } @@ -940,20 +886,7 @@ impl RegexCaptures { locs: AutomataCaptures, offset: usize, ) -> RegexCaptures { - RegexCaptures(RegexCapturesImp::Regex { - locs, - offset, - strip_crlf: false, - }) - } - - pub(crate) fn locations(&self) -> &AutomataCaptures { - match self.0 { - RegexCapturesImp::AhoCorasick { .. } => { - panic!("getting locations for simple captures is invalid") - } - RegexCapturesImp::Regex { ref locs, .. } => locs, - } + RegexCaptures(RegexCapturesImp::Regex { locs, offset }) } pub(crate) fn locations_mut(&mut self) -> &mut AutomataCaptures { @@ -965,17 +898,6 @@ impl RegexCaptures { } } - pub(crate) fn strip_crlf(&mut self, yes: bool) { - match self.0 { - RegexCapturesImp::AhoCorasick { .. } => { - panic!("setting strip_crlf for simple captures is invalid") - } - RegexCapturesImp::Regex { ref mut strip_crlf, .. } => { - *strip_crlf = yes; - } - } - } - pub(crate) fn set_simple(&mut self, one: Option) { match self.0 { RegexCapturesImp::AhoCorasick { ref mut mat } => {