1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-04-19 09:02:15 +02:00

BREAKING: regex: finally remove CRLF hack

Now that Rust's regex crate finally supports a CRLF mode, we can remove
this giant hack in ripgrep to enable it. (And assuredly did not work in
all cases.)

The way this works in the regex engine is actually subtly different than
what ripgrep previously did. Namely, --crlf would previously treat
either \r\n or \n as a line terminator. But now it treats \r\n, \n and
\r as line terminators. In effect, it is implemented by treating \r and
\n as line terminators, but ^ and $ will never match at a position
between a \r and a \n.

So basically this means that $ will end up matching in more cases than
it might be intended too, but I don't expect this to be a big problem in
practice.

Note that passing --crlf to ripgrep and enabling CRLF mode in the regex
via the `R` inline flag (e.g., `(?R:$)`) are subtly different. The `R`
flag just controls the regex engine, but --crlf instructs all of ripgrep
to use \r\n as a line terminator. There are likely some inconsistencies
or corner cases that are wrong as a result of this cognitive dissonance,
but we choose to leave well enough alone for now.

Fixing this for real will probably require re-thinking how line
terminators are handled in ripgrep. For example, one "problem" with how
they're handled now is that ripgrep will re-insert its own line
terminators when printing output instead of copying the input. This is
maybe not so great and perhaps unexpected. (ripgrep probably can't get
away with not inserting any line terminators. Users probably expect
files that don't end with a line terminator whose last line matches to
have a line terminator inserted.)
This commit is contained in:
Andrew Gallant 2023-06-16 14:36:27 -04:00
parent e028ea3792
commit 9d62eb997a
4 changed files with 9 additions and 288 deletions

View File

@ -6,7 +6,7 @@ use {
};
use crate::{
ast::AstAnalysis, crlf::crlfify, error::Error, literal::LiteralSets,
ast::AstAnalysis, error::Error, literal::LiteralSets,
multi::alternation_literals, non_matching::non_matching_bytes,
strip::strip_from_match,
};
@ -75,6 +75,7 @@ impl Config {
.case_insensitive(self.is_case_insensitive(&analysis))
.multi_line(self.multi_line)
.dot_matches_new_line(self.dot_matches_new_line)
.crlf(self.crlf)
.swap_greed(self.swap_greed)
.unicode(self.unicode)
.build()
@ -88,8 +89,7 @@ impl Config {
original: pattern.to_string(),
config: self.clone(),
analysis,
// If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
expr: if self.crlf { crlfify(expr) } else { expr },
expr,
})
}
@ -167,19 +167,6 @@ impl ConfiguredHIR {
non_matching_bytes(&self.expr)
}
/// Returns true if and only if this regex needs to have its match offsets
/// tweaked because of CRLF support. Specifically, this occurs when the
/// CRLF hack is enabled and the regex is line anchored at the end. In
/// this case, matches that end with a `\r` have the `\r` stripped.
pub fn needs_crlf_stripped(&self) -> bool {
self.config.crlf
&& self
.expr
.properties()
.look_set_suffix_any()
.contains(hir::Look::EndLF)
}
/// Returns the line terminator configured on this expression.
///
/// When we have beginning/end anchors (NOT line anchors), the fast line
@ -298,6 +285,7 @@ impl ConfiguredHIR {
.octal(self.config.octal)
.multi_line(self.config.multi_line)
.dot_matches_new_line(self.config.dot_matches_new_line)
.crlf(self.config.crlf)
.unicode(self.config.unicode);
let meta = Regex::config()
.utf8_empty(false)
@ -321,6 +309,7 @@ impl ConfiguredHIR {
.utf8(false)
.multi_line(self.config.multi_line)
.dot_matches_new_line(self.config.dot_matches_new_line)
.crlf(self.config.crlf)
.unicode(self.config.unicode)
.build()
.parse(pattern)

View File

@ -1,189 +0,0 @@
use std::collections::HashMap;
use {
grep_matcher::{Match, Matcher, NoError},
regex_automata::{meta::Regex, Input, PatternID},
regex_syntax::hir::{self, Hir, HirKind},
};
use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
/// A matcher for implementing "word match" semantics.
#[derive(Clone, Debug)]
pub struct CRLFMatcher {
/// The regex.
regex: Regex,
/// The pattern string corresponding to the regex above.
pattern: String,
/// A map from capture group name to capture group index.
names: HashMap<String, usize>,
}
impl CRLFMatcher {
/// Create a new matcher from the given pattern that strips `\r` from the
/// end of every match.
///
/// This panics if the given expression doesn't need its CRLF stripped.
pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {
assert!(expr.needs_crlf_stripped());
let regex = expr.regex()?;
let pattern = expr.pattern();
let mut names = HashMap::new();
let it = regex.group_info().pattern_names(PatternID::ZERO);
for (i, optional_name) in it.enumerate() {
if let Some(name) = optional_name {
names.insert(name.to_string(), i.checked_sub(1).unwrap());
}
}
Ok(CRLFMatcher { regex, pattern, names })
}
/// Return the underlying pattern string for the regex used by this
/// matcher.
pub fn pattern(&self) -> &str {
&self.pattern
}
}
impl Matcher for CRLFMatcher {
type Captures = RegexCaptures;
type Error = NoError;
fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, NoError> {
let input = Input::new(haystack).span(at..haystack.len());
let m = match self.regex.find(input) {
None => return Ok(None),
Some(m) => Match::new(m.start(), m.end()),
};
Ok(Some(adjust_match(haystack, m)))
}
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
Ok(RegexCaptures::new(self.regex.create_captures()))
}
fn capture_count(&self) -> usize {
self.regex.captures_len().checked_sub(1).unwrap()
}
fn capture_index(&self, name: &str) -> Option<usize> {
self.names.get(name).map(|i| *i)
}
fn captures_at(
&self,
haystack: &[u8],
at: usize,
caps: &mut RegexCaptures,
) -> Result<bool, NoError> {
caps.strip_crlf(false);
let input = Input::new(haystack).span(at..haystack.len());
self.regex.search_captures(&input, caps.locations_mut());
if !caps.locations().is_match() {
return Ok(false);
}
// If the end of our match includes a `\r`, then strip it from all
// capture groups ending at the same location.
let end = caps.locations().get_match().unwrap().end();
if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
caps.strip_crlf(true);
}
Ok(true)
}
// We specifically do not implement other methods like find_iter or
// captures_iter. Namely, the iter methods are guaranteed to be correct
// by virtue of implementing find_at and captures_at above.
}
/// If the given match ends with a `\r`, then return a new match that ends
/// immediately before the `\r`.
pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {
m.with_end(m.end() - 1)
} else {
m
}
}
/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
///
/// This does not preserve the exact semantics of the given expression,
/// however, it does have the useful property that anything that matched the
/// given expression will also match the returned expression. The difference is
/// that the returned expression can match possibly other things as well.
///
/// The principle reason why we do this is because the underlying regex engine
/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that
/// level, but we perform this kludge in the mean time.
///
/// Note that while the match preserving semantics are nice and neat, the
/// match position semantics are quite a bit messier. Namely, `$` only ever
/// matches the position between characters where as `\r??` can match a
/// character and change the offset. This is regretable, but works out pretty
/// nicely in most cases, especially when a match is limited to a single line.
pub fn crlfify(expr: Hir) -> Hir {
match expr.into_kind() {
HirKind::Look(hir::Look::EndLF) => Hir::concat(vec![
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: false,
sub: Box::new(Hir::literal("\r".as_bytes())),
}),
Hir::look(hir::Look::EndLF),
]),
HirKind::Empty => Hir::empty(),
HirKind::Literal(hir::Literal(x)) => Hir::literal(x),
HirKind::Class(x) => Hir::class(x),
HirKind::Look(x) => Hir::look(x),
HirKind::Repetition(mut x) => {
x.sub = Box::new(crlfify(*x.sub));
Hir::repetition(x)
}
HirKind::Capture(mut x) => {
x.sub = Box::new(crlfify(*x.sub));
Hir::capture(x)
}
HirKind::Concat(xs) => {
Hir::concat(xs.into_iter().map(crlfify).collect())
}
HirKind::Alternation(xs) => {
Hir::alternation(xs.into_iter().map(crlfify).collect())
}
}
}
#[cfg(test)]
mod tests {
use super::crlfify;
use regex_syntax::Parser;
fn roundtrip(pattern: &str) -> String {
let expr1 = Parser::new().parse(pattern).unwrap();
let expr2 = crlfify(expr1);
expr2.to_string()
}
#[test]
fn various() {
assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$)\r??(?m:$))");
assert_eq!(
roundtrip(r"(?m)(?:foo$|bar$)"),
"(?:(?:(?:foo)\r??(?m:$))|(?:(?:bar)\r??(?m:$)))"
);
assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$)a)");
// Not a multiline `$`, so no crlfifying occurs.
assert_eq!(roundtrip(r"$"), "\\z");
// It's a literal, derp.
assert_eq!(roundtrip(r"\$"), "\\$");
}
}

View File

@ -8,7 +8,6 @@ pub use crate::matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder};
mod ast;
mod config;
mod crlf;
mod error;
mod literal;
mod matcher;

View File

@ -11,7 +11,6 @@ use {
use crate::{
config::{Config, ConfiguredHIR},
crlf::CRLFMatcher,
error::Error,
multi::MultiLiteralMatcher,
word::WordMatcher,
@ -428,11 +427,6 @@ enum RegexMatcherImpl {
Standard(StandardMatcher),
/// A matcher for an alternation of plain literals.
MultiLiteral(MultiLiteralMatcher),
/// A matcher that strips `\r` from the end of matches.
///
/// This is only used when the CRLF hack is enabled and the regex is line
/// anchored at the end.
CRLF(CRLFMatcher),
/// A matcher that only matches at word boundaries. This transforms the
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
/// Because of this, the WordMatcher provides its own implementation of
@ -447,8 +441,6 @@ impl RegexMatcherImpl {
fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
if expr.config().word {
Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
} else if expr.needs_crlf_stripped() {
Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?))
} else {
if let Some(lits) = expr.alternation_literals() {
if lits.len() >= 40 {
@ -464,7 +456,6 @@ impl RegexMatcherImpl {
fn regex(&self) -> String {
match *self {
RegexMatcherImpl::Word(ref x) => x.pattern().to_string(),
RegexMatcherImpl::CRLF(ref x) => x.pattern().to_string(),
RegexMatcherImpl::MultiLiteral(_) => "<N/A>".to_string(),
RegexMatcherImpl::Standard(ref x) => x.pattern.clone(),
}
@ -487,7 +478,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.find_at(haystack, at),
MultiLiteral(ref m) => m.find_at(haystack, at),
CRLF(ref m) => m.find_at(haystack, at),
Word(ref m) => m.find_at(haystack, at),
}
}
@ -497,7 +487,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.new_captures(),
MultiLiteral(ref m) => m.new_captures(),
CRLF(ref m) => m.new_captures(),
Word(ref m) => m.new_captures(),
}
}
@ -507,7 +496,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.capture_count(),
MultiLiteral(ref m) => m.capture_count(),
CRLF(ref m) => m.capture_count(),
Word(ref m) => m.capture_count(),
}
}
@ -517,7 +505,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.capture_index(name),
MultiLiteral(ref m) => m.capture_index(name),
CRLF(ref m) => m.capture_index(name),
Word(ref m) => m.capture_index(name),
}
}
@ -527,7 +514,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.find(haystack),
MultiLiteral(ref m) => m.find(haystack),
CRLF(ref m) => m.find(haystack),
Word(ref m) => m.find(haystack),
}
}
@ -540,7 +526,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.find_iter(haystack, matched),
MultiLiteral(ref m) => m.find_iter(haystack, matched),
CRLF(ref m) => m.find_iter(haystack, matched),
Word(ref m) => m.find_iter(haystack, matched),
}
}
@ -557,7 +542,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.try_find_iter(haystack, matched),
MultiLiteral(ref m) => m.try_find_iter(haystack, matched),
CRLF(ref m) => m.try_find_iter(haystack, matched),
Word(ref m) => m.try_find_iter(haystack, matched),
}
}
@ -571,7 +555,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.captures(haystack, caps),
MultiLiteral(ref m) => m.captures(haystack, caps),
CRLF(ref m) => m.captures(haystack, caps),
Word(ref m) => m.captures(haystack, caps),
}
}
@ -589,7 +572,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.captures_iter(haystack, caps, matched),
MultiLiteral(ref m) => m.captures_iter(haystack, caps, matched),
CRLF(ref m) => m.captures_iter(haystack, caps, matched),
Word(ref m) => m.captures_iter(haystack, caps, matched),
}
}
@ -609,7 +591,6 @@ impl Matcher for RegexMatcher {
MultiLiteral(ref m) => {
m.try_captures_iter(haystack, caps, matched)
}
CRLF(ref m) => m.try_captures_iter(haystack, caps, matched),
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
}
}
@ -624,7 +605,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.captures_at(haystack, at, caps),
MultiLiteral(ref m) => m.captures_at(haystack, at, caps),
CRLF(ref m) => m.captures_at(haystack, at, caps),
Word(ref m) => m.captures_at(haystack, at, caps),
}
}
@ -642,7 +622,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.replace(haystack, dst, append),
MultiLiteral(ref m) => m.replace(haystack, dst, append),
CRLF(ref m) => m.replace(haystack, dst, append),
Word(ref m) => m.replace(haystack, dst, append),
}
}
@ -665,9 +644,6 @@ impl Matcher for RegexMatcher {
MultiLiteral(ref m) => {
m.replace_with_captures(haystack, caps, dst, append)
}
CRLF(ref m) => {
m.replace_with_captures(haystack, caps, dst, append)
}
Word(ref m) => {
m.replace_with_captures(haystack, caps, dst, append)
}
@ -679,7 +655,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.is_match(haystack),
MultiLiteral(ref m) => m.is_match(haystack),
CRLF(ref m) => m.is_match(haystack),
Word(ref m) => m.is_match(haystack),
}
}
@ -693,7 +668,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.is_match_at(haystack, at),
MultiLiteral(ref m) => m.is_match_at(haystack, at),
CRLF(ref m) => m.is_match_at(haystack, at),
Word(ref m) => m.is_match_at(haystack, at),
}
}
@ -706,7 +680,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.shortest_match(haystack),
MultiLiteral(ref m) => m.shortest_match(haystack),
CRLF(ref m) => m.shortest_match(haystack),
Word(ref m) => m.shortest_match(haystack),
}
}
@ -720,7 +693,6 @@ impl Matcher for RegexMatcher {
match self.matcher {
Standard(ref m) => m.shortest_match_at(haystack, at),
MultiLiteral(ref m) => m.shortest_match_at(haystack, at),
CRLF(ref m) => m.shortest_match_at(haystack, at),
Word(ref m) => m.shortest_match_at(haystack, at),
}
}
@ -871,9 +843,6 @@ enum RegexCapturesImp {
/// the matcher and the capturing groups must behave as if `(re)` is
/// the `0`th capture group.
offset: usize,
/// When enable, the end of a match has `\r` stripped from it, if one
/// exists.
strip_crlf: bool,
},
}
@ -896,32 +865,9 @@ impl Captures for RegexCaptures {
None
}
}
RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => {
if !strip_crlf {
let actual = i.checked_add(offset).unwrap();
return locs
.get_group(actual)
.map(|sp| Match::new(sp.start, sp.end));
}
// currently don't support capture offsetting with CRLF
// stripping
assert_eq!(offset, 0);
let m = match locs
.get_group(i)
.map(|sp| Match::new(sp.start, sp.end))
{
None => return None,
Some(m) => m,
};
// If the end position of this match corresponds to the end
// position of the overall match, then we apply our CRLF
// stripping. Otherwise, we cannot assume stripping is correct.
if i == 0 || m.end() == locs.get_group(0).unwrap().end {
Some(m.with_end(m.end() - 1))
} else {
Some(m)
}
RegexCapturesImp::Regex { ref locs, offset } => {
let actual = i.checked_add(offset).unwrap();
locs.get_group(actual).map(|sp| Match::new(sp.start, sp.end))
}
}
}
@ -940,20 +886,7 @@ impl RegexCaptures {
locs: AutomataCaptures,
offset: usize,
) -> RegexCaptures {
RegexCaptures(RegexCapturesImp::Regex {
locs,
offset,
strip_crlf: false,
})
}
pub(crate) fn locations(&self) -> &AutomataCaptures {
match self.0 {
RegexCapturesImp::AhoCorasick { .. } => {
panic!("getting locations for simple captures is invalid")
}
RegexCapturesImp::Regex { ref locs, .. } => locs,
}
RegexCaptures(RegexCapturesImp::Regex { locs, offset })
}
pub(crate) fn locations_mut(&mut self) -> &mut AutomataCaptures {
@ -965,17 +898,6 @@ impl RegexCaptures {
}
}
pub(crate) fn strip_crlf(&mut self, yes: bool) {
match self.0 {
RegexCapturesImp::AhoCorasick { .. } => {
panic!("setting strip_crlf for simple captures is invalid")
}
RegexCapturesImp::Regex { ref mut strip_crlf, .. } => {
*strip_crlf = yes;
}
}
}
pub(crate) fn set_simple(&mut self, one: Option<Match>) {
match self.0 {
RegexCapturesImp::AhoCorasick { ref mut mat } => {