mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2024-12-12 19:18:24 +02:00
regex: make CRLF hack more robust
This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095
This commit is contained in:
parent
e99b6bda0e
commit
9d703110cf
@ -817,7 +817,8 @@ impl<'a> SubMatches<'a> {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use grep_regex::RegexMatcher;
|
use grep_regex::{RegexMatcher, RegexMatcherBuilder};
|
||||||
|
use grep_matcher::LineTerminator;
|
||||||
use grep_searcher::SearcherBuilder;
|
use grep_searcher::SearcherBuilder;
|
||||||
|
|
||||||
use super::{JSON, JSONBuilder};
|
use super::{JSON, JSONBuilder};
|
||||||
@ -918,4 +919,45 @@ and exhibited clearly, with a label attached.\
|
|||||||
assert_eq!(got.lines().count(), 2);
|
assert_eq!(got.lines().count(), 2);
|
||||||
assert!(got.contains("begin") && got.contains("end"));
|
assert!(got.contains("begin") && got.contains("end"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn missing_crlf() {
|
||||||
|
let haystack = "test\r\n".as_bytes();
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.build("test")
|
||||||
|
.unwrap();
|
||||||
|
let mut printer = JSONBuilder::new()
|
||||||
|
.build(vec![]);
|
||||||
|
SearcherBuilder::new()
|
||||||
|
.build()
|
||||||
|
.search_reader(&matcher, haystack, printer.sink(&matcher))
|
||||||
|
.unwrap();
|
||||||
|
let got = printer_contents(&mut printer);
|
||||||
|
assert_eq!(got.lines().count(), 3);
|
||||||
|
assert!(
|
||||||
|
got.lines().nth(1).unwrap().contains(r"test\r\n"),
|
||||||
|
r"missing 'test\r\n' in '{}'",
|
||||||
|
got.lines().nth(1).unwrap(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.crlf(true)
|
||||||
|
.build("test")
|
||||||
|
.unwrap();
|
||||||
|
let mut printer = JSONBuilder::new()
|
||||||
|
.build(vec![]);
|
||||||
|
SearcherBuilder::new()
|
||||||
|
.line_terminator(LineTerminator::crlf())
|
||||||
|
.build()
|
||||||
|
.search_reader(&matcher, haystack, printer.sink(&matcher))
|
||||||
|
.unwrap();
|
||||||
|
let got = printer_contents(&mut printer);
|
||||||
|
assert_eq!(got.lines().count(), 3);
|
||||||
|
assert!(
|
||||||
|
got.lines().nth(1).unwrap().contains(r"test\r\n"),
|
||||||
|
r"missing 'test\r\n' in '{}'",
|
||||||
|
got.lines().nth(1).unwrap(),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,6 @@ license = "Unlicense/MIT"
|
|||||||
log = "0.4.5"
|
log = "0.4.5"
|
||||||
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
||||||
regex = "1.1"
|
regex = "1.1"
|
||||||
regex-syntax = "0.6.4"
|
regex-syntax = "0.6.5"
|
||||||
thread_local = "0.3.6"
|
thread_local = "0.3.6"
|
||||||
utf8-ranges = "1.0.1"
|
utf8-ranges = "1.0.1"
|
||||||
|
@ -160,6 +160,14 @@ impl ConfiguredHIR {
|
|||||||
non_matching_bytes(&self.expr)
|
non_matching_bytes(&self.expr)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if this regex needs to have its match offsets
|
||||||
|
/// tweaked because of CRLF support. Specifically, this occurs when the
|
||||||
|
/// CRLF hack is enabled and the regex is line anchored at the end. In
|
||||||
|
/// this case, matches that end with a `\r` have the `\r` stripped.
|
||||||
|
pub fn needs_crlf_stripped(&self) -> bool {
|
||||||
|
self.config.crlf && self.expr.is_line_anchored_end()
|
||||||
|
}
|
||||||
|
|
||||||
/// Builds a regular expression from this HIR expression.
|
/// Builds a regular expression from this HIR expression.
|
||||||
pub fn regex(&self) -> Result<Regex, Error> {
|
pub fn regex(&self) -> Result<Regex, Error> {
|
||||||
self.pattern_to_regex(&self.expr.to_string())
|
self.pattern_to_regex(&self.expr.to_string())
|
||||||
|
@ -1,5 +1,105 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use grep_matcher::{Match, Matcher, NoError};
|
||||||
|
use regex::bytes::Regex;
|
||||||
use regex_syntax::hir::{self, Hir, HirKind};
|
use regex_syntax::hir::{self, Hir, HirKind};
|
||||||
|
|
||||||
|
use config::ConfiguredHIR;
|
||||||
|
use error::Error;
|
||||||
|
use matcher::RegexCaptures;
|
||||||
|
|
||||||
|
/// A matcher for implementing "word match" semantics.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct CRLFMatcher {
|
||||||
|
/// The regex.
|
||||||
|
regex: Regex,
|
||||||
|
/// A map from capture group name to capture group index.
|
||||||
|
names: HashMap<String, usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CRLFMatcher {
|
||||||
|
/// Create a new matcher from the given pattern that strips `\r` from the
|
||||||
|
/// end of every match.
|
||||||
|
///
|
||||||
|
/// This panics if the given expression doesn't need its CRLF stripped.
|
||||||
|
pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {
|
||||||
|
assert!(expr.needs_crlf_stripped());
|
||||||
|
|
||||||
|
let regex = expr.regex()?;
|
||||||
|
let mut names = HashMap::new();
|
||||||
|
for (i, optional_name) in regex.capture_names().enumerate() {
|
||||||
|
if let Some(name) = optional_name {
|
||||||
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(CRLFMatcher { regex, names })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matcher for CRLFMatcher {
|
||||||
|
type Captures = RegexCaptures;
|
||||||
|
type Error = NoError;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>, NoError> {
|
||||||
|
let m = match self.regex.find_at(haystack, at) {
|
||||||
|
None => return Ok(None),
|
||||||
|
Some(m) => Match::new(m.start(), m.end()),
|
||||||
|
};
|
||||||
|
Ok(Some(adjust_match(haystack, m)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||||
|
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_count(&self) -> usize {
|
||||||
|
self.regex.captures_len().checked_sub(1).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||||
|
self.names.get(name).map(|i| *i)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
) -> Result<bool, NoError> {
|
||||||
|
caps.strip_crlf(false);
|
||||||
|
let r = self.regex.captures_read_at(caps.locations(), haystack, at);
|
||||||
|
if !r.is_some() {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the end of our match includes a `\r`, then strip it from all
|
||||||
|
// capture groups ending at the same location.
|
||||||
|
let end = caps.locations().get(0).unwrap().1;
|
||||||
|
if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
|
||||||
|
caps.strip_crlf(true);
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We specifically do not implement other methods like find_iter or
|
||||||
|
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
||||||
|
// by virtue of implementing find_at and captures_at above.
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If the given match ends with a `\r`, then return a new match that ends
|
||||||
|
/// immediately before the `\r`.
|
||||||
|
pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
|
||||||
|
if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {
|
||||||
|
m.with_end(m.end() - 1)
|
||||||
|
} else {
|
||||||
|
m
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
|
/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
|
||||||
///
|
///
|
||||||
/// This does not preserve the exact semantics of the given expression,
|
/// This does not preserve the exact semantics of the given expression,
|
||||||
|
@ -6,6 +6,7 @@ use grep_matcher::{
|
|||||||
use regex::bytes::{CaptureLocations, Regex};
|
use regex::bytes::{CaptureLocations, Regex};
|
||||||
|
|
||||||
use config::{Config, ConfiguredHIR};
|
use config::{Config, ConfiguredHIR};
|
||||||
|
use crlf::CRLFMatcher;
|
||||||
use error::Error;
|
use error::Error;
|
||||||
use word::WordMatcher;
|
use word::WordMatcher;
|
||||||
|
|
||||||
@ -344,6 +345,11 @@ impl RegexMatcher {
|
|||||||
enum RegexMatcherImpl {
|
enum RegexMatcherImpl {
|
||||||
/// The standard matcher used for all regular expressions.
|
/// The standard matcher used for all regular expressions.
|
||||||
Standard(StandardMatcher),
|
Standard(StandardMatcher),
|
||||||
|
/// A matcher that strips `\r` from the end of matches.
|
||||||
|
///
|
||||||
|
/// This is only used when the CRLF hack is enabled and the regex is line
|
||||||
|
/// anchored at the end.
|
||||||
|
CRLF(CRLFMatcher),
|
||||||
/// A matcher that only matches at word boundaries. This transforms the
|
/// A matcher that only matches at word boundaries. This transforms the
|
||||||
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
|
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
|
||||||
/// Because of this, the WordMatcher provides its own implementation of
|
/// Because of this, the WordMatcher provides its own implementation of
|
||||||
@ -358,6 +364,8 @@ impl RegexMatcherImpl {
|
|||||||
fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
|
fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
|
||||||
if expr.config().word {
|
if expr.config().word {
|
||||||
Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
|
Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
|
||||||
|
} else if expr.needs_crlf_stripped() {
|
||||||
|
Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?))
|
||||||
} else {
|
} else {
|
||||||
Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
|
Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
|
||||||
}
|
}
|
||||||
@ -379,6 +387,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.find_at(haystack, at),
|
Standard(ref m) => m.find_at(haystack, at),
|
||||||
|
CRLF(ref m) => m.find_at(haystack, at),
|
||||||
Word(ref m) => m.find_at(haystack, at),
|
Word(ref m) => m.find_at(haystack, at),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -387,6 +396,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.new_captures(),
|
Standard(ref m) => m.new_captures(),
|
||||||
|
CRLF(ref m) => m.new_captures(),
|
||||||
Word(ref m) => m.new_captures(),
|
Word(ref m) => m.new_captures(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -395,6 +405,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.capture_count(),
|
Standard(ref m) => m.capture_count(),
|
||||||
|
CRLF(ref m) => m.capture_count(),
|
||||||
Word(ref m) => m.capture_count(),
|
Word(ref m) => m.capture_count(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -403,6 +414,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.capture_index(name),
|
Standard(ref m) => m.capture_index(name),
|
||||||
|
CRLF(ref m) => m.capture_index(name),
|
||||||
Word(ref m) => m.capture_index(name),
|
Word(ref m) => m.capture_index(name),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -411,6 +423,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.find(haystack),
|
Standard(ref m) => m.find(haystack),
|
||||||
|
CRLF(ref m) => m.find(haystack),
|
||||||
Word(ref m) => m.find(haystack),
|
Word(ref m) => m.find(haystack),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -425,6 +438,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.find_iter(haystack, matched),
|
Standard(ref m) => m.find_iter(haystack, matched),
|
||||||
|
CRLF(ref m) => m.find_iter(haystack, matched),
|
||||||
Word(ref m) => m.find_iter(haystack, matched),
|
Word(ref m) => m.find_iter(haystack, matched),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -439,6 +453,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.try_find_iter(haystack, matched),
|
Standard(ref m) => m.try_find_iter(haystack, matched),
|
||||||
|
CRLF(ref m) => m.try_find_iter(haystack, matched),
|
||||||
Word(ref m) => m.try_find_iter(haystack, matched),
|
Word(ref m) => m.try_find_iter(haystack, matched),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -451,6 +466,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.captures(haystack, caps),
|
Standard(ref m) => m.captures(haystack, caps),
|
||||||
|
CRLF(ref m) => m.captures(haystack, caps),
|
||||||
Word(ref m) => m.captures(haystack, caps),
|
Word(ref m) => m.captures(haystack, caps),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -466,6 +482,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.captures_iter(haystack, caps, matched),
|
Standard(ref m) => m.captures_iter(haystack, caps, matched),
|
||||||
|
CRLF(ref m) => m.captures_iter(haystack, caps, matched),
|
||||||
Word(ref m) => m.captures_iter(haystack, caps, matched),
|
Word(ref m) => m.captures_iter(haystack, caps, matched),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -481,6 +498,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
|
Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||||
|
CRLF(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||||
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
|
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -494,6 +512,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.captures_at(haystack, at, caps),
|
Standard(ref m) => m.captures_at(haystack, at, caps),
|
||||||
|
CRLF(ref m) => m.captures_at(haystack, at, caps),
|
||||||
Word(ref m) => m.captures_at(haystack, at, caps),
|
Word(ref m) => m.captures_at(haystack, at, caps),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -509,6 +528,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.replace(haystack, dst, append),
|
Standard(ref m) => m.replace(haystack, dst, append),
|
||||||
|
CRLF(ref m) => m.replace(haystack, dst, append),
|
||||||
Word(ref m) => m.replace(haystack, dst, append),
|
Word(ref m) => m.replace(haystack, dst, append),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -527,6 +547,9 @@ impl Matcher for RegexMatcher {
|
|||||||
Standard(ref m) => {
|
Standard(ref m) => {
|
||||||
m.replace_with_captures(haystack, caps, dst, append)
|
m.replace_with_captures(haystack, caps, dst, append)
|
||||||
}
|
}
|
||||||
|
CRLF(ref m) => {
|
||||||
|
m.replace_with_captures(haystack, caps, dst, append)
|
||||||
|
}
|
||||||
Word(ref m) => {
|
Word(ref m) => {
|
||||||
m.replace_with_captures(haystack, caps, dst, append)
|
m.replace_with_captures(haystack, caps, dst, append)
|
||||||
}
|
}
|
||||||
@ -537,6 +560,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.is_match(haystack),
|
Standard(ref m) => m.is_match(haystack),
|
||||||
|
CRLF(ref m) => m.is_match(haystack),
|
||||||
Word(ref m) => m.is_match(haystack),
|
Word(ref m) => m.is_match(haystack),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -549,6 +573,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.is_match_at(haystack, at),
|
Standard(ref m) => m.is_match_at(haystack, at),
|
||||||
|
CRLF(ref m) => m.is_match_at(haystack, at),
|
||||||
Word(ref m) => m.is_match_at(haystack, at),
|
Word(ref m) => m.is_match_at(haystack, at),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -560,6 +585,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.shortest_match(haystack),
|
Standard(ref m) => m.shortest_match(haystack),
|
||||||
|
CRLF(ref m) => m.shortest_match(haystack),
|
||||||
Word(ref m) => m.shortest_match(haystack),
|
Word(ref m) => m.shortest_match(haystack),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -572,6 +598,7 @@ impl Matcher for RegexMatcher {
|
|||||||
use self::RegexMatcherImpl::*;
|
use self::RegexMatcherImpl::*;
|
||||||
match self.matcher {
|
match self.matcher {
|
||||||
Standard(ref m) => m.shortest_match_at(haystack, at),
|
Standard(ref m) => m.shortest_match_at(haystack, at),
|
||||||
|
CRLF(ref m) => m.shortest_match_at(haystack, at),
|
||||||
Word(ref m) => m.shortest_match_at(haystack, at),
|
Word(ref m) => m.shortest_match_at(haystack, at),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -712,6 +739,9 @@ pub struct RegexCaptures {
|
|||||||
/// and the capturing groups must behave as if `(re)` is the `0`th capture
|
/// and the capturing groups must behave as if `(re)` is the `0`th capture
|
||||||
/// group.
|
/// group.
|
||||||
offset: usize,
|
offset: usize,
|
||||||
|
/// When enable, the end of a match has `\r` stripped from it, if one
|
||||||
|
/// exists.
|
||||||
|
strip_crlf: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Captures for RegexCaptures {
|
impl Captures for RegexCaptures {
|
||||||
@ -720,8 +750,25 @@ impl Captures for RegexCaptures {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn get(&self, i: usize) -> Option<Match> {
|
fn get(&self, i: usize) -> Option<Match> {
|
||||||
let actual = i.checked_add(self.offset).unwrap();
|
if !self.strip_crlf {
|
||||||
self.locs.pos(actual).map(|(s, e)| Match::new(s, e))
|
let actual = i.checked_add(self.offset).unwrap();
|
||||||
|
return self.locs.pos(actual).map(|(s, e)| Match::new(s, e));
|
||||||
|
}
|
||||||
|
|
||||||
|
// currently don't support capture offsetting with CRLF stripping
|
||||||
|
assert_eq!(self.offset, 0);
|
||||||
|
let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) {
|
||||||
|
None => return None,
|
||||||
|
Some(m) => m,
|
||||||
|
};
|
||||||
|
// If the end position of this match corresponds to the end position
|
||||||
|
// of the overall match, then we apply our CRLF stripping. Otherwise,
|
||||||
|
// we cannot assume stripping is correct.
|
||||||
|
if i == 0 || m.end() == self.locs.pos(0).unwrap().1 {
|
||||||
|
Some(m.with_end(m.end() - 1))
|
||||||
|
} else {
|
||||||
|
Some(m)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -734,12 +781,16 @@ impl RegexCaptures {
|
|||||||
locs: CaptureLocations,
|
locs: CaptureLocations,
|
||||||
offset: usize,
|
offset: usize,
|
||||||
) -> RegexCaptures {
|
) -> RegexCaptures {
|
||||||
RegexCaptures { locs, offset }
|
RegexCaptures { locs, offset, strip_crlf: false }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
|
pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
|
||||||
&mut self.locs
|
&mut self.locs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn strip_crlf(&mut self, yes: bool) {
|
||||||
|
self.strip_crlf = yes;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -424,16 +424,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
|
|||||||
}
|
}
|
||||||
self.count_lines(buf, range.start());
|
self.count_lines(buf, range.start());
|
||||||
let offset = self.absolute_byte_offset + range.start() as u64;
|
let offset = self.absolute_byte_offset + range.start() as u64;
|
||||||
let linebuf =
|
let linebuf = &buf[*range];
|
||||||
if self.config.line_term.is_crlf() {
|
|
||||||
// Normally, a line terminator is never part of a match, but
|
|
||||||
// if the line terminator is CRLF, then it's possible for `\r`
|
|
||||||
// to end up in the match, which we generally don't want. So
|
|
||||||
// we strip it here.
|
|
||||||
lines::without_terminator(&buf[*range], self.config.line_term)
|
|
||||||
} else {
|
|
||||||
&buf[*range]
|
|
||||||
};
|
|
||||||
let keepgoing = self.sink.matched(
|
let keepgoing = self.sink.matched(
|
||||||
&self.searcher,
|
&self.searcher,
|
||||||
&SinkMatch {
|
&SinkMatch {
|
||||||
|
@ -153,7 +153,10 @@ rgtest!(basic, |dir: Dir, mut cmd: TestCommand| {
|
|||||||
msgs[1].unwrap_context(),
|
msgs[1].unwrap_context(),
|
||||||
Context {
|
Context {
|
||||||
path: Some(Data::text("sherlock")),
|
path: Some(Data::text("sherlock")),
|
||||||
lines: Data::text("Holmeses, success in the province of detective work must always\n"),
|
lines: Data::text(
|
||||||
|
"Holmeses, success in the province of \
|
||||||
|
detective work must always\n",
|
||||||
|
),
|
||||||
line_number: Some(2),
|
line_number: Some(2),
|
||||||
absolute_offset: 65,
|
absolute_offset: 65,
|
||||||
submatches: vec![],
|
submatches: vec![],
|
||||||
@ -163,7 +166,10 @@ rgtest!(basic, |dir: Dir, mut cmd: TestCommand| {
|
|||||||
msgs[2].unwrap_match(),
|
msgs[2].unwrap_match(),
|
||||||
Match {
|
Match {
|
||||||
path: Some(Data::text("sherlock")),
|
path: Some(Data::text("sherlock")),
|
||||||
lines: Data::text("be, to a very large extent, the result of luck. Sherlock Holmes\n"),
|
lines: Data::text(
|
||||||
|
"be, to a very large extent, the result of luck. \
|
||||||
|
Sherlock Holmes\n",
|
||||||
|
),
|
||||||
line_number: Some(3),
|
line_number: Some(3),
|
||||||
absolute_offset: 129,
|
absolute_offset: 129,
|
||||||
submatches: vec![
|
submatches: vec![
|
||||||
@ -212,7 +218,9 @@ rgtest!(notutf8, |dir: Dir, mut cmd: TestCommand| {
|
|||||||
let contents = &b"quux\xFFbaz"[..];
|
let contents = &b"quux\xFFbaz"[..];
|
||||||
|
|
||||||
// APFS does not support creating files with invalid UTF-8 bytes, so just
|
// APFS does not support creating files with invalid UTF-8 bytes, so just
|
||||||
// skip the test if we can't create our file.
|
// skip the test if we can't create our file. Presumably we don't need this
|
||||||
|
// check if we're already skipping it on macOS, but maybe other file
|
||||||
|
// systems won't like this test either?
|
||||||
if !dir.try_create_bytes(OsStr::from_bytes(name), contents).is_ok() {
|
if !dir.try_create_bytes(OsStr::from_bytes(name), contents).is_ok() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -305,3 +313,52 @@ rgtest!(crlf, |dir: Dir, mut cmd: TestCommand| {
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// See: https://github.com/BurntSushi/ripgrep/issues/1095
|
||||||
|
//
|
||||||
|
// This test checks that we don't drop the \r\n in a matching line when --crlf
|
||||||
|
// mode is enabled.
|
||||||
|
rgtest!(r1095_missing_crlf, |dir: Dir, mut cmd: TestCommand| {
|
||||||
|
dir.create("foo", "test\r\n");
|
||||||
|
|
||||||
|
// Check without --crlf flag.
|
||||||
|
let msgs = json_decode(&cmd.arg("--json").arg("test").stdout());
|
||||||
|
assert_eq!(msgs.len(), 4);
|
||||||
|
assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));
|
||||||
|
|
||||||
|
// Now check with --crlf flag.
|
||||||
|
let msgs = json_decode(&cmd.arg("--crlf").stdout());
|
||||||
|
assert_eq!(msgs.len(), 4);
|
||||||
|
assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));
|
||||||
|
});
|
||||||
|
|
||||||
|
// See: https://github.com/BurntSushi/ripgrep/issues/1095
|
||||||
|
//
|
||||||
|
// This test checks that we don't return empty submatches when matching a `\n`
|
||||||
|
// in CRLF mode.
|
||||||
|
rgtest!(r1095_crlf_empty_match, |dir: Dir, mut cmd: TestCommand| {
|
||||||
|
dir.create("foo", "test\r\n\n");
|
||||||
|
|
||||||
|
// Check without --crlf flag.
|
||||||
|
let msgs = json_decode(&cmd.arg("-U").arg("--json").arg("\n").stdout());
|
||||||
|
assert_eq!(msgs.len(), 5);
|
||||||
|
|
||||||
|
let m = msgs[1].unwrap_match();
|
||||||
|
assert_eq!(m.lines, Data::text("test\r\n"));
|
||||||
|
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
||||||
|
|
||||||
|
let m = msgs[2].unwrap_match();
|
||||||
|
assert_eq!(m.lines, Data::text("\n"));
|
||||||
|
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
||||||
|
|
||||||
|
// Now check with --crlf flag.
|
||||||
|
let msgs = json_decode(&cmd.arg("--crlf").stdout());
|
||||||
|
|
||||||
|
let m = msgs[1].unwrap_match();
|
||||||
|
assert_eq!(m.lines, Data::text("test\r\n"));
|
||||||
|
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
||||||
|
|
||||||
|
let m = msgs[2].unwrap_match();
|
||||||
|
assert_eq!(m.lines, Data::text("\n"));
|
||||||
|
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
||||||
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user