mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2024-12-12 19:18:24 +02:00
656aa12649
This commit fixes a subtle bug in multi-line replacement of line terminators. The problem is that even though ripgrep supports multi-line searches, it is *still* line oriented. It still needs to print line numbers, for example. For this reason, there are various parts in the printer that iterate over lines in order to format them into the desired output. This turns out to be problematic in some cases. #1311 documents one of those cases (with line numbers enabled to highlight a point later): $ printf "hello\nworld\n" | rg -n -U "\n" -r "?" 1:hello? 2:world? But the desired output is this: $ printf "hello\nworld\n" | rg -n -U "\n" -r "?" 1:hello?world? At first I had thought that the main problem was that the printer was taking ownership of writing line terminators, even if the input already had them. But it's more subtle than that. If we fix that issue, we get output like this instead: $ printf "hello\nworld\n" | rg -n -U "\n" -r "?" 1:hello?2:world? Notice how '2:' is printed before 'world?'. The reason it works this way is because matches are reported to the printer in a line oriented way. That is, the printer gets a block of lines. The searcher guarantees that all matches that start or end in any of those lines also end or start in another line in that same block. As a result, the printer uses this assumption: once it has processed a block of lines, the next match will begin on a new and distinct line. Thus, things like '2:' are printed. This is generally all fine and good, but an impedance mismatch arises when replacements are used. Because now, the replacement can be used to change the "block of lines" approach. Now, in terms of the output, the subsequent match might actually continue the current line since the replacement might get rid of the concept of lines altogether. We can sometimes work around this. For example: $ printf "hello\nworld\n" | rg -U "\n(.)?" -r '?$1' hello?world? Why does this work? It's because the '(.)' after the '\n' causes the match to overlap between lines. Thus, the searcher guarantees that the block sent to the printer contains every line. And there in lay the solution: all we need to do is tweak the multi-line searcher so that it combines lines with matches that directly adjacent, instead of requiring at least one byte of overlap. Fixing that solves the issue above. It does cause some tests to fail: * The binary3 test in the searcher crate fails because adjacent line matches are now one part of block, and that block is scanned for binary data. To preserve the essence of the test, we insert a couple dummy lines to split up the blocks. * The JSON CRLF test. It was testing that we didn't output any messages with an empty 'submatches' array. That is indeed still the case. The difference is that the messages got combined because of the adjacent line merging behavior. This is a slight change to the output, but is still correct. Fixes #1311
342 lines
9.3 KiB
Rust
342 lines
9.3 KiB
Rust
use std::time;
|
|
|
|
use serde_derive::Deserialize;
|
|
use serde_json as json;
|
|
|
|
use crate::hay::{SHERLOCK, SHERLOCK_CRLF};
|
|
use crate::util::{Dir, TestCommand};
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
#[serde(tag = "type", content = "data")]
|
|
#[serde(rename_all = "snake_case")]
|
|
enum Message {
|
|
Begin(Begin),
|
|
End(End),
|
|
Match(Match),
|
|
Context(Context),
|
|
Summary(Summary),
|
|
}
|
|
|
|
impl Message {
|
|
fn unwrap_begin(&self) -> Begin {
|
|
match *self {
|
|
Message::Begin(ref x) => x.clone(),
|
|
ref x => panic!("expected Message::Begin but got {:?}", x),
|
|
}
|
|
}
|
|
|
|
fn unwrap_end(&self) -> End {
|
|
match *self {
|
|
Message::End(ref x) => x.clone(),
|
|
ref x => panic!("expected Message::End but got {:?}", x),
|
|
}
|
|
}
|
|
|
|
fn unwrap_match(&self) -> Match {
|
|
match *self {
|
|
Message::Match(ref x) => x.clone(),
|
|
ref x => panic!("expected Message::Match but got {:?}", x),
|
|
}
|
|
}
|
|
|
|
fn unwrap_context(&self) -> Context {
|
|
match *self {
|
|
Message::Context(ref x) => x.clone(),
|
|
ref x => panic!("expected Message::Context but got {:?}", x),
|
|
}
|
|
}
|
|
|
|
fn unwrap_summary(&self) -> Summary {
|
|
match *self {
|
|
Message::Summary(ref x) => x.clone(),
|
|
ref x => panic!("expected Message::Summary but got {:?}", x),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
struct Begin {
|
|
path: Option<Data>,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
struct End {
|
|
path: Option<Data>,
|
|
binary_offset: Option<u64>,
|
|
stats: Stats,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
struct Summary {
|
|
elapsed_total: Duration,
|
|
stats: Stats,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
struct Match {
|
|
path: Option<Data>,
|
|
lines: Data,
|
|
line_number: Option<u64>,
|
|
absolute_offset: u64,
|
|
submatches: Vec<SubMatch>,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
struct Context {
|
|
path: Option<Data>,
|
|
lines: Data,
|
|
line_number: Option<u64>,
|
|
absolute_offset: u64,
|
|
submatches: Vec<SubMatch>,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
struct SubMatch {
|
|
#[serde(rename = "match")]
|
|
m: Data,
|
|
start: usize,
|
|
end: usize,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
#[serde(untagged)]
|
|
enum Data {
|
|
Text { text: String },
|
|
// This variant is used when the data isn't valid UTF-8. The bytes are
|
|
// base64 encoded, so using a String here is OK.
|
|
Bytes { bytes: String },
|
|
}
|
|
|
|
impl Data {
|
|
fn text(s: &str) -> Data {
|
|
Data::Text { text: s.to_string() }
|
|
}
|
|
fn bytes(s: &str) -> Data {
|
|
Data::Bytes { bytes: s.to_string() }
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
struct Stats {
|
|
elapsed: Duration,
|
|
searches: u64,
|
|
searches_with_match: u64,
|
|
bytes_searched: u64,
|
|
bytes_printed: u64,
|
|
matched_lines: u64,
|
|
matches: u64,
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
|
|
struct Duration {
|
|
#[serde(flatten)]
|
|
duration: time::Duration,
|
|
human: String,
|
|
}
|
|
|
|
/// Decode JSON Lines into a Vec<Message>. If there was an error decoding,
|
|
/// this function panics.
|
|
fn json_decode(jsonlines: &str) -> Vec<Message> {
|
|
json::Deserializer::from_str(jsonlines)
|
|
.into_iter()
|
|
.collect::<Result<Vec<Message>, _>>()
|
|
.unwrap()
|
|
}
|
|
|
|
rgtest!(basic, |dir: Dir, mut cmd: TestCommand| {
|
|
dir.create("sherlock", SHERLOCK);
|
|
cmd.arg("--json").arg("-B1").arg("Sherlock Holmes").arg("sherlock");
|
|
|
|
let msgs = json_decode(&cmd.stdout());
|
|
|
|
assert_eq!(
|
|
msgs[0].unwrap_begin(),
|
|
Begin { path: Some(Data::text("sherlock")) }
|
|
);
|
|
assert_eq!(
|
|
msgs[1].unwrap_context(),
|
|
Context {
|
|
path: Some(Data::text("sherlock")),
|
|
lines: Data::text(
|
|
"Holmeses, success in the province of \
|
|
detective work must always\n",
|
|
),
|
|
line_number: Some(2),
|
|
absolute_offset: 65,
|
|
submatches: vec![],
|
|
}
|
|
);
|
|
assert_eq!(
|
|
msgs[2].unwrap_match(),
|
|
Match {
|
|
path: Some(Data::text("sherlock")),
|
|
lines: Data::text(
|
|
"be, to a very large extent, the result of luck. \
|
|
Sherlock Holmes\n",
|
|
),
|
|
line_number: Some(3),
|
|
absolute_offset: 129,
|
|
submatches: vec![SubMatch {
|
|
m: Data::text("Sherlock Holmes"),
|
|
start: 48,
|
|
end: 63,
|
|
},],
|
|
}
|
|
);
|
|
assert_eq!(msgs[3].unwrap_end().path, Some(Data::text("sherlock")));
|
|
assert_eq!(msgs[3].unwrap_end().binary_offset, None);
|
|
assert_eq!(msgs[4].unwrap_summary().stats.searches_with_match, 1);
|
|
assert_eq!(msgs[4].unwrap_summary().stats.bytes_printed, 494);
|
|
});
|
|
|
|
#[cfg(unix)]
|
|
rgtest!(notutf8, |dir: Dir, mut cmd: TestCommand| {
|
|
use std::ffi::OsStr;
|
|
use std::os::unix::ffi::OsStrExt;
|
|
|
|
// This test does not work with PCRE2 because PCRE2 does not support the
|
|
// `u` flag.
|
|
if dir.is_pcre2() {
|
|
return;
|
|
}
|
|
// macOS doesn't like this either... sigh.
|
|
if cfg!(target_os = "macos") {
|
|
return;
|
|
}
|
|
|
|
let name = &b"foo\xFFbar"[..];
|
|
let contents = &b"quux\xFFbaz"[..];
|
|
|
|
// APFS does not support creating files with invalid UTF-8 bytes, so just
|
|
// skip the test if we can't create our file. Presumably we don't need this
|
|
// check if we're already skipping it on macOS, but maybe other file
|
|
// systems won't like this test either?
|
|
if !dir.try_create_bytes(OsStr::from_bytes(name), contents).is_ok() {
|
|
return;
|
|
}
|
|
cmd.arg("--json").arg(r"(?-u)\xFF");
|
|
|
|
let msgs = json_decode(&cmd.stdout());
|
|
|
|
assert_eq!(
|
|
msgs[0].unwrap_begin(),
|
|
Begin { path: Some(Data::bytes("Zm9v/2Jhcg==")) }
|
|
);
|
|
assert_eq!(
|
|
msgs[1].unwrap_match(),
|
|
Match {
|
|
path: Some(Data::bytes("Zm9v/2Jhcg==")),
|
|
lines: Data::bytes("cXV1eP9iYXo="),
|
|
line_number: Some(1),
|
|
absolute_offset: 0,
|
|
submatches: vec![SubMatch {
|
|
m: Data::bytes("/w=="),
|
|
start: 4,
|
|
end: 5,
|
|
},],
|
|
}
|
|
);
|
|
});
|
|
|
|
rgtest!(notutf8_file, |dir: Dir, mut cmd: TestCommand| {
|
|
use std::ffi::OsStr;
|
|
|
|
// This test does not work with PCRE2 because PCRE2 does not support the
|
|
// `u` flag.
|
|
if dir.is_pcre2() {
|
|
return;
|
|
}
|
|
|
|
let name = "foo";
|
|
let contents = &b"quux\xFFbaz"[..];
|
|
|
|
// APFS does not support creating files with invalid UTF-8 bytes, so just
|
|
// skip the test if we can't create our file.
|
|
if !dir.try_create_bytes(OsStr::new(name), contents).is_ok() {
|
|
return;
|
|
}
|
|
cmd.arg("--json").arg(r"(?-u)\xFF");
|
|
|
|
let msgs = json_decode(&cmd.stdout());
|
|
|
|
assert_eq!(
|
|
msgs[0].unwrap_begin(),
|
|
Begin { path: Some(Data::text("foo")) }
|
|
);
|
|
assert_eq!(
|
|
msgs[1].unwrap_match(),
|
|
Match {
|
|
path: Some(Data::text("foo")),
|
|
lines: Data::bytes("cXV1eP9iYXo="),
|
|
line_number: Some(1),
|
|
absolute_offset: 0,
|
|
submatches: vec![SubMatch {
|
|
m: Data::bytes("/w=="),
|
|
start: 4,
|
|
end: 5,
|
|
},],
|
|
}
|
|
);
|
|
});
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/416
|
|
//
|
|
// This test in particular checks that our match does _not_ include the `\r`
|
|
// even though the '$' may be rewritten as '(?:\r??$)' and could thus include
|
|
// `\r` in the match.
|
|
rgtest!(crlf, |dir: Dir, mut cmd: TestCommand| {
|
|
dir.create("sherlock", SHERLOCK_CRLF);
|
|
cmd.arg("--json").arg("--crlf").arg(r"Sherlock$").arg("sherlock");
|
|
|
|
let msgs = json_decode(&cmd.stdout());
|
|
|
|
assert_eq!(
|
|
msgs[1].unwrap_match().submatches[0].clone(),
|
|
SubMatch { m: Data::text("Sherlock"), start: 56, end: 64 },
|
|
);
|
|
});
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/1095
|
|
//
|
|
// This test checks that we don't drop the \r\n in a matching line when --crlf
|
|
// mode is enabled.
|
|
rgtest!(r1095_missing_crlf, |dir: Dir, mut cmd: TestCommand| {
|
|
dir.create("foo", "test\r\n");
|
|
|
|
// Check without --crlf flag.
|
|
let msgs = json_decode(&cmd.arg("--json").arg("test").stdout());
|
|
assert_eq!(msgs.len(), 4);
|
|
assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));
|
|
|
|
// Now check with --crlf flag.
|
|
let msgs = json_decode(&cmd.arg("--crlf").stdout());
|
|
assert_eq!(msgs.len(), 4);
|
|
assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));
|
|
});
|
|
|
|
// See: https://github.com/BurntSushi/ripgrep/issues/1095
|
|
//
|
|
// This test checks that we don't return empty submatches when matching a `\n`
|
|
// in CRLF mode.
|
|
rgtest!(r1095_crlf_empty_match, |dir: Dir, mut cmd: TestCommand| {
|
|
dir.create("foo", "test\r\n\n");
|
|
|
|
// Check without --crlf flag.
|
|
let msgs = json_decode(&cmd.arg("-U").arg("--json").arg("\n").stdout());
|
|
assert_eq!(msgs.len(), 4);
|
|
|
|
let m = msgs[1].unwrap_match();
|
|
assert_eq!(m.lines, Data::text("test\r\n\n"));
|
|
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
|
assert_eq!(m.submatches[1].m, Data::text("\n"));
|
|
|
|
// Now check with --crlf flag.
|
|
let msgs = json_decode(&cmd.arg("--crlf").stdout());
|
|
assert_eq!(msgs.len(), 4);
|
|
|
|
let m = msgs[1].unwrap_match();
|
|
assert_eq!(m.lines, Data::text("test\r\n\n"));
|
|
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
|
assert_eq!(m.submatches[1].m, Data::text("\n"));
|
|
});
|