ripgrep/tests/json.rs

use std::time;

use serde_derive::Deserialize;
use serde_json as json;

use crate::hay::{SHERLOCK, SHERLOCK_CRLF};
use crate::util::{Dir, TestCommand};

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
#[serde(tag = "type", content = "data")]
#[serde(rename_all = "snake_case")]
enum Message {
    Begin(Begin),
    End(End),
    Match(Match),
    Context(Context),
    Summary(Summary),
}

impl Message {
    fn unwrap_begin(&self) -> Begin {
        match *self {
            Message::Begin(ref x) => x.clone(),
            ref x => panic!("expected Message::Begin but got {:?}", x),
        }
    }

    fn unwrap_end(&self) -> End {
        match *self {
            Message::End(ref x) => x.clone(),
            ref x => panic!("expected Message::End but got {:?}", x),
        }
    }

    fn unwrap_match(&self) -> Match {
        match *self {
            Message::Match(ref x) => x.clone(),
            ref x => panic!("expected Message::Match but got {:?}", x),
        }
    }

    fn unwrap_context(&self) -> Context {
        match *self {
            Message::Context(ref x) => x.clone(),
            ref x => panic!("expected Message::Context but got {:?}", x),
        }
    }

    fn unwrap_summary(&self) -> Summary {
        match *self {
            Message::Summary(ref x) => x.clone(),
            ref x => panic!("expected Message::Summary but got {:?}", x),
        }
    }
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
struct Begin {
    path: Option<Data>,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
struct End {
    path: Option<Data>,
    binary_offset: Option<u64>,
    stats: Stats,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
struct Summary {
    elapsed_total: Duration,
    stats: Stats,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
struct Match {
    path: Option<Data>,
    lines: Data,
    line_number: Option<u64>,
    absolute_offset: u64,
    submatches: Vec<SubMatch>,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
struct Context {
    path: Option<Data>,
    lines: Data,
    line_number: Option<u64>,
    absolute_offset: u64,
    submatches: Vec<SubMatch>,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
struct SubMatch {
    #[serde(rename = "match")]
    m: Data,
    start: usize,
    end: usize,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
#[serde(untagged)]
enum Data {
    Text { text: String },
    // This variant is used when the data isn't valid UTF-8. The bytes are
    // base64 encoded, so using a String here is OK.
    Bytes { bytes: String },
}

impl Data {
    fn text(s: &str) -> Data { Data::Text { text: s.to_string() } }
    fn bytes(s: &str) -> Data { Data::Bytes { bytes: s.to_string() } }
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
struct Stats {
    elapsed: Duration,
    searches: u64,
    searches_with_match: u64,
    bytes_searched: u64,
    bytes_printed: u64,
    matched_lines: u64,
    matches: u64,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]
struct Duration {
    #[serde(flatten)]
    duration: time::Duration,
    human: String,
}

/// Decode JSON Lines into a Vec<Message>. If there was an error decoding,
/// this function panics.
fn json_decode(jsonlines: &str) -> Vec<Message> {
    json::Deserializer::from_str(jsonlines)
        .into_iter()
        .collect::<Result<Vec<Message>, _>>()
        .unwrap()
}

rgtest!(basic, |dir: Dir, mut cmd: TestCommand| {
    dir.create("sherlock", SHERLOCK);
    cmd.arg("--json").arg("-B1").arg("Sherlock Holmes").arg("sherlock");

    let msgs = json_decode(&cmd.stdout());

    assert_eq!(
        msgs[0].unwrap_begin(),
        Begin { path: Some(Data::text("sherlock")) }
    );
    assert_eq!(
        msgs[1].unwrap_context(),
        Context {
            path: Some(Data::text("sherlock")),
            lines: Data::text(
                "Holmeses, success in the province of \
                 detective work must always\n",
            ),
            line_number: Some(2),
            absolute_offset: 65,
            submatches: vec![],
        }
    );
    assert_eq!(
        msgs[2].unwrap_match(),
        Match {
            path: Some(Data::text("sherlock")),
            lines: Data::text(
                "be, to a very large extent, the result of luck. \
                 Sherlock Holmes\n",
            ),
            line_number: Some(3),
            absolute_offset: 129,
            submatches: vec![
                SubMatch {
                    m: Data::text("Sherlock Holmes"),
                    start: 48,
                    end: 63,
                },
            ],
        }
    );
    assert_eq!(
        msgs[3].unwrap_end().path,
        Some(Data::text("sherlock"))
    );
    assert_eq!(
        msgs[3].unwrap_end().binary_offset,
        None
    );
    assert_eq!(
        msgs[4].unwrap_summary().stats.searches_with_match,
        1
    );
    assert_eq!(
        msgs[4].unwrap_summary().stats.bytes_printed,
        494
    );
});

#[cfg(unix)]
rgtest!(notutf8, |dir: Dir, mut cmd: TestCommand| {
    use std::ffi::OsStr;
    use std::os::unix::ffi::OsStrExt;

    // This test does not work with PCRE2 because PCRE2 does not support the
    // `u` flag.
    if dir.is_pcre2() {
        return;
    }
    // macOS doesn't like this either... sigh.
    if cfg!(target_os = "macos") {
        return;
    }

    let name = &b"foo\xFFbar"[..];
    let contents = &b"quux\xFFbaz"[..];

    // APFS does not support creating files with invalid UTF-8 bytes, so just
    // skip the test if we can't create our file. Presumably we don't need this
    // check if we're already skipping it on macOS, but maybe other file
    // systems won't like this test either?
    if !dir.try_create_bytes(OsStr::from_bytes(name), contents).is_ok() {
        return;
    }
    cmd.arg("--json").arg(r"(?-u)\xFF");

    let msgs = json_decode(&cmd.stdout());

    assert_eq!(
        msgs[0].unwrap_begin(),
        Begin { path: Some(Data::bytes("Zm9v/2Jhcg==")) }
    );
    assert_eq!(
        msgs[1].unwrap_match(),
        Match {
            path: Some(Data::bytes("Zm9v/2Jhcg==")),
            lines: Data::bytes("cXV1eP9iYXo="),
            line_number: Some(1),
            absolute_offset: 0,
            submatches: vec![
                SubMatch {
                    m: Data::bytes("/w=="),
                    start: 4,
                    end: 5,
                },
            ],
        }
    );
});

rgtest!(notutf8_file, |dir: Dir, mut cmd: TestCommand| {
    use std::ffi::OsStr;

    // This test does not work with PCRE2 because PCRE2 does not support the
    // `u` flag.
    if dir.is_pcre2() {
        return;
    }

    let name = "foo";
    let contents = &b"quux\xFFbaz"[..];

    // APFS does not support creating files with invalid UTF-8 bytes, so just
    // skip the test if we can't create our file.
    if !dir.try_create_bytes(OsStr::new(name), contents).is_ok() {
        return;
    }
    cmd.arg("--json").arg(r"(?-u)\xFF");

    let msgs = json_decode(&cmd.stdout());

    assert_eq!(
        msgs[0].unwrap_begin(),
        Begin { path: Some(Data::text("foo")) }
    );
    assert_eq!(
        msgs[1].unwrap_match(),
        Match {
            path: Some(Data::text("foo")),
            lines: Data::bytes("cXV1eP9iYXo="),
            line_number: Some(1),
            absolute_offset: 0,
            submatches: vec![
                SubMatch {
                    m: Data::bytes("/w=="),
                    start: 4,
                    end: 5,
                },
            ],
        }
    );
});

// See: https://github.com/BurntSushi/ripgrep/issues/416
//
// This test in particular checks that our match does _not_ include the `\r`
// even though the '$' may be rewritten as '(?:\r??$)' and could thus include
// `\r` in the match.
rgtest!(crlf, |dir: Dir, mut cmd: TestCommand| {
    dir.create("sherlock", SHERLOCK_CRLF);
    cmd.arg("--json").arg("--crlf").arg(r"Sherlock$").arg("sherlock");

    let msgs = json_decode(&cmd.stdout());

    assert_eq!(
        msgs[1].unwrap_match().submatches[0].clone(),
        SubMatch {
            m: Data::text("Sherlock"),
            start: 56,
            end: 64,
        },
    );
});

// See: https://github.com/BurntSushi/ripgrep/issues/1095
//
// This test checks that we don't drop the \r\n in a matching line when --crlf
// mode is enabled.
rgtest!(r1095_missing_crlf, |dir: Dir, mut cmd: TestCommand| {
    dir.create("foo", "test\r\n");

    // Check without --crlf flag.
    let msgs = json_decode(&cmd.arg("--json").arg("test").stdout());
    assert_eq!(msgs.len(), 4);
    assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));

    // Now check with --crlf flag.
    let msgs = json_decode(&cmd.arg("--crlf").stdout());
    assert_eq!(msgs.len(), 4);
    assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));
});

// See: https://github.com/BurntSushi/ripgrep/issues/1095
//
// This test checks that we don't return empty submatches when matching a `\n`
// in CRLF mode.
rgtest!(r1095_crlf_empty_match, |dir: Dir, mut cmd: TestCommand| {
    dir.create("foo", "test\r\n\n");

    // Check without --crlf flag.
    let msgs = json_decode(&cmd.arg("-U").arg("--json").arg("\n").stdout());
    assert_eq!(msgs.len(), 5);

    let m = msgs[1].unwrap_match();
    assert_eq!(m.lines, Data::text("test\r\n"));
    assert_eq!(m.submatches[0].m, Data::text("\n"));

    let m = msgs[2].unwrap_match();
    assert_eq!(m.lines, Data::text("\n"));
    assert_eq!(m.submatches[0].m, Data::text("\n"));

    // Now check with --crlf flag.
    let msgs = json_decode(&cmd.arg("--crlf").stdout());

    let m = msgs[1].unwrap_match();
    assert_eq!(m.lines, Data::text("test\r\n"));
    assert_eq!(m.submatches[0].m, Data::text("\n"));

    let m = msgs[2].unwrap_match();
    assert_eq!(m.lines, Data::text("\n"));
    assert_eq!(m.submatches[0].m, Data::text("\n"));
});
tests: re-tool integration tests This basically rewrites every integration test. We reduce the amount of magic involved here in terms of which arguments are being passed to ripgrep processes. To make up for the boiler plate saved by the magic, we make the Dir (formerly WorkDir) type a bit nicer to use, along with a new TestCommand that wraps a std::process::Command. In exchange, we get tests that are easier to read and write. We also run every test with the `--pcre2` flag to make sure that works, when PCRE2 is available. 2018-08-06 20:11:58 -04:00			`use std::time;`

edition: move core ripgrep to Rust 2018 2019-01-19 10:15:56 -05:00			`use serde_derive::Deserialize;`
tests: re-tool integration tests This basically rewrites every integration test. We reduce the amount of magic involved here in terms of which arguments are being passed to ripgrep processes. To make up for the boiler plate saved by the magic, we make the Dir (formerly WorkDir) type a bit nicer to use, along with a new TestCommand that wraps a std::process::Command. In exchange, we get tests that are easier to read and write. We also run every test with the `--pcre2` flag to make sure that works, when PCRE2 is available. 2018-08-06 20:11:58 -04:00			`use serde_json as json;`

edition: move core ripgrep to Rust 2018 2019-01-19 10:15:56 -05:00			`use crate::hay::{SHERLOCK, SHERLOCK_CRLF};`
			`use crate::util::{Dir, TestCommand};`
tests: re-tool integration tests This basically rewrites every integration test. We reduce the amount of magic involved here in terms of which arguments are being passed to ripgrep processes. To make up for the boiler plate saved by the magic, we make the Dir (formerly WorkDir) type a bit nicer to use, along with a new TestCommand that wraps a std::process::Command. In exchange, we get tests that are easier to read and write. We also run every test with the `--pcre2` flag to make sure that works, when PCRE2 is available. 2018-08-06 20:11:58 -04:00
			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`#[serde(tag = "type", content = "data")]`
			`#[serde(rename_all = "snake_case")]`
			`enum Message {`
			`Begin(Begin),`
			`End(End),`
			`Match(Match),`
			`Context(Context),`
			`Summary(Summary),`
			`}`

			`impl Message {`
			`fn unwrap_begin(&self) -> Begin {`
			`match *self {`
			`Message::Begin(ref x) => x.clone(),`
			`ref x => panic!("expected Message::Begin but got {:?}", x),`
			`}`
			`}`

			`fn unwrap_end(&self) -> End {`
			`match *self {`
			`Message::End(ref x) => x.clone(),`
			`ref x => panic!("expected Message::End but got {:?}", x),`
			`}`
			`}`

			`fn unwrap_match(&self) -> Match {`
			`match *self {`
			`Message::Match(ref x) => x.clone(),`
			`ref x => panic!("expected Message::Match but got {:?}", x),`
			`}`
			`}`

			`fn unwrap_context(&self) -> Context {`
			`match *self {`
			`Message::Context(ref x) => x.clone(),`
			`ref x => panic!("expected Message::Context but got {:?}", x),`
			`}`
			`}`

			`fn unwrap_summary(&self) -> Summary {`
			`match *self {`
			`Message::Summary(ref x) => x.clone(),`
			`ref x => panic!("expected Message::Summary but got {:?}", x),`
			`}`
			`}`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`struct Begin {`
			`path: Option<Data>,`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`struct End {`
			`path: Option<Data>,`
			`binary_offset: Option<u64>,`
			`stats: Stats,`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`struct Summary {`
			`elapsed_total: Duration,`
			`stats: Stats,`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`struct Match {`
			`path: Option<Data>,`
			`lines: Data,`
			`line_number: Option<u64>,`
			`absolute_offset: u64,`
			`submatches: Vec<SubMatch>,`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`struct Context {`
			`path: Option<Data>,`
			`lines: Data,`
			`line_number: Option<u64>,`
			`absolute_offset: u64,`
			`submatches: Vec<SubMatch>,`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`struct SubMatch {`
			`#[serde(rename = "match")]`
			`m: Data,`
			`start: usize,`
			`end: usize,`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`#[serde(untagged)]`
			`enum Data {`
			`Text { text: String },`
			`// This variant is used when the data isn't valid UTF-8. The bytes are`
			`// base64 encoded, so using a String here is OK.`
			`Bytes { bytes: String },`
			`}`

			`impl Data {`
			`fn text(s: &str) -> Data { Data::Text { text: s.to_string() } }`
			`fn bytes(s: &str) -> Data { Data::Bytes { bytes: s.to_string() } }`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`struct Stats {`
			`elapsed: Duration,`
			`searches: u64,`
			`searches_with_match: u64,`
			`bytes_searched: u64,`
			`bytes_printed: u64,`
			`matched_lines: u64,`
			`matches: u64,`
			`}`

			`#[derive(Clone, Debug, Deserialize, PartialEq, Eq)]`
			`struct Duration {`
			`#[serde(flatten)]`
			`duration: time::Duration,`
			`human: String,`
			`}`

			`/// Decode JSON Lines into a Vec<Message>. If there was an error decoding,`
			`/// this function panics.`
			`fn json_decode(jsonlines: &str) -> Vec<Message> {`
			`json::Deserializer::from_str(jsonlines)`
			`.into_iter()`
			`.collect::<Result<Vec<Message>, _>>()`
			`.unwrap()`
			`}`

			`rgtest!(basic, \|dir: Dir, mut cmd: TestCommand\| {`
			`dir.create("sherlock", SHERLOCK);`
			`cmd.arg("--json").arg("-B1").arg("Sherlock Holmes").arg("sherlock");`

			`let msgs = json_decode(&cmd.stdout());`

			`assert_eq!(`
			`msgs[0].unwrap_begin(),`
			`Begin { path: Some(Data::text("sherlock")) }`
			`);`
			`assert_eq!(`
			`msgs[1].unwrap_context(),`
			`Context {`
			`path: Some(Data::text("sherlock")),`
regex: make CRLF hack more robust This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo\|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r\|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095 2019-01-26 12:25:21 -05:00			`lines: Data::text(`
			`"Holmeses, success in the province of \`
			`detective work must always\n",`
			`),`
tests: re-tool integration tests This basically rewrites every integration test. We reduce the amount of magic involved here in terms of which arguments are being passed to ripgrep processes. To make up for the boiler plate saved by the magic, we make the Dir (formerly WorkDir) type a bit nicer to use, along with a new TestCommand that wraps a std::process::Command. In exchange, we get tests that are easier to read and write. We also run every test with the `--pcre2` flag to make sure that works, when PCRE2 is available. 2018-08-06 20:11:58 -04:00			`line_number: Some(2),`
			`absolute_offset: 65,`
			`submatches: vec![],`
			`}`
			`);`
			`assert_eq!(`
			`msgs[2].unwrap_match(),`
			`Match {`
			`path: Some(Data::text("sherlock")),`
regex: make CRLF hack more robust This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo\|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r\|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095 2019-01-26 12:25:21 -05:00			`lines: Data::text(`
			`"be, to a very large extent, the result of luck. \`
			`Sherlock Holmes\n",`
			`),`
tests: re-tool integration tests This basically rewrites every integration test. We reduce the amount of magic involved here in terms of which arguments are being passed to ripgrep processes. To make up for the boiler plate saved by the magic, we make the Dir (formerly WorkDir) type a bit nicer to use, along with a new TestCommand that wraps a std::process::Command. In exchange, we get tests that are easier to read and write. We also run every test with the `--pcre2` flag to make sure that works, when PCRE2 is available. 2018-08-06 20:11:58 -04:00			`line_number: Some(3),`
			`absolute_offset: 129,`
			`submatches: vec![`
			`SubMatch {`
			`m: Data::text("Sherlock Holmes"),`
			`start: 48,`
			`end: 63,`
			`},`
			`],`
			`}`
			`);`
			`assert_eq!(`
			`msgs[3].unwrap_end().path,`
			`Some(Data::text("sherlock"))`
			`);`
			`assert_eq!(`
			`msgs[3].unwrap_end().binary_offset,`
			`None`
			`);`
			`assert_eq!(`
			`msgs[4].unwrap_summary().stats.searches_with_match,`
			`1`
			`);`
			`assert_eq!(`
			`msgs[4].unwrap_summary().stats.bytes_printed,`
			`494`
			`);`
			`});`

			`#[cfg(unix)]`
			`rgtest!(notutf8, \|dir: Dir, mut cmd: TestCommand\| {`
			`use std::ffi::OsStr;`
			`use std::os::unix::ffi::OsStrExt;`

			`// This test does not work with PCRE2 because PCRE2 does not support the`
			// `u` flag.
			`if dir.is_pcre2() {`
			`return;`
			`}`
			`// macOS doesn't like this either... sigh.`
			`if cfg!(target_os = "macos") {`
			`return;`
			`}`

			`let name = &b"foo\xFFbar"[..];`
			`let contents = &b"quux\xFFbaz"[..];`

			`// APFS does not support creating files with invalid UTF-8 bytes, so just`
regex: make CRLF hack more robust This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo\|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r\|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095 2019-01-26 12:25:21 -05:00			`// skip the test if we can't create our file. Presumably we don't need this`
			`// check if we're already skipping it on macOS, but maybe other file`
			`// systems won't like this test either?`
tests: re-tool integration tests This basically rewrites every integration test. We reduce the amount of magic involved here in terms of which arguments are being passed to ripgrep processes. To make up for the boiler plate saved by the magic, we make the Dir (formerly WorkDir) type a bit nicer to use, along with a new TestCommand that wraps a std::process::Command. In exchange, we get tests that are easier to read and write. We also run every test with the `--pcre2` flag to make sure that works, when PCRE2 is available. 2018-08-06 20:11:58 -04:00			`if !dir.try_create_bytes(OsStr::from_bytes(name), contents).is_ok() {`
			`return;`
			`}`
			`cmd.arg("--json").arg(r"(?-u)\xFF");`

			`let msgs = json_decode(&cmd.stdout());`

			`assert_eq!(`
			`msgs[0].unwrap_begin(),`
			`Begin { path: Some(Data::bytes("Zm9v/2Jhcg==")) }`
			`);`
			`assert_eq!(`
			`msgs[1].unwrap_match(),`
			`Match {`
			`path: Some(Data::bytes("Zm9v/2Jhcg==")),`
			`lines: Data::bytes("cXV1eP9iYXo="),`
			`line_number: Some(1),`
			`absolute_offset: 0,`
			`submatches: vec![`
			`SubMatch {`
			`m: Data::bytes("/w=="),`
			`start: 4,`
			`end: 5,`
			`},`
			`],`
			`}`
			`);`
			`});`

tests: touch up tests on Windows This fixes warnings and adds an additional invalid UTF-8 test that will run on Windows. 2018-08-21 20:26:33 -04:00			`rgtest!(notutf8_file, \|dir: Dir, mut cmd: TestCommand\| {`
			`use std::ffi::OsStr;`

			`// This test does not work with PCRE2 because PCRE2 does not support the`
			// `u` flag.
			`if dir.is_pcre2() {`
			`return;`
			`}`

			`let name = "foo";`
			`let contents = &b"quux\xFFbaz"[..];`

			`// APFS does not support creating files with invalid UTF-8 bytes, so just`
			`// skip the test if we can't create our file.`
			`if !dir.try_create_bytes(OsStr::new(name), contents).is_ok() {`
			`return;`
			`}`
			`cmd.arg("--json").arg(r"(?-u)\xFF");`

			`let msgs = json_decode(&cmd.stdout());`

			`assert_eq!(`
			`msgs[0].unwrap_begin(),`
			`Begin { path: Some(Data::text("foo")) }`
			`);`
			`assert_eq!(`
			`msgs[1].unwrap_match(),`
			`Match {`
			`path: Some(Data::text("foo")),`
			`lines: Data::bytes("cXV1eP9iYXo="),`
			`line_number: Some(1),`
			`absolute_offset: 0,`
			`submatches: vec![`
			`SubMatch {`
			`m: Data::bytes("/w=="),`
			`start: 4,`
			`end: 5,`
			`},`
			`],`
			`}`
			`);`
			`});`

tests: re-tool integration tests This basically rewrites every integration test. We reduce the amount of magic involved here in terms of which arguments are being passed to ripgrep processes. To make up for the boiler plate saved by the magic, we make the Dir (formerly WorkDir) type a bit nicer to use, along with a new TestCommand that wraps a std::process::Command. In exchange, we get tests that are easier to read and write. We also run every test with the `--pcre2` flag to make sure that works, when PCRE2 is available. 2018-08-06 20:11:58 -04:00			`// See: https://github.com/BurntSushi/ripgrep/issues/416`
			`//`
			// This test in particular checks that our match does _not_ include the `\r`
			`// even though the '$' may be rewritten as '(?:\r??$)' and could thus include`
			// `\r` in the match.
			`rgtest!(crlf, \|dir: Dir, mut cmd: TestCommand\| {`
			`dir.create("sherlock", SHERLOCK_CRLF);`
			`cmd.arg("--json").arg("--crlf").arg(r"Sherlock$").arg("sherlock");`

			`let msgs = json_decode(&cmd.stdout());`

			`assert_eq!(`
			`msgs[1].unwrap_match().submatches[0].clone(),`
			`SubMatch {`
			`m: Data::text("Sherlock"),`
			`start: 56,`
			`end: 64,`
			`},`
			`);`
			`});`
regex: make CRLF hack more robust This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo\|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r\|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095 2019-01-26 12:25:21 -05:00
			`// See: https://github.com/BurntSushi/ripgrep/issues/1095`
			`//`
			`// This test checks that we don't drop the \r\n in a matching line when --crlf`
			`// mode is enabled.`
			`rgtest!(r1095_missing_crlf, \|dir: Dir, mut cmd: TestCommand\| {`
			`dir.create("foo", "test\r\n");`

			`// Check without --crlf flag.`
			`let msgs = json_decode(&cmd.arg("--json").arg("test").stdout());`
			`assert_eq!(msgs.len(), 4);`
			`assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));`

			`// Now check with --crlf flag.`
			`let msgs = json_decode(&cmd.arg("--crlf").stdout());`
			`assert_eq!(msgs.len(), 4);`
			`assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));`
			`});`

			`// See: https://github.com/BurntSushi/ripgrep/issues/1095`
			`//`
			// This test checks that we don't return empty submatches when matching a `\n`
			`// in CRLF mode.`
			`rgtest!(r1095_crlf_empty_match, \|dir: Dir, mut cmd: TestCommand\| {`
			`dir.create("foo", "test\r\n\n");`

			`// Check without --crlf flag.`
			`let msgs = json_decode(&cmd.arg("-U").arg("--json").arg("\n").stdout());`
			`assert_eq!(msgs.len(), 5);`

			`let m = msgs[1].unwrap_match();`
			`assert_eq!(m.lines, Data::text("test\r\n"));`
			`assert_eq!(m.submatches[0].m, Data::text("\n"));`

			`let m = msgs[2].unwrap_match();`
			`assert_eq!(m.lines, Data::text("\n"));`
			`assert_eq!(m.submatches[0].m, Data::text("\n"));`

			`// Now check with --crlf flag.`
			`let msgs = json_decode(&cmd.arg("--crlf").stdout());`

			`let m = msgs[1].unwrap_match();`
			`assert_eq!(m.lines, Data::text("test\r\n"));`
			`assert_eq!(m.submatches[0].m, Data::text("\n"));`

			`let m = msgs[2].unwrap_match();`
			`assert_eq!(m.lines, Data::text("\n"));`
			`assert_eq!(m.submatches[0].m, Data::text("\n"));`
			`});`