diff --git a/CHANGELOG.md b/CHANGELOG.md index c0fb04b9..73406762 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,8 @@ Bug fixes: Context from the `--passthru` flag should not impact process exit status. * [BUG #984](https://github.com/BurntSushi/ripgrep/issues/984): Fixes bug in `ignore` crate where first path was always treated as a symlink. +* [BUG #990](https://github.com/BurntSushi/ripgrep/issues/990): + Read stderr asynchronously when running a process. * [BUG #1013](https://github.com/BurntSushi/ripgrep/issues/1013): Add compile time and runtime CPU features to `--version` output. * [BUG #1028](https://github.com/BurntSushi/ripgrep/pull/1028): diff --git a/Cargo.lock b/Cargo.lock index 7ddb3f2b..b1ee1723 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -168,6 +168,7 @@ name = "grep" version = "0.2.0" dependencies = [ "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "grep-cli 0.1.0", "grep-matcher 0.1.0", "grep-pcre2 0.1.0", "grep-printer 0.1.0", @@ -177,6 +178,20 @@ dependencies = [ "walkdir 2.2.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "grep-cli" +version = "0.1.0" +dependencies = [ + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "globset 0.4.1", + "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "same-file 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "termcolor 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "grep-matcher" version = "0.1.0" @@ -464,21 +479,17 @@ dependencies = [ name = "ripgrep" version = "0.9.0" dependencies = [ - "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", - "globset 0.4.1", "grep 0.2.0", "ignore 0.4.3", "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "same-file 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.75 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.75 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)", "termcolor 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 3ff769c6..0c489c46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ path = "tests/tests.rs" members = [ "globset", "grep", + "grep-cli", "grep-matcher", "grep-pcre2", "grep-printer", @@ -44,15 +45,12 @@ members = [ ] [dependencies] -atty = "0.2.11" -globset = { version = "0.4.0", path = "globset" } grep = { version = "0.2.0", path = "grep" } ignore = { version = "0.4.0", path = "ignore" } lazy_static = "1" log = "0.4" num_cpus = "1" regex = "1" -same-file = "1" serde_json = "1" termcolor = "1" @@ -61,9 +59,6 @@ version = "2.32.0" default-features = false features = ["suggestions"] -[target.'cfg(windows)'.dependencies.winapi-util] -version = "0.1.1" - [build-dependencies] lazy_static = "1" diff --git a/grep-cli/Cargo.toml b/grep-cli/Cargo.toml new file mode 100644 index 00000000..1d5fda22 --- /dev/null +++ b/grep-cli/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "grep-cli" +version = "0.1.0" #:version +authors = ["Andrew Gallant "] +description = """ +Utilities for search oriented command line applications. +""" +documentation = "https://docs.rs/grep-cli" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "cli", "utility", "util"] +license = "Unlicense/MIT" + +[dependencies] +atty = "0.2.11" +globset = { version = "0.4.1", path = "../globset" } +lazy_static = "1.1" +log = "0.4" +regex = "1" +same-file = "1" +termcolor = "1" + +[target.'cfg(windows)'.dependencies.winapi-util] +version = "0.1.1" diff --git a/grep-cli/LICENSE-MIT b/grep-cli/LICENSE-MIT new file mode 100644 index 00000000..3b0a5dc0 --- /dev/null +++ b/grep-cli/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-cli/README.md b/grep-cli/README.md new file mode 100644 index 00000000..e78317c6 --- /dev/null +++ b/grep-cli/README.md @@ -0,0 +1,38 @@ +grep-cli +-------- +A utility library that provides common routines desired in search oriented +command line applications. This includes, but is not limited to, parsing hex +escapes, detecting whether stdin is readable and more. To the extent possible, +this crate strives for compatibility across Windows, macOS and Linux. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-cli.svg)](https://crates.io/crates/grep-cli) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + + +### Documentation + +[https://docs.rs/grep-cli](https://docs.rs/grep-cli) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-cli = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_cli; +``` diff --git a/grep-cli/UNLICENSE b/grep-cli/UNLICENSE new file mode 100644 index 00000000..68a49daa --- /dev/null +++ b/grep-cli/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/grep-cli/src/decompress.rs b/grep-cli/src/decompress.rs new file mode 100644 index 00000000..ad108ea0 --- /dev/null +++ b/grep-cli/src/decompress.rs @@ -0,0 +1,381 @@ +use std::ffi::{OsStr, OsString}; +use std::fs::File; +use std::io; +use std::path::Path; +use std::process::Command; + +use globset::{Glob, GlobSet, GlobSetBuilder}; + +use process::{CommandError, CommandReader, CommandReaderBuilder}; + +/// A builder for a matcher that determines which files get decompressed. +#[derive(Clone, Debug)] +pub struct DecompressionMatcherBuilder { + /// The commands for each matching glob. + commands: Vec, + /// Whether to include the default matching rules. + defaults: bool, +} + +/// A representation of a single command for decompressing data +/// out-of-proccess. +#[derive(Clone, Debug)] +struct DecompressionCommand { + /// The glob that matches this command. + glob: String, + /// The command or binary name. + bin: OsString, + /// The arguments to invoke with the command. + args: Vec, +} + +impl Default for DecompressionMatcherBuilder { + fn default() -> DecompressionMatcherBuilder { + DecompressionMatcherBuilder::new() + } +} + +impl DecompressionMatcherBuilder { + /// Create a new builder for configuring a decompression matcher. + pub fn new() -> DecompressionMatcherBuilder { + DecompressionMatcherBuilder { + commands: vec![], + defaults: true, + } + } + + /// Build a matcher for determining how to decompress files. + /// + /// If there was a problem compiling the matcher, then an error is + /// returned. + pub fn build(&self) -> Result { + let defaults = + if !self.defaults { + vec![] + } else { + default_decompression_commands() + }; + let mut glob_builder = GlobSetBuilder::new(); + let mut commands = vec![]; + for decomp_cmd in defaults.iter().chain(&self.commands) { + let glob = Glob::new(&decomp_cmd.glob).map_err(|err| { + CommandError::io(io::Error::new(io::ErrorKind::Other, err)) + })?; + glob_builder.add(glob); + commands.push(decomp_cmd.clone()); + } + let globs = glob_builder.build().map_err(|err| { + CommandError::io(io::Error::new(io::ErrorKind::Other, err)) + })?; + Ok(DecompressionMatcher { globs, commands }) + } + + /// When enabled, the default matching rules will be compiled into this + /// matcher before any other associations. When disabled, only the + /// rules explicitly given to this builder will be used. + /// + /// This is enabled by default. + pub fn defaults(&mut self, yes: bool) -> &mut DecompressionMatcherBuilder { + self.defaults = yes; + self + } + + /// Associates a glob with a command to decompress files matching the glob. + /// + /// If multiple globs match the same file, then the most recently added + /// glob takes precedence. + /// + /// The syntax for the glob is documented in the + /// [`globset` crate](https://docs.rs/globset/#syntax). + pub fn associate( + &mut self, + glob: &str, + program: P, + args: I, + ) -> &mut DecompressionMatcherBuilder + where P: AsRef, + I: IntoIterator, + A: AsRef, + { + + let glob = glob.to_string(); + let bin = program.as_ref().to_os_string(); + let args = args + .into_iter() + .map(|a| a.as_ref().to_os_string()) + .collect(); + self.commands.push(DecompressionCommand { glob, bin, args }); + self + } +} + +/// A matcher for determining how to decompress files. +#[derive(Clone, Debug)] +pub struct DecompressionMatcher { + /// The set of globs to match. Each glob has a corresponding entry in + /// `commands`. When a glob matches, the corresponding command should be + /// used to perform out-of-process decompression. + globs: GlobSet, + /// The commands for each matching glob. + commands: Vec, +} + +impl Default for DecompressionMatcher { + fn default() -> DecompressionMatcher { + DecompressionMatcher::new() + } +} + +impl DecompressionMatcher { + /// Create a new matcher with default rules. + /// + /// To add more matching rules, build a matcher with + /// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html). + pub fn new() -> DecompressionMatcher { + DecompressionMatcherBuilder::new() + .build() + .expect("built-in matching rules should always compile") + } + + /// Return a pre-built command based on the given file path that can + /// decompress its contents. If no such decompressor is known, then this + /// returns `None`. + /// + /// If there are multiple possible commands matching the given path, then + /// the command added last takes precedence. + pub fn command>(&self, path: P) -> Option { + for i in self.globs.matches(path).into_iter().rev() { + let decomp_cmd = &self.commands[i]; + let mut cmd = Command::new(&decomp_cmd.bin); + cmd.args(&decomp_cmd.args); + return Some(cmd); + } + None + } + + /// Returns true if and only if the given file path has at least one + /// matching command to perform decompression on. + pub fn has_command>(&self, path: P) -> bool { + self.globs.is_match(path) + } +} + +/// Configures and builds a streaming reader for decompressing data. +#[derive(Clone, Debug, Default)] +pub struct DecompressionReaderBuilder { + matcher: DecompressionMatcher, + command_builder: CommandReaderBuilder, +} + +impl DecompressionReaderBuilder { + /// Create a new builder with the default configuration. + pub fn new() -> DecompressionReaderBuilder { + DecompressionReaderBuilder::default() + } + + /// Build a new streaming reader for decompressing data. + /// + /// If decompression is done out-of-process and if there was a problem + /// spawning the process, then its error is logged at the debug level and a + /// passthru reader is returned that does no decompression. This behavior + /// typically occurs when the given file path matches a decompression + /// command, but is executing in an environment where the decompression + /// command is not available. + /// + /// If the given file path could not be matched with a decompression + /// strategy, then a passthru reader is returned that does no + /// decompression. + pub fn build>( + &self, + path: P, + ) -> Result { + let path = path.as_ref(); + let mut cmd = match self.matcher.command(path) { + None => return DecompressionReader::new_passthru(path), + Some(cmd) => cmd, + }; + cmd.arg(path); + + match self.command_builder.build(&mut cmd) { + Ok(cmd_reader) => Ok(DecompressionReader { rdr: Ok(cmd_reader) }), + Err(err) => { + debug!( + "{}: error spawning command '{:?}': {} \ + (falling back to uncompressed reader)", + path.display(), + cmd, + err, + ); + DecompressionReader::new_passthru(path) + } + } + } + + /// Set the matcher to use to look up the decompression command for each + /// file path. + /// + /// A set of sensible rules is enabled by default. Setting this will + /// completely replace the current rules. + pub fn matcher( + &mut self, + matcher: DecompressionMatcher, + ) -> &mut DecompressionReaderBuilder { + self.matcher = matcher; + self + } + + /// Get the underlying matcher currently used by this builder. + pub fn get_matcher(&self) -> &DecompressionMatcher { + &self.matcher + } + + /// When enabled, the reader will asynchronously read the contents of the + /// command's stderr output. When disabled, stderr is only read after the + /// stdout stream has been exhausted (or if the process quits with an error + /// code). + /// + /// Note that when enabled, this may require launching an additional + /// thread in order to read stderr. This is done so that the process being + /// executed is never blocked from writing to stdout or stderr. If this is + /// disabled, then it is possible for the process to fill up the stderr + /// buffer and deadlock. + /// + /// This is enabled by default. + pub fn async_stderr( + &mut self, + yes: bool, + ) -> &mut DecompressionReaderBuilder { + self.command_builder.async_stderr(yes); + self + } +} + +/// A streaming reader for decompressing the contents of a file. +/// +/// The purpose of this reader is to provide a seamless way to decompress the +/// contents of file using existing tools in the current environment. This is +/// meant to be an alternative to using decompression libraries in favor of the +/// simplicity and portability of using external commands such as `gzip` and +/// `xz`. This does impose the overhead of spawning a process, so other means +/// for performing decompression should be sought if this overhead isn't +/// acceptable. +/// +/// A decompression reader comes with a default set of matching rules that are +/// meant to associate file paths with the corresponding command to use to +/// decompress them. For example, a glob like `*.gz` matches gzip compressed +/// files with the command `gzip -d -c`. If a file path does not match any +/// existing rules, or if it matches a rule whose command does not exist in the +/// current environment, then the decompression reader passes through the +/// contents of the underlying file without doing any decompression. +/// +/// The default matching rules are probably good enough for most cases, and if +/// they require revision, pull requests are welcome. In cases where they must +/// be changed or extended, they can be customized through the use of +/// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html) +/// and +/// [`DecompressionReaderBuilder`](struct.DecompressionReaderBuilder.html). +/// +/// By default, this reader will asynchronously read the processes' stderr. +/// This prevents subtle deadlocking bugs for noisy processes that write a lot +/// to stderr. Currently, the entire contents of stderr is read on to the heap. +/// +/// # Example +/// +/// This example shows how to read the decompressed contents of a file without +/// needing to explicitly choose the decompression command to run. +/// +/// Note that if you need to decompress multiple files, it is better to use +/// `DecompressionReaderBuilder`, which will amortize the cost of compiling the +/// matcher. +/// +/// ```no_run +/// use std::io::Read; +/// use std::process::Command; +/// use grep_cli::DecompressionReader; +/// +/// # fn example() -> Result<(), Box<::std::error::Error>> { +/// let mut rdr = DecompressionReader::new("/usr/share/man/man1/ls.1.gz")?; +/// let mut contents = vec![]; +/// rdr.read_to_end(&mut contents)?; +/// # Ok(()) } +/// ``` +#[derive(Debug)] +pub struct DecompressionReader { + rdr: Result, +} + +impl DecompressionReader { + /// Build a new streaming reader for decompressing data. + /// + /// If decompression is done out-of-process and if there was a problem + /// spawning the process, then its error is returned. + /// + /// If the given file path could not be matched with a decompression + /// strategy, then a passthru reader is returned that does no + /// decompression. + /// + /// This uses the default matching rules for determining how to decompress + /// the given file. To change those matching rules, use + /// [`DecompressionReaderBuilder`](struct.DecompressionReaderBuilder.html) + /// and + /// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html). + /// + /// When creating readers for many paths. it is better to use the builder + /// since it will amortize the cost of constructing the matcher. + pub fn new>( + path: P, + ) -> Result { + DecompressionReaderBuilder::new().build(path) + } + + /// Creates a new "passthru" decompression reader that reads from the file + /// corresponding to the given path without doing decompression and without + /// executing another process. + fn new_passthru(path: &Path) -> Result { + let file = File::open(path)?; + Ok(DecompressionReader { rdr: Err(file) }) + } +} + +impl io::Read for DecompressionReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self.rdr { + Ok(ref mut rdr) => rdr.read(buf), + Err(ref mut rdr) => rdr.read(buf), + } + } +} + +fn default_decompression_commands() -> Vec { + const ARGS_GZIP: &[&str] = &["gzip", "-d", "-c"]; + const ARGS_BZIP: &[&str] = &["bzip2", "-d", "-c"]; + const ARGS_XZ: &[&str] = &["xz", "-d", "-c"]; + const ARGS_LZ4: &[&str] = &["lz4", "-d", "-c"]; + const ARGS_LZMA: &[&str] = &["xz", "--format=lzma", "-d", "-c"]; + + fn cmd(glob: &str, args: &[&str]) -> DecompressionCommand { + DecompressionCommand { + glob: glob.to_string(), + bin: OsStr::new(&args[0]).to_os_string(), + args: args + .iter() + .skip(1) + .map(|s| OsStr::new(s).to_os_string()) + .collect(), + } + } + vec![ + cmd("*.gz", ARGS_GZIP), + cmd("*.tgz", ARGS_GZIP), + + cmd("*.bz2", ARGS_BZIP), + cmd("*.tbz2", ARGS_BZIP), + + cmd("*.xz", ARGS_XZ), + cmd("*.txz", ARGS_XZ), + + cmd("*.lz4", ARGS_LZ4), + + cmd("*.lzma", ARGS_LZMA), + ] +} diff --git a/grep-cli/src/escape.rs b/grep-cli/src/escape.rs new file mode 100644 index 00000000..9b350a93 --- /dev/null +++ b/grep-cli/src/escape.rs @@ -0,0 +1,315 @@ +use std::ffi::OsStr; +use std::str; + +/// A single state in the state machine used by `unescape`. +#[derive(Clone, Copy, Eq, PartialEq)] +enum State { + /// The state after seeing a `\`. + Escape, + /// The state after seeing a `\x`. + HexFirst, + /// The state after seeing a `\x[0-9A-Fa-f]`. + HexSecond(char), + /// Default state. + Literal, +} + +/// Escapes arbitrary bytes into a human readable string. +/// +/// This converts `\t`, `\r` and `\n` into their escaped forms. It also +/// converts the non-printable subset of ASCII in addition to invalid UTF-8 +/// bytes to hexadecimal escape sequences. Everything else is left as is. +/// +/// The dual of this routine is [`unescape`](fn.unescape.html). +/// +/// # Example +/// +/// This example shows how to convert a byte string that contains a `\n` and +/// invalid UTF-8 bytes into a `String`. +/// +/// Pay special attention to the use of raw strings. That is, `r"\n"` is +/// equivalent to `"\\n"`. +/// +/// ``` +/// use grep_cli::escape; +/// +/// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz")); +/// ``` +pub fn escape(mut bytes: &[u8]) -> String { + let mut escaped = String::new(); + while let Some(result) = decode_utf8(bytes) { + match result { + Ok(cp) => { + escape_char(cp, &mut escaped); + bytes = &bytes[cp.len_utf8()..]; + } + Err(byte) => { + escape_byte(byte, &mut escaped); + bytes = &bytes[1..]; + } + } + } + escaped +} + +/// Escapes an OS string into a human readable string. +/// +/// This is like [`escape`](fn.escape.html), but accepts an OS string. +pub fn escape_os(string: &OsStr) -> String { + #[cfg(unix)] + fn imp(string: &OsStr) -> String { + use std::os::unix::ffi::OsStrExt; + + escape(string.as_bytes()) + } + + #[cfg(not(unix))] + fn imp(string: &OsStr) -> String { + escape(string.to_string_lossy().as_bytes()) + } + + imp(string) +} + +/// Unescapes a string. +/// +/// It supports a limited set of escape sequences: +/// +/// * `\t`, `\r` and `\n` are mapped to their corresponding ASCII bytes. +/// * `\xZZ` hexadecimal escapes are mapped to their byte. +/// +/// Everything else is left as is, including non-hexadecimal escapes like +/// `\xGG`. +/// +/// This is useful when it is desirable for a command line argument to be +/// capable of specifying arbitrary bytes or otherwise make it easier to +/// specify non-printable characters. +/// +/// The dual of this routine is [`escape`](fn.escape.html). +/// +/// # Example +/// +/// This example shows how to convert an escaped string (which is valid UTF-8) +/// into a corresponding sequence of bytes. Each escape sequence is mapped to +/// its bytes, which may include invalid UTF-8. +/// +/// Pay special attention to the use of raw strings. That is, `r"\n"` is +/// equivalent to `"\\n"`. +/// +/// ``` +/// use grep_cli::unescape; +/// +/// assert_eq!(&b"foo\nbar\xFFbaz"[..], &*unescape(r"foo\nbar\xFFbaz")); +/// ``` +pub fn unescape(s: &str) -> Vec { + use self::State::*; + + let mut bytes = vec![]; + let mut state = Literal; + for c in s.chars() { + match state { + Escape => { + match c { + '\\' => { bytes.push(b'\\'); state = Literal; } + 'n' => { bytes.push(b'\n'); state = Literal; } + 'r' => { bytes.push(b'\r'); state = Literal; } + 't' => { bytes.push(b'\t'); state = Literal; } + 'x' => { state = HexFirst; } + c => { + bytes.extend(format!(r"\{}", c).into_bytes()); + state = Literal; + } + } + } + HexFirst => { + match c { + '0'...'9' | 'A'...'F' | 'a'...'f' => { + state = HexSecond(c); + } + c => { + bytes.extend(format!(r"\x{}", c).into_bytes()); + state = Literal; + } + } + } + HexSecond(first) => { + match c { + '0'...'9' | 'A'...'F' | 'a'...'f' => { + let ordinal = format!("{}{}", first, c); + let byte = u8::from_str_radix(&ordinal, 16).unwrap(); + bytes.push(byte); + state = Literal; + } + c => { + let original = format!(r"\x{}{}", first, c); + bytes.extend(original.into_bytes()); + state = Literal; + } + } + } + Literal => { + match c { + '\\' => { state = Escape; } + c => { bytes.extend(c.to_string().as_bytes()); } + } + } + } + } + match state { + Escape => bytes.push(b'\\'), + HexFirst => bytes.extend(b"\\x"), + HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()), + Literal => {} + } + bytes +} + +/// Unescapes an OS string. +/// +/// This is like [`unescape`](fn.unescape.html), but accepts an OS string. +/// +/// Note that this first lossily decodes the given OS string as UTF-8. That +/// is, an escaped string (the thing given) should be valid UTF-8. +pub fn unescape_os(string: &OsStr) -> Vec { + unescape(&string.to_string_lossy()) +} + +/// Adds the given codepoint to the given string, escaping it if necessary. +fn escape_char(cp: char, into: &mut String) { + if cp.is_ascii() { + escape_byte(cp as u8, into); + } else { + into.push(cp); + } +} + +/// Adds the given byte to the given string, escaping it if necessary. +fn escape_byte(byte: u8, into: &mut String) { + match byte { + 0x21...0x5B | 0x5D...0x7D => into.push(byte as char), + b'\n' => into.push_str(r"\n"), + b'\r' => into.push_str(r"\r"), + b'\t' => into.push_str(r"\t"), + b'\\' => into.push_str(r"\\"), + _ => into.push_str(&format!(r"\x{:02X}", byte)), + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +fn decode_utf8(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return None; + } + let len = match utf8_len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(len) => len, + }; + match str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +fn utf8_len(byte: u8) -> Option { + if byte <= 0x7F { + Some(1) + } else if byte <= 0b110_11111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::{escape, unescape}; + + fn b(bytes: &'static [u8]) -> Vec { + bytes.to_vec() + } + + #[test] + fn empty() { + assert_eq!(b(b""), unescape(r"")); + assert_eq!(r"", escape(b"")); + } + + #[test] + fn backslash() { + assert_eq!(b(b"\\"), unescape(r"\\")); + assert_eq!(r"\\", escape(b"\\")); + } + + #[test] + fn nul() { + assert_eq!(b(b"\x00"), unescape(r"\x00")); + assert_eq!(r"\x00", escape(b"\x00")); + } + + #[test] + fn nl() { + assert_eq!(b(b"\n"), unescape(r"\n")); + assert_eq!(r"\n", escape(b"\n")); + } + + #[test] + fn tab() { + assert_eq!(b(b"\t"), unescape(r"\t")); + assert_eq!(r"\t", escape(b"\t")); + } + + #[test] + fn carriage() { + assert_eq!(b(b"\r"), unescape(r"\r")); + assert_eq!(r"\r", escape(b"\r")); + } + + #[test] + fn nothing_simple() { + assert_eq!(b(b"\\a"), unescape(r"\a")); + assert_eq!(b(b"\\a"), unescape(r"\\a")); + assert_eq!(r"\\a", escape(b"\\a")); + } + + #[test] + fn nothing_hex0() { + assert_eq!(b(b"\\x"), unescape(r"\x")); + assert_eq!(b(b"\\x"), unescape(r"\\x")); + assert_eq!(r"\\x", escape(b"\\x")); + } + + #[test] + fn nothing_hex1() { + assert_eq!(b(b"\\xz"), unescape(r"\xz")); + assert_eq!(b(b"\\xz"), unescape(r"\\xz")); + assert_eq!(r"\\xz", escape(b"\\xz")); + } + + #[test] + fn nothing_hex2() { + assert_eq!(b(b"\\xzz"), unescape(r"\xzz")); + assert_eq!(b(b"\\xzz"), unescape(r"\\xzz")); + assert_eq!(r"\\xzz", escape(b"\\xzz")); + } + + #[test] + fn invalid_utf8() { + assert_eq!(r"\xFF", escape(b"\xFF")); + assert_eq!(r"a\xFFb", escape(b"a\xFFb")); + } +} diff --git a/grep-cli/src/human.rs b/grep-cli/src/human.rs new file mode 100644 index 00000000..a69fd376 --- /dev/null +++ b/grep-cli/src/human.rs @@ -0,0 +1,171 @@ +use std::error; +use std::fmt; +use std::io; +use std::num::ParseIntError; + +use regex::Regex; + +/// An error that occurs when parsing a human readable size description. +/// +/// This error provides a end user friendly message describing why the +/// description coudln't be parsed and what the expected format is. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ParseSizeError { + original: String, + kind: ParseSizeErrorKind, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +enum ParseSizeErrorKind { + InvalidFormat, + InvalidInt(ParseIntError), + Overflow, +} + +impl ParseSizeError { + fn format(original: &str) -> ParseSizeError { + ParseSizeError { + original: original.to_string(), + kind: ParseSizeErrorKind::InvalidFormat, + } + } + + fn int(original: &str, err: ParseIntError) -> ParseSizeError { + ParseSizeError { + original: original.to_string(), + kind: ParseSizeErrorKind::InvalidInt(err), + } + } + + fn overflow(original: &str) -> ParseSizeError { + ParseSizeError { + original: original.to_string(), + kind: ParseSizeErrorKind::Overflow, + } + } +} + +impl error::Error for ParseSizeError { + fn description(&self) -> &str { "invalid size" } +} + +impl fmt::Display for ParseSizeError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::ParseSizeErrorKind::*; + + match self.kind { + InvalidFormat => { + write!( + f, + "invalid format for size '{}', which should be a sequence \ + of digits followed by an optional 'K', 'M' or 'G' \ + suffix", + self.original + ) + } + InvalidInt(ref err) => { + write!( + f, + "invalid integer found in size '{}': {}", + self.original, + err + ) + } + Overflow => { + write!(f, "size too big in '{}'", self.original) + } + } + } +} + +impl From for io::Error { + fn from(size_err: ParseSizeError) -> io::Error { + io::Error::new(io::ErrorKind::Other, size_err) + } +} + +/// Parse a human readable size like `2M` into a corresponding number of bytes. +/// +/// Supported size suffixes are `K` (for kilobyte), `M` (for megabyte) and `G` +/// (for gigabyte). If a size suffix is missing, then the size is interpreted +/// as bytes. If the size is too big to fit into a `u64`, then this returns an +/// error. +/// +/// Additional suffixes may be added over time. +pub fn parse_human_readable_size(size: &str) -> Result { + lazy_static! { + // Normally I'd just parse something this simple by hand to avoid the + // regex dep, but we bring regex in any way for glob matching, so might + // as well use it. + static ref RE: Regex = Regex::new(r"^([0-9]+)([KMG])?$").unwrap(); + } + + let caps = match RE.captures(size) { + Some(caps) => caps, + None => return Err(ParseSizeError::format(size)), + }; + let value: u64 = caps[1].parse().map_err(|err| { + ParseSizeError::int(size, err) + })?; + let suffix = match caps.get(2) { + None => return Ok(value), + Some(cap) => cap.as_str(), + }; + let bytes = match suffix { + "K" => value.checked_mul(1<<10), + "M" => value.checked_mul(1<<20), + "G" => value.checked_mul(1<<30), + // Because if the regex matches this group, it must be [KMG]. + _ => unreachable!(), + }; + bytes.ok_or_else(|| ParseSizeError::overflow(size)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn suffix_none() { + let x = parse_human_readable_size("123").unwrap(); + assert_eq!(123, x); + } + + #[test] + fn suffix_k() { + let x = parse_human_readable_size("123K").unwrap(); + assert_eq!(123 * (1<<10), x); + } + + #[test] + fn suffix_m() { + let x = parse_human_readable_size("123M").unwrap(); + assert_eq!(123 * (1<<20), x); + } + + #[test] + fn suffix_g() { + let x = parse_human_readable_size("123G").unwrap(); + assert_eq!(123 * (1<<30), x); + } + + #[test] + fn invalid_empty() { + assert!(parse_human_readable_size("").is_err()); + } + + #[test] + fn invalid_non_digit() { + assert!(parse_human_readable_size("a").is_err()); + } + + #[test] + fn invalid_overflow() { + assert!(parse_human_readable_size("9999999999999999G").is_err()); + } + + #[test] + fn invalid_suffix() { + assert!(parse_human_readable_size("123T").is_err()); + } +} diff --git a/grep-cli/src/lib.rs b/grep-cli/src/lib.rs new file mode 100644 index 00000000..b9909c20 --- /dev/null +++ b/grep-cli/src/lib.rs @@ -0,0 +1,251 @@ +/*! +This crate provides common routines used in command line applications, with a +focus on routines useful for search oriented applications. As a utility +library, there is no central type or function. However, a key focus of this +crate is to improve failure modes and provide user friendly error messages +when things go wrong. + +To the best extent possible, everything in this crate works on Windows, macOS +and Linux. + + +# Standard I/O + +The +[`is_readable_stdin`](fn.is_readable_stdin.html), +[`is_tty_stderr`](fn.is_tty_stderr.html), +[`is_tty_stdin`](fn.is_tty_stdin.html) +and +[`is_tty_stdout`](fn.is_tty_stdout.html) +routines query aspects of standard I/O. `is_readable_stdin` determines whether +stdin can be usefully read from, while the `tty` methods determine whether a +tty is attached to stdin/stdout/stderr. + +`is_readable_stdin` is useful when writing an application that changes behavior +based on whether the application was invoked with data on stdin. For example, +`rg foo` might recursively search the current working directory for +occurrences of `foo`, but `rg foo < file` might only search the contents of +`file`. + +The `tty` methods are useful for similar reasons. Namely, commands like `ls` +will change their output depending on whether they are printing to a terminal +or not. For example, `ls` shows a file on each line when stdout is redirected +to a file or a pipe, but condenses the output to show possibly many files on +each line when stdout is connected to a tty. + + +# Coloring and buffering + +The +[`stdout`](fn.stdout.html), +[`stdout_buffered_block`](fn.stdout_buffered_block.html) +and +[`stdout_buffered_line`](fn.stdout_buffered_line.html) +routines are alternative constructors for +[`StandardStream`](struct.StandardStream.html). +A `StandardStream` implements `termcolor::WriteColor`, which provides a way +to emit colors to terminals. Its key use is the encapsulation of buffering +style. Namely, `stdout` will return a line buffered `StandardStream` if and +only if stdout is connected to a tty, and will otherwise return a block +buffered `StandardStream`. Line buffering is important for use with a tty +because it typically decreases the latency at which the end user sees output. +Block buffering is used otherwise because it is faster, and redirecting stdout +to a file typically doesn't benefit from the decreased latency that line +buffering provides. + +The `stdout_buffered_block` and `stdout_buffered_line` can be used to +explicitly set the buffering strategy regardless of whether stdout is connected +to a tty or not. + + +# Escaping + +The +[`escape`](fn.escape.html), +[`escape_os`](fn.escape_os.html), +[`unescape`](fn.unescape.html) +and +[`unescape_os`](fn.unescape_os.html) +routines provide a user friendly way of dealing with UTF-8 encoded strings that +can express arbitrary bytes. For example, you might want to accept a string +containing arbitrary bytes as a command line argument, but most interactive +shells make such strings difficult to type. Instead, we can ask users to use +escape sequences. + +For example, `a\xFFz` is itself a valid UTF-8 string corresponding to the +following bytes: + +```ignore +[b'a', b'\\', b'x', b'F', b'F', b'z'] +``` + +However, we can +interpret `\xFF` as an escape sequence with the `unescape`/`unescape_os` +routines, which will yield + +```ignore +[b'a', b'\xFF', b'z'] +``` + +instead. For example: + +``` +use grep_cli::unescape; + +// Note the use of a raw string! +assert_eq!(vec![b'a', b'\xFF', b'z'], unescape(r"a\xFFz")); +``` + +The `escape`/`escape_os` routines provide the reverse transformation, which +makes it easy to show user friendly error messages involving arbitrary bytes. + + +# Building patterns + +Typically, regular expression patterns must be valid UTF-8. However, command +line arguments aren't guaranteed to be valid UTF-8. Unfortunately, the +standard library's UTF-8 conversion functions from `OsStr`s do not provide +good error messages. However, the +[`pattern_from_bytes`](fn.pattern_from_bytes.html) +and +[`pattern_from_os`](fn.pattern_from_os.html) +do, including reporting exactly where the first invalid UTF-8 byte is seen. + +Additionally, it can be useful to read patterns from a file while reporting +good error messages that include line numbers. The +[`patterns_from_path`](fn.patterns_from_path.html), +[`patterns_from_reader`](fn.patterns_from_reader.html) +and +[`patterns_from_stdin`](fn.patterns_from_stdin.html) +routines do just that. If any pattern is found that is invalid UTF-8, then the +error includes the file path (if available) along with the line number and the +byte offset at which the first invalid UTF-8 byte was observed. + + +# Read process output + +Sometimes a command line application needs to execute other processes and read +its stdout in a streaming fashion. The +[`CommandReader`](struct.CommandReader.html) +provides this functionality with an explicit goal of improving failure modes. +In particular, if the process exits with an error code, then stderr is read +and converted into a normal Rust error to show to end users. This makes the +underlying failure modes explicit and gives more information to end users for +debugging the problem. + +As a special case, +[`DecompressionReader`](struct.DecompressionReader.html) +provides a way to decompress arbitrary files by matching their file extensions +up with corresponding decompression programs (such as `gzip` and `xz`). This +is useful as a means of performing simplistic decompression in a portable +manner without binding to specific compression libraries. This does come with +some overhead though, so if you need to decompress lots of small files, this +may not be an appropriate convenience to use. + +Each reader has a corresponding builder for additional configuration, such as +whether to read stderr asynchronously in order to avoid deadlock (which is +enabled by default). + + +# Miscellaneous parsing + +The +[`parse_human_readable_size`](fn.parse_human_readable_size.html) +routine parses strings like `2M` and converts them to the corresponding number +of bytes (`2 * 1<<20` in this case). If an invalid size is found, then a good +error message is crafted that typically tells the user how to fix the problem. +*/ + +#![deny(missing_docs)] + +extern crate atty; +extern crate globset; +#[macro_use] +extern crate lazy_static; +#[macro_use] +extern crate log; +extern crate regex; +extern crate same_file; +extern crate termcolor; +#[cfg(windows)] +extern crate winapi_util; + +mod decompress; +mod escape; +mod human; +mod pattern; +mod process; +mod wtr; + +pub use decompress::{ + DecompressionMatcher, DecompressionMatcherBuilder, + DecompressionReader, DecompressionReaderBuilder, +}; +pub use escape::{escape, escape_os, unescape, unescape_os}; +pub use human::{ParseSizeError, parse_human_readable_size}; +pub use pattern::{ + InvalidPatternError, + pattern_from_os, pattern_from_bytes, + patterns_from_path, patterns_from_reader, patterns_from_stdin, +}; +pub use process::{CommandError, CommandReader, CommandReaderBuilder}; +pub use wtr::{ + StandardStream, + stdout, stdout_buffered_line, stdout_buffered_block, +}; + +/// Returns true if and only if stdin is believed to be readable. +/// +/// When stdin is readable, command line programs may choose to behave +/// differently than when stdin is not readable. For example, `command foo` +/// might search the current directory for occurrences of `foo` where as +/// `command foo < some-file` or `cat some-file | command foo` might instead +/// only search stdin for occurrences of `foo`. +pub fn is_readable_stdin() -> bool { + #[cfg(unix)] + fn imp() -> bool { + use std::os::unix::fs::FileTypeExt; + use same_file::Handle; + + let ft = match Handle::stdin().and_then(|h| h.as_file().metadata()) { + Err(_) => return false, + Ok(md) => md.file_type(), + }; + ft.is_file() || ft.is_fifo() + } + + #[cfg(windows)] + fn imp() -> bool { + use winapi_util as winutil; + + winutil::file::typ(winutil::HandleRef::stdin()) + .map(|t| t.is_disk() || t.is_pipe()) + .unwrap_or(false) + } + + !is_tty_stdin() && imp() +} + +/// Returns true if and only if stdin is believed to be connectted to a tty +/// or a console. +pub fn is_tty_stdin() -> bool { + atty::is(atty::Stream::Stdin) +} + +/// Returns true if and only if stdout is believed to be connectted to a tty +/// or a console. +/// +/// This is useful for when you want your command line program to produce +/// different output depending on whether it's printing directly to a user's +/// terminal or whether it's being redirected somewhere else. For example, +/// implementations of `ls` will often show one item per line when stdout is +/// redirected, but will condensed output when printing to a tty. +pub fn is_tty_stdout() -> bool { + atty::is(atty::Stream::Stdout) +} + +/// Returns true if and only if stderr is believed to be connectted to a tty +/// or a console. +pub fn is_tty_stderr() -> bool { + atty::is(atty::Stream::Stderr) +} diff --git a/grep-cli/src/pattern.rs b/grep-cli/src/pattern.rs new file mode 100644 index 00000000..ed1d95a5 --- /dev/null +++ b/grep-cli/src/pattern.rs @@ -0,0 +1,205 @@ +use std::error; +use std::ffi::OsStr; +use std::fmt; +use std::fs::File; +use std::io::{self, BufRead}; +use std::path::Path; +use std::str; + +use escape::{escape, escape_os}; + +/// An error that occurs when a pattern could not be converted to valid UTF-8. +/// +/// The purpose of this error is to give a more targeted failure mode for +/// patterns written by end users that are not valid UTF-8. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct InvalidPatternError { + original: String, + valid_up_to: usize, +} + +impl InvalidPatternError { + /// Returns the index in the given string up to which valid UTF-8 was + /// verified. + pub fn valid_up_to(&self) -> usize { + self.valid_up_to + } +} + +impl error::Error for InvalidPatternError { + fn description(&self) -> &str { "invalid pattern" } +} + +impl fmt::Display for InvalidPatternError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "found invalid UTF-8 in pattern at byte offset {} \ + (use hex escape sequences to match arbitrary bytes \ + in a pattern, e.g., \\xFF): '{}'", + self.valid_up_to, + self.original, + ) + } +} + +impl From for io::Error { + fn from(paterr: InvalidPatternError) -> io::Error { + io::Error::new(io::ErrorKind::Other, paterr) + } +} + +/// Convert an OS string into a regular expression pattern. +/// +/// This conversion fails if the given pattern is not valid UTF-8, in which +/// case, a targeted error with more information about where the invalid UTF-8 +/// occurs is given. The error also suggests the use of hex escape sequences, +/// which are supported by many regex engines. +pub fn pattern_from_os(pattern: &OsStr) -> Result<&str, InvalidPatternError> { + pattern.to_str().ok_or_else(|| { + let valid_up_to = pattern + .to_string_lossy() + .find('\u{FFFD}') + .expect("a Unicode replacement codepoint for invalid UTF-8"); + InvalidPatternError { + original: escape_os(pattern), + valid_up_to: valid_up_to, + } + }) +} + +/// Convert arbitrary bytes into a regular expression pattern. +/// +/// This conversion fails if the given pattern is not valid UTF-8, in which +/// case, a targeted error with more information about where the invalid UTF-8 +/// occurs is given. The error also suggests the use of hex escape sequences, +/// which are supported by many regex engines. +pub fn pattern_from_bytes( + pattern: &[u8], +) -> Result<&str, InvalidPatternError> { + str::from_utf8(pattern).map_err(|err| { + InvalidPatternError { + original: escape(pattern), + valid_up_to: err.valid_up_to(), + } + }) +} + +/// Read patterns from a file path, one per line. +/// +/// If there was a problem reading or if any of the patterns contain invalid +/// UTF-8, then an error is returned. If there was a problem with a specific +/// pattern, then the error message will include the line number and the file +/// path. +pub fn patterns_from_path>(path: P) -> io::Result> { + let path = path.as_ref(); + let file = File::open(path).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("{}: {}", path.display(), err), + ) + })?; + patterns_from_reader(file).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("{}:{}", path.display(), err), + ) + }) +} + +/// Read patterns from stdin, one per line. +/// +/// If there was a problem reading or if any of the patterns contain invalid +/// UTF-8, then an error is returned. If there was a problem with a specific +/// pattern, then the error message will include the line number and the fact +/// that it came from stdin. +pub fn patterns_from_stdin() -> io::Result> { + let stdin = io::stdin(); + let locked = stdin.lock(); + patterns_from_reader(locked).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!(":{}", err), + ) + }) +} + +/// Read patterns from any reader, one per line. +/// +/// If there was a problem reading or if any of the patterns contain invalid +/// UTF-8, then an error is returned. If there was a problem with a specific +/// pattern, then the error message will include the line number. +/// +/// Note that this routine uses its own internal buffer, so the caller should +/// not provide their own buffered reader if possible. +/// +/// # Example +/// +/// This shows how to parse patterns, one per line. +/// +/// ``` +/// use grep_cli::patterns_from_reader; +/// +/// # fn example() -> Result<(), Box<::std::error::Error>> { +/// let patterns = "\ +/// foo +/// bar\\s+foo +/// [a-z]{3} +/// "; +/// +/// assert_eq!(patterns_from_reader(patterns.as_bytes())?, vec![ +/// r"foo", +/// r"bar\s+foo", +/// r"[a-z]{3}", +/// ]); +/// # Ok(()) } +/// ``` +pub fn patterns_from_reader(rdr: R) -> io::Result> { + let mut patterns = vec![]; + let mut bufrdr = io::BufReader::new(rdr); + let mut line = vec![]; + let mut line_number = 0; + while { + line.clear(); + line_number += 1; + bufrdr.read_until(b'\n', &mut line)? > 0 + } { + line.pop().unwrap(); // remove trailing '\n' + if line.last() == Some(&b'\r') { + line.pop().unwrap(); + } + match pattern_from_bytes(&line) { + Ok(pattern) => patterns.push(pattern.to_string()), + Err(err) => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("{}: {}", line_number, err), + )); + } + } + } + Ok(patterns) +} + +#[cfg(test)] +mod tests { + use super::{pattern_from_bytes, pattern_from_os}; + + #[test] + fn bytes() { + let pat = b"abc\xFFxyz"; + let err = pattern_from_bytes(pat).unwrap_err(); + assert_eq!(3, err.valid_up_to()); + } + + #[test] + #[cfg(unix)] + fn os() { + use std::os::unix::ffi::OsStrExt; + use std::ffi::OsStr; + + let pat = OsStr::from_bytes(b"abc\xFFxyz"); + let err = pattern_from_os(pat).unwrap_err(); + assert_eq!(3, err.valid_up_to()); + } +} diff --git a/grep-cli/src/process.rs b/grep-cli/src/process.rs new file mode 100644 index 00000000..017dd0c3 --- /dev/null +++ b/grep-cli/src/process.rs @@ -0,0 +1,267 @@ +use std::error; +use std::fmt; +use std::io::{self, Read}; +use std::iter; +use std::process; +use std::thread::{self, JoinHandle}; + +/// An error that can occur while running a command and reading its output. +/// +/// This error can be seamlessly converted to an `io::Error` via a `From` +/// implementation. +#[derive(Debug)] +pub struct CommandError { + kind: CommandErrorKind, +} + +#[derive(Debug)] +enum CommandErrorKind { + Io(io::Error), + Stderr(Vec), +} + +impl CommandError { + /// Create an error from an I/O error. + pub(crate) fn io(ioerr: io::Error) -> CommandError { + CommandError { kind: CommandErrorKind::Io(ioerr) } + } + + /// Create an error from the contents of stderr (which may be empty). + pub(crate) fn stderr(bytes: Vec) -> CommandError { + CommandError { kind: CommandErrorKind::Stderr(bytes) } + } +} + +impl error::Error for CommandError { + fn description(&self) -> &str { "command error" } +} + +impl fmt::Display for CommandError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + CommandErrorKind::Io(ref e) => e.fmt(f), + CommandErrorKind::Stderr(ref bytes) => { + let msg = String::from_utf8_lossy(bytes); + if msg.trim().is_empty() { + write!(f, "") + } else { + let div = iter::repeat('-').take(79).collect::(); + write!(f, "\n{div}\n{msg}\n{div}", div=div, msg=msg.trim()) + } + } + } + } +} + +impl From for CommandError { + fn from(ioerr: io::Error) -> CommandError { + CommandError { kind: CommandErrorKind::Io(ioerr) } + } +} + +impl From for io::Error { + fn from(cmderr: CommandError) -> io::Error { + match cmderr.kind { + CommandErrorKind::Io(ioerr) => ioerr, + CommandErrorKind::Stderr(_) => { + io::Error::new(io::ErrorKind::Other, cmderr) + } + } + } +} + +/// Configures and builds a streaming reader for process output. +#[derive(Clone, Debug, Default)] +pub struct CommandReaderBuilder { + async_stderr: bool, +} + +impl CommandReaderBuilder { + /// Create a new builder with the default configuration. + pub fn new() -> CommandReaderBuilder { + CommandReaderBuilder::default() + } + + /// Build a new streaming reader for the given command's output. + /// + /// The caller should set everything that's required on the given command + /// before building a reader, such as its arguments, environment and + /// current working directory. Settings such as the stdout and stderr (but + /// not stdin) pipes will be overridden so that they can be controlled by + /// the reader. + /// + /// If there was a problem spawning the given command, then its error is + /// returned. + pub fn build( + &self, + command: &mut process::Command, + ) -> Result { + let mut child = command + .stdout(process::Stdio::piped()) + .stderr(process::Stdio::piped()) + .spawn()?; + let stdout = child.stdout.take().unwrap(); + let stderr = + if self.async_stderr { + StderrReader::async(child.stderr.take().unwrap()) + } else { + StderrReader::sync(child.stderr.take().unwrap()) + }; + Ok(CommandReader { + child: child, + stdout: stdout, + stderr: stderr, + done: false, + }) + } + + /// When enabled, the reader will asynchronously read the contents of the + /// command's stderr output. When disabled, stderr is only read after the + /// stdout stream has been exhausted (or if the process quits with an error + /// code). + /// + /// Note that when enabled, this may require launching an additional + /// thread in order to read stderr. This is done so that the process being + /// executed is never blocked from writing to stdout or stderr. If this is + /// disabled, then it is possible for the process to fill up the stderr + /// buffer and deadlock. + /// + /// This is enabled by default. + pub fn async_stderr(&mut self, yes: bool) -> &mut CommandReaderBuilder { + self.async_stderr = yes; + self + } +} + +/// A streaming reader for a command's output. +/// +/// The purpose of this reader is to provide an easy way to execute processes +/// whose stdout is read in a streaming way while also making the processes' +/// stderr available when the process fails with an exit code. This makes it +/// possible to execute processes while surfacing the underlying failure mode +/// in the case of an error. +/// +/// Moreover, by default, this reader will asynchronously read the processes' +/// stderr. This prevents subtle deadlocking bugs for noisy processes that +/// write a lot to stderr. Currently, the entire contents of stderr is read +/// on to the heap. +/// +/// # Example +/// +/// This example shows how to invoke `gzip` to decompress the contents of a +/// file. If the `gzip` command reports a failing exit status, then its stderr +/// is returned as an error. +/// +/// ```no_run +/// use std::io::Read; +/// use std::process::Command; +/// use grep_cli::CommandReader; +/// +/// # fn example() -> Result<(), Box<::std::error::Error>> { +/// let mut cmd = Command::new("gzip"); +/// cmd.arg("-d").arg("-c").arg("/usr/share/man/man1/ls.1.gz"); +/// +/// let mut rdr = CommandReader::new(&mut cmd)?; +/// let mut contents = vec![]; +/// rdr.read_to_end(&mut contents)?; +/// # Ok(()) } +/// ``` +#[derive(Debug)] +pub struct CommandReader { + child: process::Child, + stdout: process::ChildStdout, + stderr: StderrReader, + done: bool, +} + +impl CommandReader { + /// Create a new streaming reader for the given command using the default + /// configuration. + /// + /// The caller should set everything that's required on the given command + /// before building a reader, such as its arguments, environment and + /// current working directory. Settings such as the stdout and stderr (but + /// not stdin) pipes will be overridden so that they can be controlled by + /// the reader. + /// + /// If there was a problem spawning the given command, then its error is + /// returned. + /// + /// If the caller requires additional configuration for the reader + /// returned, then use + /// [`CommandReaderBuilder`](struct.CommandReaderBuilder.html). + pub fn new( + cmd: &mut process::Command, + ) -> Result { + CommandReaderBuilder::new().build(cmd) + } +} + +impl io::Read for CommandReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.done { + return Ok(0); + } + let nread = self.stdout.read(buf)?; + if nread == 0 { + self.done = true; + // Reap the child now that we're done reading. If the command + // failed, report stderr as an error. + if !self.child.wait()?.success() { + return Err(io::Error::from(self.stderr.read_to_end())); + } + } + Ok(nread) + } +} + +/// A reader that encapsulates the asynchronous or synchronous reading of +/// stderr. +#[derive(Debug)] +enum StderrReader { + Async(Option>), + Sync(process::ChildStderr), +} + +impl StderrReader { + /// Create a reader for stderr that reads contents asynchronously. + fn async(mut stderr: process::ChildStderr) -> StderrReader { + let handle = thread::spawn(move || { + stderr_to_command_error(&mut stderr) + }); + StderrReader::Async(Some(handle)) + } + + /// Create a reader for stderr that reads contents synchronously. + fn sync(stderr: process::ChildStderr) -> StderrReader { + StderrReader::Sync(stderr) + } + + /// Consumes all of stderr on to the heap and returns it as an error. + /// + /// If there was a problem reading stderr itself, then this returns an I/O + /// command error. + fn read_to_end(&mut self) -> CommandError { + match *self { + StderrReader::Async(ref mut handle) => { + let handle = handle + .take() + .expect("read_to_end cannot be called more than once"); + handle + .join() + .expect("stderr reading thread does not panic") + } + StderrReader::Sync(ref mut stderr) => { + stderr_to_command_error(stderr) + } + } + } +} + +fn stderr_to_command_error(stderr: &mut process::ChildStderr) -> CommandError { + let mut bytes = vec![]; + match stderr.read_to_end(&mut bytes) { + Ok(_) => CommandError::stderr(bytes), + Err(err) => CommandError::io(err), + } +} diff --git a/grep-cli/src/wtr.rs b/grep-cli/src/wtr.rs new file mode 100644 index 00000000..f7722ce8 --- /dev/null +++ b/grep-cli/src/wtr.rs @@ -0,0 +1,133 @@ +use std::io; + +use termcolor; + +use is_tty_stdout; + +/// A writer that supports coloring with either line or block buffering. +pub struct StandardStream(StandardStreamKind); + +/// Returns a possibly buffered writer to stdout for the given color choice. +/// +/// The writer returned is either line buffered or block buffered. The decision +/// between these two is made automatically based on whether a tty is attached +/// to stdout or not. If a tty is attached, then line buffering is used. +/// Otherwise, block buffering is used. In general, block buffering is more +/// efficient, but may increase the time it takes for the end user to see the +/// first bits of output. +/// +/// If you need more fine grained control over the buffering mode, then use one +/// of `stdout_buffered_line` or `stdout_buffered_block`. +/// +/// The color choice given is passed along to the underlying writer. To +/// completely disable colors in all cases, use `ColorChoice::Never`. +pub fn stdout(color_choice: termcolor::ColorChoice) -> StandardStream { + if is_tty_stdout() { + stdout_buffered_line(color_choice) + } else { + stdout_buffered_block(color_choice) + } +} + +/// Returns a line buffered writer to stdout for the given color choice. +/// +/// This writer is useful when printing results directly to a tty such that +/// users see output as soon as it's written. The downside of this approach +/// is that it can be slower, especially when there is a lot of output. +/// +/// You might consider using +/// [`stdout`](fn.stdout.html) +/// instead, which chooses the buffering strategy automatically based on +/// whether stdout is connected to a tty. +pub fn stdout_buffered_line( + color_choice: termcolor::ColorChoice, +) -> StandardStream { + let out = termcolor::StandardStream::stdout(color_choice); + StandardStream(StandardStreamKind::LineBuffered(out)) +} + +/// Returns a block buffered writer to stdout for the given color choice. +/// +/// This writer is useful when printing results to a file since it amortizes +/// the cost of writing data. The downside of this approach is that it can +/// increase the latency of display output when writing to a tty. +/// +/// You might consider using +/// [`stdout`](fn.stdout.html) +/// instead, which chooses the buffering strategy automatically based on +/// whether stdout is connected to a tty. +pub fn stdout_buffered_block( + color_choice: termcolor::ColorChoice, +) -> StandardStream { + let out = termcolor::BufferedStandardStream::stdout(color_choice); + StandardStream(StandardStreamKind::BlockBuffered(out)) +} + +enum StandardStreamKind { + LineBuffered(termcolor::StandardStream), + BlockBuffered(termcolor::BufferedStandardStream), +} + +impl io::Write for StandardStream { + #[inline] + fn write(&mut self, buf: &[u8]) -> io::Result { + use self::StandardStreamKind::*; + + match self.0 { + LineBuffered(ref mut w) => w.write(buf), + BlockBuffered(ref mut w) => w.write(buf), + } + } + + #[inline] + fn flush(&mut self) -> io::Result<()> { + use self::StandardStreamKind::*; + + match self.0 { + LineBuffered(ref mut w) => w.flush(), + BlockBuffered(ref mut w) => w.flush(), + } + } +} + +impl termcolor::WriteColor for StandardStream { + #[inline] + fn supports_color(&self) -> bool { + use self::StandardStreamKind::*; + + match self.0 { + LineBuffered(ref w) => w.supports_color(), + BlockBuffered(ref w) => w.supports_color(), + } + } + + #[inline] + fn set_color(&mut self, spec: &termcolor::ColorSpec) -> io::Result<()> { + use self::StandardStreamKind::*; + + match self.0 { + LineBuffered(ref mut w) => w.set_color(spec), + BlockBuffered(ref mut w) => w.set_color(spec), + } + } + + #[inline] + fn reset(&mut self) -> io::Result<()> { + use self::StandardStreamKind::*; + + match self.0 { + LineBuffered(ref mut w) => w.reset(), + BlockBuffered(ref mut w) => w.reset(), + } + } + + #[inline] + fn is_synchronous(&self) -> bool { + use self::StandardStreamKind::*; + + match self.0 { + LineBuffered(ref w) => w.is_synchronous(), + BlockBuffered(ref w) => w.is_synchronous(), + } + } +} diff --git a/grep-printer/src/color.rs b/grep-printer/src/color.rs index dcaca59d..394f5ccf 100644 --- a/grep-printer/src/color.rs +++ b/grep-printer/src/color.rs @@ -4,6 +4,25 @@ use std::str::FromStr; use termcolor::{Color, ColorSpec, ParseColorError}; +/// Returns a default set of color specifications. +/// +/// This may change over time, but the color choices are meant to be fairly +/// conservative that work across terminal themes. +/// +/// Additional color specifications can be added to the list returned. More +/// recently added specifications override previously added specifications. +pub fn default_color_specs() -> Vec { + vec![ + #[cfg(unix)] + "path:fg:magenta".parse().unwrap(), + #[cfg(windows)] + "path:fg:cyan".parse().unwrap(), + "line:fg:green".parse().unwrap(), + "match:fg:red".parse().unwrap(), + "match:style:bold".parse().unwrap(), + ] +} + /// An error that can occur when parsing color specifications. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ColorError { @@ -227,6 +246,15 @@ impl ColorSpecs { merged } + /// Create a default set of specifications that have color. + /// + /// This is distinct from `ColorSpecs`'s `Default` implementation in that + /// this provides a set of default color choices, where as the `Default` + /// implementation provides no color choices. + pub fn default_with_color() -> ColorSpecs { + ColorSpecs::new(&default_color_specs()) + } + /// Return the color specification for coloring file paths. pub fn path(&self) -> &ColorSpec { &self.path diff --git a/grep-printer/src/lib.rs b/grep-printer/src/lib.rs index 128b0bdf..6ef11c72 100644 --- a/grep-printer/src/lib.rs +++ b/grep-printer/src/lib.rs @@ -83,7 +83,7 @@ extern crate serde_derive; extern crate serde_json; extern crate termcolor; -pub use color::{ColorError, ColorSpecs, UserColorSpec}; +pub use color::{ColorError, ColorSpecs, UserColorSpec, default_color_specs}; #[cfg(feature = "serde1")] pub use json::{JSON, JSONBuilder, JSONSink}; pub use standard::{Standard, StandardBuilder, StandardSink}; diff --git a/grep/Cargo.toml b/grep/Cargo.toml index 3ffaeae3..64ae2273 100644 --- a/grep/Cargo.toml +++ b/grep/Cargo.toml @@ -13,6 +13,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"] license = "Unlicense/MIT" [dependencies] +grep-cli = { version = "0.1.0", path = "../grep-cli" } grep-matcher = { version = "0.1.0", path = "../grep-matcher" } grep-pcre2 = { version = "0.1.0", path = "../grep-pcre2", optional = true } grep-printer = { version = "0.1.0", path = "../grep-printer" } diff --git a/grep/examples/simplegrep.rs b/grep/examples/simplegrep.rs index fb2d4001..d4bdef48 100644 --- a/grep/examples/simplegrep.rs +++ b/grep/examples/simplegrep.rs @@ -1,19 +1,18 @@ -extern crate atty; extern crate grep; extern crate termcolor; extern crate walkdir; use std::env; -use std::error; use std::ffi::OsString; use std::path::Path; use std::process; use std::result; +use grep::cli; use grep::printer::{ColorSpecs, StandardBuilder}; use grep::regex::RegexMatcher; use grep::searcher::{BinaryDetection, SearcherBuilder}; -use termcolor::{ColorChoice, StandardStream}; +use termcolor::ColorChoice; use walkdir::WalkDir; macro_rules! fail { @@ -22,7 +21,7 @@ macro_rules! fail { } } -type Result = result::Result>; +type Result = result::Result>; fn main() { if let Err(err) = try_main() { @@ -39,26 +38,18 @@ fn try_main() -> Result<()> { if args.len() == 2 { args.push(OsString::from("./")); } - let pattern = match args[1].clone().into_string() { - Ok(pattern) => pattern, - Err(_) => { - fail!( - "pattern is not valid UTF-8: '{:?}'", - args[1].to_string_lossy() - ); - } - }; - search(&pattern, &args[2..]) + search(cli::pattern_from_os(&args[1])?, &args[2..]) } fn search(pattern: &str, paths: &[OsString]) -> Result<()> { let matcher = RegexMatcher::new_line_matcher(&pattern)?; let mut searcher = SearcherBuilder::new() .binary_detection(BinaryDetection::quit(b'\x00')) + .line_number(false) .build(); let mut printer = StandardBuilder::new() - .color_specs(colors()) - .build(StandardStream::stdout(color_choice())); + .color_specs(ColorSpecs::default_with_color()) + .build(cli::stdout(color_choice())); for path in paths { for result in WalkDir::new(path) { @@ -90,18 +81,9 @@ fn search(pattern: &str, paths: &[OsString]) -> Result<()> { } fn color_choice() -> ColorChoice { - if atty::is(atty::Stream::Stdout) { + if cli::is_tty_stdout() { ColorChoice::Auto } else { ColorChoice::Never } } - -fn colors() -> ColorSpecs { - ColorSpecs::new(&[ - "path:fg:magenta".parse().unwrap(), - "line:fg:green".parse().unwrap(), - "match:fg:red".parse().unwrap(), - "match:style:bold".parse().unwrap(), - ]) -} diff --git a/grep/src/lib.rs b/grep/src/lib.rs index ab0d78eb..13eaee25 100644 --- a/grep/src/lib.rs +++ b/grep/src/lib.rs @@ -14,6 +14,7 @@ A cookbook and a guide are planned. #![deny(missing_docs)] +pub extern crate grep_cli as cli; pub extern crate grep_matcher as matcher; #[cfg(feature = "pcre2")] pub extern crate grep_pcre2 as pcre2; diff --git a/src/app.rs b/src/app.rs index 059effbd..039b6980 100644 --- a/src/app.rs +++ b/src/app.rs @@ -80,7 +80,7 @@ pub fn app() -> App<'static, 'static> { /// Return the "long" format of ripgrep's version string. /// /// If a revision hash is given, then it is used. If one isn't given, then -/// the RIPGREP_BUILD_GIT_HASH env var is inspect for it. If that isn't set, +/// the RIPGREP_BUILD_GIT_HASH env var is inspected for it. If that isn't set, /// then a revision hash is not included in the version string returned. pub fn long_version(revision_hash: Option<&str>) -> String { // Do we have a git hash? @@ -537,7 +537,11 @@ pub fn all_args_and_flags() -> Vec { // The positional arguments must be defined first and in order. arg_pattern(&mut args); arg_path(&mut args); - // Flags can be defined in any order, but we do it alphabetically. + // Flags can be defined in any order, but we do it alphabetically. Note + // that each function may define multiple flags. For example, + // `flag_encoding` defines `--encoding` and `--no-encoding`. Most `--no` + // flags are hidden and merely mentioned in the docs of the corresponding + // "positive" flag. flag_after_context(&mut args); flag_before_context(&mut args); flag_byte_offset(&mut args); diff --git a/src/args.rs b/src/args.rs index de84f094..2decefe7 100644 --- a/src/args.rs +++ b/src/args.rs @@ -1,14 +1,14 @@ use std::cmp; use std::env; use std::ffi::OsStr; -use std::fs::{self, File}; -use std::io::{self, BufRead}; +use std::fs; +use std::io; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::SystemTime; -use atty; use clap; +use grep::cli; use grep::matcher::LineTerminator; #[cfg(feature = "pcre2")] use grep::pcre2::{ @@ -20,6 +20,7 @@ use grep::printer::{ JSON, JSONBuilder, Standard, StandardBuilder, Summary, SummaryBuilder, SummaryKind, + default_color_specs, }; use grep::regex::{ RegexMatcher as RustRegexMatcher, @@ -34,11 +35,10 @@ use ignore::{Walk, WalkBuilder, WalkParallel}; use log; use num_cpus; use path_printer::{PathPrinter, PathPrinterBuilder}; -use regex::{self, Regex}; -use same_file::Handle; +use regex; use termcolor::{ WriteColor, - BufferedStandardStream, BufferWriter, ColorChoice, StandardStream, + BufferWriter, ColorChoice, }; use app; @@ -47,7 +47,6 @@ use logger::Logger; use messages::{set_messages, set_ignore_messages}; use search::{PatternMatcher, Printer, SearchWorker, SearchWorkerBuilder}; use subject::SubjectBuilder; -use unescape::{escape, unescape}; use Result; /// The command that ripgrep should execute based on the command line @@ -314,13 +313,8 @@ impl Args { /// Execute the given function with a writer to stdout that enables color /// support based on the command line configuration. - pub fn stdout(&self) -> Box { - let color_choice = self.matches().color_choice(); - if atty::is(atty::Stream::Stdout) { - Box::new(StandardStream::stdout(color_choice)) - } else { - Box::new(BufferedStandardStream::stdout(color_choice)) - } + pub fn stdout(&self) -> cli::StandardStream { + cli::stdout(self.matches().color_choice()) } /// Return the type definitions compiled into ripgrep. @@ -628,8 +622,8 @@ impl ArgMatches { .caseless(self.case_insensitive()) .multi_line(true) .word(self.is_present("word-regexp")); - // For whatever reason, the JIT craps out during compilation with a - // "no more memory" error on 32 bit systems. So don't use it there. + // For whatever reason, the JIT craps out during regex compilation with + // a "no more memory" error on 32 bit systems. So don't use it there. if !cfg!(target_pointer_width = "32") { builder.jit(true); } @@ -638,7 +632,7 @@ impl ArgMatches { if self.encoding()?.is_some() { // SAFETY: If an encoding was specified, then we're guaranteed // to get valid UTF-8, so we can disable PCRE2's UTF checking. - // (Feeding invalid UTF-8 to PCRE2 is UB.) + // (Feeding invalid UTF-8 to PCRE2 is undefined behavior.) unsafe { builder.disable_utf_check(); } @@ -853,7 +847,7 @@ impl ArgMatches { } else if preference == "ansi" { ColorChoice::AlwaysAnsi } else if preference == "auto" { - if atty::is(atty::Stream::Stdout) || self.is_present("pretty") { + if cli::is_tty_stdout() || self.is_present("pretty") { ColorChoice::Auto } else { ColorChoice::Never @@ -869,15 +863,7 @@ impl ArgMatches { /// is returned. fn color_specs(&self) -> Result { // Start with a default set of color specs. - let mut specs = vec![ - #[cfg(unix)] - "path:fg:magenta".parse().unwrap(), - #[cfg(windows)] - "path:fg:cyan".parse().unwrap(), - "line:fg:green".parse().unwrap(), - "match:fg:red".parse().unwrap(), - "match:style:bold".parse().unwrap(), - ]; + let mut specs = default_color_specs(); for spec_str in self.values_of_lossy_vec("colors") { specs.push(spec_str.parse()?); } @@ -913,9 +899,9 @@ impl ArgMatches { /// /// If one was not provided, the default `--` is returned. fn context_separator(&self) -> Vec { - match self.value_of_lossy("context-separator") { + match self.value_of_os("context-separator") { None => b"--".to_vec(), - Some(sep) => unescape(&sep), + Some(sep) => cli::unescape_os(&sep), } } @@ -990,7 +976,7 @@ impl ArgMatches { if self.is_present("no-heading") || self.is_present("vimgrep") { false } else { - atty::is(atty::Stream::Stdout) + cli::is_tty_stdout() || self.is_present("heading") || self.is_present("pretty") } @@ -1042,7 +1028,7 @@ impl ArgMatches { // generally want to show line numbers by default when printing to a // tty for human consumption, except for one interesting case: when // we're only searching stdin. This makes pipelines work as expected. - (atty::is(atty::Stream::Stdout) && !self.is_only_stdin(paths)) + (cli::is_tty_stdout() && !self.is_only_stdin(paths)) || self.is_present("line-number") || self.is_present("column") || self.is_present("pretty") @@ -1177,8 +1163,7 @@ impl ArgMatches { let file_is_stdin = self.values_of_os("file") .map_or(false, |mut files| files.any(|f| f == "-")); let search_cwd = - atty::is(atty::Stream::Stdin) - || !stdin_is_readable() + !cli::is_readable_stdin() || (self.is_present("file") && file_is_stdin) || self.is_present("files") || self.is_present("type-list"); @@ -1194,9 +1179,9 @@ impl ArgMatches { /// If the provided path separator is more than a single byte, then an /// error is returned. fn path_separator(&self) -> Result> { - let sep = match self.value_of_lossy("path-separator") { + let sep = match self.value_of_os("path-separator") { None => return Ok(None), - Some(sep) => unescape(&sep), + Some(sep) => cli::unescape_os(&sep), }; if sep.is_empty() { Ok(None) @@ -1207,7 +1192,7 @@ impl ArgMatches { In some shells on Windows '/' is automatically \ expanded. Use '//' instead.", sep.len(), - escape(&sep), + cli::escape(&sep), ))) } else { Ok(Some(sep[0])) @@ -1254,18 +1239,12 @@ impl ArgMatches { } } } - if let Some(files) = self.values_of_os("file") { - for file in files { - if file == "-" { - let stdin = io::stdin(); - for line in stdin.lock().lines() { - pats.push(self.pattern_from_str(&line?)); - } + if let Some(paths) = self.values_of_os("file") { + for path in paths { + if path == "-" { + pats.extend(cli::patterns_from_stdin()?); } else { - let f = File::open(file)?; - for line in io::BufReader::new(f).lines() { - pats.push(self.pattern_from_str(&line?)); - } + pats.extend(cli::patterns_from_path(path)?); } } } @@ -1287,7 +1266,7 @@ impl ArgMatches { /// /// If the pattern is not valid UTF-8, then an error is returned. fn pattern_from_os_str(&self, pat: &OsStr) -> Result { - let s = pattern_to_str(pat)?; + let s = cli::pattern_from_os(pat)?; Ok(self.pattern_from_str(s)) } @@ -1495,40 +1474,11 @@ impl ArgMatches { &self, arg_name: &str, ) -> Result> { - lazy_static! { - static ref RE: Regex = Regex::new(r"^([0-9]+)([KMG])?$").unwrap(); - } - - let arg_value = match self.value_of_lossy(arg_name) { - Some(x) => x, - None => return Ok(None) + let size = match self.value_of_lossy(arg_name) { + None => return Ok(None), + Some(size) => size, }; - let caps = RE - .captures(&arg_value) - .ok_or_else(|| { - format!("invalid format for {}", arg_name) - })?; - - let value = caps[1].parse::()?; - let suffix = caps.get(2).map(|x| x.as_str()); - - let v_10 = value.checked_mul(1024); - let v_20 = v_10.and_then(|x| x.checked_mul(1024)); - let v_30 = v_20.and_then(|x| x.checked_mul(1024)); - let try_suffix = |x: Option| { - if x.is_some() { - Ok(x) - } else { - Err(From::from(format!("number too large for {}", arg_name))) - } - }; - match suffix { - None => Ok(Some(value)), - Some("K") => try_suffix(v_10), - Some("M") => try_suffix(v_20), - Some("G") => try_suffix(v_30), - _ => Err(From::from(format!("invalid suffix for {}", arg_name))) - } + Ok(Some(cli::parse_human_readable_size(&size)?)) } } @@ -1562,21 +1512,6 @@ impl ArgMatches { } } -/// Convert an OsStr to a Unicode string. -/// -/// Patterns _must_ be valid UTF-8, so if the given OsStr isn't valid UTF-8, -/// this returns an error. -fn pattern_to_str(s: &OsStr) -> Result<&str> { - s.to_str().ok_or_else(|| { - From::from(format!( - "Argument '{}' is not valid UTF-8. \ - Use hex escape sequences to match arbitrary \ - bytes in a pattern (e.g., \\xFF).", - s.to_string_lossy() - )) - }) -} - /// Inspect an error resulting from building a Rust regex matcher, and if it's /// believed to correspond to a syntax error that PCRE2 could handle, then /// add a message to suggest the use of -P/--pcre2. @@ -1638,25 +1573,3 @@ where G: Fn(&fs::Metadata) -> io::Result t1.cmp(&t2) } } - -/// Returns true if and only if stdin is deemed searchable. -#[cfg(unix)] -fn stdin_is_readable() -> bool { - use std::os::unix::fs::FileTypeExt; - - let ft = match Handle::stdin().and_then(|h| h.as_file().metadata()) { - Err(_) => return false, - Ok(md) => md.file_type(), - }; - ft.is_file() || ft.is_fifo() -} - -/// Returns true if and only if stdin is deemed searchable. -#[cfg(windows)] -fn stdin_is_readable() -> bool { - use winapi_util as winutil; - - winutil::file::typ(winutil::HandleRef::stdin()) - .map(|t| t.is_disk() || t.is_pipe()) - .unwrap_or(false) -} diff --git a/src/decompressor.rs b/src/decompressor.rs deleted file mode 100644 index d25c2f56..00000000 --- a/src/decompressor.rs +++ /dev/null @@ -1,190 +0,0 @@ -use std::collections::HashMap; -use std::ffi::OsStr; -use std::fmt; -use std::io::{self, Read}; -use std::path::Path; -use std::process::{self, Stdio}; - -use globset::{Glob, GlobSet, GlobSetBuilder}; - -/// A decompression command, contains the command to be spawned as well as any -/// necessary CLI args. -#[derive(Clone, Copy, Debug)] -struct DecompressionCommand { - cmd: &'static str, - args: &'static [&'static str], -} - -impl DecompressionCommand { - /// Create a new decompress command - fn new( - cmd: &'static str, - args: &'static [&'static str], - ) -> DecompressionCommand { - DecompressionCommand { - cmd, args - } - } -} - -impl fmt::Display for DecompressionCommand { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{} {}", self.cmd, self.args.join(" ")) - } -} - -lazy_static! { - static ref DECOMPRESSION_COMMANDS: HashMap< - &'static str, - DecompressionCommand, - > = { - let mut m = HashMap::new(); - - const ARGS: &[&str] = &["-d", "-c"]; - m.insert("gz", DecompressionCommand::new("gzip", ARGS)); - m.insert("bz2", DecompressionCommand::new("bzip2", ARGS)); - m.insert("xz", DecompressionCommand::new("xz", ARGS)); - m.insert("lz4", DecompressionCommand::new("lz4", ARGS)); - - const LZMA_ARGS: &[&str] = &["--format=lzma", "-d", "-c"]; - m.insert("lzma", DecompressionCommand::new("xz", LZMA_ARGS)); - - m - }; - static ref SUPPORTED_COMPRESSION_FORMATS: GlobSet = { - let mut builder = GlobSetBuilder::new(); - builder.add(Glob::new("*.gz").unwrap()); - builder.add(Glob::new("*.bz2").unwrap()); - builder.add(Glob::new("*.xz").unwrap()); - builder.add(Glob::new("*.lz4").unwrap()); - builder.add(Glob::new("*.lzma").unwrap()); - builder.build().unwrap() - }; - static ref TAR_ARCHIVE_FORMATS: GlobSet = { - let mut builder = GlobSetBuilder::new(); - builder.add(Glob::new("*.tar.gz").unwrap()); - builder.add(Glob::new("*.tar.xz").unwrap()); - builder.add(Glob::new("*.tar.bz2").unwrap()); - builder.add(Glob::new("*.tar.lz4").unwrap()); - builder.add(Glob::new("*.tgz").unwrap()); - builder.add(Glob::new("*.txz").unwrap()); - builder.add(Glob::new("*.tbz2").unwrap()); - builder.build().unwrap() - }; -} - -/// DecompressionReader provides an `io::Read` implementation for a limited -/// set of compression formats. -#[derive(Debug)] -pub struct DecompressionReader { - cmd: DecompressionCommand, - child: process::Child, - done: bool, -} - -impl DecompressionReader { - /// Returns a handle to the stdout of the spawned decompression process for - /// `path`, which can be directly searched in the worker. When the returned - /// value is exhausted, the underlying process is reaped. If the underlying - /// process fails, then its stderr is read and converted into a normal - /// io::Error. - /// - /// If there is any error in spawning the decompression command, then - /// return `None`, after outputting any necessary debug or error messages. - pub fn from_path(path: &Path) -> Option { - let extension = match path.extension().and_then(OsStr::to_str) { - Some(extension) => extension, - None => { - debug!( - "{}: failed to get compresson extension", path.display()); - return None; - } - }; - let decompression_cmd = match DECOMPRESSION_COMMANDS.get(extension) { - Some(cmd) => cmd, - None => { - debug!( - "{}: failed to get decompression command", path.display()); - return None; - } - }; - let cmd = process::Command::new(decompression_cmd.cmd) - .args(decompression_cmd.args) - .arg(path) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn(); - let child = match cmd { - Ok(process) => process, - Err(_) => { - debug!( - "{}: decompression command '{}' not found", - path.display(), decompression_cmd.cmd); - return None; - } - }; - Some(DecompressionReader::new(*decompression_cmd, child)) - } - - fn new( - cmd: DecompressionCommand, - child: process::Child, - ) -> DecompressionReader { - DecompressionReader { - cmd: cmd, - child: child, - done: false, - } - } - - fn read_error(&mut self) -> io::Result { - let mut errbytes = vec![]; - self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?; - let errstr = String::from_utf8_lossy(&errbytes); - let errstr = errstr.trim(); - - Ok(if errstr.is_empty() { - let msg = format!("decompression command failed: '{}'", self.cmd); - io::Error::new(io::ErrorKind::Other, msg) - } else { - let msg = format!( - "decompression command '{}' failed: {}", self.cmd, errstr); - io::Error::new(io::ErrorKind::Other, msg) - }) - } -} - -impl io::Read for DecompressionReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - if self.done { - return Ok(0); - } - let nread = self.child.stdout.as_mut().unwrap().read(buf)?; - if nread == 0 { - self.done = true; - // Reap the child now that we're done reading. - // If the command failed, report stderr as an error. - if !self.child.wait()?.success() { - return Err(self.read_error()?); - } - } - Ok(nread) - } -} - -/// Returns true if the given path contains a supported compression format or -/// is a TAR archive. -pub fn is_compressed(path: &Path) -> bool { - is_supported_compression_format(path) || is_tar_archive(path) -} - -/// Returns true if the given path matches any one of the supported compression -/// formats -fn is_supported_compression_format(path: &Path) -> bool { - SUPPORTED_COMPRESSION_FORMATS.is_match(path) -} - -/// Returns true if the given path matches any of the known TAR file formats. -fn is_tar_archive(path: &Path) -> bool { - TAR_ARCHIVE_FORMATS.is_match(path) -} diff --git a/src/main.rs b/src/main.rs index 4a4ac5f0..bcadc8a1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,5 @@ -extern crate atty; #[macro_use] extern crate clap; -extern crate globset; extern crate grep; extern crate ignore; #[macro_use] @@ -10,14 +8,11 @@ extern crate lazy_static; extern crate log; extern crate num_cpus; extern crate regex; -extern crate same_file; #[macro_use] extern crate serde_json; extern crate termcolor; -#[cfg(windows)] -extern crate winapi_util; -use std::io; +use std::io::{self, Write}; use std::process; use std::sync::{Arc, Mutex}; use std::time::Instant; @@ -33,13 +28,10 @@ mod messages; mod app; mod args; mod config; -mod decompressor; -mod preprocessor; mod logger; mod path_printer; mod search; mod subject; -mod unescape; type Result = ::std::result::Result>; diff --git a/src/preprocessor.rs b/src/preprocessor.rs deleted file mode 100644 index 07f66e2d..00000000 --- a/src/preprocessor.rs +++ /dev/null @@ -1,93 +0,0 @@ -use std::fs::File; -use std::io::{self, Read}; -use std::path::{Path, PathBuf}; -use std::process::{self, Stdio}; - -/// PreprocessorReader provides an `io::Read` impl to read kids output. -#[derive(Debug)] -pub struct PreprocessorReader { - cmd: PathBuf, - path: PathBuf, - child: process::Child, - done: bool, -} - -impl PreprocessorReader { - /// Returns a handle to the stdout of the spawned preprocessor process for - /// `path`, which can be directly searched in the worker. When the returned - /// value is exhausted, the underlying process is reaped. If the underlying - /// process fails, then its stderr is read and converted into a normal - /// io::Error. - /// - /// If there is any error in spawning the preprocessor command, then - /// return the corresponding error. - pub fn from_cmd_path( - cmd: PathBuf, - path: &Path, - ) -> io::Result { - let child = process::Command::new(&cmd) - .arg(path) - .stdin(Stdio::from(File::open(path)?)) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .map_err(|err| { - io::Error::new( - io::ErrorKind::Other, - format!( - "error running preprocessor command '{}': {}", - cmd.display(), - err, - ), - ) - })?; - Ok(PreprocessorReader { - cmd: cmd, - path: path.to_path_buf(), - child: child, - done: false, - }) - } - - fn read_error(&mut self) -> io::Result { - let mut errbytes = vec![]; - self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?; - let errstr = String::from_utf8_lossy(&errbytes); - let errstr = errstr.trim(); - - Ok(if errstr.is_empty() { - let msg = format!( - "preprocessor command failed: '{} {}'", - self.cmd.display(), - self.path.display(), - ); - io::Error::new(io::ErrorKind::Other, msg) - } else { - let msg = format!( - "preprocessor command failed: '{} {}': {}", - self.cmd.display(), - self.path.display(), - errstr, - ); - io::Error::new(io::ErrorKind::Other, msg) - }) - } -} - -impl io::Read for PreprocessorReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - if self.done { - return Ok(0); - } - let nread = self.child.stdout.as_mut().unwrap().read(buf)?; - if nread == 0 { - self.done = true; - // Reap the child now that we're done reading. - // If the command failed, report stderr as an error. - if !self.child.wait()?.success() { - return Err(self.read_error()?); - } - } - Ok(nread) - } -} diff --git a/src/search.rs b/src/search.rs index 45f7cf87..457f8f7a 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,7 +1,10 @@ +use std::fs::File; use std::io; use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; use std::time::Duration; +use grep::cli; use grep::matcher::Matcher; #[cfg(feature = "pcre2")] use grep::pcre2::{RegexMatcher as PCRE2RegexMatcher}; @@ -11,8 +14,6 @@ use grep::searcher::Searcher; use serde_json as json; use termcolor::WriteColor; -use decompressor::{DecompressionReader, is_compressed}; -use preprocessor::PreprocessorReader; use subject::Subject; /// The configuration for the search worker. Among a few other things, the @@ -39,6 +40,8 @@ impl Default for Config { #[derive(Clone, Debug)] pub struct SearchWorkerBuilder { config: Config, + command_builder: cli::CommandReaderBuilder, + decomp_builder: cli::DecompressionReaderBuilder, } impl Default for SearchWorkerBuilder { @@ -50,7 +53,17 @@ impl Default for SearchWorkerBuilder { impl SearchWorkerBuilder { /// Create a new builder for configuring and constructing a search worker. pub fn new() -> SearchWorkerBuilder { - SearchWorkerBuilder { config: Config::default() } + let mut cmd_builder = cli::CommandReaderBuilder::new(); + cmd_builder.async_stderr(true); + + let mut decomp_builder = cli::DecompressionReaderBuilder::new(); + decomp_builder.async_stderr(true); + + SearchWorkerBuilder { + config: Config::default(), + command_builder: cmd_builder, + decomp_builder: decomp_builder, + } } /// Create a new search worker using the given searcher, matcher and @@ -62,7 +75,12 @@ impl SearchWorkerBuilder { printer: Printer, ) -> SearchWorker { let config = self.config.clone(); - SearchWorker { config, matcher, searcher, printer } + let command_builder = self.command_builder.clone(); + let decomp_builder = self.decomp_builder.clone(); + SearchWorker { + config, command_builder, decomp_builder, + matcher, searcher, printer, + } } /// Forcefully use JSON to emit statistics, even if the underlying printer @@ -237,6 +255,8 @@ impl Printer { #[derive(Debug)] pub struct SearchWorker { config: Config, + command_builder: cli::CommandReaderBuilder, + decomp_builder: cli::DecompressionReaderBuilder, matcher: PatternMatcher, searcher: Searcher, printer: Printer, @@ -279,19 +299,48 @@ impl SearchWorker { // A `return` here appeases the borrow checker. NLL will fix this. return self.search_reader(path, stdin.lock()); } else if self.config.preprocessor.is_some() { - let cmd = self.config.preprocessor.clone().unwrap(); - let rdr = PreprocessorReader::from_cmd_path(cmd, path)?; - self.search_reader(path, rdr) - } else if self.config.search_zip && is_compressed(path) { - match DecompressionReader::from_path(path) { - None => Ok(SearchResult::default()), - Some(rdr) => self.search_reader(path, rdr), - } + self.search_preprocessor(path) + } else if self.should_decompress(path) { + self.search_decompress(path) } else { self.search_path(path) } } + /// Returns true if and only if the given file path should be + /// decompressed before searching. + fn should_decompress(&self, path: &Path) -> bool { + if !self.config.search_zip { + return false; + } + self.decomp_builder.get_matcher().has_command(path) + } + + fn search_preprocessor( + &mut self, + path: &Path, + ) -> io::Result { + let bin = self.config.preprocessor.clone().unwrap(); + let mut cmd = Command::new(&bin); + cmd.arg(path).stdin(Stdio::from(File::open(path)?)); + + let rdr = self.command_builder.build(&mut cmd)?; + self.search_reader(path, rdr).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("preprocessor command failed: '{:?}': {}", cmd, err), + ) + }) + } + + fn search_decompress( + &mut self, + path: &Path, + ) -> io::Result { + let rdr = self.decomp_builder.build(path)?; + self.search_reader(path, rdr) + } + /// Search the contents of the given file path. fn search_path(&mut self, path: &Path) -> io::Result { use self::PatternMatcher::*; diff --git a/src/unescape.rs b/src/unescape.rs deleted file mode 100644 index 0c7f1c8d..00000000 --- a/src/unescape.rs +++ /dev/null @@ -1,137 +0,0 @@ -/// A single state in the state machine used by `unescape`. -#[derive(Clone, Copy, Eq, PartialEq)] -enum State { - /// The state after seeing a `\`. - Escape, - /// The state after seeing a `\x`. - HexFirst, - /// The state after seeing a `\x[0-9A-Fa-f]`. - HexSecond(char), - /// Default state. - Literal, -} - -/// Escapes an arbitrary byte slice such that it can be presented as a human -/// readable string. -pub fn escape(bytes: &[u8]) -> String { - use std::ascii::escape_default; - - let escaped = bytes.iter().flat_map(|&b| escape_default(b)).collect(); - String::from_utf8(escaped).unwrap() -} - -/// Unescapes a string given on the command line. It supports a limited set of -/// escape sequences: -/// -/// * `\t`, `\r` and `\n` are mapped to their corresponding ASCII bytes. -/// * `\xZZ` hexadecimal escapes are mapped to their byte. -pub fn unescape(s: &str) -> Vec { - use self::State::*; - - let mut bytes = vec![]; - let mut state = Literal; - for c in s.chars() { - match state { - Escape => { - match c { - 'n' => { bytes.push(b'\n'); state = Literal; } - 'r' => { bytes.push(b'\r'); state = Literal; } - 't' => { bytes.push(b'\t'); state = Literal; } - 'x' => { state = HexFirst; } - c => { - bytes.extend(format!(r"\{}", c).into_bytes()); - state = Literal; - } - } - } - HexFirst => { - match c { - '0'...'9' | 'A'...'F' | 'a'...'f' => { - state = HexSecond(c); - } - c => { - bytes.extend(format!(r"\x{}", c).into_bytes()); - state = Literal; - } - } - } - HexSecond(first) => { - match c { - '0'...'9' | 'A'...'F' | 'a'...'f' => { - let ordinal = format!("{}{}", first, c); - let byte = u8::from_str_radix(&ordinal, 16).unwrap(); - bytes.push(byte); - state = Literal; - } - c => { - let original = format!(r"\x{}{}", first, c); - bytes.extend(original.into_bytes()); - state = Literal; - } - } - } - Literal => { - match c { - '\\' => { state = Escape; } - c => { bytes.extend(c.to_string().as_bytes()); } - } - } - } - } - match state { - Escape => bytes.push(b'\\'), - HexFirst => bytes.extend(b"\\x"), - HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()), - Literal => {} - } - bytes -} - -#[cfg(test)] -mod tests { - use super::unescape; - - fn b(bytes: &'static [u8]) -> Vec { - bytes.to_vec() - } - - #[test] - fn unescape_nul() { - assert_eq!(b(b"\x00"), unescape(r"\x00")); - } - - #[test] - fn unescape_nl() { - assert_eq!(b(b"\n"), unescape(r"\n")); - } - - #[test] - fn unescape_tab() { - assert_eq!(b(b"\t"), unescape(r"\t")); - } - - #[test] - fn unescape_carriage() { - assert_eq!(b(b"\r"), unescape(r"\r")); - } - - #[test] - fn unescape_nothing_simple() { - assert_eq!(b(b"\\a"), unescape(r"\a")); - } - - #[test] - fn unescape_nothing_hex0() { - assert_eq!(b(b"\\x"), unescape(r"\x")); - } - - #[test] - fn unescape_nothing_hex1() { - assert_eq!(b(b"\\xz"), unescape(r"\xz")); - } - - #[test] - fn unescape_nothing_hex2() { - assert_eq!(b(b"\\xzz"), unescape(r"\xzz")); - } -}