1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-03-28 12:42:13 +02:00

search: add support for searching compressed files

This commit adds opt-in support for searching compressed files during
recursive search. This behavior is only enabled when the
`-z/--search-zip` flag is passed to ripgrep. When enabled, a limited set
of common compression formats are recognized via file extension, and a
new process is spawned to perform the decompression. ripgrep then
searches the stdout of that spawned process.

Closes #539
This commit is contained in:
Balaji Sivaraman 2018-01-07 21:35:58 +05:30 committed by Andrew Gallant
parent a8543f798d
commit f007f940c5
18 changed files with 373 additions and 24 deletions

View File

@ -9,8 +9,10 @@ env:
addons:
apt:
packages:
# Needed for completion-function test
# Needed for completion-function test.
- zsh
# Needed for testing decompression search.
- xz-utils
matrix:
fast_finish: true

1
Cargo.lock generated
View File

@ -237,6 +237,7 @@ dependencies = [
"clap 2.29.0 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
"globset 0.2.1",
"grep 0.1.7",
"ignore 0.3.1",
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",

View File

@ -49,6 +49,7 @@ num_cpus = "1"
regex = "0.2.4"
same-file = "1"
termcolor = { version = "0.3.3", path = "termcolor" }
globset = { version = "0.2.1", path = "globset" }
[build-dependencies]
clap = "2.26"

View File

@ -91,6 +91,8 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
as UTF-16, latin-1, GBK, EUC-JP, Shift_JIS and more. (Some support for
automatically detecting UTF-16 is provided. Other text encodings must be
specifically specified with the `-E/--encoding` flag.)
* `ripgrep` supports searching files compressed in a common format (gzip, xz,
lzma or bzip2 current) with the `-z/--search-zip` flag.
In other words, use `ripgrep` if you like speed, filtering by default, fewer
bugs, and Unicode support.
@ -109,12 +111,10 @@ give you a glimpse at some important downsides or missing features of
support for Unicode categories (e.g., `\p{Sc}` to match currency symbols or
`\p{Lu}` to match any uppercase letter). (Fancier regexes will never be
supported.)
* `ripgrep` doesn't yet support searching compressed files. (Likely to be
supported in the future.)
* `ripgrep` doesn't have multiline search. (Unlikely to ever be supported.)
In other words, if you like fancy regexes, searching compressed files or
multiline search, then `ripgrep` may not quite meet your needs (yet).
In other words, if you like fancy regexes or multiline search, then `ripgrep`
may not quite meet your needs (yet).
### Feature comparison

View File

@ -87,6 +87,7 @@ _rg() {
'(-w -x --line-regexp --word-regexp)'{-w,--word-regexp}'[only show matches surrounded by word boundaries]'
'(-e -f --file --files --regexp --type-list)1: :_rg_pattern'
'(--type-list)*:file:_files'
'(-z --search-zip)'{-z,--search-zip}'[search in compressed files]'
)
[[ ${_RG_COMPLETE_LIST_ARGS:-} == (1|t*|y*) ]] && {

View File

@ -184,6 +184,15 @@ Only show matches surrounded by line boundaries.
This is equivalent to putting ^...$ around the search pattern.
.RS
.RE
.TP
.B \-z, \-\-search\-zip
Search in compressed files.
Currently gz, bz2, xz and lzma formats are supported.
.RS
.PP
Note that ripgrep expects to find the decompression binaries for the
respective formats in your system\[aq]s PATH for use with this flag.
.RE
.SH LESS COMMON OPTIONS
.TP
.B \-A, \-\-after\-context \f[I]NUM\f[]
@ -437,9 +446,7 @@ such part on a separate output line.
.TP
.B \-\-passthru, \-\-passthrough
Show both matching and non\-matching lines.
This is equivalent to adding ^ to the list of search patterns.
This option overrides \-\-count and cannot be used with
\-\-only\-matching or \-\-replace.
This option cannot be used with \-\-only\-matching or \-\-replace.
.RS
.RE
.TP

View File

@ -125,6 +125,13 @@ Project home page: https://github.com/BurntSushi/ripgrep
: Only show matches surrounded by line boundaries. This is equivalent to
putting ^...$ around the search pattern.
-z, --search-zip
: Search in compressed files. Currently gz, bz2, xz and lzma
formats are supported.
Note that ripgrep expects to find the decompression binaries for the
respective formats in your system's PATH for use with this flag.
# LESS COMMON OPTIONS
-A, --after-context *NUM*

View File

@ -103,6 +103,7 @@ const DEFAULT_TYPES: &'static [(&'static str, &'static [&'static str])] = &[
("avro", &["*.avdl", "*.avpr", "*.avsc"]),
("awk", &["*.awk"]),
("bitbake", &["*.bb", "*.bbappend", "*.bbclass", "*.conf", "*.inc"]),
("bzip2", &["*.bz2"]),
("c", &["*.c", "*.h", "*.H"]),
("cabal", &["*.cabal"]),
("cbor", &["*.cbor"]),
@ -137,6 +138,7 @@ const DEFAULT_TYPES: &'static [(&'static str, &'static [&'static str])] = &[
("fsharp", &["*.fs", "*.fsx", "*.fsi"]),
("gn", &["*.gn", "*.gni"]),
("go", &["*.go"]),
("gzip", &["*.gz"]),
("groovy", &["*.groovy", "*.gradle"]),
("h", &["*.h", "*.hpp"]),
("hbs", &["*.hbs"]),
@ -184,6 +186,7 @@ const DEFAULT_TYPES: &'static [(&'static str, &'static [&'static str])] = &[
("lisp", &["*.el", "*.jl", "*.lisp", "*.lsp", "*.sc", "*.scm"]),
("log", &["*.log"]),
("lua", &["*.lua"]),
("lzma", &["*.lzma"]),
("m4", &["*.ac", "*.m4"]),
("make", &[
"gnumakefile", "Gnumakefile", "GNUmakefile",
@ -276,6 +279,7 @@ const DEFAULT_TYPES: &'static [(&'static str, &'static [&'static str])] = &[
("wiki", &["*.mediawiki", "*.wiki"]),
("webidl", &["*.idl", "*.webidl", "*.widl"]),
("xml", &["*.xml", "*.xml.dist"]),
("xz", &["*.xz"]),
("yacc", &["*.y"]),
("yaml", &["*.yaml", "*.yml"]),
("zsh", &[

View File

@ -191,6 +191,7 @@ pub fn app() -> App<'static, 'static> {
.arg(flag("type-clear")
.value_name("TYPE").takes_value(true)
.multiple(true).number_of_values(1))
.arg(flag("search-zip").short("z"))
}
struct Usage {
@ -450,7 +451,8 @@ lazy_static! {
can be specified by using the --ignore-file flag several times. \
When specifying multiple ignore files, earlier files have lower \
precedence than later files. If you are looking for a way to \
include or exclude files and directories directly used -g instead.");
include or exclude files and directories directly used -g \
instead.");
doc!(h, "follow",
"Follow symbolic links.");
doc!(h, "max-count",
@ -592,6 +594,11 @@ lazy_static! {
only clears the default type definitions that are found inside \
of ripgrep.\n\nNote that this MUST be passed to every \
invocation of ripgrep. Type settings are NOT persisted.");
doc!(h, "search-zip",
"Search in compressed files.",
"Search in compressed files. Currently gz, bz2, xz, and \
lzma files are supported. This option expects the decompression \
binaries to be available in the system PATH.");
h
};
@ -599,8 +606,9 @@ lazy_static! {
fn validate_line_number_width(s: String) -> Result<(), String> {
if s.starts_with("0") {
Err(String::from("Custom padding characters are currently not supported. \
Please enter only a numeric value."))
Err(String::from(
"Custom padding characters are currently not supported. \
Please enter only a numeric value."))
} else {
validate_number(s)
}

View File

@ -77,6 +77,7 @@ pub struct Args {
type_list: bool,
types: Types,
with_filename: bool,
search_zip_files: bool
}
impl Args {
@ -229,6 +230,7 @@ impl Args {
.no_messages(self.no_messages)
.quiet(self.quiet)
.text(self.text)
.search_zip_files(self.search_zip_files)
.build()
}
@ -365,6 +367,7 @@ impl<'a> ArgMatches<'a> {
type_list: self.is_present("type-list"),
types: self.types()?,
with_filename: with_filename,
search_zip_files: self.is_present("search-zip")
};
if args.mmap {
debug!("will try to use memory maps");

191
src/decompressor.rs Normal file
View File

@ -0,0 +1,191 @@
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fmt;
use std::io::{self, Read};
use std::path::Path;
use std::process::{self, Stdio};
use globset::{Glob, GlobSet, GlobSetBuilder};
/// A decompression command, contains the command to be spawned as well as any
/// necessary CLI args.
#[derive(Clone, Copy, Debug)]
struct DecompressionCommand {
cmd: &'static str,
args: &'static [&'static str],
}
impl DecompressionCommand {
/// Create a new decompress command
fn new(
cmd: &'static str,
args: &'static [&'static str],
) -> DecompressionCommand {
DecompressionCommand {
cmd, args
}
}
}
impl fmt::Display for DecompressionCommand {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{} {}", self.cmd, self.args.join(" "))
}
}
lazy_static! {
static ref DECOMPRESSION_COMMANDS: HashMap<
&'static str,
DecompressionCommand,
> = {
let mut m = HashMap::new();
const ARGS: &[&str] = &["-d", "-c"];
m.insert("gz", DecompressionCommand::new("gzip", ARGS));
m.insert("bz2", DecompressionCommand::new("bzip2", ARGS));
m.insert("xz", DecompressionCommand::new("xz", ARGS));
const LZMA_ARGS: &[&str] = &["--format=lzma", "-d", "-c"];
m.insert("lzma", DecompressionCommand::new("xz", LZMA_ARGS));
m
};
static ref SUPPORTED_COMPRESSION_FORMATS: GlobSet = {
let mut builder = GlobSetBuilder::new();
builder.add(Glob::new("*.gz").unwrap());
builder.add(Glob::new("*.bz2").unwrap());
builder.add(Glob::new("*.xz").unwrap());
builder.add(Glob::new("*.lzma").unwrap());
builder.build().unwrap()
};
static ref TAR_ARCHIVE_FORMATS: GlobSet = {
let mut builder = GlobSetBuilder::new();
builder.add(Glob::new("*.tar.gz").unwrap());
builder.add(Glob::new("*.tar.xz").unwrap());
builder.add(Glob::new("*.tar.bz2").unwrap());
builder.add(Glob::new("*.tgz").unwrap());
builder.add(Glob::new("*.txz").unwrap());
builder.add(Glob::new("*.tbz2").unwrap());
builder.build().unwrap()
};
}
/// DecompressionReader provides an `io::Read` implementation for a limited
/// set of compression formats.
#[derive(Debug)]
pub struct DecompressionReader {
cmd: DecompressionCommand,
child: process::Child,
done: bool,
}
impl DecompressionReader {
/// Returns a handle to the stdout of the spawned decompression process for
/// `path`, which can be directly searched in the worker. When the returned
/// value is exhausted, the underlying process is reaped. If the underlying
/// process fails, then its stderr is read and converted into a normal
/// io::Error.
///
/// If there is any error in spawning the decompression command, then
/// return `None`, after outputting any necessary debug or error messages.
pub fn from_path(path: &Path) -> Option<DecompressionReader> {
if is_tar_archive(path) {
debug!("{}: skipping tar archive", path.display());
return None;
}
let extension = match path.extension().and_then(OsStr::to_str) {
Some(extension) => extension,
None => {
debug!(
"{}: failed to get compresson extension", path.display());
return None;
}
};
let decompression_cmd = match DECOMPRESSION_COMMANDS.get(extension) {
Some(cmd) => cmd,
None => {
debug!(
"{}: failed to get decompression command", path.display());
return None;
}
};
let cmd = process::Command::new(decompression_cmd.cmd)
.args(decompression_cmd.args)
.arg(path)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn();
let child = match cmd {
Ok(process) => process,
Err(_) => {
debug!(
"{}: decompression command '{}' not found",
path.display(), decompression_cmd.cmd);
return None;
}
};
Some(DecompressionReader::new(*decompression_cmd, child))
}
fn new(
cmd: DecompressionCommand,
child: process::Child,
) -> DecompressionReader {
DecompressionReader {
cmd: cmd,
child: child,
done: false,
}
}
fn read_error(&mut self) -> io::Result<io::Error> {
let mut errbytes = vec![];
self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?;
let errstr = String::from_utf8_lossy(&errbytes);
let errstr = errstr.trim();
Ok(if errstr.is_empty() {
let msg = format!("decompression command failed: '{}'", self.cmd);
io::Error::new(io::ErrorKind::Other, msg)
} else {
let msg = format!(
"decompression command '{}' failed: {}", self.cmd, errstr);
io::Error::new(io::ErrorKind::Other, msg)
})
}
}
impl io::Read for DecompressionReader {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
if self.done {
return Ok(0);
}
let nread = self.child.stdout.as_mut().unwrap().read(buf)?;
if nread == 0 {
self.done = true;
// Reap the child now that we're done reading.
// If the command failed, report stderr as an error.
if !self.child.wait()?.success() {
return Err(self.read_error()?);
}
}
Ok(nread)
}
}
/// Returns true if the given path contains a supported compression format or
/// is a TAR archive.
pub fn is_compressed(path: &Path) -> bool {
is_supported_compression_format(path) || is_tar_archive(path)
}
/// Returns true if the given path matches any one of the supported compression
/// formats
fn is_supported_compression_format(path: &Path) -> bool {
SUPPORTED_COMPRESSION_FORMATS.is_match(path)
}
/// Returns true if the given path matches any of the known TAR file formats.
fn is_tar_archive(path: &Path) -> bool {
TAR_ARCHIVE_FORMATS.is_match(path)
}

View File

@ -4,6 +4,7 @@ extern crate bytecount;
extern crate clap;
extern crate encoding_rs;
extern crate env_logger;
extern crate globset;
extern crate grep;
extern crate ignore;
#[macro_use]
@ -44,6 +45,7 @@ macro_rules! eprintln {
mod app;
mod args;
mod decoder;
mod decompressor;
mod pathutil;
mod printer;
mod search_buffer;

View File

@ -9,6 +9,7 @@ use memmap::Mmap;
use termcolor::WriteColor;
use decoder::DecodeReader;
use decompressor::{self, DecompressionReader};
use pathutil::strip_prefix;
use printer::Printer;
use search_buffer::BufferSearcher;
@ -42,6 +43,7 @@ struct Options {
no_messages: bool,
quiet: bool,
text: bool,
search_zip_files: bool
}
impl Default for Options {
@ -61,6 +63,7 @@ impl Default for Options {
no_messages: false,
quiet: false,
text: false,
search_zip_files: false,
}
}
}
@ -190,6 +193,12 @@ impl WorkerBuilder {
self.opts.text = yes;
self
}
/// If enabled, search through compressed files as well
pub fn search_zip_files(mut self, yes: bool) -> Self {
self.opts.search_zip_files = yes;
self
}
}
/// Worker is responsible for executing searches on file paths, while choosing
@ -218,22 +227,33 @@ impl Worker {
}
Work::DirEntry(dent) => {
let mut path = dent.path();
let file = match File::open(path) {
Ok(file) => file,
Err(err) => {
if !self.opts.no_messages {
eprintln!("{}: {}", path.display(), err);
if self.opts.search_zip_files
&& decompressor::is_compressed(path)
{
match DecompressionReader::from_path(path) {
Some(reader) => self.search(printer, path, reader),
None => {
return 0;
}
return 0;
}
};
if let Some(p) = strip_prefix("./", path) {
path = p;
}
if self.opts.mmap {
self.search_mmap(printer, path, &file)
} else {
self.search(printer, path, file)
let file = match File::open(path) {
Ok(file) => file,
Err(err) => {
if !self.opts.no_messages {
eprintln!("{}: {}", path.display(), err);
}
return 0;
}
};
if let Some(p) = strip_prefix("./", path) {
path = p;
}
if self.opts.mmap {
self.search_mmap(printer, path, &file)
} else {
self.search(printer, path, file)
}
}
}
};

BIN
tests/data/sherlock.bz2 Normal file

Binary file not shown.

BIN
tests/data/sherlock.gz Normal file

Binary file not shown.

BIN
tests/data/sherlock.lzma Normal file

Binary file not shown.

BIN
tests/data/sherlock.xz Normal file

Binary file not shown.

View File

@ -75,6 +75,10 @@ fn sort_lines(lines: &str) -> String {
format!("{}\n", lines.join("\n"))
}
fn cmd_exists(name: &str) -> bool {
Command::new(name).arg("--help").output().is_ok()
}
sherlock!(single_file, |wd: WorkDir, mut cmd| {
let lines: String = wd.stdout(&mut cmd);
let expected = "\
@ -1609,6 +1613,104 @@ clean!(suggest_fixed_strings_for_invalid_regex, "foo(", ".",
assert_eq!(err.contains("--fixed-strings"), true);
});
#[test]
fn compressed_gzip() {
if !cmd_exists("gzip") {
return;
}
let gzip_file = include_bytes!("./data/sherlock.gz");
let wd = WorkDir::new("feature_search_compressed");
wd.create_bytes("sherlock.gz", gzip_file);
let mut cmd = wd.command();
cmd.arg("-z").arg("Sherlock").arg("sherlock.gz");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
}
#[test]
fn compressed_bzip2() {
if !cmd_exists("bzip2") {
return;
}
let bzip2_file = include_bytes!("./data/sherlock.bz2");
let wd = WorkDir::new("feature_search_compressed");
wd.create_bytes("sherlock.bz2", bzip2_file);
let mut cmd = wd.command();
cmd.arg("-z").arg("Sherlock").arg("sherlock.bz2");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
}
#[test]
fn compressed_xz() {
if !cmd_exists("xz") {
return;
}
let xz_file = include_bytes!("./data/sherlock.xz");
let wd = WorkDir::new("feature_search_compressed");
wd.create_bytes("sherlock.xz", xz_file);
let mut cmd = wd.command();
cmd.arg("-z").arg("Sherlock").arg("sherlock.xz");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
}
#[test]
fn compressed_lzma() {
if !cmd_exists("xz") {
return;
}
let lzma_file = include_bytes!("./data/sherlock.lzma");
let wd = WorkDir::new("feature_search_compressed");
wd.create_bytes("sherlock.lzma", lzma_file);
let mut cmd = wd.command();
cmd.arg("-z").arg("Sherlock").arg("sherlock.lzma");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
}
#[test]
fn compressed_failing_gzip() {
if !cmd_exists("gzip") {
return;
}
let wd = WorkDir::new("feature_search_compressed");
wd.create("sherlock.gz", hay::SHERLOCK);
let mut cmd = wd.command();
cmd.arg("-z").arg("Sherlock").arg("sherlock.gz");
wd.assert_non_empty_stderr(&mut cmd);
let output = cmd.output().unwrap();
let err = String::from_utf8_lossy(&output.stderr);
assert_eq!(err.contains("not in gzip format"), true);
}
#[test]
fn feature_740_passthru() {
let wd = WorkDir::new("feature_740");