1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2024-12-12 19:18:24 +02:00

Add support for memory maps.

I though plain `read` had usurped them, but when searching a very small
number of files, mmaps can be around 20% faster on Linux. It'd be really
unfortunate to leave that on the table.

Mmap searching doesn't support contexts yet, but we probably don't really
care. And duplicating that logic doesn't sound fun. Without contexts, mmap
searching is delightfully simple.
This commit is contained in:
Andrew Gallant 2016-09-06 21:47:33 -04:00
parent af3b56a623
commit ca058d7584
4 changed files with 384 additions and 29 deletions

View File

@ -16,6 +16,7 @@ use ignore::Ignore;
use out::Out; use out::Out;
use printer::Printer; use printer::Printer;
use search::{InputBuffer, Searcher}; use search::{InputBuffer, Searcher};
use search_buffer::BufferSearcher;
use sys; use sys;
use types::{FileTypeDef, Types, TypesBuilder}; use types::{FileTypeDef, Types, TypesBuilder};
use walk; use walk;
@ -111,6 +112,14 @@ Less common options:
The byte to use for a line terminator. Escape sequences may be used. The byte to use for a line terminator. Escape sequences may be used.
[default: \\n] [default: \\n]
--mmap
Search using memory maps when possible. This is enabled by default
when xrep thinks it will be faster. (Note that mmap searching doesn't
current support the various context related options.)
--no-mmap
Never use memory maps, even when they might be faster.
--no-ignore --no-ignore
Don't respect ignore files (.gitignore, .xrepignore, etc.) Don't respect ignore files (.gitignore, .xrepignore, etc.)
@ -166,10 +175,12 @@ pub struct RawArgs {
flag_line_number: bool, flag_line_number: bool,
flag_line_terminator: String, flag_line_terminator: String,
flag_literal: bool, flag_literal: bool,
flag_mmap: bool,
flag_no_heading: bool, flag_no_heading: bool,
flag_no_ignore: bool, flag_no_ignore: bool,
flag_no_ignore_parent: bool, flag_no_ignore_parent: bool,
flag_no_line_number: bool, flag_no_line_number: bool,
flag_no_mmap: bool,
flag_pretty: bool, flag_pretty: bool,
flag_quiet: bool, flag_quiet: bool,
flag_replace: Option<String>, flag_replace: Option<String>,
@ -205,6 +216,7 @@ pub struct Args {
ignore_case: bool, ignore_case: bool,
invert_match: bool, invert_match: bool,
line_number: bool, line_number: bool,
mmap: bool,
no_ignore: bool, no_ignore: bool,
no_ignore_parent: bool, no_ignore_parent: bool,
quiet: bool, quiet: bool,
@ -251,6 +263,19 @@ impl RawArgs {
} else { } else {
(self.flag_after_context, self.flag_before_context) (self.flag_after_context, self.flag_before_context)
}; };
let mmap =
if before_context > 0 || after_context > 0 || self.flag_no_mmap {
false
} else if self.flag_mmap {
true
} else {
// If we're only searching a few paths and all of them are
// files, then memory maps are probably faster.
paths.len() <= 10 && paths.iter().all(|p| p.is_file())
};
if mmap {
debug!("will try to use memory maps");
}
let eol = { let eol = {
let eol = unescape(&self.flag_line_terminator); let eol = unescape(&self.flag_line_terminator);
if eol.is_empty() { if eol.is_empty() {
@ -316,6 +341,7 @@ impl RawArgs {
ignore_case: self.flag_ignore_case, ignore_case: self.flag_ignore_case,
invert_match: self.flag_invert_match, invert_match: self.flag_invert_match,
line_number: !self.flag_no_line_number && self.flag_line_number, line_number: !self.flag_no_line_number && self.flag_line_number,
mmap: mmap,
no_ignore: self.flag_no_ignore, no_ignore: self.flag_no_ignore,
no_ignore_parent: self.flag_no_ignore_parent, no_ignore_parent: self.flag_no_ignore_parent,
quiet: self.flag_quiet, quiet: self.flag_quiet,
@ -405,6 +431,11 @@ impl Args {
inp inp
} }
/// Whether we should prefer memory maps for searching or not.
pub fn mmap(&self) -> bool {
self.mmap
}
/// Create a new printer of individual search results that writes to the /// Create a new printer of individual search results that writes to the
/// writer given. /// writer given.
pub fn printer<W: Send + io::Write>(&self, wtr: W) -> Printer<W> { pub fn printer<W: Send + io::Write>(&self, wtr: W) -> Printer<W> {
@ -459,6 +490,24 @@ impl Args {
.text(self.text) .text(self.text)
} }
/// Create a new line based searcher whose configuration is taken from the
/// command line. This search operates on an entire file all once (which
/// may have been memory mapped).
pub fn searcher_buffer<'a, W: Send + io::Write>(
&self,
printer: &'a mut Printer<W>,
grep: &'a Grep,
path: &'a Path,
buf: &'a [u8],
) -> BufferSearcher<'a, W> {
BufferSearcher::new(printer, grep, path, buf)
.count(self.count)
.eol(self.eol)
.line_number(self.line_number)
.invert_match(self.invert_match)
.text(self.text)
}
/// Returns the number of worker search threads that should be used. /// Returns the number of worker search threads that should be used.
pub fn threads(&self) -> usize { pub fn threads(&self) -> usize {
self.threads self.threads

View File

@ -34,6 +34,7 @@ use std::thread;
use crossbeam::sync::chase_lev::{self, Steal, Stealer}; use crossbeam::sync::chase_lev::{self, Steal, Stealer};
use grep::Grep; use grep::Grep;
use memmap::{Mmap, Protection};
use walkdir::DirEntry; use walkdir::DirEntry;
use args::Args; use args::Args;
@ -61,6 +62,7 @@ mod ignore;
mod out; mod out;
mod printer; mod printer;
mod search; mod search;
mod search_buffer;
mod sys; mod sys;
mod terminal; mod terminal;
mod types; mod types;
@ -221,8 +223,12 @@ impl Worker {
if let Ok(p) = path.strip_prefix("./") { if let Ok(p) = path.strip_prefix("./") {
path = p; path = p;
} }
if self.args.mmap() {
self.search_mmap(printer, path, &file)
} else {
self.search(printer, path, file) self.search(printer, path, file)
} }
}
}; };
match result { match result {
Ok(count) => { Ok(count) => {
@ -248,4 +254,23 @@ impl Worker {
rdr, rdr,
).run().map_err(From::from) ).run().map_err(From::from)
} }
fn search_mmap<W: Send + io::Write>(
&mut self,
printer: &mut Printer<W>,
path: &Path,
file: &File,
) -> Result<u64> {
if try!(file.metadata()).len() == 0 {
// Opening a memory map with an empty file results in an error.
return Ok(0);
}
let mmap = try!(Mmap::open(file, Protection::Read));
Ok(self.args.searcher_buffer(
printer,
&self.grep,
path,
unsafe { mmap.as_slice() },
).run())
}
} }

View File

@ -74,14 +74,14 @@ pub struct Searcher<'a, R, W: 'a> {
/// Options for configuring search. /// Options for configuring search.
#[derive(Clone)] #[derive(Clone)]
struct Options { pub struct Options {
after_context: usize, pub after_context: usize,
before_context: usize, pub before_context: usize,
count: bool, pub count: bool,
eol: u8, pub eol: u8,
invert_match: bool, pub invert_match: bool,
line_number: bool, pub line_number: bool,
text: bool, pub text: bool,
} }
impl Default for Options { impl Default for Options {
@ -219,15 +219,12 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> {
self.print_inverted_matches(upto); self.print_inverted_matches(upto);
} }
} else if matched { } else if matched {
self.match_count += 1;
if !self.opts.count {
let start = self.last_match.start(); let start = self.last_match.start();
let end = self.last_match.end(); let end = self.last_match.end();
self.print_after_context(start); self.print_after_context(start);
self.print_before_context(start); self.print_before_context(start);
self.print_match(start, end); self.print_match(start, end);
} }
}
if matched { if matched {
self.inp.pos = self.last_match.end(); self.inp.pos = self.last_match.end();
} else { } else {
@ -275,11 +272,8 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> {
debug_assert!(self.opts.invert_match); debug_assert!(self.opts.invert_match);
let mut it = IterLines::new(self.opts.eol, self.inp.pos); let mut it = IterLines::new(self.opts.eol, self.inp.pos);
while let Some((start, end)) = it.next(&self.inp.buf[..upto]) { while let Some((start, end)) = it.next(&self.inp.buf[..upto]) {
if !self.opts.count {
self.print_match(start, end); self.print_match(start, end);
}
self.inp.pos = end; self.inp.pos = end;
self.match_count += 1;
} }
} }
@ -325,11 +319,15 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> {
#[inline(always)] #[inline(always)]
fn print_match(&mut self, start: usize, end: usize) { fn print_match(&mut self, start: usize, end: usize) {
self.match_count += 1;
if self.opts.count {
return;
}
self.print_separator(start); self.print_separator(start);
self.count_lines(start); self.count_lines(start);
self.add_line(end); self.add_line(end);
self.printer.matched( self.printer.matched(
self.grep.regex(), &self.path, self.grep.regex(), self.path,
&self.inp.buf, start, end, self.line_count); &self.inp.buf, start, end, self.line_count);
self.last_printed = end; self.last_printed = end;
self.after_context_remaining = self.opts.after_context; self.after_context_remaining = self.opts.after_context;
@ -535,7 +533,7 @@ impl InputBuffer {
/// ///
/// Note that this may return both false positives and false negatives. /// Note that this may return both false positives and false negatives.
#[inline(always)] #[inline(always)]
fn is_binary(buf: &[u8]) -> bool { pub fn is_binary(buf: &[u8]) -> bool {
if buf.len() >= 4 && &buf[0..4] == b"%PDF" { if buf.len() >= 4 && &buf[0..4] == b"%PDF" {
return true; return true;
} }
@ -544,7 +542,7 @@ fn is_binary(buf: &[u8]) -> bool {
/// Count the number of lines in the given buffer. /// Count the number of lines in the given buffer.
#[inline(always)] #[inline(always)]
fn count_lines(mut buf: &[u8], eol: u8) -> u64 { pub fn count_lines(mut buf: &[u8], eol: u8) -> u64 {
let mut count = 0; let mut count = 0;
while let Some(pos) = memchr(eol, buf) { while let Some(pos) = memchr(eol, buf) {
count += 1; count += 1;
@ -575,7 +573,7 @@ fn replace_buf(buf: &mut [u8], a: u8, b: u8) {
/// advance over the positions of each line. We neglect that approach to avoid /// advance over the positions of each line. We neglect that approach to avoid
/// the borrow in the search code. (Because the borrow prevents composition /// the borrow in the search code. (Because the borrow prevents composition
/// through other mutable methods.) /// through other mutable methods.)
struct IterLines { pub struct IterLines {
eol: u8, eol: u8,
pos: usize, pos: usize,
} }
@ -585,7 +583,7 @@ impl IterLines {
/// ///
/// The buffer is passed to the `next` method. /// The buffer is passed to the `next` method.
#[inline(always)] #[inline(always)]
fn new(eol: u8, start: usize) -> IterLines { pub fn new(eol: u8, start: usize) -> IterLines {
IterLines { IterLines {
eol: eol, eol: eol,
pos: start, pos: start,
@ -597,7 +595,7 @@ impl IterLines {
/// ///
/// The range returned includes the new line. /// The range returned includes the new line.
#[inline(always)] #[inline(always)]
fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> { pub fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> {
match memchr(self.eol, &buf[self.pos..]) { match memchr(self.eol, &buf[self.pos..]) {
None => { None => {
if self.pos < buf.len() { if self.pos < buf.len() {
@ -870,7 +868,7 @@ fn main() {
} }
#[test] #[test]
fn basic_search() { fn basic_search1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s|s); let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s|s);
assert_eq!(2, count); assert_eq!(2, count);
assert_eq!(out, "\ assert_eq!(out, "\
@ -887,7 +885,6 @@ fn main() {
assert_eq!(out, ""); assert_eq!(out, "");
} }
#[test] #[test]
fn binary_text() { fn binary_text() {
let text = "Sherlock\n\x00Holmes\n"; let text = "Sherlock\n\x00Holmes\n";

284
src/search_buffer.rs Normal file
View File

@ -0,0 +1,284 @@
use std::cmp;
use std::io;
use std::path::Path;
use grep::Grep;
use printer::Printer;
use search::{IterLines, Options, count_lines, is_binary};
pub struct BufferSearcher<'a, W: 'a> {
opts: Options,
printer: &'a mut Printer<W>,
grep: &'a Grep,
path: &'a Path,
buf: &'a [u8],
match_count: u64,
line_count: Option<u64>,
last_line: usize,
}
impl<'a, W: Send + io::Write> BufferSearcher<'a, W> {
pub fn new(
printer: &'a mut Printer<W>,
grep: &'a Grep,
path: &'a Path,
buf: &'a [u8],
) -> BufferSearcher<'a, W> {
BufferSearcher {
opts: Options::default(),
printer: printer,
grep: grep,
path: path,
buf: buf,
match_count: 0,
line_count: None,
last_line: 0,
}
}
/// If enabled, searching will print a count instead of each match.
///
/// Disabled by default.
pub fn count(mut self, yes: bool) -> Self {
self.opts.count = yes;
self
}
/// Set the end-of-line byte used by this searcher.
pub fn eol(mut self, eol: u8) -> Self {
self.opts.eol = eol;
self
}
/// If enabled, matching is inverted so that lines that *don't* match the
/// given pattern are treated as matches.
pub fn invert_match(mut self, yes: bool) -> Self {
self.opts.invert_match = yes;
self
}
/// If enabled, compute line numbers and prefix each line of output with
/// them.
pub fn line_number(mut self, yes: bool) -> Self {
self.opts.line_number = yes;
self
}
/// If enabled, search binary files as if they were text.
pub fn text(mut self, yes: bool) -> Self {
self.opts.text = yes;
self
}
#[inline(never)]
pub fn run(mut self) -> u64 {
let binary_upto = cmp::min(4096, self.buf.len());
if !self.opts.text && is_binary(&self.buf[..binary_upto]) {
return 0;
}
self.match_count = 0;
self.line_count = if self.opts.line_number { Some(0) } else { None };
let mut last_end = 0;
for m in self.grep.iter(self.buf) {
if self.opts.invert_match {
self.print_inverted_matches(last_end, m.start());
} else {
self.print_match(m.start(), m.end());
}
last_end = m.end();
}
if self.opts.invert_match {
let upto = self.buf.len();
self.print_inverted_matches(last_end, upto);
}
if self.opts.count && self.match_count > 0 {
self.printer.path_count(self.path, self.match_count);
}
self.match_count
}
#[inline(always)]
pub fn print_match(&mut self, start: usize, end: usize) {
self.match_count += 1;
if self.opts.count {
return;
}
self.count_lines(start);
self.add_line(end);
self.printer.matched(
self.grep.regex(), self.path, self.buf,
start, end, self.line_count);
}
#[inline(always)]
fn print_inverted_matches(&mut self, start: usize, end: usize) {
debug_assert!(self.opts.invert_match);
let mut it = IterLines::new(self.opts.eol, start);
while let Some((s, e)) = it.next(&self.buf[..end]) {
self.print_match(s, e);
}
}
#[inline(always)]
fn count_lines(&mut self, upto: usize) {
if let Some(ref mut line_count) = self.line_count {
*line_count += count_lines(
&self.buf[self.last_line..upto], self.opts.eol);
self.last_line = upto;
}
}
#[inline(always)]
fn add_line(&mut self, line_end: usize) {
if let Some(ref mut line_count) = self.line_count {
*line_count += 1;
self.last_line = line_end;
}
}
}
#[cfg(test)]
mod tests {
use std::path::Path;
use grep::{Grep, GrepBuilder};
use printer::Printer;
use super::BufferSearcher;
lazy_static! {
static ref SHERLOCK: &'static str = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
can extract a clew from a wisp of straw or a flake of cigar ash;
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.\
";
static ref CODE: &'static str = "\
extern crate snap;
use std::io;
fn main() {
let stdin = io::stdin();
let stdout = io::stdout();
// Wrap the stdin reader in a Snappy reader.
let mut rdr = snap::Reader::new(stdin.lock());
let mut wtr = stdout.lock();
io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\");
}
";
}
fn matcher(pat: &str) -> Grep {
GrepBuilder::new(pat).build().unwrap()
}
fn test_path() -> &'static Path {
&Path::new("/baz.rs")
}
type TestSearcher<'a> = BufferSearcher<'a, Vec<u8>>;
fn search<F: FnMut(TestSearcher) -> TestSearcher>(
pat: &str,
haystack: &str,
mut map: F,
) -> (u64, String) {
let mut pp = Printer::new(vec![], false).with_filename(true);
let grep = GrepBuilder::new(pat).build().unwrap();
let count = {
let searcher = BufferSearcher::new(
&mut pp, &grep, test_path(), haystack.as_bytes());
map(searcher).run()
};
(count, String::from_utf8(pp.into_inner()).unwrap())
}
#[test]
fn basic_search() {
let (count, out) = search("Sherlock", &*SHERLOCK, |s|s);
assert_eq!(2, count);
assert_eq!(out, "\
/baz.rs:For the Doctor Watsons of this world, as opposed to the Sherlock
/baz.rs:be, to a very large extent, the result of luck. Sherlock Holmes
");
}
#[test]
fn binary() {
let text = "Sherlock\n\x00Holmes\n";
let (count, out) = search("Sherlock|Holmes", text, |s|s);
assert_eq!(0, count);
assert_eq!(out, "");
}
#[test]
fn binary_text() {
let text = "Sherlock\n\x00Holmes\n";
let (count, out) = search("Sherlock|Holmes", text, |s| s.text(true));
assert_eq!(2, count);
assert_eq!(out, "/baz.rs:Sherlock\n/baz.rs:\x00Holmes\n");
}
#[test]
fn line_numbers() {
let (count, out) = search(
"Sherlock", &*SHERLOCK, |s| s.line_number(true));
assert_eq!(2, count);
assert_eq!(out, "\
/baz.rs:1:For the Doctor Watsons of this world, as opposed to the Sherlock
/baz.rs:3:be, to a very large extent, the result of luck. Sherlock Holmes
");
}
#[test]
fn count() {
let (count, out) = search(
"Sherlock", &*SHERLOCK, |s| s.count(true));
assert_eq!(2, count);
assert_eq!(out, "/baz.rs:2\n");
}
#[test]
fn invert_match() {
let (count, out) = search(
"Sherlock", &*SHERLOCK, |s| s.invert_match(true));
assert_eq!(4, count);
assert_eq!(out, "\
/baz.rs:Holmeses, success in the province of detective work must always
/baz.rs:can extract a clew from a wisp of straw or a flake of cigar ash;
/baz.rs:but Doctor Watson has to have it taken out for him and dusted,
/baz.rs:and exhibited clearly, with a label attached.
");
}
#[test]
fn invert_match_line_numbers() {
let (count, out) = search("Sherlock", &*SHERLOCK, |s| {
s.invert_match(true).line_number(true)
});
assert_eq!(4, count);
assert_eq!(out, "\
/baz.rs:2:Holmeses, success in the province of detective work must always
/baz.rs:4:can extract a clew from a wisp of straw or a flake of cigar ash;
/baz.rs:5:but Doctor Watson has to have it taken out for him and dusted,
/baz.rs:6:and exhibited clearly, with a label attached.
");
}
#[test]
fn invert_match_count() {
let (count, out) = search("Sherlock", &*SHERLOCK, |s| {
s.invert_match(true).count(true)
});
assert_eq!(4, count);
assert_eq!(out, "/baz.rs:4\n");
}
}