From ca058d7584ddc16d88a8dbff90b89cae8fca6e90 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 6 Sep 2016 21:47:33 -0400 Subject: [PATCH] Add support for memory maps. I though plain `read` had usurped them, but when searching a very small number of files, mmaps can be around 20% faster on Linux. It'd be really unfortunate to leave that on the table. Mmap searching doesn't support contexts yet, but we probably don't really care. And duplicating that logic doesn't sound fun. Without contexts, mmap searching is delightfully simple. --- src/args.rs | 49 ++++++++ src/main.rs | 27 +++- src/search.rs | 53 ++++---- src/search_buffer.rs | 284 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 384 insertions(+), 29 deletions(-) create mode 100644 src/search_buffer.rs diff --git a/src/args.rs b/src/args.rs index e5320110..de9ddcb8 100644 --- a/src/args.rs +++ b/src/args.rs @@ -16,6 +16,7 @@ use ignore::Ignore; use out::Out; use printer::Printer; use search::{InputBuffer, Searcher}; +use search_buffer::BufferSearcher; use sys; use types::{FileTypeDef, Types, TypesBuilder}; use walk; @@ -111,6 +112,14 @@ Less common options: The byte to use for a line terminator. Escape sequences may be used. [default: \\n] + --mmap + Search using memory maps when possible. This is enabled by default + when xrep thinks it will be faster. (Note that mmap searching doesn't + current support the various context related options.) + + --no-mmap + Never use memory maps, even when they might be faster. + --no-ignore Don't respect ignore files (.gitignore, .xrepignore, etc.) @@ -166,10 +175,12 @@ pub struct RawArgs { flag_line_number: bool, flag_line_terminator: String, flag_literal: bool, + flag_mmap: bool, flag_no_heading: bool, flag_no_ignore: bool, flag_no_ignore_parent: bool, flag_no_line_number: bool, + flag_no_mmap: bool, flag_pretty: bool, flag_quiet: bool, flag_replace: Option, @@ -205,6 +216,7 @@ pub struct Args { ignore_case: bool, invert_match: bool, line_number: bool, + mmap: bool, no_ignore: bool, no_ignore_parent: bool, quiet: bool, @@ -251,6 +263,19 @@ impl RawArgs { } else { (self.flag_after_context, self.flag_before_context) }; + let mmap = + if before_context > 0 || after_context > 0 || self.flag_no_mmap { + false + } else if self.flag_mmap { + true + } else { + // If we're only searching a few paths and all of them are + // files, then memory maps are probably faster. + paths.len() <= 10 && paths.iter().all(|p| p.is_file()) + }; + if mmap { + debug!("will try to use memory maps"); + } let eol = { let eol = unescape(&self.flag_line_terminator); if eol.is_empty() { @@ -316,6 +341,7 @@ impl RawArgs { ignore_case: self.flag_ignore_case, invert_match: self.flag_invert_match, line_number: !self.flag_no_line_number && self.flag_line_number, + mmap: mmap, no_ignore: self.flag_no_ignore, no_ignore_parent: self.flag_no_ignore_parent, quiet: self.flag_quiet, @@ -405,6 +431,11 @@ impl Args { inp } + /// Whether we should prefer memory maps for searching or not. + pub fn mmap(&self) -> bool { + self.mmap + } + /// Create a new printer of individual search results that writes to the /// writer given. pub fn printer(&self, wtr: W) -> Printer { @@ -459,6 +490,24 @@ impl Args { .text(self.text) } + /// Create a new line based searcher whose configuration is taken from the + /// command line. This search operates on an entire file all once (which + /// may have been memory mapped). + pub fn searcher_buffer<'a, W: Send + io::Write>( + &self, + printer: &'a mut Printer, + grep: &'a Grep, + path: &'a Path, + buf: &'a [u8], + ) -> BufferSearcher<'a, W> { + BufferSearcher::new(printer, grep, path, buf) + .count(self.count) + .eol(self.eol) + .line_number(self.line_number) + .invert_match(self.invert_match) + .text(self.text) + } + /// Returns the number of worker search threads that should be used. pub fn threads(&self) -> usize { self.threads diff --git a/src/main.rs b/src/main.rs index 10611907..15b250a6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -34,6 +34,7 @@ use std::thread; use crossbeam::sync::chase_lev::{self, Steal, Stealer}; use grep::Grep; +use memmap::{Mmap, Protection}; use walkdir::DirEntry; use args::Args; @@ -61,6 +62,7 @@ mod ignore; mod out; mod printer; mod search; +mod search_buffer; mod sys; mod terminal; mod types; @@ -221,7 +223,11 @@ impl Worker { if let Ok(p) = path.strip_prefix("./") { path = p; } - self.search(printer, path, file) + if self.args.mmap() { + self.search_mmap(printer, path, &file) + } else { + self.search(printer, path, file) + } } }; match result { @@ -248,4 +254,23 @@ impl Worker { rdr, ).run().map_err(From::from) } + + fn search_mmap( + &mut self, + printer: &mut Printer, + path: &Path, + file: &File, + ) -> Result { + if try!(file.metadata()).len() == 0 { + // Opening a memory map with an empty file results in an error. + return Ok(0); + } + let mmap = try!(Mmap::open(file, Protection::Read)); + Ok(self.args.searcher_buffer( + printer, + &self.grep, + path, + unsafe { mmap.as_slice() }, + ).run()) + } } diff --git a/src/search.rs b/src/search.rs index 9739d51f..4f8ae10d 100644 --- a/src/search.rs +++ b/src/search.rs @@ -74,14 +74,14 @@ pub struct Searcher<'a, R, W: 'a> { /// Options for configuring search. #[derive(Clone)] -struct Options { - after_context: usize, - before_context: usize, - count: bool, - eol: u8, - invert_match: bool, - line_number: bool, - text: bool, +pub struct Options { + pub after_context: usize, + pub before_context: usize, + pub count: bool, + pub eol: u8, + pub invert_match: bool, + pub line_number: bool, + pub text: bool, } impl Default for Options { @@ -219,14 +219,11 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> { self.print_inverted_matches(upto); } } else if matched { - self.match_count += 1; - if !self.opts.count { - let start = self.last_match.start(); - let end = self.last_match.end(); - self.print_after_context(start); - self.print_before_context(start); - self.print_match(start, end); - } + let start = self.last_match.start(); + let end = self.last_match.end(); + self.print_after_context(start); + self.print_before_context(start); + self.print_match(start, end); } if matched { self.inp.pos = self.last_match.end(); @@ -275,11 +272,8 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> { debug_assert!(self.opts.invert_match); let mut it = IterLines::new(self.opts.eol, self.inp.pos); while let Some((start, end)) = it.next(&self.inp.buf[..upto]) { - if !self.opts.count { - self.print_match(start, end); - } + self.print_match(start, end); self.inp.pos = end; - self.match_count += 1; } } @@ -325,11 +319,15 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> { #[inline(always)] fn print_match(&mut self, start: usize, end: usize) { + self.match_count += 1; + if self.opts.count { + return; + } self.print_separator(start); self.count_lines(start); self.add_line(end); self.printer.matched( - self.grep.regex(), &self.path, + self.grep.regex(), self.path, &self.inp.buf, start, end, self.line_count); self.last_printed = end; self.after_context_remaining = self.opts.after_context; @@ -535,7 +533,7 @@ impl InputBuffer { /// /// Note that this may return both false positives and false negatives. #[inline(always)] -fn is_binary(buf: &[u8]) -> bool { +pub fn is_binary(buf: &[u8]) -> bool { if buf.len() >= 4 && &buf[0..4] == b"%PDF" { return true; } @@ -544,7 +542,7 @@ fn is_binary(buf: &[u8]) -> bool { /// Count the number of lines in the given buffer. #[inline(always)] -fn count_lines(mut buf: &[u8], eol: u8) -> u64 { +pub fn count_lines(mut buf: &[u8], eol: u8) -> u64 { let mut count = 0; while let Some(pos) = memchr(eol, buf) { count += 1; @@ -575,7 +573,7 @@ fn replace_buf(buf: &mut [u8], a: u8, b: u8) { /// advance over the positions of each line. We neglect that approach to avoid /// the borrow in the search code. (Because the borrow prevents composition /// through other mutable methods.) -struct IterLines { +pub struct IterLines { eol: u8, pos: usize, } @@ -585,7 +583,7 @@ impl IterLines { /// /// The buffer is passed to the `next` method. #[inline(always)] - fn new(eol: u8, start: usize) -> IterLines { + pub fn new(eol: u8, start: usize) -> IterLines { IterLines { eol: eol, pos: start, @@ -597,7 +595,7 @@ impl IterLines { /// /// The range returned includes the new line. #[inline(always)] - fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> { + pub fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> { match memchr(self.eol, &buf[self.pos..]) { None => { if self.pos < buf.len() { @@ -870,7 +868,7 @@ fn main() { } #[test] - fn basic_search() { + fn basic_search1() { let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s|s); assert_eq!(2, count); assert_eq!(out, "\ @@ -887,7 +885,6 @@ fn main() { assert_eq!(out, ""); } - #[test] fn binary_text() { let text = "Sherlock\n\x00Holmes\n"; diff --git a/src/search_buffer.rs b/src/search_buffer.rs new file mode 100644 index 00000000..48ff1ba0 --- /dev/null +++ b/src/search_buffer.rs @@ -0,0 +1,284 @@ +use std::cmp; +use std::io; +use std::path::Path; + +use grep::Grep; + +use printer::Printer; +use search::{IterLines, Options, count_lines, is_binary}; + +pub struct BufferSearcher<'a, W: 'a> { + opts: Options, + printer: &'a mut Printer, + grep: &'a Grep, + path: &'a Path, + buf: &'a [u8], + match_count: u64, + line_count: Option, + last_line: usize, +} + +impl<'a, W: Send + io::Write> BufferSearcher<'a, W> { + pub fn new( + printer: &'a mut Printer, + grep: &'a Grep, + path: &'a Path, + buf: &'a [u8], + ) -> BufferSearcher<'a, W> { + BufferSearcher { + opts: Options::default(), + printer: printer, + grep: grep, + path: path, + buf: buf, + match_count: 0, + line_count: None, + last_line: 0, + } + } + + /// If enabled, searching will print a count instead of each match. + /// + /// Disabled by default. + pub fn count(mut self, yes: bool) -> Self { + self.opts.count = yes; + self + } + + /// Set the end-of-line byte used by this searcher. + pub fn eol(mut self, eol: u8) -> Self { + self.opts.eol = eol; + self + } + + /// If enabled, matching is inverted so that lines that *don't* match the + /// given pattern are treated as matches. + pub fn invert_match(mut self, yes: bool) -> Self { + self.opts.invert_match = yes; + self + } + + /// If enabled, compute line numbers and prefix each line of output with + /// them. + pub fn line_number(mut self, yes: bool) -> Self { + self.opts.line_number = yes; + self + } + + /// If enabled, search binary files as if they were text. + pub fn text(mut self, yes: bool) -> Self { + self.opts.text = yes; + self + } + + #[inline(never)] + pub fn run(mut self) -> u64 { + let binary_upto = cmp::min(4096, self.buf.len()); + if !self.opts.text && is_binary(&self.buf[..binary_upto]) { + return 0; + } + + self.match_count = 0; + self.line_count = if self.opts.line_number { Some(0) } else { None }; + let mut last_end = 0; + for m in self.grep.iter(self.buf) { + if self.opts.invert_match { + self.print_inverted_matches(last_end, m.start()); + } else { + self.print_match(m.start(), m.end()); + } + last_end = m.end(); + } + if self.opts.invert_match { + let upto = self.buf.len(); + self.print_inverted_matches(last_end, upto); + } + if self.opts.count && self.match_count > 0 { + self.printer.path_count(self.path, self.match_count); + } + self.match_count + } + + #[inline(always)] + pub fn print_match(&mut self, start: usize, end: usize) { + self.match_count += 1; + if self.opts.count { + return; + } + self.count_lines(start); + self.add_line(end); + self.printer.matched( + self.grep.regex(), self.path, self.buf, + start, end, self.line_count); + } + + #[inline(always)] + fn print_inverted_matches(&mut self, start: usize, end: usize) { + debug_assert!(self.opts.invert_match); + let mut it = IterLines::new(self.opts.eol, start); + while let Some((s, e)) = it.next(&self.buf[..end]) { + self.print_match(s, e); + } + } + + #[inline(always)] + fn count_lines(&mut self, upto: usize) { + if let Some(ref mut line_count) = self.line_count { + *line_count += count_lines( + &self.buf[self.last_line..upto], self.opts.eol); + self.last_line = upto; + } + } + + #[inline(always)] + fn add_line(&mut self, line_end: usize) { + if let Some(ref mut line_count) = self.line_count { + *line_count += 1; + self.last_line = line_end; + } + } +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use grep::{Grep, GrepBuilder}; + + use printer::Printer; + + use super::BufferSearcher; + + lazy_static! { + static ref SHERLOCK: &'static str = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached.\ +"; + static ref CODE: &'static str = "\ +extern crate snap; + +use std::io; + +fn main() { + let stdin = io::stdin(); + let stdout = io::stdout(); + + // Wrap the stdin reader in a Snappy reader. + let mut rdr = snap::Reader::new(stdin.lock()); + let mut wtr = stdout.lock(); + io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); +} +"; + } + + fn matcher(pat: &str) -> Grep { + GrepBuilder::new(pat).build().unwrap() + } + + fn test_path() -> &'static Path { + &Path::new("/baz.rs") + } + + type TestSearcher<'a> = BufferSearcher<'a, Vec>; + + fn search TestSearcher>( + pat: &str, + haystack: &str, + mut map: F, + ) -> (u64, String) { + let mut pp = Printer::new(vec![], false).with_filename(true); + let grep = GrepBuilder::new(pat).build().unwrap(); + let count = { + let searcher = BufferSearcher::new( + &mut pp, &grep, test_path(), haystack.as_bytes()); + map(searcher).run() + }; + (count, String::from_utf8(pp.into_inner()).unwrap()) + } + + #[test] + fn basic_search() { + let (count, out) = search("Sherlock", &*SHERLOCK, |s|s); + assert_eq!(2, count); + assert_eq!(out, "\ +/baz.rs:For the Doctor Watsons of this world, as opposed to the Sherlock +/baz.rs:be, to a very large extent, the result of luck. Sherlock Holmes +"); + } + + #[test] + fn binary() { + let text = "Sherlock\n\x00Holmes\n"; + let (count, out) = search("Sherlock|Holmes", text, |s|s); + assert_eq!(0, count); + assert_eq!(out, ""); + } + + + #[test] + fn binary_text() { + let text = "Sherlock\n\x00Holmes\n"; + let (count, out) = search("Sherlock|Holmes", text, |s| s.text(true)); + assert_eq!(2, count); + assert_eq!(out, "/baz.rs:Sherlock\n/baz.rs:\x00Holmes\n"); + } + + #[test] + fn line_numbers() { + let (count, out) = search( + "Sherlock", &*SHERLOCK, |s| s.line_number(true)); + assert_eq!(2, count); + assert_eq!(out, "\ +/baz.rs:1:For the Doctor Watsons of this world, as opposed to the Sherlock +/baz.rs:3:be, to a very large extent, the result of luck. Sherlock Holmes +"); + } + + #[test] + fn count() { + let (count, out) = search( + "Sherlock", &*SHERLOCK, |s| s.count(true)); + assert_eq!(2, count); + assert_eq!(out, "/baz.rs:2\n"); + } + + #[test] + fn invert_match() { + let (count, out) = search( + "Sherlock", &*SHERLOCK, |s| s.invert_match(true)); + assert_eq!(4, count); + assert_eq!(out, "\ +/baz.rs:Holmeses, success in the province of detective work must always +/baz.rs:can extract a clew from a wisp of straw or a flake of cigar ash; +/baz.rs:but Doctor Watson has to have it taken out for him and dusted, +/baz.rs:and exhibited clearly, with a label attached. +"); + } + + #[test] + fn invert_match_line_numbers() { + let (count, out) = search("Sherlock", &*SHERLOCK, |s| { + s.invert_match(true).line_number(true) + }); + assert_eq!(4, count); + assert_eq!(out, "\ +/baz.rs:2:Holmeses, success in the province of detective work must always +/baz.rs:4:can extract a clew from a wisp of straw or a flake of cigar ash; +/baz.rs:5:but Doctor Watson has to have it taken out for him and dusted, +/baz.rs:6:and exhibited clearly, with a label attached. +"); + } + + #[test] + fn invert_match_count() { + let (count, out) = search("Sherlock", &*SHERLOCK, |s| { + s.invert_match(true).count(true) + }); + assert_eq!(4, count); + assert_eq!(out, "/baz.rs:4\n"); + } +}