1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-02-04 06:08:39 +02:00

Rework glob sets.

We try to reduce the pressure on regexes and offload some of it to
Aho-Corasick or exact lookups.
This commit is contained in:
Andrew Gallant 2016-09-15 22:06:04 -04:00
parent f5c85827ce
commit 0e46171e3b
9 changed files with 519 additions and 91 deletions

7
Cargo.lock generated
View File

@ -5,6 +5,7 @@ dependencies = [
"deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"grep 0.1.0",
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
@ -57,6 +58,11 @@ dependencies = [
"regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "fnv"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "fs2"
version = "0.2.5"
@ -230,6 +236,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf"
"checksum docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)" = "fc42c6077823a361410c37d47c2535b73a190cbe10838dc4f400fe87c10c8c3b"
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
"checksum fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8e8af7b5408ab0c4910cad114c8f9eb454bf75df7afe8964307eeafb68a13a5e"
"checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef"
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"

View File

@ -26,6 +26,7 @@ path = "tests/tests.rs"
deque = "0.3"
docopt = "0.6"
env_logger = "0.3"
fnv = "1.0"
grep = { version = "0.1", path = "grep" }
lazy_static = "0.2"
libc = "0.2"

View File

@ -64,7 +64,9 @@ def bench_linux_literal_default(suite_dir):
# doesn't read gitignore files. Instead, it has a file whitelist
# that happens to match up exactly with the gitignores for this search.
mkcmd('ucg', ['ucg', pat]),
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}),
# I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
# default, but I'd guess it to be on most desktop systems.
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
mkcmd('pt', ['pt', pat]),
# sift reports an extra line here for a binary file matched.
mkcmd('sift', ['sift', pat]),
@ -89,11 +91,10 @@ def bench_linux_literal(suite_dir):
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('ag (mmap)', ['ag', '-s', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
mkcmd('git grep', [
'git', 'grep', '-I', '-n', pat,
], env={'LC_ALL': 'C'}),
@ -121,13 +122,16 @@ def bench_linux_literal_casei(suite_dir):
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
mkcmd('rg (mmap)', ['rg', '-n', '-i', pat]),
mkcmd('rg (whitelist)', [
'rg', '-n', '-i', '--no-ignore', '-tall', pat,
]),
mkcmd('ag', ['ag', '-i', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]),
mkcmd('ag (mmap)', ['ag', '-i', pat]),
mkcmd('ucg', ['ucg', '-i', pat]),
# It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
# since that is certainly what ripgrep is doing, but this is for an
# ASCII literal, so we should give `git grep` all the opportunity to
# do its best.
mkcmd('git grep', [
'git', 'grep', '-I', '-n', '-i', pat,
], env={'LC_ALL': 'C'}),

View File

@ -21,6 +21,7 @@ additional rules such as whitelists (prefix of `!`) or directory-only globs
// TODO(burntsushi): Implement something similar, but for Mercurial. We can't
// use this exact implementation because hgignore files are different.
use std::cell::RefCell;
use std::error::Error as StdError;
use std::fmt;
use std::fs::File;
@ -30,6 +31,7 @@ use std::path::{Path, PathBuf};
use regex;
use glob;
use pathutil::strip_prefix;
/// Represents an error that can occur when parsing a gitignore file.
#[derive(Debug)]
@ -110,37 +112,35 @@ impl Gitignore {
/// same directory as this gitignore file.
pub fn matched<P: AsRef<Path>>(&self, path: P, is_dir: bool) -> Match {
let mut path = path.as_ref();
if let Ok(p) = path.strip_prefix(&self.root) {
if let Some(p) = strip_prefix("./", path) {
path = p;
}
self.matched_utf8(&*path.to_string_lossy(), is_dir)
if let Some(p) = strip_prefix(&self.root, path) {
path = p;
}
self.matched_stripped(path, is_dir)
}
/// Like matched, but takes a path that has already been stripped and
/// converted to UTF-8.
pub fn matched_utf8(&self, path: &str, is_dir: bool) -> Match {
// A single regex with a bunch of alternations of glob patterns is
// unfortunately typically faster than a regex, so we use it as a
// first pass filter. We still need to run the RegexSet to get the most
// recently defined glob that matched.
if !self.set.is_match(path) {
return Match::None;
/// Like matched, but takes a path that has already been stripped.
pub fn matched_stripped(&self, path: &Path, is_dir: bool) -> Match {
thread_local! {
static MATCHES: RefCell<Vec<usize>> = RefCell::new(vec![]);
}
// The regex set can't actually pick the right glob that matched all
// on its own. In particular, some globs require that only directories
// can match. Thus, only accept a match from the regex set if the given
// path satisfies the corresponding glob's directory criteria.
for i in self.set.matches(path).iter().rev() {
let pat = &self.patterns[i];
if !pat.only_dir || is_dir {
return if pat.whitelist {
Match::Whitelist(pat)
} else {
Match::Ignored(pat)
};
MATCHES.with(|matches| {
let mut matches = matches.borrow_mut();
self.set.matches_into(path, &mut *matches);
for &i in matches.iter().rev() {
let pat = &self.patterns[i];
if !pat.only_dir || is_dir {
return if pat.whitelist {
Match::Whitelist(pat)
} else {
Match::Ignored(pat)
};
}
}
}
Match::None
Match::None
})
}
/// Returns the total number of ignore patterns.
@ -390,6 +390,7 @@ mod tests {
ignored!(ig23, ROOT, "foo", "./foo");
ignored!(ig24, ROOT, "target", "grep/target");
ignored!(ig25, ROOT, "Cargo.lock", "./tabwriter-bin/Cargo.lock");
ignored!(ig26, ROOT, "/foo/bar/baz", "./foo/bar/baz");
not_ignored!(ignot1, ROOT, "amonths", "months");
not_ignored!(ignot2, ROOT, "monthsa", "months");

View File

@ -26,13 +26,22 @@ to make its way into `glob` proper.
// at the .gitignore for the chromium repo---just about every pattern satisfies
// that assumption.)
use std::borrow::Cow;
use std::collections::HashMap;
use std::error::Error as StdError;
use std::ffi::{OsStr, OsString};
use std::fmt;
use std::hash;
use std::iter;
use std::path::Path;
use std::str;
use fnv;
use regex;
use regex::bytes::{Regex, RegexSet, SetMatches};
use regex::bytes::Regex;
use regex::bytes::RegexSet;
use pathutil::file_name;
/// Represents an error that can occur when parsing a glob pattern.
#[derive(Clone, Debug, Eq, PartialEq)]
@ -71,33 +80,165 @@ impl fmt::Display for Error {
}
}
/// SetYesNo represents a group of globs that can be matched together in a
/// single pass. SetYesNo can only determine whether a particular path matched
/// any pattern in the set.
#[derive(Clone, Debug)]
pub struct SetYesNo {
re: Regex,
}
impl SetYesNo {
/// Returns true if and only if the given path matches at least one glob
/// in this set.
pub fn is_match<T: AsRef<Path>>(&self, path: T) -> bool {
self.re.is_match(&*path_bytes(path.as_ref()))
}
fn new(
pats: &[(Pattern, MatchOptions)],
) -> Result<SetYesNo, regex::Error> {
let mut joined = String::new();
for &(ref p, ref o) in pats {
let part = format!("(?:{})", p.to_regex_with(o));
if !joined.is_empty() {
joined.push('|');
}
joined.push_str(&part);
}
Ok(SetYesNo { re: try!(Regex::new(&joined)) })
}
}
type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
/// Set represents a group of globs that can be matched together in a single
/// pass.
#[derive(Clone, Debug)]
pub struct Set {
re: Regex,
set: RegexSet,
yesno: SetYesNo,
exts: HashMap<OsString, Vec<usize>, Fnv>,
literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
base_literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
base_prefixes: Vec<Vec<u8>>,
base_prefixes_map: Vec<usize>,
base_suffixes: Vec<Vec<u8>>,
base_suffixes_map: Vec<usize>,
regexes: RegexSet,
regexes_map: Vec<usize>,
}
impl Set {
/// Returns true if and only if the given path matches at least one glob
/// in this set.
pub fn is_match<T: AsRef<[u8]>>(&self, path: T) -> bool {
self.re.is_match(path.as_ref())
}
/// Returns every glob pattern (by sequence number) that matches the given
/// path.
pub fn matches<T: AsRef<[u8]>>(&self, path: T) -> SetMatches {
// TODO(burntsushi): If we split this out into a separate crate, don't
// expose the regex::SetMatches type in the public API.
self.set.matches(path.as_ref())
}
/// Returns the number of glob patterns in this set.
/// Returns the sequence number of every glob pattern that matches the
/// given path.
#[allow(dead_code)]
pub fn len(&self) -> usize {
self.set.len()
pub fn matches<T: AsRef<Path>>(&self, path: T) -> Vec<usize> {
let mut into = vec![];
self.matches_into(path, &mut into);
into
}
/// Adds the sequence number of every glob pattern that matches the given
/// path to the vec given.
pub fn matches_into<T: AsRef<Path>>(
&self,
path: T,
into: &mut Vec<usize>,
) {
into.clear();
let path = path.as_ref();
let path_bytes = &*path_bytes(path);
let basename = file_name(path).map(|b| os_str_bytes(b));
if !self.yesno.is_match(path) {
return;
}
if !self.exts.is_empty() {
if let Some(ext) = path.extension() {
if let Some(matches) = self.exts.get(ext) {
into.extend(matches.as_slice());
}
}
}
if !self.literals.is_empty() {
if let Some(matches) = self.literals.get(path_bytes) {
into.extend(matches.as_slice());
}
}
if !self.base_literals.is_empty() {
if let Some(ref basename) = basename {
if let Some(matches) = self.base_literals.get(&**basename) {
into.extend(matches.as_slice());
}
}
}
if !self.base_prefixes.is_empty() {
if let Some(ref basename) = basename {
let basename = &**basename;
for (i, pre) in self.base_prefixes.iter().enumerate() {
if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] {
into.push(self.base_prefixes_map[i]);
}
}
}
}
if !self.base_suffixes.is_empty() {
if let Some(ref basename) = basename {
let basename = &**basename;
for (i, suf) in self.base_suffixes.iter().enumerate() {
if suf.len() > basename.len() {
continue;
}
let (s, e) = (basename.len() - suf.len(), basename.len());
if &**suf == &basename[s..e] {
into.push(self.base_suffixes_map[i]);
}
}
}
}
into.extend(self.regexes.matches(path_bytes));
into.sort();
}
fn new(pats: &[(Pattern, MatchOptions)]) -> Result<Set, regex::Error> {
let fnv = Fnv::default();
let mut exts = HashMap::with_hasher(fnv.clone());
let mut literals = HashMap::with_hasher(fnv.clone());
let mut base_literals = HashMap::with_hasher(fnv.clone());
let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]);
let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]);
let (mut regexes, mut regexes_map) = (vec![], vec![]);
for (i, &(ref p, ref o)) in pats.iter().enumerate() {
if let Some(ext) = p.ext() {
exts.entry(ext).or_insert(vec![]).push(i);
} else if let Some(literal) = p.literal() {
literals.entry(literal.into_bytes()).or_insert(vec![]).push(i);
} else if let Some(literal) = p.base_literal() {
base_literals
.entry(literal.into_bytes()).or_insert(vec![]).push(i);
} else if let Some(literal) = p.base_literal_prefix() {
base_prefixes.push(literal.into_bytes());
base_prefixes_map.push(i);
} else if let Some(literal) = p.base_literal_suffix() {
base_suffixes.push(literal.into_bytes());
base_suffixes_map.push(i);
} else {
let part = format!("(?:{})", p.to_regex_with(o));
regexes.push(part);
regexes_map.push(i);
}
}
Ok(Set {
yesno: try!(SetYesNo::new(pats)),
exts: exts,
literals: literals,
base_literals: base_literals,
base_prefixes: base_prefixes,
base_prefixes_map: base_prefixes_map,
base_suffixes: base_suffixes,
base_suffixes_map: base_suffixes_map,
regexes: try!(RegexSet::new(regexes)),
regexes_map: regexes_map,
})
}
}
@ -119,19 +260,12 @@ impl SetBuilder {
///
/// Once a matcher is built, no new patterns can be added to it.
pub fn build(&self) -> Result<Set, regex::Error> {
let it = self.pats.iter().map(|&(ref p, ref o)| p.to_regex_with(o));
let set = try!(RegexSet::new(it));
Set::new(&self.pats)
}
let mut joined = String::new();
for &(ref p, ref o) in &self.pats {
let part = format!("(?:{})", p.to_regex_with(o));
if !joined.is_empty() {
joined.push('|');
}
joined.push_str(&part);
}
let re = try!(Regex::new(&joined));
Ok(Set { re: re, set: set })
/// Like `build`, but returns a matcher that can only answer yes/no.
pub fn build_yesno(&self) -> Result<SetYesNo, regex::Error> {
SetYesNo::new(&self.pats)
}
/// Add a new pattern to this set.
@ -149,8 +283,21 @@ impl SetBuilder {
pat: &str,
opts: &MatchOptions,
) -> Result<(), Error> {
let pat = try!(Pattern::new(pat));
self.pats.push((pat, opts.clone()));
let parsed = try!(Pattern::new(pat));
// if let Some(ext) = parsed.ext() {
// eprintln!("ext :: {:?} :: {:?}", ext, pat);
// } else if let Some(lit) = parsed.literal() {
// eprintln!("literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal_prefix() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal_suffix() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else {
// eprintln!("regex :: {:?} :: {:?}", pat, parsed);
// }
self.pats.push((parsed, opts.clone()));
Ok(())
}
}
@ -204,6 +351,114 @@ impl Pattern {
Ok(p.p)
}
/// Returns an extension if this pattern exclusively matches it.
pub fn ext(&self) -> Option<OsString> {
if self.tokens.len() <= 3 {
return None;
}
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.get(1) {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
match self.tokens.get(2) {
Some(&Token::Literal(c)) if c == '.' => {}
_ => return None,
}
let mut lit = OsString::new();
for t in self.tokens[3..].iter() {
match *t {
Token::Literal(c) if c == '/' || c == '\\' || c == '.' => {
return None;
}
Token::Literal(c) => lit.push(c.to_string()),
_ => return None,
}
}
Some(lit)
}
/// Returns the pattern as a literal if and only if the pattern exclusiely
/// matches the basename of a file path *and* is a literal.
///
/// The basic format of these patterns is `**/{literal}`, where `{literal}`
/// does not contain a path separator.
pub fn base_literal(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[1..] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns the pattern as a literal if and only if the pattern must match
/// an entire path exactly.
///
/// The basic format of these patterns is `{literal}`.
pub fn literal(&self) -> Option<String> {
let mut lit = String::new();
for t in &self.tokens {
match *t {
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns a basename literal prefix of this pattern.
pub fn base_literal_prefix(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.last() {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[1..self.tokens.len()-1] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns a basename literal suffix of this pattern.
pub fn base_literal_suffix(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.get(1) {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[2..] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Convert this pattern to a string that is guaranteed to be a valid
/// regular expression and will represent the matching semantics of this
/// glob pattern. This uses a default set of options.
@ -415,13 +670,34 @@ impl<'a> Parser<'a> {
}
}
fn path_bytes(path: &Path) -> Cow<[u8]> {
os_str_bytes(path.as_os_str())
}
#[cfg(unix)]
fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
use std::os::unix::ffi::OsStrExt;
Cow::Borrowed(s.as_bytes())
}
#[cfg(not(unix))]
fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
// TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even
// if we could get at the raw bytes, they wouldn't be useful. We *must*
// convert to UTF-8 before doing path matching. Unfortunate, but necessary.
match s.to_string_lossy() {
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
}
}
#[cfg(test)]
mod tests {
use std::path::Path;
use regex::bytes::Regex;
use super::{Error, Pattern, MatchOptions, SetBuilder, Token};
use super::{Error, Pattern, MatchOptions, Set, SetBuilder, Token};
use super::Token::*;
macro_rules! syntax {
@ -491,6 +767,37 @@ mod tests {
};
}
macro_rules! ext {
($name:ident, $pat:expr, $ext:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
let ext = pat.ext().map(|e| e.to_string_lossy().into_owned());
assert_eq!($ext, ext.as_ref().map(|s| &**s));
}
};
}
macro_rules! baseliteral {
($name:ident, $pat:expr, $yes:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
assert_eq!($yes, pat.base_literal().is_some());
}
};
}
macro_rules! basesuffix {
($name:ident, $pat:expr, $yes:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
assert_eq!($yes, pat.is_literal_suffix());
}
};
}
fn class(s: char, e: char) -> Token {
Class { negated: false, ranges: vec![(s, e)] }
}
@ -585,6 +892,26 @@ mod tests {
toregex!(re10, "+", r"^\+$");
toregex!(re11, "**", r"^.*$");
ext!(ext1, "**/*.rs", Some("rs"));
baseliteral!(lit1, "**", true);
baseliteral!(lit2, "**/a", true);
baseliteral!(lit3, "**/ab", true);
baseliteral!(lit4, "**/a*b", false);
baseliteral!(lit5, "z/**/a*b", false);
baseliteral!(lit6, "[ab]", false);
baseliteral!(lit7, "?", false);
/*
issuffix!(suf1, "", false);
issuffix!(suf2, "a", true);
issuffix!(suf3, "ab", true);
issuffix!(suf4, "*ab", true);
issuffix!(suf5, "*.ab", true);
issuffix!(suf6, "?.ab", true);
issuffix!(suf7, "ab*", false);
*/
matches!(match1, "a", "a");
matches!(match2, "a*b", "a_b");
matches!(match3, "a*b*c", "abc");
@ -681,16 +1008,22 @@ mod tests {
builder.add("src/lib.rs").unwrap();
let set = builder.build().unwrap();
assert!(set.is_match("foo.c"));
assert!(set.is_match("src/foo.c"));
assert!(!set.is_match("foo.rs"));
assert!(!set.is_match("tests/foo.rs"));
assert!(set.is_match("src/foo.rs"));
assert!(set.is_match("src/grep/src/main.rs"));
fn is_match(set: &Set, s: &str) -> bool {
let mut matches = vec![];
set.matches_into(s, &mut matches);
!matches.is_empty()
}
assert_eq!(2, set.matches("src/lib.rs").iter().count());
assert!(set.matches("src/lib.rs").matched(0));
assert!(!set.matches("src/lib.rs").matched(1));
assert!(set.matches("src/lib.rs").matched(2));
assert!(is_match(&set, "foo.c"));
assert!(is_match(&set, "src/foo.c"));
assert!(!is_match(&set, "foo.rs"));
assert!(!is_match(&set, "tests/foo.rs"));
assert!(is_match(&set, "src/foo.rs"));
assert!(is_match(&set, "src/grep/src/main.rs"));
let matches = set.matches("src/lib.rs");
assert_eq!(2, matches.len());
assert_eq!(0, matches[0]);
assert_eq!(2, matches[1]);
}
}

View File

@ -365,8 +365,7 @@ impl Overrides {
let path = path.as_ref();
self.gi.as_ref()
.map(|gi| {
let path = &*path.to_string_lossy();
let mat = gi.matched_utf8(path, is_dir).invert();
let mat = gi.matched_stripped(path, is_dir).invert();
if mat.is_none() && !is_dir {
if gi.num_ignores() > 0 {
return Match::Ignored(&self.unmatched_pat);

View File

@ -1,6 +1,7 @@
extern crate deque;
extern crate docopt;
extern crate env_logger;
extern crate fnv;
extern crate grep;
#[cfg(windows)]
extern crate kernel32;
@ -36,6 +37,7 @@ use walkdir::DirEntry;
use args::Args;
use out::{ColoredTerminal, Out};
use pathutil::strip_prefix;
use printer::Printer;
use search_stream::InputBuffer;
#[cfg(windows)]
@ -60,6 +62,7 @@ mod gitignore;
mod glob;
mod ignore;
mod out;
mod pathutil;
mod printer;
mod search_buffer;
mod search_stream;
@ -257,7 +260,7 @@ impl Worker {
}
WorkReady::DirFile(ent, file) => {
let mut path = ent.path();
if let Ok(p) = path.strip_prefix("./") {
if let Some(p) = strip_prefix("./", path) {
path = p;
}
if self.args.mmap() {
@ -268,7 +271,7 @@ impl Worker {
}
WorkReady::PathFile(path, file) => {
let mut path = &*path;
if let Ok(p) = path.strip_prefix("./") {
if let Some(p) = strip_prefix("./", path) {
path = p;
}
if self.args.mmap() {

80
src/pathutil.rs Normal file
View File

@ -0,0 +1,80 @@
/*!
The pathutil module provides platform specific operations on paths that are
typically faster than the same operations as provided in std::path. In
particular, we really want to avoid the costly operation of parsing the path
into its constituent components. We give up on Windows, but on Unix, we deal
with the raw bytes directly.
On large repositories (like chromium), this can have a ~25% performance
improvement on just listing the files to search (!).
*/
use std::ffi::OsStr;
use std::path::Path;
/// Strip `prefix` from the `path` and return the remainder.
///
/// If `path` doesn't have a prefix `prefix`, then return `None`.
#[cfg(unix)]
pub fn strip_prefix<'a, P: AsRef<Path>>(
prefix: P,
path: &'a Path,
) -> Option<&'a Path> {
use std::os::unix::ffi::OsStrExt;
let prefix = prefix.as_ref().as_os_str().as_bytes();
let path = path.as_os_str().as_bytes();
if prefix.len() > path.len() || prefix != &path[0..prefix.len()] {
None
} else {
Some(&Path::new(OsStr::from_bytes(&path[prefix.len()..])))
}
}
/// Strip `prefix` from the `path` and return the remainder.
///
/// If `path` doesn't have a prefix `prefix`, then return `None`.
#[cfg(not(unix))]
pub fn strip_prefix<'a>(prefix: &Path, path: &'a Path) -> Option<&'a Path> {
path.strip_prefix(prefix).ok()
}
/// The final component of the path, if it is a normal file.
///
/// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None.
#[cfg(unix)]
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
use std::os::unix::ffi::OsStrExt;
let path = path.as_ref().as_os_str().as_bytes();
if path.is_empty() {
return None;
} else if path.len() == 1 && path[0] == b'.' {
return None;
} else if path.last() == Some(&b'.') {
return None;
} else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] {
return None;
}
let mut last_slash = 0;
for (i, &b) in path.iter().enumerate().rev() {
if b == b'/' {
last_slash = i;
break;
}
}
Some(OsStr::from_bytes(&path[last_slash + 1..]))
}
/// The final component of the path, if it is a normal file.
///
/// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None.
#[cfg(not(unix))]
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
path.as_ref().file_name()
}

View File

@ -151,8 +151,8 @@ impl FileTypeDef {
/// Types is a file type matcher.
#[derive(Clone, Debug)]
pub struct Types {
selected: Option<glob::Set>,
negated: Option<glob::Set>,
selected: Option<glob::SetYesNo>,
negated: Option<glob::SetYesNo>,
has_selected: bool,
unmatched_pat: Pattern,
}
@ -165,8 +165,8 @@ impl Types {
/// If has_selected is true, then at least one file type was selected.
/// Therefore, any non-matches should be ignored.
fn new(
selected: Option<glob::Set>,
negated: Option<glob::Set>,
selected: Option<glob::SetYesNo>,
negated: Option<glob::SetYesNo>,
has_selected: bool,
) -> Types {
Types {
@ -268,7 +268,7 @@ impl TypesBuilder {
try!(bset.add_with(glob, &opts));
}
}
Some(try!(bset.build()))
Some(try!(bset.build_yesno()))
};
let negated_globs =
if self.negated.is_empty() {
@ -287,7 +287,7 @@ impl TypesBuilder {
try!(bset.add_with(glob, &opts));
}
}
Some(try!(bset.build()))
Some(try!(bset.build_yesno()))
};
Ok(Types::new(
selected_globs, negated_globs, !self.selected.is_empty()))