1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-02-04 06:08:39 +02:00

globset: use bstr

This simplifies the various path related functions and pushed more platform
dependent code down into bstr. This likely also makes things a bit more
efficient on Windows, since we now only do a single UTF-8 check for each
file path.
This commit is contained in:
Andrew Gallant 2019-04-04 18:33:41 -04:00
parent b1c064d5af
commit 8e1d40ed7d
No known key found for this signature in database
GPG Key ID: B2E3A4923F8B0D44
4 changed files with 60 additions and 94 deletions

View File

@ -20,10 +20,10 @@ bench = false
[dependencies] [dependencies]
aho-corasick = "0.7.3" aho-corasick = "0.7.3"
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
fnv = "1.0.6" fnv = "1.0.6"
log = "0.4.5" log = "0.4.5"
memchr = "2.1.0" regex = "1.1.5"
regex = "1.1.0"
[dev-dependencies] [dev-dependencies]
glob = "0.2.11" glob = "0.2.11"

View File

@ -120,7 +120,7 @@ impl GlobMatcher {
/// Tests whether the given path matches this pattern or not. /// Tests whether the given path matches this pattern or not.
pub fn is_match_candidate(&self, path: &Candidate) -> bool { pub fn is_match_candidate(&self, path: &Candidate) -> bool {
self.re.is_match(&path.path) self.re.is_match(path.path.as_bytes())
} }
} }
@ -145,7 +145,7 @@ impl GlobStrategic {
/// Tests whether the given path matches this pattern or not. /// Tests whether the given path matches this pattern or not.
fn is_match_candidate(&self, candidate: &Candidate) -> bool { fn is_match_candidate(&self, candidate: &Candidate) -> bool {
let byte_path = &*candidate.path; let byte_path = candidate.path.as_bytes();
match self.strategy { match self.strategy {
MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path, MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path,

View File

@ -104,27 +104,25 @@ or to enable case insensitive matching.
#![deny(missing_docs)] #![deny(missing_docs)]
extern crate aho_corasick; extern crate aho_corasick;
extern crate bstr;
extern crate fnv; extern crate fnv;
#[macro_use] #[macro_use]
extern crate log; extern crate log;
extern crate memchr;
extern crate regex; extern crate regex;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap}; use std::collections::{BTreeMap, HashMap};
use std::error::Error as StdError; use std::error::Error as StdError;
use std::ffi::OsStr;
use std::fmt; use std::fmt;
use std::hash; use std::hash;
use std::path::Path; use std::path::Path;
use std::str; use std::str;
use aho_corasick::AhoCorasick; use aho_corasick::AhoCorasick;
use bstr::{B, BStr, BString};
use regex::bytes::{Regex, RegexBuilder, RegexSet}; use regex::bytes::{Regex, RegexBuilder, RegexSet};
use pathutil::{ use pathutil::{file_name, file_name_ext, normalize_path};
file_name, file_name_ext, normalize_path, os_str_bytes, path_bytes,
};
use glob::MatchStrategy; use glob::MatchStrategy;
pub use glob::{Glob, GlobBuilder, GlobMatcher}; pub use glob::{Glob, GlobBuilder, GlobMatcher};
@ -489,24 +487,25 @@ impl GlobSetBuilder {
/// path against multiple globs or sets of globs. /// path against multiple globs or sets of globs.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct Candidate<'a> { pub struct Candidate<'a> {
path: Cow<'a, [u8]>, path: Cow<'a, BStr>,
basename: Cow<'a, [u8]>, basename: Cow<'a, BStr>,
ext: Cow<'a, [u8]>, ext: Cow<'a, BStr>,
} }
impl<'a> Candidate<'a> { impl<'a> Candidate<'a> {
/// Create a new candidate for matching from the given path. /// Create a new candidate for matching from the given path.
pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> { pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> {
let path = path.as_ref(); let path = normalize_path(BString::from_path_lossy(path.as_ref()));
let basename = file_name(path).unwrap_or(OsStr::new("")); let basename = file_name(&path).unwrap_or(Cow::Borrowed(B("")));
let ext = file_name_ext(&basename).unwrap_or(Cow::Borrowed(B("")));
Candidate { Candidate {
path: normalize_path(path_bytes(path)), path: path,
basename: os_str_bytes(basename), basename: basename,
ext: file_name_ext(basename).unwrap_or(Cow::Borrowed(b"")), ext: ext,
} }
} }
fn path_prefix(&self, max: usize) -> &[u8] { fn path_prefix(&self, max: usize) -> &BStr {
if self.path.len() <= max { if self.path.len() <= max {
&*self.path &*self.path
} else { } else {
@ -514,7 +513,7 @@ impl<'a> Candidate<'a> {
} }
} }
fn path_suffix(&self, max: usize) -> &[u8] { fn path_suffix(&self, max: usize) -> &BStr {
if self.path.len() <= max { if self.path.len() <= max {
&*self.path &*self.path
} else { } else {
@ -575,12 +574,12 @@ impl LiteralStrategy {
} }
fn is_match(&self, candidate: &Candidate) -> bool { fn is_match(&self, candidate: &Candidate) -> bool {
self.0.contains_key(&*candidate.path) self.0.contains_key(candidate.path.as_bytes())
} }
#[inline(never)] #[inline(never)]
fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) { fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
if let Some(hits) = self.0.get(&*candidate.path) { if let Some(hits) = self.0.get(candidate.path.as_bytes()) {
matches.extend(hits); matches.extend(hits);
} }
} }
@ -602,7 +601,7 @@ impl BasenameLiteralStrategy {
if candidate.basename.is_empty() { if candidate.basename.is_empty() {
return false; return false;
} }
self.0.contains_key(&*candidate.basename) self.0.contains_key(candidate.basename.as_bytes())
} }
#[inline(never)] #[inline(never)]
@ -610,7 +609,7 @@ impl BasenameLiteralStrategy {
if candidate.basename.is_empty() { if candidate.basename.is_empty() {
return; return;
} }
if let Some(hits) = self.0.get(&*candidate.basename) { if let Some(hits) = self.0.get(candidate.basename.as_bytes()) {
matches.extend(hits); matches.extend(hits);
} }
} }
@ -632,7 +631,7 @@ impl ExtensionStrategy {
if candidate.ext.is_empty() { if candidate.ext.is_empty() {
return false; return false;
} }
self.0.contains_key(&*candidate.ext) self.0.contains_key(candidate.ext.as_bytes())
} }
#[inline(never)] #[inline(never)]
@ -640,7 +639,7 @@ impl ExtensionStrategy {
if candidate.ext.is_empty() { if candidate.ext.is_empty() {
return; return;
} }
if let Some(hits) = self.0.get(&*candidate.ext) { if let Some(hits) = self.0.get(candidate.ext.as_bytes()) {
matches.extend(hits); matches.extend(hits);
} }
} }
@ -710,11 +709,11 @@ impl RequiredExtensionStrategy {
if candidate.ext.is_empty() { if candidate.ext.is_empty() {
return false; return false;
} }
match self.0.get(&*candidate.ext) { match self.0.get(candidate.ext.as_bytes()) {
None => false, None => false,
Some(regexes) => { Some(regexes) => {
for &(_, ref re) in regexes { for &(_, ref re) in regexes {
if re.is_match(&*candidate.path) { if re.is_match(candidate.path.as_bytes()) {
return true; return true;
} }
} }
@ -728,9 +727,9 @@ impl RequiredExtensionStrategy {
if candidate.ext.is_empty() { if candidate.ext.is_empty() {
return; return;
} }
if let Some(regexes) = self.0.get(&*candidate.ext) { if let Some(regexes) = self.0.get(candidate.ext.as_bytes()) {
for &(global_index, ref re) in regexes { for &(global_index, ref re) in regexes {
if re.is_match(&*candidate.path) { if re.is_match(candidate.path.as_bytes()) {
matches.push(global_index); matches.push(global_index);
} }
} }
@ -746,11 +745,11 @@ struct RegexSetStrategy {
impl RegexSetStrategy { impl RegexSetStrategy {
fn is_match(&self, candidate: &Candidate) -> bool { fn is_match(&self, candidate: &Candidate) -> bool {
self.matcher.is_match(&*candidate.path) self.matcher.is_match(candidate.path.as_bytes())
} }
fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) { fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
for i in self.matcher.matches(&*candidate.path) { for i in self.matcher.matches(candidate.path.as_bytes()) {
matches.push(self.map[i]); matches.push(self.map[i]);
} }
} }

View File

@ -1,41 +1,30 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::ffi::OsStr;
use std::path::Path; use bstr::BStr;
/// The final component of the path, if it is a normal file. /// The final component of the path, if it is a normal file.
/// ///
/// If the path terminates in ., .., or consists solely of a root of prefix, /// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None. /// file_name will return None.
#[cfg(unix)] pub fn file_name<'a>(path: &Cow<'a, BStr>) -> Option<Cow<'a, BStr>> {
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
use std::os::unix::ffi::OsStrExt;
use memchr::memrchr;
let path = path.as_ref().as_os_str().as_bytes();
if path.is_empty() { if path.is_empty() {
return None; return None;
} else if path.len() == 1 && path[0] == b'.' { } else if path.len() == 1 && path[0] == b'.' {
return None; return None;
} else if path.last() == Some(&b'.') { } else if path.last() == Some(b'.') {
return None; return None;
} else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] { } else if path.len() >= 2 && &path[path.len() - 2..] == ".." {
return None; return None;
} }
let last_slash = memrchr(b'/', path).map(|i| i + 1).unwrap_or(0); let last_slash = path.rfind_byte(b'/').map(|i| i + 1).unwrap_or(0);
Some(OsStr::from_bytes(&path[last_slash..])) Some(match *path {
} Cow::Borrowed(path) => Cow::Borrowed(&path[last_slash..]),
Cow::Owned(ref path) => {
/// The final component of the path, if it is a normal file. let mut path = path.clone();
/// path.drain_bytes(..last_slash);
/// If the path terminates in ., .., or consists solely of a root of prefix, Cow::Owned(path)
/// file_name will return None. }
#[cfg(not(unix))] })
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
path.as_ref().file_name()
} }
/// Return a file extension given a path's file name. /// Return a file extension given a path's file name.
@ -54,59 +43,34 @@ pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
/// a pattern like `*.rs` is obviously trying to match files with a `rs` /// a pattern like `*.rs` is obviously trying to match files with a `rs`
/// extension, but it also matches files like `.rs`, which doesn't have an /// extension, but it also matches files like `.rs`, which doesn't have an
/// extension according to std::path::Path::extension. /// extension according to std::path::Path::extension.
pub fn file_name_ext(name: &OsStr) -> Option<Cow<[u8]>> { pub fn file_name_ext<'a>(name: &Cow<'a, BStr>) -> Option<Cow<'a, BStr>> {
if name.is_empty() { if name.is_empty() {
return None; return None;
} }
let name = os_str_bytes(name);
let last_dot_at = { let last_dot_at = {
let result = name let result = name
.iter().enumerate().rev() .bytes().enumerate().rev()
.find(|&(_, &b)| b == b'.') .find(|&(_, b)| b == b'.')
.map(|(i, _)| i); .map(|(i, _)| i);
match result { match result {
None => return None, None => return None,
Some(i) => i, Some(i) => i,
} }
}; };
Some(match name { Some(match *name {
Cow::Borrowed(name) => Cow::Borrowed(&name[last_dot_at..]), Cow::Borrowed(name) => Cow::Borrowed(&name[last_dot_at..]),
Cow::Owned(mut name) => { Cow::Owned(ref name) => {
name.drain(..last_dot_at); let mut name = name.clone();
name.drain_bytes(..last_dot_at);
Cow::Owned(name) Cow::Owned(name)
} }
}) })
} }
/// Return raw bytes of a path, transcoded to UTF-8 if necessary.
pub fn path_bytes(path: &Path) -> Cow<[u8]> {
os_str_bytes(path.as_os_str())
}
/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
#[cfg(unix)]
pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
use std::os::unix::ffi::OsStrExt;
Cow::Borrowed(s.as_bytes())
}
/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
#[cfg(not(unix))]
pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
// TODO(burntsushi): On Windows, OS strings are WTF-8, which is a superset
// of UTF-8, so even if we could get at the raw bytes, they wouldn't
// be useful. We *must* convert to UTF-8 before doing path matching.
// Unfortunate, but necessary.
match s.to_string_lossy() {
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
}
}
/// Normalizes a path to use `/` as a separator everywhere, even on platforms /// Normalizes a path to use `/` as a separator everywhere, even on platforms
/// that recognize other characters as separators. /// that recognize other characters as separators.
#[cfg(unix)] #[cfg(unix)]
pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> { pub fn normalize_path(path: Cow<BStr>) -> Cow<BStr> {
// UNIX only uses /, so we're good. // UNIX only uses /, so we're good.
path path
} }
@ -114,7 +78,7 @@ pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> {
/// Normalizes a path to use `/` as a separator everywhere, even on platforms /// Normalizes a path to use `/` as a separator everywhere, even on platforms
/// that recognize other characters as separators. /// that recognize other characters as separators.
#[cfg(not(unix))] #[cfg(not(unix))]
pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> { pub fn normalize_path(mut path: Cow<BStr>) -> Cow<BStr> {
use std::path::is_separator; use std::path::is_separator;
for i in 0..path.len() { for i in 0..path.len() {
@ -129,7 +93,8 @@ pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::borrow::Cow; use std::borrow::Cow;
use std::ffi::OsStr;
use bstr::{B, BString};
use super::{file_name_ext, normalize_path}; use super::{file_name_ext, normalize_path};
@ -137,8 +102,9 @@ mod tests {
($name:ident, $file_name:expr, $ext:expr) => { ($name:ident, $file_name:expr, $ext:expr) => {
#[test] #[test]
fn $name() { fn $name() {
let got = file_name_ext(OsStr::new($file_name)); let bs = BString::from($file_name);
assert_eq!($ext.map(|s| Cow::Borrowed(s.as_bytes())), got); let got = file_name_ext(&Cow::Owned(bs));
assert_eq!($ext.map(|s| Cow::Borrowed(B(s))), got);
} }
}; };
} }
@ -153,7 +119,8 @@ mod tests {
($name:ident, $path:expr, $expected:expr) => { ($name:ident, $path:expr, $expected:expr) => {
#[test] #[test]
fn $name() { fn $name() {
let got = normalize_path(Cow::Owned($path.to_vec())); let bs = BString::from_slice($path);
let got = normalize_path(Cow::Owned(bs));
assert_eq!($expected.to_vec(), got.into_owned()); assert_eq!($expected.to_vec(), got.into_owned());
} }
}; };