mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-01-19 05:49:14 +02:00
globset: remove use of unsafe
This commit removes, in retrospect, a silly use of `unsafe`. In particular, to extract a file name extension (distinct from how `std` implements it), we were transmuting an OsStr to its underlying WTF-8 byte representation and then searching that. This required `unsafe` and relied on an undocumented std API, so it was a bad choice to make, but everything gets sacrificed at the Alter of Performance. The thing I didn't seem to realize at the time was that: 1. On Unix, you can already get the raw byte representation in a manner that has zero cost. 2. On Windows, paths are already being encoded and copied every which way. So doing a UTF-8 check and, in rare cases (for invalid UTF-8), an extra copy, doesn't seem like that much more of an added expense. Thus, rewrite the extension extraction using safe APIs. On Unix, this should have identical performance characteristics as the previous implementation. On Windows, we do pay a higher cost in the UTF-8 check, but Windows is already paying a similar cost a few times over anyway.
This commit is contained in:
parent
3effea0b7c
commit
96ee4482cd
@ -1,4 +1,3 @@
|
|||||||
use std::ffi::{OsStr, OsString};
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::hash;
|
use std::hash;
|
||||||
use std::iter;
|
use std::iter;
|
||||||
@ -28,7 +27,7 @@ pub enum MatchStrategy {
|
|||||||
BasenameLiteral(String),
|
BasenameLiteral(String),
|
||||||
/// A pattern matches if and only if the file path's extension matches this
|
/// A pattern matches if and only if the file path's extension matches this
|
||||||
/// literal string.
|
/// literal string.
|
||||||
Extension(OsString),
|
Extension(String),
|
||||||
/// A pattern matches if and only if this prefix literal is a prefix of the
|
/// A pattern matches if and only if this prefix literal is a prefix of the
|
||||||
/// candidate file path.
|
/// candidate file path.
|
||||||
Prefix(String),
|
Prefix(String),
|
||||||
@ -47,7 +46,7 @@ pub enum MatchStrategy {
|
|||||||
/// extension. Note that this is a necessary but NOT sufficient criterion.
|
/// extension. Note that this is a necessary but NOT sufficient criterion.
|
||||||
/// Namely, if the extension matches, then a full regex search is still
|
/// Namely, if the extension matches, then a full regex search is still
|
||||||
/// required.
|
/// required.
|
||||||
RequiredExtension(OsString),
|
RequiredExtension(String),
|
||||||
/// A regex needs to be used for matching.
|
/// A regex needs to be used for matching.
|
||||||
Regex,
|
Regex,
|
||||||
}
|
}
|
||||||
@ -154,7 +153,7 @@ impl GlobStrategic {
|
|||||||
lit.as_bytes() == &*candidate.basename
|
lit.as_bytes() == &*candidate.basename
|
||||||
}
|
}
|
||||||
MatchStrategy::Extension(ref ext) => {
|
MatchStrategy::Extension(ref ext) => {
|
||||||
candidate.ext == ext
|
ext.as_bytes() == &*candidate.ext
|
||||||
}
|
}
|
||||||
MatchStrategy::Prefix(ref pre) => {
|
MatchStrategy::Prefix(ref pre) => {
|
||||||
starts_with(pre.as_bytes(), byte_path)
|
starts_with(pre.as_bytes(), byte_path)
|
||||||
@ -166,7 +165,8 @@ impl GlobStrategic {
|
|||||||
ends_with(suffix.as_bytes(), byte_path)
|
ends_with(suffix.as_bytes(), byte_path)
|
||||||
}
|
}
|
||||||
MatchStrategy::RequiredExtension(ref ext) => {
|
MatchStrategy::RequiredExtension(ref ext) => {
|
||||||
candidate.ext == ext && self.re.is_match(byte_path)
|
let ext = ext.as_bytes();
|
||||||
|
&*candidate.ext == ext && self.re.is_match(byte_path)
|
||||||
}
|
}
|
||||||
MatchStrategy::Regex => self.re.is_match(byte_path),
|
MatchStrategy::Regex => self.re.is_match(byte_path),
|
||||||
}
|
}
|
||||||
@ -295,7 +295,7 @@ impl Glob {
|
|||||||
/// std::path::Path::extension returns. Namely, this extension includes
|
/// std::path::Path::extension returns. Namely, this extension includes
|
||||||
/// the '.'. Also, paths like `.rs` are considered to have an extension
|
/// the '.'. Also, paths like `.rs` are considered to have an extension
|
||||||
/// of `.rs`.
|
/// of `.rs`.
|
||||||
fn ext(&self) -> Option<OsString> {
|
fn ext(&self) -> Option<String> {
|
||||||
if self.opts.case_insensitive {
|
if self.opts.case_insensitive {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@ -319,11 +319,11 @@ impl Glob {
|
|||||||
Some(&Token::Literal('.')) => {}
|
Some(&Token::Literal('.')) => {}
|
||||||
_ => return None,
|
_ => return None,
|
||||||
}
|
}
|
||||||
let mut lit = OsStr::new(".").to_os_string();
|
let mut lit = ".".to_string();
|
||||||
for t in self.tokens[start + 2..].iter() {
|
for t in self.tokens[start + 2..].iter() {
|
||||||
match *t {
|
match *t {
|
||||||
Token::Literal('.') | Token::Literal('/') => return None,
|
Token::Literal('.') | Token::Literal('/') => return None,
|
||||||
Token::Literal(c) => lit.push(c.to_string()),
|
Token::Literal(c) => lit.push(c),
|
||||||
_ => return None,
|
_ => return None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -337,7 +337,7 @@ impl Glob {
|
|||||||
/// This is like `ext`, but returns an extension even if it isn't sufficent
|
/// This is like `ext`, but returns an extension even if it isn't sufficent
|
||||||
/// to imply a match. Namely, if an extension is returned, then it is
|
/// to imply a match. Namely, if an extension is returned, then it is
|
||||||
/// necessary but not sufficient for a match.
|
/// necessary but not sufficient for a match.
|
||||||
fn required_ext(&self) -> Option<OsString> {
|
fn required_ext(&self) -> Option<String> {
|
||||||
if self.opts.case_insensitive {
|
if self.opts.case_insensitive {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@ -360,7 +360,7 @@ impl Glob {
|
|||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
ext.reverse();
|
ext.reverse();
|
||||||
Some(OsString::from(ext.into_iter().collect::<String>()))
|
Some(ext.into_iter().collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -927,8 +927,6 @@ fn ends_with(needle: &[u8], haystack: &[u8]) -> bool {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::ffi::{OsStr, OsString};
|
|
||||||
|
|
||||||
use {GlobSetBuilder, ErrorKind};
|
use {GlobSetBuilder, ErrorKind};
|
||||||
use super::{Glob, GlobBuilder, Token};
|
use super::{Glob, GlobBuilder, Token};
|
||||||
use super::Token::*;
|
use super::Token::*;
|
||||||
@ -1021,7 +1019,6 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn s(string: &str) -> String { string.to_string() }
|
fn s(string: &str) -> String { string.to_string() }
|
||||||
fn os(string: &str) -> OsString { OsStr::new(string).to_os_string() }
|
|
||||||
|
|
||||||
fn class(s: char, e: char) -> Token {
|
fn class(s: char, e: char) -> Token {
|
||||||
Class { negated: false, ranges: vec![(s, e)] }
|
Class { negated: false, ranges: vec![(s, e)] }
|
||||||
@ -1319,19 +1316,19 @@ mod tests {
|
|||||||
Literal('f'), Literal('o'), ZeroOrMore, Literal('o'),
|
Literal('f'), Literal('o'), ZeroOrMore, Literal('o'),
|
||||||
]), SLASHLIT);
|
]), SLASHLIT);
|
||||||
|
|
||||||
ext!(extract_ext1, "**/*.rs", Some(os(".rs")));
|
ext!(extract_ext1, "**/*.rs", Some(s(".rs")));
|
||||||
ext!(extract_ext2, "**/*.rs.bak", None);
|
ext!(extract_ext2, "**/*.rs.bak", None);
|
||||||
ext!(extract_ext3, "*.rs", Some(os(".rs")));
|
ext!(extract_ext3, "*.rs", Some(s(".rs")));
|
||||||
ext!(extract_ext4, "a*.rs", None);
|
ext!(extract_ext4, "a*.rs", None);
|
||||||
ext!(extract_ext5, "/*.c", None);
|
ext!(extract_ext5, "/*.c", None);
|
||||||
ext!(extract_ext6, "*.c", None, SLASHLIT);
|
ext!(extract_ext6, "*.c", None, SLASHLIT);
|
||||||
ext!(extract_ext7, "*.c", Some(os(".c")));
|
ext!(extract_ext7, "*.c", Some(s(".c")));
|
||||||
|
|
||||||
required_ext!(extract_req_ext1, "*.rs", Some(os(".rs")));
|
required_ext!(extract_req_ext1, "*.rs", Some(s(".rs")));
|
||||||
required_ext!(extract_req_ext2, "/foo/bar/*.rs", Some(os(".rs")));
|
required_ext!(extract_req_ext2, "/foo/bar/*.rs", Some(s(".rs")));
|
||||||
required_ext!(extract_req_ext3, "/foo/bar/*.rs", Some(os(".rs")));
|
required_ext!(extract_req_ext3, "/foo/bar/*.rs", Some(s(".rs")));
|
||||||
required_ext!(extract_req_ext4, "/foo/bar/.rs", Some(os(".rs")));
|
required_ext!(extract_req_ext4, "/foo/bar/.rs", Some(s(".rs")));
|
||||||
required_ext!(extract_req_ext5, ".rs", Some(os(".rs")));
|
required_ext!(extract_req_ext5, ".rs", Some(s(".rs")));
|
||||||
required_ext!(extract_req_ext6, "./rs", None);
|
required_ext!(extract_req_ext6, "./rs", None);
|
||||||
required_ext!(extract_req_ext7, "foo", None);
|
required_ext!(extract_req_ext7, "foo", None);
|
||||||
required_ext!(extract_req_ext8, ".foo/", None);
|
required_ext!(extract_req_ext8, ".foo/", None);
|
||||||
|
@ -108,7 +108,7 @@ extern crate regex;
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::error::Error as StdError;
|
use std::error::Error as StdError;
|
||||||
use std::ffi::{OsStr, OsString};
|
use std::ffi::OsStr;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::hash;
|
use std::hash;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
@ -458,7 +458,7 @@ impl GlobSetBuilder {
|
|||||||
pub struct Candidate<'a> {
|
pub struct Candidate<'a> {
|
||||||
path: Cow<'a, [u8]>,
|
path: Cow<'a, [u8]>,
|
||||||
basename: Cow<'a, [u8]>,
|
basename: Cow<'a, [u8]>,
|
||||||
ext: &'a OsStr,
|
ext: Cow<'a, [u8]>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Candidate<'a> {
|
impl<'a> Candidate<'a> {
|
||||||
@ -469,7 +469,7 @@ impl<'a> Candidate<'a> {
|
|||||||
Candidate {
|
Candidate {
|
||||||
path: normalize_path(path_bytes(path)),
|
path: normalize_path(path_bytes(path)),
|
||||||
basename: os_str_bytes(basename),
|
basename: os_str_bytes(basename),
|
||||||
ext: file_name_ext(basename).unwrap_or(OsStr::new("")),
|
ext: file_name_ext(basename).unwrap_or(Cow::Borrowed(b"")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -584,22 +584,22 @@ impl BasenameLiteralStrategy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
struct ExtensionStrategy(HashMap<OsString, Vec<usize>, Fnv>);
|
struct ExtensionStrategy(HashMap<Vec<u8>, Vec<usize>, Fnv>);
|
||||||
|
|
||||||
impl ExtensionStrategy {
|
impl ExtensionStrategy {
|
||||||
fn new() -> ExtensionStrategy {
|
fn new() -> ExtensionStrategy {
|
||||||
ExtensionStrategy(HashMap::with_hasher(Fnv::default()))
|
ExtensionStrategy(HashMap::with_hasher(Fnv::default()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add(&mut self, global_index: usize, ext: OsString) {
|
fn add(&mut self, global_index: usize, ext: String) {
|
||||||
self.0.entry(ext).or_insert(vec![]).push(global_index);
|
self.0.entry(ext.into_bytes()).or_insert(vec![]).push(global_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_match(&self, candidate: &Candidate) -> bool {
|
fn is_match(&self, candidate: &Candidate) -> bool {
|
||||||
if candidate.ext.is_empty() {
|
if candidate.ext.is_empty() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
self.0.contains_key(candidate.ext)
|
self.0.contains_key(&*candidate.ext)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(never)]
|
#[inline(never)]
|
||||||
@ -607,7 +607,7 @@ impl ExtensionStrategy {
|
|||||||
if candidate.ext.is_empty() {
|
if candidate.ext.is_empty() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if let Some(hits) = self.0.get(candidate.ext) {
|
if let Some(hits) = self.0.get(&*candidate.ext) {
|
||||||
matches.extend(hits);
|
matches.extend(hits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -670,14 +670,14 @@ impl SuffixStrategy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
struct RequiredExtensionStrategy(HashMap<OsString, Vec<(usize, Regex)>, Fnv>);
|
struct RequiredExtensionStrategy(HashMap<Vec<u8>, Vec<(usize, Regex)>, Fnv>);
|
||||||
|
|
||||||
impl RequiredExtensionStrategy {
|
impl RequiredExtensionStrategy {
|
||||||
fn is_match(&self, candidate: &Candidate) -> bool {
|
fn is_match(&self, candidate: &Candidate) -> bool {
|
||||||
if candidate.ext.is_empty() {
|
if candidate.ext.is_empty() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
match self.0.get(candidate.ext) {
|
match self.0.get(&*candidate.ext) {
|
||||||
None => false,
|
None => false,
|
||||||
Some(regexes) => {
|
Some(regexes) => {
|
||||||
for &(_, ref re) in regexes {
|
for &(_, ref re) in regexes {
|
||||||
@ -695,7 +695,7 @@ impl RequiredExtensionStrategy {
|
|||||||
if candidate.ext.is_empty() {
|
if candidate.ext.is_empty() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if let Some(regexes) = self.0.get(candidate.ext) {
|
if let Some(regexes) = self.0.get(&*candidate.ext) {
|
||||||
for &(global_index, ref re) in regexes {
|
for &(global_index, ref re) in regexes {
|
||||||
if re.is_match(&*candidate.path) {
|
if re.is_match(&*candidate.path) {
|
||||||
matches.push(global_index);
|
matches.push(global_index);
|
||||||
@ -775,7 +775,7 @@ impl MultiStrategyBuilder {
|
|||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
struct RequiredExtensionStrategyBuilder(
|
struct RequiredExtensionStrategyBuilder(
|
||||||
HashMap<OsString, Vec<(usize, String)>>,
|
HashMap<Vec<u8>, Vec<(usize, String)>>,
|
||||||
);
|
);
|
||||||
|
|
||||||
impl RequiredExtensionStrategyBuilder {
|
impl RequiredExtensionStrategyBuilder {
|
||||||
@ -783,8 +783,11 @@ impl RequiredExtensionStrategyBuilder {
|
|||||||
RequiredExtensionStrategyBuilder(HashMap::new())
|
RequiredExtensionStrategyBuilder(HashMap::new())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add(&mut self, global_index: usize, ext: OsString, regex: String) {
|
fn add(&mut self, global_index: usize, ext: String, regex: String) {
|
||||||
self.0.entry(ext).or_insert(vec![]).push((global_index, regex));
|
self.0
|
||||||
|
.entry(ext.into_bytes())
|
||||||
|
.or_insert(vec![])
|
||||||
|
.push((global_index, regex));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build(self) -> Result<RequiredExtensionStrategy, Error> {
|
fn build(self) -> Result<RequiredExtensionStrategy, Error> {
|
||||||
|
@ -54,34 +54,28 @@ pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
|
|||||||
/// a pattern like `*.rs` is obviously trying to match files with a `rs`
|
/// a pattern like `*.rs` is obviously trying to match files with a `rs`
|
||||||
/// extension, but it also matches files like `.rs`, which doesn't have an
|
/// extension, but it also matches files like `.rs`, which doesn't have an
|
||||||
/// extension according to std::path::Path::extension.
|
/// extension according to std::path::Path::extension.
|
||||||
pub fn file_name_ext(name: &OsStr) -> Option<&OsStr> {
|
pub fn file_name_ext(name: &OsStr) -> Option<Cow<[u8]>> {
|
||||||
// Yes, these functions are awful, and yes, we are completely violating
|
|
||||||
// the abstraction barrier of std::ffi. The barrier we're violating is
|
|
||||||
// that an OsStr's encoding is *ASCII compatible*. While this is obviously
|
|
||||||
// true on Unix systems, it's also true on Windows because an OsStr uses
|
|
||||||
// WTF-8 internally: https://simonsapin.github.io/wtf-8/
|
|
||||||
//
|
|
||||||
// We should consider doing the same for the other path utility functions.
|
|
||||||
// Right now, we don't break any barriers, but Windows users are paying
|
|
||||||
// for it.
|
|
||||||
//
|
|
||||||
// Got any better ideas that don't cost anything? Hit me up. ---AG
|
|
||||||
unsafe fn os_str_as_u8_slice(s: &OsStr) -> &[u8] {
|
|
||||||
::std::mem::transmute(s)
|
|
||||||
}
|
|
||||||
unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
|
|
||||||
::std::mem::transmute(s)
|
|
||||||
}
|
|
||||||
if name.is_empty() {
|
if name.is_empty() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let name = unsafe { os_str_as_u8_slice(name) };
|
let name = os_str_bytes(name);
|
||||||
for (i, &b) in name.iter().enumerate().rev() {
|
let last_dot_at = {
|
||||||
if b == b'.' {
|
let result = name
|
||||||
return Some(unsafe { u8_slice_as_os_str(&name[i..]) });
|
.iter().enumerate().rev()
|
||||||
|
.find(|&(_, &b)| b == b'.')
|
||||||
|
.map(|(i, _)| i);
|
||||||
|
match result {
|
||||||
|
None => return None,
|
||||||
|
Some(i) => i,
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
Some(match name {
|
||||||
|
Cow::Borrowed(name) => Cow::Borrowed(&name[last_dot_at..]),
|
||||||
|
Cow::Owned(mut name) => {
|
||||||
|
name.drain(..last_dot_at);
|
||||||
|
Cow::Owned(name)
|
||||||
}
|
}
|
||||||
None
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return raw bytes of a path, transcoded to UTF-8 if necessary.
|
/// Return raw bytes of a path, transcoded to UTF-8 if necessary.
|
||||||
@ -144,7 +138,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn $name() {
|
fn $name() {
|
||||||
let got = file_name_ext(OsStr::new($file_name));
|
let got = file_name_ext(OsStr::new($file_name));
|
||||||
assert_eq!($ext.map(OsStr::new), got);
|
assert_eq!($ext.map(|s| Cow::Borrowed(s.as_bytes())), got);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user