1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-03-17 20:28:03 +02:00

globset: polishing

This brings the code in line with my current style. It also inlines the
dozen or so lines of code for FNV hashing instead of bringing in a
micro-crate for it. Finally, it drops the dependency on regex in favor
of using regex-syntax and regex-automata directly.
This commit is contained in:
Andrew Gallant 2023-09-26 15:01:20 -04:00
parent 0951820f63
commit 7f45640401
6 changed files with 179 additions and 152 deletions

10
Cargo.lock generated
View File

@ -130,12 +130,6 @@ dependencies = [
"encoding_rs", "encoding_rs",
] ]
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]] [[package]]
name = "glob" name = "glob"
version = "0.3.1" version = "0.3.1"
@ -148,11 +142,11 @@ version = "0.4.13"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"bstr", "bstr",
"fnv",
"glob", "glob",
"lazy_static", "lazy_static",
"log", "log",
"regex", "regex-automata",
"regex-syntax",
"serde", "serde",
"serde_json", "serde_json",
] ]

View File

@ -13,24 +13,32 @@ repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/globset"
readme = "README.md" readme = "README.md"
keywords = ["regex", "glob", "multiple", "set", "pattern"] keywords = ["regex", "glob", "multiple", "set", "pattern"]
license = "Unlicense OR MIT" license = "Unlicense OR MIT"
edition = "2018" edition = "2021"
[lib] [lib]
name = "globset" name = "globset"
bench = false bench = false
[dependencies] [dependencies]
aho-corasick = "1.0.2" aho-corasick = "1.1.1"
bstr = { version = "1.6.0", default-features = false, features = ["std"] } bstr = { version = "1.6.2", default-features = false, features = ["std"] }
fnv = "1.0.6" log = { version = "0.4.20", optional = true }
log = { version = "0.4.5", optional = true } serde = { version = "1.0.188", optional = true }
regex = { version = "1.8.3", default-features = false, features = ["perf", "std"] }
serde = { version = "1.0.104", optional = true } [dependencies.regex-syntax]
version = "0.7.5"
default-features = false
features = ["std"]
[dependencies.regex-automata]
version = "0.3.8"
default-features = false
features = ["std", "perf", "syntax", "meta", "nfa", "hybrid"]
[dev-dependencies] [dev-dependencies]
glob = "0.3.0" glob = "0.3.1"
lazy_static = "1" lazy_static = "1"
serde_json = "1.0.45" serde_json = "1.0.107"
[features] [features]
default = ["log"] default = ["log"]

30
crates/globset/src/fnv.rs Normal file
View File

@ -0,0 +1,30 @@
/// A convenience alias for creating a hash map with an FNV hasher.
pub(crate) type HashMap<K, V> =
std::collections::HashMap<K, V, std::hash::BuildHasherDefault<Hasher>>;
/// A hasher that implements the Fowler–Noll–Vo (FNV) hash.
pub(crate) struct Hasher(u64);
impl Hasher {
const OFFSET_BASIS: u64 = 0xcbf29ce484222325;
const PRIME: u64 = 0x100000001b3;
}
impl Default for Hasher {
fn default() -> Hasher {
Hasher(Hasher::OFFSET_BASIS)
}
}
impl std::hash::Hasher for Hasher {
fn finish(&self) -> u64 {
self.0
}
fn write(&mut self, bytes: &[u8]) {
for &byte in bytes.iter() {
self.0 = self.0 ^ u64::from(byte);
self.0 = self.0.wrapping_mul(Hasher::PRIME);
}
}
}

View File

@ -1,12 +1,6 @@
use std::fmt;
use std::hash;
use std::iter;
use std::ops::{Deref, DerefMut};
use std::path::{is_separator, Path}; use std::path::{is_separator, Path};
use std::str;
use regex; use regex_automata::meta::Regex;
use regex::bytes::Regex;
use crate::{new_regex, Candidate, Error, ErrorKind}; use crate::{new_regex, Candidate, Error, ErrorKind};
@ -18,7 +12,7 @@ use crate::{new_regex, Candidate, Error, ErrorKind};
/// possible to test whether any of those patterns matches by looking up a /// possible to test whether any of those patterns matches by looking up a
/// file path's extension in a hash table. /// file path's extension in a hash table.
#[derive(Clone, Debug, Eq, PartialEq)] #[derive(Clone, Debug, Eq, PartialEq)]
pub enum MatchStrategy { pub(crate) enum MatchStrategy {
/// A pattern matches if and only if the entire file path matches this /// A pattern matches if and only if the entire file path matches this
/// literal string. /// literal string.
Literal(String), Literal(String),
@ -53,7 +47,7 @@ pub enum MatchStrategy {
impl MatchStrategy { impl MatchStrategy {
/// Returns a matching strategy for the given pattern. /// Returns a matching strategy for the given pattern.
pub fn new(pat: &Glob) -> MatchStrategy { pub(crate) fn new(pat: &Glob) -> MatchStrategy {
if let Some(lit) = pat.basename_literal() { if let Some(lit) = pat.basename_literal() {
MatchStrategy::BasenameLiteral(lit) MatchStrategy::BasenameLiteral(lit)
} else if let Some(lit) = pat.literal() { } else if let Some(lit) = pat.literal() {
@ -63,7 +57,7 @@ impl MatchStrategy {
} else if let Some(prefix) = pat.prefix() { } else if let Some(prefix) = pat.prefix() {
MatchStrategy::Prefix(prefix) MatchStrategy::Prefix(prefix)
} else if let Some((suffix, component)) = pat.suffix() { } else if let Some((suffix, component)) = pat.suffix() {
MatchStrategy::Suffix { suffix: suffix, component: component } MatchStrategy::Suffix { suffix, component }
} else if let Some(ext) = pat.required_ext() { } else if let Some(ext) = pat.required_ext() {
MatchStrategy::RequiredExtension(ext) MatchStrategy::RequiredExtension(ext)
} else { } else {
@ -90,20 +84,20 @@ impl PartialEq for Glob {
} }
} }
impl hash::Hash for Glob { impl std::hash::Hash for Glob {
fn hash<H: hash::Hasher>(&self, state: &mut H) { fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.glob.hash(state); self.glob.hash(state);
self.opts.hash(state); self.opts.hash(state);
} }
} }
impl fmt::Display for Glob { impl std::fmt::Display for Glob {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.glob.fmt(f) self.glob.fmt(f)
} }
} }
impl str::FromStr for Glob { impl std::str::FromStr for Glob {
type Err = Error; type Err = Error;
fn from_str(glob: &str) -> Result<Self, Self::Err> { fn from_str(glob: &str) -> Result<Self, Self::Err> {
@ -227,14 +221,14 @@ impl GlobOptions {
#[derive(Clone, Debug, Default, Eq, PartialEq)] #[derive(Clone, Debug, Default, Eq, PartialEq)]
struct Tokens(Vec<Token>); struct Tokens(Vec<Token>);
impl Deref for Tokens { impl std::ops::Deref for Tokens {
type Target = Vec<Token>; type Target = Vec<Token>;
fn deref(&self) -> &Vec<Token> { fn deref(&self) -> &Vec<Token> {
&self.0 &self.0
} }
} }
impl DerefMut for Tokens { impl std::ops::DerefMut for Tokens {
fn deref_mut(&mut self) -> &mut Vec<Token> { fn deref_mut(&mut self) -> &mut Vec<Token> {
&mut self.0 &mut self.0
} }
@ -262,7 +256,7 @@ impl Glob {
pub fn compile_matcher(&self) -> GlobMatcher { pub fn compile_matcher(&self) -> GlobMatcher {
let re = let re =
new_regex(&self.re).expect("regex compilation shouldn't fail"); new_regex(&self.re).expect("regex compilation shouldn't fail");
GlobMatcher { pat: self.clone(), re: re } GlobMatcher { pat: self.clone(), re }
} }
/// Returns a strategic matcher. /// Returns a strategic matcher.
@ -275,7 +269,7 @@ impl Glob {
let strategy = MatchStrategy::new(self); let strategy = MatchStrategy::new(self);
let re = let re =
new_regex(&self.re).expect("regex compilation shouldn't fail"); new_regex(&self.re).expect("regex compilation shouldn't fail");
GlobStrategic { strategy: strategy, re: re } GlobStrategic { strategy, re }
} }
/// Returns the original glob pattern used to build this pattern. /// Returns the original glob pattern used to build this pattern.
@ -311,10 +305,8 @@ impl Glob {
} }
let mut lit = String::new(); let mut lit = String::new();
for t in &*self.tokens { for t in &*self.tokens {
match *t { let Token::Literal(c) = *t else { return None };
Token::Literal(c) => lit.push(c), lit.push(c);
_ => return None,
}
} }
if lit.is_empty() { if lit.is_empty() {
None None
@ -334,13 +326,12 @@ impl Glob {
if self.opts.case_insensitive { if self.opts.case_insensitive {
return None; return None;
} }
let start = match self.tokens.get(0) { let start = match *self.tokens.get(0)? {
Some(&Token::RecursivePrefix) => 1, Token::RecursivePrefix => 1,
Some(_) => 0, _ => 0,
_ => return None,
}; };
match self.tokens.get(start) { match *self.tokens.get(start)? {
Some(&Token::ZeroOrMore) => { Token::ZeroOrMore => {
// If there was no recursive prefix, then we only permit // If there was no recursive prefix, then we only permit
// `*` if `*` can match a `/`. For example, if `*` can't // `*` if `*` can match a `/`. For example, if `*` can't
// match `/`, then `*.c` doesn't match `foo/bar.c`. // match `/`, then `*.c` doesn't match `foo/bar.c`.
@ -350,8 +341,8 @@ impl Glob {
} }
_ => return None, _ => return None,
} }
match self.tokens.get(start + 1) { match *self.tokens.get(start + 1)? {
Some(&Token::Literal('.')) => {} Token::Literal('.') => {}
_ => return None, _ => return None,
} }
let mut lit = ".".to_string(); let mut lit = ".".to_string();
@ -405,8 +396,8 @@ impl Glob {
if self.opts.case_insensitive { if self.opts.case_insensitive {
return None; return None;
} }
let (end, need_sep) = match self.tokens.last() { let (end, need_sep) = match *self.tokens.last()? {
Some(&Token::ZeroOrMore) => { Token::ZeroOrMore => {
if self.opts.literal_separator { if self.opts.literal_separator {
// If a trailing `*` can't match a `/`, then we can't // If a trailing `*` can't match a `/`, then we can't
// assume a match of the prefix corresponds to a match // assume a match of the prefix corresponds to a match
@ -418,15 +409,13 @@ impl Glob {
} }
(self.tokens.len() - 1, false) (self.tokens.len() - 1, false)
} }
Some(&Token::RecursiveSuffix) => (self.tokens.len() - 1, true), Token::RecursiveSuffix => (self.tokens.len() - 1, true),
_ => (self.tokens.len(), false), _ => (self.tokens.len(), false),
}; };
let mut lit = String::new(); let mut lit = String::new();
for t in &self.tokens[0..end] { for t in &self.tokens[0..end] {
match *t { let Token::Literal(c) = *t else { return None };
Token::Literal(c) => lit.push(c), lit.push(c);
_ => return None,
}
} }
if need_sep { if need_sep {
lit.push('/'); lit.push('/');
@ -455,8 +444,8 @@ impl Glob {
return None; return None;
} }
let mut lit = String::new(); let mut lit = String::new();
let (start, entire) = match self.tokens.get(0) { let (start, entire) = match *self.tokens.get(0)? {
Some(&Token::RecursivePrefix) => { Token::RecursivePrefix => {
// We only care if this follows a path component if the next // We only care if this follows a path component if the next
// token is a literal. // token is a literal.
if let Some(&Token::Literal(_)) = self.tokens.get(1) { if let Some(&Token::Literal(_)) = self.tokens.get(1) {
@ -468,8 +457,8 @@ impl Glob {
} }
_ => (0, false), _ => (0, false),
}; };
let start = match self.tokens.get(start) { let start = match *self.tokens.get(start)? {
Some(&Token::ZeroOrMore) => { Token::ZeroOrMore => {
// If literal_separator is enabled, then a `*` can't // If literal_separator is enabled, then a `*` can't
// necessarily match everything, so reporting a suffix match // necessarily match everything, so reporting a suffix match
// as a match of the pattern would be a false positive. // as a match of the pattern would be a false positive.
@ -481,10 +470,8 @@ impl Glob {
_ => start, _ => start,
}; };
for t in &self.tokens[start..] { for t in &self.tokens[start..] {
match *t { let Token::Literal(c) = *t else { return None };
Token::Literal(c) => lit.push(c), lit.push(c);
_ => return None,
}
} }
if lit.is_empty() || lit == "/" { if lit.is_empty() || lit == "/" {
None None
@ -508,8 +495,8 @@ impl Glob {
if self.opts.case_insensitive { if self.opts.case_insensitive {
return None; return None;
} }
let start = match self.tokens.get(0) { let start = match *self.tokens.get(0)? {
Some(&Token::RecursivePrefix) => 1, Token::RecursivePrefix => 1,
_ => { _ => {
// With nothing to gobble up the parent portion of a path, // With nothing to gobble up the parent portion of a path,
// we can't assume that matching on only the basename is // we can't assume that matching on only the basename is
@ -520,7 +507,7 @@ impl Glob {
if self.tokens[start..].is_empty() { if self.tokens[start..].is_empty() {
return None; return None;
} }
for t in &self.tokens[start..] { for t in self.tokens[start..].iter() {
match *t { match *t {
Token::Literal('/') => return None, Token::Literal('/') => return None,
Token::Literal(_) => {} // OK Token::Literal(_) => {} // OK
@ -554,16 +541,11 @@ impl Glob {
/// The basic format of these patterns is `**/{literal}`, where `{literal}` /// The basic format of these patterns is `**/{literal}`, where `{literal}`
/// does not contain a path separator. /// does not contain a path separator.
fn basename_literal(&self) -> Option<String> { fn basename_literal(&self) -> Option<String> {
let tokens = match self.basename_tokens() { let tokens = self.basename_tokens()?;
None => return None,
Some(tokens) => tokens,
};
let mut lit = String::new(); let mut lit = String::new();
for t in tokens { for t in tokens {
match *t { let Token::Literal(c) = *t else { return None };
Token::Literal(c) => lit.push(c), lit.push(c);
_ => return None,
}
} }
Some(lit) Some(lit)
} }
@ -574,7 +556,7 @@ impl<'a> GlobBuilder<'a> {
/// ///
/// The pattern is not compiled until `build` is called. /// The pattern is not compiled until `build` is called.
pub fn new(glob: &'a str) -> GlobBuilder<'a> { pub fn new(glob: &'a str) -> GlobBuilder<'a> {
GlobBuilder { glob: glob, opts: GlobOptions::default() } GlobBuilder { glob, opts: GlobOptions::default() }
} }
/// Parses and builds the pattern. /// Parses and builds the pattern.
@ -604,7 +586,7 @@ impl<'a> GlobBuilder<'a> {
glob: self.glob.to_string(), glob: self.glob.to_string(),
re: tokens.to_regex_with(&self.opts), re: tokens.to_regex_with(&self.opts),
opts: self.opts, opts: self.opts,
tokens: tokens, tokens,
}) })
} }
} }
@ -640,7 +622,8 @@ impl<'a> GlobBuilder<'a> {
/// Toggle whether an empty pattern in a list of alternates is accepted. /// Toggle whether an empty pattern in a list of alternates is accepted.
/// ///
/// For example, if this is set then the glob `foo{,.txt}` will match both `foo` and `foo.txt`. /// For example, if this is set then the glob `foo{,.txt}` will match both
/// `foo` and `foo.txt`.
/// ///
/// By default this is false. /// By default this is false.
pub fn empty_alternates(&mut self, yes: bool) -> &mut GlobBuilder<'a> { pub fn empty_alternates(&mut self, yes: bool) -> &mut GlobBuilder<'a> {
@ -678,7 +661,7 @@ impl Tokens {
tokens: &[Token], tokens: &[Token],
re: &mut String, re: &mut String,
) { ) {
for tok in tokens { for tok in tokens.iter() {
match *tok { match *tok {
Token::Literal(c) => { Token::Literal(c) => {
re.push_str(&char_to_escaped_literal(c)); re.push_str(&char_to_escaped_literal(c));
@ -758,7 +741,9 @@ fn bytes_to_escaped_literal(bs: &[u8]) -> String {
let mut s = String::with_capacity(bs.len()); let mut s = String::with_capacity(bs.len());
for &b in bs { for &b in bs {
if b <= 0x7F { if b <= 0x7F {
s.push_str(&regex::escape(&(b as char).to_string())); s.push_str(&regex_syntax::escape(
char::from(b).encode_utf8(&mut [0; 4]),
));
} else { } else {
s.push_str(&format!("\\x{:02x}", b)); s.push_str(&format!("\\x{:02x}", b));
} }
@ -769,7 +754,7 @@ fn bytes_to_escaped_literal(bs: &[u8]) -> String {
struct Parser<'a> { struct Parser<'a> {
glob: &'a str, glob: &'a str,
stack: Vec<Tokens>, stack: Vec<Tokens>,
chars: iter::Peekable<str::Chars<'a>>, chars: std::iter::Peekable<std::str::Chars<'a>>,
prev: Option<char>, prev: Option<char>,
cur: Option<char>, cur: Option<char>,
opts: &'a GlobOptions, opts: &'a GlobOptions,
@ -777,7 +762,7 @@ struct Parser<'a> {
impl<'a> Parser<'a> { impl<'a> Parser<'a> {
fn error(&self, kind: ErrorKind) -> Error { fn error(&self, kind: ErrorKind) -> Error {
Error { glob: Some(self.glob.to_string()), kind: kind } Error { glob: Some(self.glob.to_string()), kind }
} }
fn parse(&mut self) -> Result<(), Error> { fn parse(&mut self) -> Result<(), Error> {
@ -996,7 +981,7 @@ impl<'a> Parser<'a> {
// it as a literal. // it as a literal.
ranges.push(('-', '-')); ranges.push(('-', '-'));
} }
self.push_token(Token::Class { negated: negated, ranges: ranges }) self.push_token(Token::Class { negated, ranges })
} }
fn bump(&mut self) -> Option<char> { fn bump(&mut self) -> Option<char> {

View File

@ -5,11 +5,9 @@ Glob set matching is the process of matching one or more glob patterns against
a single candidate path simultaneously, and returning all of the globs that a single candidate path simultaneously, and returning all of the globs that
matched. For example, given this set of globs: matched. For example, given this set of globs:
```ignore * `*.rs`
*.rs * `src/lib.rs`
src/lib.rs * `src/**/foo.rs`
src/**/foo.rs
```
and a path `src/bar/baz/foo.rs`, then the set would report the first and third and a path `src/bar/baz/foo.rs`, then the set would report the first and third
globs as matching. globs as matching.
@ -19,7 +17,6 @@ globs as matching.
This example shows how to match a single glob against a single file path. This example shows how to match a single glob against a single file path.
``` ```
# fn example() -> Result<(), globset::Error> {
use globset::Glob; use globset::Glob;
let glob = Glob::new("*.rs")?.compile_matcher(); let glob = Glob::new("*.rs")?.compile_matcher();
@ -27,7 +24,7 @@ let glob = Glob::new("*.rs")?.compile_matcher();
assert!(glob.is_match("foo.rs")); assert!(glob.is_match("foo.rs"));
assert!(glob.is_match("foo/bar.rs")); assert!(glob.is_match("foo/bar.rs"));
assert!(!glob.is_match("Cargo.toml")); assert!(!glob.is_match("Cargo.toml"));
# Ok(()) } example().unwrap(); # Ok::<(), Box<dyn std::error::Error>>(())
``` ```
# Example: configuring a glob matcher # Example: configuring a glob matcher
@ -36,7 +33,6 @@ This example shows how to use a `GlobBuilder` to configure aspects of match
semantics. In this example, we prevent wildcards from matching path separators. semantics. In this example, we prevent wildcards from matching path separators.
``` ```
# fn example() -> Result<(), globset::Error> {
use globset::GlobBuilder; use globset::GlobBuilder;
let glob = GlobBuilder::new("*.rs") let glob = GlobBuilder::new("*.rs")
@ -45,7 +41,7 @@ let glob = GlobBuilder::new("*.rs")
assert!(glob.is_match("foo.rs")); assert!(glob.is_match("foo.rs"));
assert!(!glob.is_match("foo/bar.rs")); // no longer matches assert!(!glob.is_match("foo/bar.rs")); // no longer matches
assert!(!glob.is_match("Cargo.toml")); assert!(!glob.is_match("Cargo.toml"));
# Ok(()) } example().unwrap(); # Ok::<(), Box<dyn std::error::Error>>(())
``` ```
# Example: match multiple globs at once # Example: match multiple globs at once
@ -53,7 +49,6 @@ assert!(!glob.is_match("Cargo.toml"));
This example shows how to match multiple glob patterns at once. This example shows how to match multiple glob patterns at once.
``` ```
# fn example() -> Result<(), globset::Error> {
use globset::{Glob, GlobSetBuilder}; use globset::{Glob, GlobSetBuilder};
let mut builder = GlobSetBuilder::new(); let mut builder = GlobSetBuilder::new();
@ -65,7 +60,7 @@ builder.add(Glob::new("src/**/foo.rs")?);
let set = builder.build()?; let set = builder.build()?;
assert_eq!(set.matches("src/bar/baz/foo.rs"), vec![0, 2]); assert_eq!(set.matches("src/bar/baz/foo.rs"), vec![0, 2]);
# Ok(()) } example().unwrap(); # Ok::<(), Box<dyn std::error::Error>>(())
``` ```
# Syntax # Syntax
@ -103,22 +98,22 @@ or to enable case insensitive matching.
#![deny(missing_docs)] #![deny(missing_docs)]
use std::borrow::Cow; use std::{borrow::Cow, path::Path};
use std::collections::{BTreeMap, HashMap};
use std::error::Error as StdError;
use std::fmt;
use std::hash;
use std::path::Path;
use std::str;
use aho_corasick::AhoCorasick; use {
use bstr::{ByteSlice, ByteVec, B}; aho_corasick::AhoCorasick,
use regex::bytes::{Regex, RegexBuilder, RegexSet}; bstr::{ByteSlice, ByteVec, B},
regex_automata::meta::Regex,
};
use crate::{
glob::MatchStrategy,
pathutil::{file_name, file_name_ext, normalize_path},
};
use crate::glob::MatchStrategy;
pub use crate::glob::{Glob, GlobBuilder, GlobMatcher}; pub use crate::glob::{Glob, GlobBuilder, GlobMatcher};
use crate::pathutil::{file_name, file_name_ext, normalize_path};
mod fnv;
mod glob; mod glob;
mod pathutil; mod pathutil;
@ -181,7 +176,7 @@ pub enum ErrorKind {
__Nonexhaustive, __Nonexhaustive,
} }
impl StdError for Error { impl std::error::Error for Error {
fn description(&self) -> &str { fn description(&self) -> &str {
self.kind.description() self.kind.description()
} }
@ -227,8 +222,8 @@ impl ErrorKind {
} }
} }
impl fmt::Display for Error { impl std::fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self.glob { match self.glob {
None => self.kind.fmt(f), None => self.kind.fmt(f),
Some(ref glob) => { Some(ref glob) => {
@ -238,8 +233,8 @@ impl fmt::Display for Error {
} }
} }
impl fmt::Display for ErrorKind { impl std::fmt::Display for ErrorKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match *self { match *self {
ErrorKind::InvalidRecursive ErrorKind::InvalidRecursive
| ErrorKind::UnclosedClass | ErrorKind::UnclosedClass
@ -257,30 +252,40 @@ impl fmt::Display for ErrorKind {
} }
fn new_regex(pat: &str) -> Result<Regex, Error> { fn new_regex(pat: &str) -> Result<Regex, Error> {
RegexBuilder::new(pat) let syntax = regex_automata::util::syntax::Config::new()
.dot_matches_new_line(true) .utf8(false)
.size_limit(10 * (1 << 20)) .dot_matches_new_line(true);
.dfa_size_limit(10 * (1 << 20)) let config = Regex::config()
.build() .utf8_empty(false)
.map_err(|err| Error { .nfa_size_limit(Some(10 * (1 << 20)))
.hybrid_cache_capacity(10 * (1 << 20));
Regex::builder().syntax(syntax).configure(config).build(pat).map_err(
|err| Error {
glob: Some(pat.to_string()), glob: Some(pat.to_string()),
kind: ErrorKind::Regex(err.to_string()), kind: ErrorKind::Regex(err.to_string()),
}) },
)
} }
fn new_regex_set<I, S>(pats: I) -> Result<RegexSet, Error> fn new_regex_set(pats: Vec<String>) -> Result<Regex, Error> {
where let syntax = regex_automata::util::syntax::Config::new()
S: AsRef<str>, .utf8(false)
I: IntoIterator<Item = S>, .dot_matches_new_line(true);
{ let config = Regex::config()
RegexSet::new(pats).map_err(|err| Error { .match_kind(regex_automata::MatchKind::All)
.utf8_empty(false)
.nfa_size_limit(Some(10 * (1 << 20)))
.hybrid_cache_capacity(10 * (1 << 20));
Regex::builder()
.syntax(syntax)
.configure(config)
.build_many(&pats)
.map_err(|err| Error {
glob: None, glob: None,
kind: ErrorKind::Regex(err.to_string()), kind: ErrorKind::Regex(err.to_string()),
}) })
} }
type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
/// GlobSet represents a group of globs that can be matched together in a /// GlobSet represents a group of globs that can be matched together in a
/// single pass. /// single pass.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
@ -521,7 +526,7 @@ impl<'a> Candidate<'a> {
let path = normalize_path(Vec::from_path_lossy(path.as_ref())); let path = normalize_path(Vec::from_path_lossy(path.as_ref()));
let basename = file_name(&path).unwrap_or(Cow::Borrowed(B(""))); let basename = file_name(&path).unwrap_or(Cow::Borrowed(B("")));
let ext = file_name_ext(&basename).unwrap_or(Cow::Borrowed(B(""))); let ext = file_name_ext(&basename).unwrap_or(Cow::Borrowed(B("")));
Candidate { path: path, basename: basename, ext: ext } Candidate { path, basename, ext }
} }
fn path_prefix(&self, max: usize) -> &[u8] { fn path_prefix(&self, max: usize) -> &[u8] {
@ -585,11 +590,11 @@ impl GlobSetMatchStrategy {
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct LiteralStrategy(BTreeMap<Vec<u8>, Vec<usize>>); struct LiteralStrategy(fnv::HashMap<Vec<u8>, Vec<usize>>);
impl LiteralStrategy { impl LiteralStrategy {
fn new() -> LiteralStrategy { fn new() -> LiteralStrategy {
LiteralStrategy(BTreeMap::new()) LiteralStrategy(fnv::HashMap::default())
} }
fn add(&mut self, global_index: usize, lit: String) { fn add(&mut self, global_index: usize, lit: String) {
@ -613,11 +618,11 @@ impl LiteralStrategy {
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct BasenameLiteralStrategy(BTreeMap<Vec<u8>, Vec<usize>>); struct BasenameLiteralStrategy(fnv::HashMap<Vec<u8>, Vec<usize>>);
impl BasenameLiteralStrategy { impl BasenameLiteralStrategy {
fn new() -> BasenameLiteralStrategy { fn new() -> BasenameLiteralStrategy {
BasenameLiteralStrategy(BTreeMap::new()) BasenameLiteralStrategy(fnv::HashMap::default())
} }
fn add(&mut self, global_index: usize, lit: String) { fn add(&mut self, global_index: usize, lit: String) {
@ -647,11 +652,11 @@ impl BasenameLiteralStrategy {
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct ExtensionStrategy(HashMap<Vec<u8>, Vec<usize>, Fnv>); struct ExtensionStrategy(fnv::HashMap<Vec<u8>, Vec<usize>>);
impl ExtensionStrategy { impl ExtensionStrategy {
fn new() -> ExtensionStrategy { fn new() -> ExtensionStrategy {
ExtensionStrategy(HashMap::with_hasher(Fnv::default())) ExtensionStrategy(fnv::HashMap::default())
} }
fn add(&mut self, global_index: usize, ext: String) { fn add(&mut self, global_index: usize, ext: String) {
@ -745,7 +750,7 @@ impl SuffixStrategy {
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct RequiredExtensionStrategy(HashMap<Vec<u8>, Vec<(usize, Regex)>, Fnv>); struct RequiredExtensionStrategy(fnv::HashMap<Vec<u8>, Vec<(usize, Regex)>>);
impl RequiredExtensionStrategy { impl RequiredExtensionStrategy {
fn is_match(&self, candidate: &Candidate<'_>) -> bool { fn is_match(&self, candidate: &Candidate<'_>) -> bool {
@ -786,8 +791,9 @@ impl RequiredExtensionStrategy {
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct RegexSetStrategy { struct RegexSetStrategy {
matcher: RegexSet, matcher: Regex,
map: Vec<usize>, map: Vec<usize>,
// patset: regex_automata::PatternSet,
} }
impl RegexSetStrategy { impl RegexSetStrategy {
@ -800,7 +806,11 @@ impl RegexSetStrategy {
candidate: &Candidate<'_>, candidate: &Candidate<'_>,
matches: &mut Vec<usize>, matches: &mut Vec<usize>,
) { ) {
for i in self.matcher.matches(candidate.path.as_bytes()) { let input = regex_automata::Input::new(candidate.path.as_bytes());
let mut patset =
regex_automata::PatternSet::new(self.matcher.pattern_len());
self.matcher.which_overlapping_matches(&input, &mut patset);
for i in patset.iter() {
matches.push(self.map[i]); matches.push(self.map[i]);
} }
} }
@ -852,12 +862,12 @@ impl MultiStrategyBuilder {
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct RequiredExtensionStrategyBuilder( struct RequiredExtensionStrategyBuilder(
HashMap<Vec<u8>, Vec<(usize, String)>>, fnv::HashMap<Vec<u8>, Vec<(usize, String)>>,
); );
impl RequiredExtensionStrategyBuilder { impl RequiredExtensionStrategyBuilder {
fn new() -> RequiredExtensionStrategyBuilder { fn new() -> RequiredExtensionStrategyBuilder {
RequiredExtensionStrategyBuilder(HashMap::new()) RequiredExtensionStrategyBuilder(fnv::HashMap::default())
} }
fn add(&mut self, global_index: usize, ext: String, regex: String) { fn add(&mut self, global_index: usize, ext: String, regex: String) {
@ -868,7 +878,7 @@ impl RequiredExtensionStrategyBuilder {
} }
fn build(self) -> Result<RequiredExtensionStrategy, Error> { fn build(self) -> Result<RequiredExtensionStrategy, Error> {
let mut exts = HashMap::with_hasher(Fnv::default()); let mut exts = fnv::HashMap::default();
for (ext, regexes) in self.0.into_iter() { for (ext, regexes) in self.0.into_iter() {
exts.insert(ext.clone(), vec![]); exts.insert(ext.clone(), vec![]);
for (global_index, regex) in regexes { for (global_index, regex) in regexes {

View File

@ -4,12 +4,10 @@ use bstr::{ByteSlice, ByteVec};
/// The final component of the path, if it is a normal file. /// The final component of the path, if it is a normal file.
/// ///
/// If the path terminates in ., .., or consists solely of a root of prefix, /// If the path terminates in `.`, `..`, or consists solely of a root of
/// file_name will return None. /// prefix, file_name will return None.
pub fn file_name<'a>(path: &Cow<'a, [u8]>) -> Option<Cow<'a, [u8]>> { pub(crate) fn file_name<'a>(path: &Cow<'a, [u8]>) -> Option<Cow<'a, [u8]>> {
if path.is_empty() { if path.last_byte().map_or(true, |b| b == b'.') {
return None;
} else if path.last_byte() == Some(b'.') {
return None; return None;
} }
let last_slash = path.rfind_byte(b'/').map(|i| i + 1).unwrap_or(0); let last_slash = path.rfind_byte(b'/').map(|i| i + 1).unwrap_or(0);
@ -39,7 +37,9 @@ pub fn file_name<'a>(path: &Cow<'a, [u8]>) -> Option<Cow<'a, [u8]>> {
/// a pattern like `*.rs` is obviously trying to match files with a `rs` /// a pattern like `*.rs` is obviously trying to match files with a `rs`
/// extension, but it also matches files like `.rs`, which doesn't have an /// extension, but it also matches files like `.rs`, which doesn't have an
/// extension according to std::path::Path::extension. /// extension according to std::path::Path::extension.
pub fn file_name_ext<'a>(name: &Cow<'a, [u8]>) -> Option<Cow<'a, [u8]>> { pub(crate) fn file_name_ext<'a>(
name: &Cow<'a, [u8]>,
) -> Option<Cow<'a, [u8]>> {
if name.is_empty() { if name.is_empty() {
return None; return None;
} }
@ -60,7 +60,7 @@ pub fn file_name_ext<'a>(name: &Cow<'a, [u8]>) -> Option<Cow<'a, [u8]>> {
/// Normalizes a path to use `/` as a separator everywhere, even on platforms /// Normalizes a path to use `/` as a separator everywhere, even on platforms
/// that recognize other characters as separators. /// that recognize other characters as separators.
#[cfg(unix)] #[cfg(unix)]
pub fn normalize_path(path: Cow<'_, [u8]>) -> Cow<'_, [u8]> { pub(crate) fn normalize_path(path: Cow<'_, [u8]>) -> Cow<'_, [u8]> {
// UNIX only uses /, so we're good. // UNIX only uses /, so we're good.
path path
} }
@ -68,11 +68,11 @@ pub fn normalize_path(path: Cow<'_, [u8]>) -> Cow<'_, [u8]> {
/// Normalizes a path to use `/` as a separator everywhere, even on platforms /// Normalizes a path to use `/` as a separator everywhere, even on platforms
/// that recognize other characters as separators. /// that recognize other characters as separators.
#[cfg(not(unix))] #[cfg(not(unix))]
pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> { pub(crate) fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> {
use std::path::is_separator; use std::path::is_separator;
for i in 0..path.len() { for i in 0..path.len() {
if path[i] == b'/' || !is_separator(path[i] as char) { if path[i] == b'/' || !is_separator(char::from(path[i])) {
continue; continue;
} }
path.to_mut()[i] = b'/'; path.to_mut()[i] = b'/';