Refactor and test glob sets.

This commit goes a long way toward refactoring glob sets so that the code is easier to maintain going forward. In particular, it makes the literal optimizations that glob sets used a lot more structured and much easier to extend. Tests have also been modified to include glob sets. There's still a bit of polish work left to do before a release. This also fixes the immediate issue where large gitignore files were causing ripgrep to slow way down. While we don't technically fix it for good, we're a lot better about reducing the number of regexes we compile. In particular, if a gitignore file contains thousands of patterns that can't be matched more simply using literals, then ripgrep will slow down again. We could fix this for good by avoiding RegexSet if the number of regexes grows too large. Fixes #134.
2025-08-04 21:52:54 +02:00 · 2016-10-04 20:22:13 -04:00
parent 89811d43d4
commit 175406df01
8 changed files with 1941 additions and 978 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -82,8 +82,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 name = "globset"
 version = "0.1.0"
 dependencies = [
+ "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
 "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
 "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
 "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
--- a/globset/Cargo.toml
+++ b/globset/Cargo.toml
@ -4,7 +4,9 @@ version = "0.1.0"
 authors = ["Andrew Gallant <jamslam@gmail.com>"]

 [dependencies]
+aho-corasick = "0.5.3"
 fnv = "1.0"
 lazy_static = "0.2"
+log = "0.3"
 memchr = "0.1"
 regex = "0.1.77"
--- a/globset/src/lib.rs
+++ b/globset/src/lib.rs
--- a/globset/src/pathutil.rs
+++ b/globset/src/pathutil.rs
@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::ffi::OsStr;
 use std::path::Path;

@ -36,3 +37,98 @@ pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
 ) -> Option<&'a OsStr> {
    path.as_ref().file_name()
 }
+
+/// Return a file extension given a path's file name.
+///
+/// Note that this does NOT match the semantics of std::path::Path::extension.
+/// Namely, the extension includes the `.` and matching is otherwise more
+/// liberal. Specifically, the extenion is:
+///
+/// * None, if the file name given is empty;
+/// * None, if there is no embedded `.`;
+/// * Otherwise, the portion of the file name starting with the final `.`.
+///
+/// e.g., A file name of `.rs` has an extension `.rs`.
+///
+/// N.B. This is done to make certain glob match optimizations easier. Namely,
+/// a pattern like `*.rs` is obviously trying to match files with a `rs`
+/// extension, but it also matches files like `.rs`, which doesn't have an
+/// extension according to std::path::Path::extension.
+pub fn file_name_ext(name: &OsStr) -> Option<&OsStr> {
+    // Yes, these functions are awful, and yes, we are completely violating
+    // the abstraction barrier of std::ffi. The barrier we're violating is
+    // that an OsStr's encoding is *ASCII compatible*. While this is obviously
+    // true on Unix systems, it's also true on Windows because an OsStr uses
+    // WTF-8 internally: https://simonsapin.github.io/wtf-8/
+    //
+    // We should consider doing the same for the other path utility functions.
+    // Right now, we don't break any barriers, but Windows users are paying
+    // for it.
+    //
+    // Got any better ideas that don't cost anything? Hit me up. ---AG
+    unsafe fn os_str_as_u8_slice(s: &OsStr) -> &[u8] {
+        ::std::mem::transmute(s)
+    }
+    unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
+        ::std::mem::transmute(s)
+    }
+    if name.is_empty() {
+        return None;
+    }
+    let name = unsafe { os_str_as_u8_slice(name) };
+    for (i, &b) in name.iter().enumerate().rev() {
+        if b == b'.' {
+            return Some(unsafe { u8_slice_as_os_str(&name[i..]) });
+        }
+    }
+    None
+}
+
+/// Return raw bytes of a path, transcoded to UTF-8 if necessary.
+pub fn path_bytes(path: &Path) -> Cow<[u8]> {
+    os_str_bytes(path.as_os_str())
+}
+
+/// Return the raw bytes of the given OS string, transcoded to UTF-8 if
+/// necessary.
+#[cfg(unix)]
+pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
+    use std::os::unix::ffi::OsStrExt;
+    Cow::Borrowed(s.as_bytes())
+}
+
+/// Return the raw bytes of the given OS string, transcoded to UTF-8 if
+/// necessary.
+#[cfg(not(unix))]
+pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
+    // TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even
+    // if we could get at the raw bytes, they wouldn't be useful. We *must*
+    // convert to UTF-8 before doing path matching. Unfortunate, but necessary.
+    match s.to_string_lossy() {
+        Cow::Owned(s) => Cow::Owned(s.into_bytes()),
+        Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::ffi::OsStr;
+
+    use super::file_name_ext;
+
+    macro_rules! ext {
+        ($name:ident, $file_name:expr, $ext:expr) => {
+            #[test]
+            fn $name() {
+                let got = file_name_ext(OsStr::new($file_name));
+                assert_eq!($ext.map(OsStr::new), got);
+            }
+        };
+    }
+
+    ext!(ext1, "foo.rs", Some(".rs"));
+    ext!(ext2, ".rs", Some(".rs"));
+    ext!(ext3, "..rs", Some(".rs"));
+    ext!(ext4, "", None::<&str>);
+    ext!(ext5, "foo", None::<&str>);
+}
--- a/globset/src/pattern.rs
+++ b/globset/src/pattern.rs
--- a/src/gitignore.rs
+++ b/src/gitignore.rs
@ -28,7 +28,7 @@ use std::fs::File;
 use std::io::{self, BufRead};
 use std::path::{Path, PathBuf};

-use globset;
+use globset::{self, PatternBuilder, Set, SetBuilder};
 use regex;

 use pathutil::{is_file_name, strip_prefix};
@ -82,7 +82,7 @@ impl From<io::Error> for Error {
 /// Gitignore is a matcher for the glob patterns in a single gitignore file.
 #[derive(Clone, Debug)]
 pub struct Gitignore {
-    set: globset::Set,
+    set: Set,
    root: PathBuf,
    patterns: Vec<Pattern>,
    num_ignores: u64,
@ -207,7 +207,7 @@ impl<'a> Match<'a> {
 /// GitignoreBuilder constructs a matcher for a single set of globs from a
 /// .gitignore file.
 pub struct GitignoreBuilder {
-    builder: globset::SetBuilder,
+    builder: SetBuilder,
    root: PathBuf,
    patterns: Vec<Pattern>,
 }
@ -237,7 +237,7 @@ impl GitignoreBuilder {
    pub fn new<P: AsRef<Path>>(root: P) -> GitignoreBuilder {
        let root = strip_prefix("./", root.as_ref()).unwrap_or(root.as_ref());
        GitignoreBuilder {
-            builder: globset::SetBuilder::new(),
+            builder: SetBuilder::new(),
            root: root.to_path_buf(),
            patterns: vec![],
        }
@ -261,6 +261,7 @@ impl GitignoreBuilder {
    /// Add each pattern line from the file path given.
    pub fn add_path<P: AsRef<Path>>(&mut self, path: P) -> Result<(), Error> {
        let rdr = io::BufReader::new(try!(File::open(&path)));
+        debug!("gitignore: {}", path.as_ref().display());
        for line in rdr.lines() {
            try!(self.add(&path, &try!(line)));
        }
@ -299,7 +300,7 @@ impl GitignoreBuilder {
            whitelist: false,
            only_dir: false,
        };
-        let mut opts = globset::MatchOptions::default();
+        let mut literal_separator = false;
        let has_slash = line.chars().any(|c| c == '/');
        let is_absolute = line.chars().nth(0).unwrap() == '/';
        if line.starts_with("\\!") || line.starts_with("\\#") {
@ -314,7 +315,7 @@ impl GitignoreBuilder {
                // then the glob can only match the beginning of a path
                // (relative to the location of gitignore). We achieve this by
                // simply banning wildcards from matching /.
-                opts.require_literal_separator = true;
+                literal_separator = true;
                line = &line[1..];
            }
        }
@ -330,7 +331,7 @@ impl GitignoreBuilder {
        // doesn't let wildcards match slashes.
        pat.pat = line.to_string();
        if has_slash {
-            opts.require_literal_separator = true;
+            literal_separator = true;
        }
        // If there was a leading slash, then this is a pattern that must
        // match the entire path name. Otherwise, we should let it match
@ -347,7 +348,11 @@ impl GitignoreBuilder {
        if pat.pat.ends_with("/**") {
            pat.pat = format!("{}/*", pat.pat);
        }
-        try!(self.builder.add_with(&pat.pat, &opts));
+        let parsed = try!(
+            PatternBuilder::new(&pat.pat)
+                .literal_separator(literal_separator)
+                .build());
+        self.builder.add(parsed);
        self.patterns.push(pat);
        Ok(())
    }
@ -429,6 +434,9 @@ mod tests {
    not_ignored!(ignot11, ROOT, "#foo", "#foo");
    not_ignored!(ignot12, ROOT, "\n\n\n", "foo");
    not_ignored!(ignot13, ROOT, "foo/**", "foo", true);
+    not_ignored!(
+        ignot14, "./third_party/protobuf", "m4/ltoptions.m4",
+        "./third_party/protobuf/csharp/src/packages/repositories.config");

    // See: https://github.com/BurntSushi/ripgrep/issues/106
    #[test]
--- a/src/types.rs
+++ b/src/types.rs
@ -11,7 +11,7 @@ use std::path::Path;
 use regex;

 use gitignore::{Match, Pattern};
-use globset::{self, MatchOptions};
+use globset::{self, PatternBuilder, Set, SetBuilder};

 const TYPE_EXTENSIONS: &'static [(&'static str, &'static [&'static str])] = &[
    ("asm", &["*.asm", "*.s", "*.S"]),
@ -161,8 +161,8 @@ impl FileTypeDef {
 #[derive(Clone, Debug)]
 pub struct Types {
    defs: Vec<FileTypeDef>,
-    selected: Option<globset::SetYesNo>,
-    negated: Option<globset::SetYesNo>,
+    selected: Option<Set>,
+    negated: Option<Set>,
    has_selected: bool,
    unmatched_pat: Pattern,
 }
@ -175,8 +175,8 @@ impl Types {
    /// If has_selected is true, then at least one file type was selected.
    /// Therefore, any non-matches should be ignored.
    fn new(
-        selected: Option<globset::SetYesNo>,
-        negated: Option<globset::SetYesNo>,
+        selected: Option<Set>,
+        negated: Option<Set>,
        has_selected: bool,
        defs: Vec<FileTypeDef>,
    ) -> Types {
@ -265,14 +265,11 @@ impl TypesBuilder {
    /// Build the current set of file type definitions *and* selections into
    /// a file type matcher.
    pub fn build(&self) -> Result<Types, Error> {
-        let opts = MatchOptions {
-            require_literal_separator: true, ..MatchOptions::default()
-        };
        let selected_globs =
            if self.selected.is_empty() {
                None
            } else {
-                let mut bset = globset::SetBuilder::new();
+                let mut bset = SetBuilder::new();
                for name in &self.selected {
                    let globs = match self.types.get(name) {
                        Some(globs) => globs,
@ -282,16 +279,19 @@ impl TypesBuilder {
                        }
                    };
                    for glob in globs {
-                        try!(bset.add_with(glob, &opts));
+                        let pat = try!(
+                            PatternBuilder::new(glob)
+                                .literal_separator(true).build());
+                        bset.add(pat);
                    }
                }
-                Some(try!(bset.build_yesno()))
+                Some(try!(bset.build()))
            };
        let negated_globs =
            if self.negated.is_empty() {
                None
            } else {
-                let mut bset = globset::SetBuilder::new();
+                let mut bset = SetBuilder::new();
                for name in &self.negated {
                    let globs = match self.types.get(name) {
                        Some(globs) => globs,
@ -301,10 +301,13 @@ impl TypesBuilder {
                        }
                    };
                    for glob in globs {
-                        try!(bset.add_with(glob, &opts));
+                        let pat = try!(
+                            PatternBuilder::new(glob)
+                                .literal_separator(true).build());
+                        bset.add(pat);
                    }
                }
-                Some(try!(bset.build_yesno()))
+                Some(try!(bset.build()))
            };
        Ok(Types::new(
            selected_globs,
--- a/tests/tests.rs
+++ b/tests/tests.rs
@ -659,7 +659,6 @@ clean!(regression_30, "test", ".", |wd: WorkDir, mut cmd: Command| {
    }
    wd.create_dir("vendor");
    wd.create("vendor/manifest", "test");
-    cmd.arg("--debug");

    let lines: String = wd.stdout(&mut cmd);
    let expected = path("vendor/manifest:test\n");