diff --git a/src/args.rs b/src/args.rs new file mode 100644 index 00000000..8344b292 --- /dev/null +++ b/src/args.rs @@ -0,0 +1,551 @@ +use std::cmp; +use std::env; +use std::io; +use std::path::{Path, PathBuf}; + +use docopt::Docopt; +use env_logger; +use grep::{Grep, GrepBuilder}; +use log; +use num_cpus; +use regex; +use walkdir::WalkDir; + +use gitignore::{Gitignore, GitignoreBuilder}; +use ignore::Ignore; +use out::Out; +use printer::Printer; +use search::{InputBuffer, Searcher}; +use types::{FileTypeDef, Types, TypesBuilder}; +use walk; + +use Result; + +/// The Docopt usage string. +/// +/// If you've never heard of Docopt before, see: http://docopt.org +/// (TL;DR: The CLI parser is generated from the usage string below.) +const USAGE: &'static str = " +Usage: xrep [options] [ ...] + xrep [options] --files [ ...] + xrep [options] --type-list + xrep --help + xrep --version + +xrep is like the silver searcher and grep, but faster than both. + +Common options: + -a, --text Search binary files as if they were text. + -c, --count Only show count of line matches for each file. + -g, --glob GLOB ... Include or exclude files for searching that + match the given glob. This always overrides any + other ignore logic. Multiple glob flags may be + used. Globbing rules match .gitignore globs. + Precede a glob with a '!' to exclude it. + -h, --help Show this usage message. + -i, --ignore-case Case insensitive search. + -n, --line-number Show line numbers (1-based). + -q, --quiet Do not print anything to stdout. + -t, --type TYPE ... Only search files matching TYPE. Multiple type + flags may be provided. Use the --type-list flag + to list all available types. + -T, --type-not TYPE ... Do not search files matching TYPE. Multiple + not-type flags may be provided. + -v, --invert-match Invert matching. + -w, --word-regexp Only show matches surrounded by word boundaries. + This is equivalent to putting \\b before and + after the search pattern. + +Less common options: + -A, --after-context NUM + Show NUM lines after each match. + + -B, --before-context NUM + Show NUM lines before each match. + + -C, --context NUM + Show NUM lines before and after each match. + + --context-separator ARG + The string to use when separating non-continuous context lines. Escape + sequences may be used. [default: --] + + --debug + Show debug messages. + + --files + Print each file that would be searched (but don't search). + + -H, --with-filename + Prefix each match with the file name that contains it. This is the + default when more than one file is searched. + + --hidden + Search hidden directories and files. + + -L, --follow + Follow symlinks. + + --line-terminator ARG + The byte to use for a line terminator. Escape sequences may be used. + [default: \\n] + + --no-ignore + Don't respect ignore files (.gitignore, .xrepignore, etc.) + + -Q, --literal + Treat the pattern as a literal string instead of a regular expression. + + --threads ARG + The number of threads to use. Defaults to the number of logical CPUs + (capped at 6). [default: 0] + + --version + Show the version number of xrep and exit. + +File type management options: + --type-list + Show all supported file types and their associated globs. + + --type-add ARG ... + Add a new glob for a particular file type. + Example: --type-add html:*.html,*.htm + + --type-clear TYPE ... + Clear the file type globs for TYPE. +"; + +/// RawArgs are the args as they are parsed from Docopt. They aren't used +/// directly by the rest of xrep. +#[derive(Debug, RustcDecodable)] +pub struct RawArgs { + arg_pattern: String, + arg_path: Vec, + flag_after_context: usize, + flag_before_context: usize, + flag_context: usize, + flag_context_separator: String, + flag_count: bool, + flag_debug: bool, + flag_files: bool, + flag_follow: bool, + flag_glob: Vec, + flag_hidden: bool, + flag_ignore_case: bool, + flag_invert_match: bool, + flag_line_number: bool, + flag_line_terminator: String, + flag_literal: bool, + flag_no_ignore: bool, + flag_quiet: bool, + flag_text: bool, + flag_threads: usize, + flag_type: Vec, + flag_type_not: Vec, + flag_type_list: bool, + flag_type_add: Vec, + flag_type_clear: Vec, + flag_with_filename: bool, + flag_word_regexp: bool, +} + +/// Args are transformed/normalized from RawArgs. +#[derive(Debug)] +pub struct Args { + pattern: String, + paths: Vec, + after_context: usize, + before_context: usize, + context_separator: Vec, + count: bool, + eol: u8, + files: bool, + follow: bool, + glob_overrides: Option, + hidden: bool, + ignore_case: bool, + invert_match: bool, + line_number: bool, + no_ignore: bool, + quiet: bool, + text: bool, + threads: usize, + type_defs: Vec, + type_list: bool, + types: Types, + with_filename: bool, +} + +impl RawArgs { + /// Convert arguments parsed into a configuration used by xrep. + fn to_args(&self) -> Result { + let pattern = { + let pattern = + if self.flag_literal { + regex::quote(&self.arg_pattern) + } else { + self.arg_pattern.clone() + }; + if self.flag_word_regexp { + format!(r"\b{}\b", pattern) + } else { + pattern + } + }; + let paths = + if self.arg_path.is_empty() { + vec![Path::new("./").to_path_buf()] + } else { + self.arg_path.iter().map(|p| { + Path::new(p).to_path_buf() + }).collect() + }; + let (after_context, before_context) = + if self.flag_context > 0 { + (self.flag_context, self.flag_context) + } else { + (self.flag_after_context, self.flag_before_context) + }; + let eol = { + let eol = unescape(&self.flag_line_terminator); + if eol.is_empty() { + errored!("Empty line terminator is not allowed."); + } else if eol.len() > 1 { + errored!("Line terminators are limited to exactly 1 byte."); + } + eol[0] + }; + let glob_overrides = + if self.flag_glob.is_empty() { + None + } else { + let cwd = try!(env::current_dir()); + let mut bgi = GitignoreBuilder::new(cwd); + for pat in &self.flag_glob { + try!(bgi.add("", pat)); + } + Some(try!(bgi.build())) + }; + let threads = + if self.flag_threads == 0 { + cmp::min(6, num_cpus::get()) + } else { + self.flag_threads + }; + let mut with_filename = self.flag_with_filename; + if !with_filename { + with_filename = paths.len() > 1 || paths[0].is_dir(); + } + let mut btypes = TypesBuilder::new(); + btypes.add_defaults(); + try!(self.add_types(&mut btypes)); + let types = try!(btypes.build()); + Ok(Args { + pattern: pattern, + paths: paths, + after_context: after_context, + before_context: before_context, + context_separator: unescape(&self.flag_context_separator), + count: self.flag_count, + eol: eol, + files: self.flag_files, + follow: self.flag_follow, + glob_overrides: glob_overrides, + hidden: self.flag_hidden, + ignore_case: self.flag_ignore_case, + invert_match: self.flag_invert_match, + line_number: self.flag_line_number, + no_ignore: self.flag_no_ignore, + quiet: self.flag_quiet, + text: self.flag_text, + threads: threads, + type_defs: btypes.definitions(), + type_list: self.flag_type_list, + types: types, + with_filename: with_filename, + }) + } + + fn add_types(&self, types: &mut TypesBuilder) -> Result<()> { + for ty in &self.flag_type_clear { + types.clear(ty); + } + for def in &self.flag_type_add { + try!(types.add_def(def)); + } + for ty in &self.flag_type { + types.select(ty); + } + for ty in &self.flag_type_not { + types.select_not(ty); + } + Ok(()) + } +} + +impl Args { + /// Parse the command line arguments for this process. + /// + /// If a CLI usage error occurred, then exit the process and print a usage + /// or error message. Similarly, if the user requested the version of + /// xrep, then print the version and exit. + /// + /// Also, initialize a global logger. + pub fn parse() -> Result { + let raw: RawArgs = + Docopt::new(USAGE) + .and_then(|d| d.version(Some(version())).decode()) + .unwrap_or_else(|e| e.exit()); + + let mut logb = env_logger::LogBuilder::new(); + if raw.flag_debug { + logb.filter(None, log::LogLevelFilter::Debug); + } else { + logb.filter(None, log::LogLevelFilter::Warn); + } + if let Err(err) = logb.init() { + errored!("failed to initialize logger: {}", err); + } + + raw.to_args().map_err(From::from) + } + + /// Returns true if xrep should print the files it will search and exit + /// (but not do any actual searching). + pub fn files(&self) -> bool { + self.files + } + + /// Create a new line based matcher. The matcher returned can be used + /// across multiple threads simultaneously. This matcher only supports + /// basic searching of regular expressions in a single buffer. + /// + /// The pattern and other flags are taken from the command line. + pub fn grep(&self) -> Result { + GrepBuilder::new(&self.pattern) + .case_insensitive(self.ignore_case) + .line_terminator(self.eol) + .build() + .map_err(From::from) + } + + /// Creates a new input buffer that is used in searching. + pub fn input_buffer(&self) -> InputBuffer { + let mut inp = InputBuffer::new(); + inp.eol(self.eol); + inp + } + + /// Create a new printer of individual search results that writes to the + /// writer given. + pub fn printer(&self, wtr: W) -> Printer { + Printer::new(wtr) + .context_separator(self.context_separator.clone()) + .eol(self.eol) + .quiet(self.quiet) + .with_filename(self.with_filename) + } + + /// Create a new printer of search results for an entire file that writes + /// to the writer given. + pub fn out(&self, wtr: W) -> Out { + let mut out = Out::new(wtr); + if self.before_context > 0 || self.after_context > 0 { + out = out.file_separator(self.context_separator.clone()); + } + out + } + + /// Return the paths that should be searched. + pub fn paths(&self) -> &[PathBuf] { + &self.paths + } + + /// Create a new line based searcher whose configuration is taken from the + /// command line. This searcher supports a dizzying array of features: + /// inverted matching, line counting, context control and more. + pub fn searcher<'a, R: io::Read, W: io::Write>( + &self, + inp: &'a mut InputBuffer, + printer: &'a mut Printer, + grep: &'a Grep, + path: &'a Path, + rdr: R, + ) -> Searcher<'a, R, W> { + Searcher::new(inp, printer, grep, path, rdr) + .after_context(self.after_context) + .before_context(self.before_context) + .count(self.count) + .eol(self.eol) + .line_number(self.line_number) + .invert_match(self.invert_match) + .text(self.text) + } + + /// Returns the number of worker search threads that should be used. + pub fn threads(&self) -> usize { + self.threads + } + + /// Returns a list of type definitions currently loaded. + pub fn type_defs(&self) -> &[FileTypeDef] { + &self.type_defs + } + + /// Returns true if xrep should print the type definitions currently loaded + /// and then exit. + pub fn type_list(&self) -> bool { + self.type_list + } + + /// Create a new recursive directory iterator at the path given. + pub fn walker(&self, path: &Path) -> walk::Iter { + let wd = WalkDir::new(path).follow_links(self.follow); + let mut ig = Ignore::new(); + ig.ignore_hidden(!self.hidden); + ig.no_ignore(self.no_ignore); + ig.add_types(self.types.clone()); + if let Some(ref overrides) = self.glob_overrides { + ig.add_override(overrides.clone()); + } + walk::Iter::new(ig, wd) + } +} + +fn version() -> String { + let (maj, min, pat) = ( + option_env!("CARGO_PKG_VERSION_MAJOR"), + option_env!("CARGO_PKG_VERSION_MINOR"), + option_env!("CARGO_PKG_VERSION_PATCH"), + ); + match (maj, min, pat) { + (Some(maj), Some(min), Some(pat)) => + format!("{}.{}.{}", maj, min, pat), + _ => "".to_owned(), + } +} + +/// A single state in the state machine used by `unescape`. +#[derive(Clone, Copy, Eq, PartialEq)] +enum State { + Escape, + HexFirst, + HexSecond(char), + Literal, +} + +/// Unescapes a string given on the command line. It supports a limit set of +/// escape sequences: +/// +/// * \t, \r and \n are mapped to their corresponding ASCII bytes. +/// * \xZZ hexadecimal escapes are mapped to their byte. +fn unescape(s: &str) -> Vec { + use self::State::*; + + let mut bytes = vec![]; + let mut state = Literal; + for c in s.chars() { + match state { + Escape => { + match c { + 'n' => { bytes.push(b'\n'); state = Literal; } + 'r' => { bytes.push(b'\r'); state = Literal; } + 't' => { bytes.push(b'\t'); state = Literal; } + 'x' => { state = HexFirst; } + c => { + bytes.extend(&format!(r"\{}", c).into_bytes()); + state = Literal; + } + } + } + HexFirst => { + match c { + '0'...'9' | 'A'...'F' | 'a'...'f' => { + state = HexSecond(c); + } + c => { + bytes.extend(&format!(r"\x{}", c).into_bytes()); + state = Literal; + } + } + } + HexSecond(first) => { + match c { + '0'...'9' | 'A'...'F' | 'a'...'f' => { + let ordinal = format!("{}{}", first, c); + let byte = u8::from_str_radix(&ordinal, 16).unwrap(); + bytes.push(byte); + state = Literal; + } + c => { + let original = format!(r"\x{}{}", first, c); + bytes.extend(&original.into_bytes()); + state = Literal; + } + } + } + Literal => { + match c { + '\\' => { state = Escape; } + c => { bytes.extend(c.to_string().as_bytes()); } + } + } + } + } + match state { + Escape => bytes.push(b'\\'), + HexFirst => bytes.extend(b"\\x"), + HexSecond(c) => bytes.extend(&format!("\\x{}", c).into_bytes()), + Literal => {} + } + bytes +} + +#[cfg(test)] +mod tests { + use super::unescape; + + fn b(bytes: &'static [u8]) -> Vec { + bytes.to_vec() + } + + #[test] + fn unescape_nul() { + assert_eq!(b(b"\x00"), unescape(r"\x00")); + } + + #[test] + fn unescape_nl() { + assert_eq!(b(b"\n"), unescape(r"\n")); + } + + #[test] + fn unescape_tab() { + assert_eq!(b(b"\t"), unescape(r"\t")); + } + + #[test] + fn unescape_carriage() { + assert_eq!(b(b"\r"), unescape(r"\r")); + } + + #[test] + fn unescape_nothing_simple() { + assert_eq!(b(b"\\a"), unescape(r"\a")); + } + + #[test] + fn unescape_nothing_hex0() { + assert_eq!(b(b"\\x"), unescape(r"\x")); + } + + #[test] + fn unescape_nothing_hex1() { + assert_eq!(b(b"\\xz"), unescape(r"\xz")); + } + + #[test] + fn unescape_nothing_hex2() { + assert_eq!(b(b"\\xzz"), unescape(r"\xzz")); + } +} diff --git a/src/gitignore.rs b/src/gitignore.rs index ffaf5bb3..14229ca1 100644 --- a/src/gitignore.rs +++ b/src/gitignore.rs @@ -79,6 +79,7 @@ impl From for Error { } /// Gitignore is a matcher for the glob patterns in a single gitignore file. +#[derive(Clone, Debug)] pub struct Gitignore { set: glob::Set, root: PathBuf, @@ -136,22 +137,26 @@ impl Gitignore { pub fn matched_utf8(&self, path: &str, is_dir: bool) -> Match { // A single regex with a bunch of alternations of glob patterns is // unfortunately typically faster than a regex, so we use it as a - // first pass filter. We still need to run the RegexSet to most + // first pass filter. We still need to run the RegexSet to get the most // recently defined glob that matched. if !self.set.is_match(path) { return Match::None; } - let pat = match self.set.matches(path).iter().last() { - None => return Match::None, - Some(i) => &self.patterns[i], - }; - if pat.whitelist { - Match::Whitelist(&pat) - } else if !pat.only_dir || is_dir { - Match::Ignored(&pat) - } else { - Match::None + // The regex set can't actually pick the right glob that matched all + // on its own. In particular, some globs require that only directories + // can match. Thus, only accept a match from the regex set if the given + // path satisfies the corresponding glob's directory criteria. + for i in self.set.matches(path).iter().rev() { + let pat = &self.patterns[i]; + if !pat.only_dir || is_dir { + return if pat.whitelist { + Match::Whitelist(pat) + } else { + Match::Ignored(pat) + }; + } } + Match::None } } @@ -177,6 +182,24 @@ impl<'a> Match<'a> { Match::None | Match::Whitelist(_) => false, } } + + /// Returns true if the match result didn't match any globs. + pub fn is_none(&self) -> bool { + match *self { + Match::None => true, + Match::Ignored(_) | Match::Whitelist(_) => false, + } + } + + /// Inverts the match so that Ignored becomes Whitelisted and Whitelisted + /// becomes Ignored. A non-match remains the same. + pub fn invert(self) -> Match<'a> { + match self { + Match::None => Match::None, + Match::Ignored(pat) => Match::Whitelist(pat), + Match::Whitelist(pat) => Match::Ignored(pat), + } + } } /// GitignoreBuilder constructs a matcher for a single set of globs from a @@ -231,7 +254,6 @@ impl GitignoreBuilder { /// Add each pattern line from the file path given. pub fn add_path>(&mut self, path: P) -> Result<(), Error> { let rdr = io::BufReader::new(try!(File::open(&path))); - // println!("adding ignores from: {}", path.as_ref().display()); for line in rdr.lines() { try!(self.add(&path, &try!(line))); } diff --git a/src/glob.rs b/src/glob.rs index 8fc25f76..c560facb 100644 --- a/src/glob.rs +++ b/src/glob.rs @@ -77,6 +77,8 @@ impl Set { /// Returns every glob pattern (by sequence number) that matches the given /// path. pub fn matches>(&self, path: T) -> SetMatches { + // TODO(burntsushi): If we split this out into a separate crate, don't + // expose the regex::SetMatches type in the public API. self.set.matches(path.as_ref()) } diff --git a/src/ignore.rs b/src/ignore.rs index a91b75a2..f752fde2 100644 --- a/src/ignore.rs +++ b/src/ignore.rs @@ -18,6 +18,7 @@ use std::fmt; use std::path::{Path, PathBuf}; use gitignore::{self, Gitignore, GitignoreBuilder, Match}; +use types::Types; /// Represents an error that can occur when parsing a gitignore file. #[derive(Debug)] @@ -56,7 +57,13 @@ pub struct Ignore { /// A stack of ignore patterns at each directory level of traversal. /// A directory that contributes no ignore patterns is `None`. stack: Vec>, + /// A set of override globs that are always checked first. A match (whether + /// it's whitelist or blacklist) trumps anything in stack. + overrides: Option, + /// A file type matcher. + types: Option, ignore_hidden: bool, + no_ignore: bool, } impl Ignore { @@ -64,7 +71,10 @@ impl Ignore { pub fn new() -> Ignore { Ignore { stack: vec![], + overrides: None, + types: None, ignore_hidden: true, + no_ignore: false, } } @@ -74,11 +84,34 @@ impl Ignore { self } + /// When set, ignore files are ignored. + pub fn no_ignore(&mut self, yes: bool) -> &mut Ignore { + self.no_ignore = yes; + self + } + + /// Add a set of globs that overrides all other match logic. + pub fn add_override(&mut self, gi: Gitignore) -> &mut Ignore { + self.overrides = Some(gi); + self + } + + /// Add a file type matcher. The file type matcher has the lowest + /// precedence. + pub fn add_types(&mut self, types: Types) -> &mut Ignore { + self.types = Some(types); + self + } + /// Add a directory to the stack. /// /// Note that even if this returns an error, the directory is added to the /// stack (and therefore should be popped). pub fn push>(&mut self, path: P) -> Result<(), Error> { + if self.no_ignore { + self.stack.push(None); + return Ok(()); + } match IgnoreDir::new(path) { Ok(id) => { self.stack.push(id); @@ -102,24 +135,57 @@ impl Ignore { /// Returns true if and only if the given file path should be ignored. pub fn ignored>(&self, path: P, is_dir: bool) -> bool { let path = path.as_ref(); + if let Some(ref overrides) = self.overrides { + let mat = overrides.matched(path, is_dir).invert(); + if let Some(is_ignored) = self.ignore_match(path, mat) { + return is_ignored; + } + } if self.ignore_hidden && is_hidden(&path) { + debug!("{} ignored because it is hidden", path.display()); return true; } for id in self.stack.iter().rev().filter_map(|id| id.as_ref()) { - match id.matched(path, is_dir) { - Match::Whitelist(ref pat) => { - debug!("{} whitelisted by {:?}", path.display(), pat); - return false; - } - Match::Ignored(ref pat) => { - debug!("{} ignored by {:?}", path.display(), pat); + let mat = id.matched(path, is_dir); + if let Some(is_ignored) = self.ignore_match(path, mat) { + if is_ignored { return true; } - Match::None => {} + // If this path is whitelisted by an ignore, then fallthrough + // and let the file type matcher have a say. + break; + } + } + if let Some(ref types) = self.types { + let mat = types.matched(path, is_dir); + if let Some(is_ignored) = self.ignore_match(path, mat) { + return is_ignored; } } false } + + /// Returns true if the given match says the given pattern should be + /// ignored or false if the given pattern should be explicitly whitelisted. + /// Returns None otherwise. + pub fn ignore_match>( + &self, + path: P, + mat: Match, + ) -> Option { + let path = path.as_ref(); + match mat { + Match::Whitelist(ref pat) => { + debug!("{} whitelisted by {:?}", path.display(), pat); + Some(false) + } + Match::Ignored(ref pat) => { + debug!("{} ignored by {:?}", path.display(), pat); + Some(true) + } + Match::None => None, + } + } } /// IgnoreDir represents a set of ignore patterns retrieved from a single diff --git a/src/main.rs b/src/main.rs index 80bb2d5d..89b003db 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,7 +19,6 @@ extern crate rustc_serialize; extern crate thread_local; extern crate walkdir; -use std::cmp; use std::error::Error; use std::fs::File; use std::io::{self, Write}; @@ -30,14 +29,13 @@ use std::sync::Arc; use std::thread; use crossbeam::sync::chase_lev::{self, Steal, Stealer}; -use docopt::Docopt; -use grep::{Grep, GrepBuilder}; +use grep::Grep; use parking_lot::Mutex; -use walkdir::WalkDir; -use ignore::Ignore; +use args::Args; +use out::Out; use printer::Printer; -use search::{InputBuffer, Searcher}; +use search::InputBuffer; macro_rules! errored { ($($tt:tt)*) => { @@ -52,64 +50,22 @@ macro_rules! eprintln { }} } +mod args; mod gitignore; mod glob; mod ignore; +mod out; mod printer; mod search; +mod types; mod walk; -const USAGE: &'static str = " -Usage: xrep [options] [ ...] - xrep --files [ ...] - -xrep is like the silver searcher and grep, but faster than both. - -WARNING: Searching stdin isn't yet supported. - -Options: - -c, --count Suppress normal output and show count of line - matches. - -A, --after-context NUM Show NUM lines after each match. - -B, --before-context NUM Show NUM lines before each match. - -C, --context NUM Show NUM lines before and after each match. - --debug Show debug messages. - --files Print each file that would be searched - (but don't search). - --hidden Search hidden directories and files. - -i, --ignore-case Case insensitive search. - -L, --follow Follow symlinks. - -n, --line-number Show line numbers (1-based). - -t, --threads ARG The number of threads to use. Defaults to the - number of logical CPUs. [default: 0] - -v, --invert-match Invert matching. -"; - -#[derive(RustcDecodable)] -struct Args { - arg_pattern: String, - arg_path: Vec, - flag_after_context: usize, - flag_before_context: usize, - flag_context: usize, - flag_count: bool, - flag_debug: bool, - flag_files: bool, - flag_follow: bool, - flag_hidden: bool, - flag_ignore_case: bool, - flag_invert_match: bool, - flag_line_number: bool, - flag_threads: usize, -} - pub type Result = result::Result>; fn main() { - let args: Args = Docopt::new(USAGE).and_then(|d| d.decode()) - .unwrap_or_else(|e| e.exit()); - match run(args) { - Ok(_) => process::exit(0), + match Args::parse().and_then(run) { + Ok(count) if count == 0 => process::exit(1), + Ok(count) => process::exit(0), Err(err) => { let _ = writeln!(&mut io::stderr(), "{}", err); process::exit(1); @@ -117,194 +73,158 @@ fn main() { } } -fn run(mut args: Args) -> Result<()> { - let mut logb = env_logger::LogBuilder::new(); - if args.flag_debug { - logb.filter(None, log::LogLevelFilter::Debug); - } else { - logb.filter(None, log::LogLevelFilter::Warn); - } - if let Err(err) = logb.init() { - errored!("failed to initialize logger: {}", err); - } - - if args.arg_path.is_empty() { - args.arg_path.push("./".to_string()); - } - if args.arg_path.iter().any(|p| p == "-") { - errored!("searching isn't yet supported"); - } - if args.flag_files { +fn run(args: Args) -> Result { + if args.files() { return run_files(args); } + if args.type_list() { + return run_types(args); + } let args = Arc::new(args); + let out = Arc::new(Mutex::new(args.out(io::stdout()))); let mut workers = vec![]; - let out = Arc::new(Mutex::new(Out::new(args.clone(), io::stdout()))); - let mut chan_work_send = { - let (worker, stealer) = chase_lev::deque(); - for _ in 0..args.num_workers() { - let grepb = - GrepBuilder::new(&args.arg_pattern) - .case_insensitive(args.flag_ignore_case); + let mut workq = { + let (workq, stealer) = chase_lev::deque(); + for _ in 0..args.threads() { let worker = Worker { args: args.clone(), out: out.clone(), chan_work: stealer.clone(), - inpbuf: InputBuffer::new(), + inpbuf: args.input_buffer(), outbuf: Some(vec![]), - grep: try!(grepb.build()), + grep: try!(args.grep()), }; workers.push(thread::spawn(move || worker.run())); } - worker + workq }; - - for p in &args.arg_path { - for path in args.walker(p) { - chan_work_send.push(Message::Some(path)); + for p in args.paths() { + if p == Path::new("-") { + workq.push(Work::Stdin) + } else { + for path in args.walker(p) { + workq.push(Work::File(path)); + } } } for _ in 0..workers.len() { - chan_work_send.push(Message::Quit); + workq.push(Work::Quit); } + let mut match_count = 0; for worker in workers { - worker.join().unwrap(); + match_count += worker.join().unwrap(); } - Ok(()) + Ok(match_count) } -fn run_files(args: Args) -> Result<()> { +fn run_files(args: Args) -> Result { let mut printer = Printer::new(io::BufWriter::new(io::stdout())); - for p in &args.arg_path { - for path in args.walker(p) { - printer.path(path); + let mut file_count = 0; + for p in args.paths() { + if p == Path::new("-") { + printer.path(&Path::new("")); + file_count += 1; + } else { + for path in args.walker(p) { + printer.path(path); + file_count += 1; + } } } - Ok(()) + Ok(file_count) } -impl Args { - fn printer(&self, wtr: W) -> Printer { - Printer::new(wtr) - } - - fn num_workers(&self) -> usize { - let mut num = self.flag_threads; - if num == 0 { - num = cmp::min(8, num_cpus::get()); - } - num - } - - fn walker>(&self, path: P) -> walk::Iter { - let wd = WalkDir::new(path).follow_links(self.flag_follow); - let mut ig = Ignore::new(); - ig.ignore_hidden(!self.flag_hidden); - walk::Iter::new(ig, wd) - } - - fn before_context(&self) -> usize { - if self.flag_context > 0 { - self.flag_context - } else { - self.flag_before_context - } - } - - fn after_context(&self) -> usize { - if self.flag_context > 0 { - self.flag_context - } else { - self.flag_after_context - } - } - - fn has_context(&self) -> bool { - self.before_context() > 0 || self.after_context() > 0 +fn run_types(args: Args) -> Result { + let mut printer = Printer::new(io::BufWriter::new(io::stdout())); + let mut ty_count = 0; + for def in args.type_defs() { + printer.type_def(def); + ty_count += 1; } + Ok(ty_count) } -enum Message { - Some(T), +enum Work { + File(PathBuf), + Stdin, Quit, } struct Worker { args: Arc, out: Arc>>, - chan_work: Stealer>, + chan_work: Stealer, inpbuf: InputBuffer, outbuf: Option>, grep: Grep, } impl Worker { - fn run(mut self) { + fn run(mut self) -> u64 { + let mut match_count = 0; loop { - let path = match self.chan_work.steal() { + let (path, file) = match self.chan_work.steal() { Steal::Empty | Steal::Abort => continue, - Steal::Data(Message::Quit) => break, - Steal::Data(Message::Some(path)) => path, - }; - let file = match File::open(&path) { - Ok(file) => file, - Err(err) => { - eprintln!("{}: {}", path.display(), err); - continue; + Steal::Data(Work::Quit) => break, + Steal::Data(Work::File(path)) => { + match File::open(&path) { + Ok(file) => (path, Some(file)), + Err(err) => { + eprintln!("{}: {}", path.display(), err); + continue; + } + } + } + Steal::Data(Work::Stdin) => { + (Path::new("").to_path_buf(), None) } }; let mut outbuf = self.outbuf.take().unwrap(); outbuf.clear(); let mut printer = self.args.printer(outbuf); { - let mut searcher = Searcher::new( - &mut self.inpbuf, - &mut printer, - &self.grep, - &path, - file, - ); - searcher = searcher.count(self.args.flag_count); - searcher = searcher.line_number(self.args.flag_line_number); - searcher = searcher.invert_match(self.args.flag_invert_match); - searcher = searcher.after_context(self.args.after_context()); - searcher = searcher.before_context(self.args.before_context()); - if let Err(err) = searcher.run() { - eprintln!("{}", err); + let result = match file { + None => { + let stdin = io::stdin(); + let stdin = stdin.lock(); + self.search(&mut printer, &path, stdin) + } + Some(file) => { + self.search(&mut printer, &path, file) + } + }; + match result { + Ok(count) => { + match_count += count; + } + Err(err) => { + eprintln!("{}", err); + } } } let outbuf = printer.into_inner(); if !outbuf.is_empty() { let mut out = self.out.lock(); - out.write_file_matches(&outbuf); + out.write(&outbuf); } self.outbuf = Some(outbuf); } - } -} - -struct Out { - args: Arc, - wtr: io::BufWriter, - printed: bool, -} - -impl Out { - fn new(args: Arc, wtr: W) -> Out { - Out { - args: args, - wtr: io::BufWriter::new(wtr), - printed: false, - } + match_count } - fn write_file_matches(&mut self, buf: &[u8]) { - if self.printed && self.args.has_context() { - let _ = self.wtr.write_all(b"--\n"); - } - let _ = self.wtr.write_all(buf); - let _ = self.wtr.flush(); - self.printed = true; + fn search( + &mut self, + printer: &mut Printer, + path: &Path, + rdr: R, + ) -> Result { + self.args.searcher( + &mut self.inpbuf, + printer, + &self.grep, + path, + rdr, + ).run().map_err(From::from) } } diff --git a/src/out.rs b/src/out.rs new file mode 100644 index 00000000..cb9d1181 --- /dev/null +++ b/src/out.rs @@ -0,0 +1,45 @@ +use std::io::{self, Write}; + +/// Out controls the actual output of all search results for a particular file +/// to the end user. +/// +/// (The difference between Out and Printer is that a Printer works with +/// individual search results where as Out works with search results for each +/// file as a whole. For example, it knows when to print a file separator.) +pub struct Out { + wtr: io::BufWriter, + printed: bool, + file_separator: Vec, +} + +impl Out { + /// Create a new Out that writes to the wtr given. + pub fn new(wtr: W) -> Out { + Out { + wtr: io::BufWriter::new(wtr), + printed: false, + file_separator: vec![], + } + } + + /// If set, the separator is printed between matches from different files. + /// By default, no separator is printed. + /// + /// If sep is empty, then no file separator is printed. + pub fn file_separator(mut self, sep: Vec) -> Out { + self.file_separator = sep; + self + } + + /// Write the search results of a single file to the underlying wtr and + /// flush wtr. + pub fn write(&mut self, buf: &[u8]) { + if self.printed && !self.file_separator.is_empty() { + let _ = self.wtr.write_all(&self.file_separator); + let _ = self.wtr.write_all(b"\n"); + } + let _ = self.wtr.write_all(buf); + let _ = self.wtr.flush(); + self.printed = true; + } +} diff --git a/src/printer.rs b/src/printer.rs index 57867f88..8c53c051 100644 --- a/src/printer.rs +++ b/src/printer.rs @@ -1,53 +1,121 @@ use std::io; use std::path::Path; -macro_rules! wln { - ($($tt:tt)*) => { - let _ = writeln!($($tt)*); - } -} - -macro_rules! w { - ($($tt:tt)*) => { - let _ = write!($($tt)*); - } -} +use types::FileTypeDef; +/// Printer encapsulates all output logic for searching. +/// +/// Note that we currently ignore all write errors. It's probably worthwhile +/// to fix this, but printers are only ever used for writes to stdout or +/// writes to memory, neither of which commonly fail. pub struct Printer { + /// The underlying writer. wtr: W, + /// Whether anything has been printed to wtr yet. has_printed: bool, + /// The string to use to separate non-contiguous runs of context lines. + context_separator: Vec, + /// The end-of-line terminator used by the printer. In general, eols are + /// printed via the match directly, but occasionally we need to insert them + /// ourselves (for example, to print a context separator). + eol: u8, + /// Whether to suppress all output. + quiet: bool, + /// Whether to prefix each match with the corresponding file name. + with_filename: bool, } impl Printer { + /// Create a new printer that writes to wtr. pub fn new(wtr: W) -> Printer { Printer { wtr: wtr, has_printed: false, + context_separator: "--".to_string().into_bytes(), + eol: b'\n', + quiet: false, + with_filename: false, } } + /// Set the context separator. The default is `--`. + pub fn context_separator(mut self, sep: Vec) -> Printer { + self.context_separator = sep; + self + } + + /// Set the end-of-line terminator. The default is `\n`. + pub fn eol(mut self, eol: u8) -> Printer { + self.eol = eol; + self + } + + /// When set, all output is suppressed. + pub fn quiet(mut self, yes: bool) -> Printer { + self.quiet = yes; + self + } + + /// When set, each match is prefixed with the file name that it came from. + pub fn with_filename(mut self, yes: bool) -> Printer { + self.with_filename = yes; + self + } + + /// Returns true if and only if something has been printed. pub fn has_printed(&self) -> bool { self.has_printed } - pub fn into_inner(self) -> W { + /// Flushes the underlying writer and returns it. + pub fn into_inner(mut self) -> W { + let _ = self.wtr.flush(); self.wtr } + /// Prints a type definition. + pub fn type_def(&mut self, def: &FileTypeDef) { + self.write(def.name().as_bytes()); + self.write(b": "); + let mut first = true; + for pat in def.patterns() { + if !first { + self.write(b", "); + } + self.write(pat.as_bytes()); + first = false; + } + self.write_eol(); + } + + /// Prints the given path. pub fn path>(&mut self, path: P) { - wln!(&mut self.wtr, "{}", path.as_ref().display()); + self.write(path.as_ref().to_string_lossy().as_bytes()); + self.write_eol(); } + /// Prints the given path and a count of the number of matches found. pub fn path_count>(&mut self, path: P, count: u64) { - wln!(&mut self.wtr, "{}:{}", path.as_ref().display(), count); + if self.with_filename { + self.write(path.as_ref().to_string_lossy().as_bytes()); + self.write(b":"); + } + self.write(count.to_string().as_bytes()); + self.write_eol(); } - pub fn count(&mut self, count: u64) { - wln!(&mut self.wtr, "{}", count); - } - - pub fn context_separator(&mut self) { - wln!(&mut self.wtr, "--"); + /// Prints the context separator. + pub fn context_separate(&mut self) { + // N.B. We can't use `write` here because of borrowing restrictions. + if self.quiet { + return; + } + if self.context_separator.is_empty() { + return; + } + self.has_printed = true; + let _ = self.wtr.write_all(&self.context_separator); + let _ = self.wtr.write_all(&[self.eol]); } pub fn matched>( @@ -58,15 +126,17 @@ impl Printer { end: usize, line_number: Option, ) { - self.write(path.as_ref().to_string_lossy().as_bytes()); - self.write(b":"); + if self.with_filename { + self.write(path.as_ref().to_string_lossy().as_bytes()); + self.write(b":"); + } if let Some(line_number) = line_number { self.write(line_number.to_string().as_bytes()); self.write(b":"); } self.write(&buf[start..end]); - if buf[start..end].last() != Some(&b'\n') { - self.write(b"\n"); + if buf[start..end].last() != Some(&self.eol) { + self.write_eol(); } } @@ -78,24 +148,30 @@ impl Printer { end: usize, line_number: Option, ) { - self.write(path.as_ref().to_string_lossy().as_bytes()); - self.write(b"-"); + if self.with_filename { + self.write(path.as_ref().to_string_lossy().as_bytes()); + self.write(b"-"); + } if let Some(line_number) = line_number { self.write(line_number.to_string().as_bytes()); self.write(b"-"); } self.write(&buf[start..end]); - if buf[start..end].last() != Some(&b'\n') { - self.write(b"\n"); + if buf[start..end].last() != Some(&self.eol) { + self.write_eol(); } } - pub fn binary_matched>(&mut self, path: P) { - wln!(&mut self.wtr, "Binary file {} matches", path.as_ref().display()); - } - fn write(&mut self, buf: &[u8]) { + if self.quiet { + return; + } self.has_printed = true; let _ = self.wtr.write_all(buf); } + + fn write_eol(&mut self) { + let eol = self.eol; + self.write(&[eol]); + } } diff --git a/src/search.rs b/src/search.rs index 7e178ff5..ea5602d1 100644 --- a/src/search.rs +++ b/src/search.rs @@ -20,6 +20,7 @@ const READ_SIZE: usize = 8 * (1<<10); /// Error describes errors that can occur while searching. #[derive(Debug)] pub enum Error { + /// A standard I/O error attached to a particular file path. Io { err: io::Error, path: PathBuf, @@ -57,6 +58,7 @@ impl fmt::Display for Error { } pub struct Searcher<'a, R, W: 'a> { + opts: Options, inp: &'a mut InputBuffer, printer: &'a mut Printer, grep: &'a Grep, @@ -68,11 +70,32 @@ pub struct Searcher<'a, R, W: 'a> { last_printed: usize, last_line: usize, after_context_remaining: usize, +} + +/// Options for configuring search. +#[derive(Clone)] +struct Options { + after_context: usize, + before_context: usize, count: bool, + eol: u8, invert_match: bool, line_number: bool, - before_context: usize, - after_context: usize, + text: bool, +} + +impl Default for Options { + fn default() -> Options { + Options { + after_context: 0, + before_context: 0, + count: false, + eol: b'\n', + invert_match: false, + line_number: false, + text: false, + } + } } impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { @@ -96,6 +119,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { haystack: R, ) -> Searcher<'a, R, W> { Searcher { + opts: Options::default(), inp: inp, printer: printer, grep: grep, @@ -107,47 +131,54 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { last_printed: 0, last_line: 0, after_context_remaining: 0, - count: false, - invert_match: false, - line_number: false, - before_context: 0, - after_context: 0, } } - /// If enabled, searching will print a count instead of each match. - /// - /// Disabled by default. - pub fn count(mut self, yes: bool) -> Self { - self.count = yes; - self - } - - /// If enabled, matching is inverted so that lines that *don't* match the - /// given pattern are treated as matches. - pub fn invert_match(mut self, yes: bool) -> Self { - self.invert_match = yes; - self - } - - /// If enabled, compute line numbers and prefix each line of output with - /// them. - pub fn line_number(mut self, yes: bool) -> Self { - self.line_number = yes; + /// The number of contextual lines to show after each match. The default + /// is zero. + pub fn after_context(mut self, count: usize) -> Self { + self.opts.after_context = count; self } /// The number of contextual lines to show before each match. The default /// is zero. pub fn before_context(mut self, count: usize) -> Self { - self.before_context = count; + self.opts.before_context = count; self } - /// The number of contextual lines to show after each match. The default - /// is zero. - pub fn after_context(mut self, count: usize) -> Self { - self.after_context = count; + /// If enabled, searching will print a count instead of each match. + /// + /// Disabled by default. + pub fn count(mut self, yes: bool) -> Self { + self.opts.count = yes; + self + } + + /// Set the end-of-line byte used by this searcher. + pub fn eol(mut self, eol: u8) -> Self { + self.opts.eol = eol; + self + } + + /// If enabled, matching is inverted so that lines that *don't* match the + /// given pattern are treated as matches. + pub fn invert_match(mut self, yes: bool) -> Self { + self.opts.invert_match = yes; + self + } + + /// If enabled, compute line numbers and prefix each line of output with + /// them. + pub fn line_number(mut self, yes: bool) -> Self { + self.opts.line_number = yes; + self + } + + /// If enabled, search binary files as if they were text. + pub fn text(mut self, yes: bool) -> Self { + self.opts.text = yes; self } @@ -157,16 +188,16 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { pub fn run(mut self) -> Result { self.inp.reset(); self.match_count = 0; - self.line_count = if self.line_number { Some(0) } else { None }; + self.line_count = if self.opts.line_number { Some(0) } else { None }; self.last_match = Match::default(); self.after_context_remaining = 0; loop { let upto = self.inp.lastnl; self.print_after_context(upto); if !try!(self.fill()) { - if self.inp.is_binary { - self.printer.binary_matched(self.path); - } + break; + } + if !self.opts.text && self.inp.is_binary { break; } while self.inp.pos < self.inp.lastnl { @@ -174,7 +205,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { &mut self.last_match, &mut self.inp.buf[..self.inp.lastnl], self.inp.pos); - if self.invert_match { + if self.opts.invert_match { let upto = if matched { self.last_match.start() @@ -189,7 +220,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { } } else if matched { self.match_count += 1; - if !self.count { + if !self.opts.count { let start = self.last_match.start(); let end = self.last_match.end(); self.print_after_context(start); @@ -204,32 +235,36 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { } } } - if self.count && self.match_count > 0 { + if self.opts.count && self.match_count > 0 { self.printer.path_count(self.path, self.match_count); } Ok(self.match_count) } + #[inline(always)] fn fill(&mut self) -> Result { - let mut keep_from = self.inp.lastnl; - if self.before_context > 0 || self.after_context > 0 { - keep_from = start_of_previous_lines( + let mut keep = self.inp.lastnl; + if self.opts.before_context > 0 || self.opts.after_context > 0 { + let lines = 1 + cmp::max( + self.opts.before_context, self.opts.after_context); + keep = start_of_previous_lines( + self.opts.eol, &self.inp.buf, self.inp.lastnl.saturating_sub(1), - cmp::max(self.before_context, self.after_context) + 1); + lines); } - if keep_from < self.last_printed { - self.last_printed = self.last_printed - keep_from; + if keep < self.last_printed { + self.last_printed = self.last_printed - keep; } else { self.last_printed = 0; } - if keep_from <= self.last_line { - self.last_line = self.last_line - keep_from; + if keep <= self.last_line { + self.last_line = self.last_line - keep; } else { - self.count_lines(keep_from); + self.count_lines(keep); self.last_line = 0; } - let ok = try!(self.inp.fill(&mut self.haystack, keep_from).map_err(|err| { + let ok = try!(self.inp.fill(&mut self.haystack, keep).map_err(|err| { Error::from_io(err, &self.path) })); Ok(ok) @@ -237,10 +272,10 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { #[inline(always)] fn print_inverted_matches(&mut self, upto: usize) { - debug_assert!(self.invert_match); - let mut it = IterLines::new(self.inp.pos); + debug_assert!(self.opts.invert_match); + let mut it = IterLines::new(self.opts.eol, self.inp.pos); while let Some((start, end)) = it.next(&self.inp.buf[..upto]) { - if !self.count { + if !self.opts.count { self.print_match(start, end); } self.inp.pos = end; @@ -250,7 +285,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { #[inline(always)] fn print_before_context(&mut self, upto: usize) { - if self.count || self.before_context == 0 { + if self.opts.count || self.opts.before_context == 0 { return; } let start = self.last_printed; @@ -260,10 +295,11 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { } let before_context_start = start + start_of_previous_lines( + self.opts.eol, &self.inp.buf[start..], end - start - 1, - self.before_context); - let mut it = IterLines::new(before_context_start); + self.opts.before_context); + let mut it = IterLines::new(self.opts.eol, before_context_start); while let Some((s, e)) = it.next(&self.inp.buf[..end]) { self.print_separator(s); self.print_context(s, e); @@ -272,12 +308,12 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { #[inline(always)] fn print_after_context(&mut self, upto: usize) { - if self.count || self.after_context_remaining == 0 { + if self.opts.count || self.after_context_remaining == 0 { return; } let start = self.last_printed; let end = upto; - let mut it = IterLines::new(start); + let mut it = IterLines::new(self.opts.eol, start); while let Some((s, e)) = it.next(&self.inp.buf[..end]) { self.print_context(s, e); self.after_context_remaining -= 1; @@ -295,7 +331,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { self.printer.matched( &self.path, &self.inp.buf, start, end, self.line_count); self.last_printed = end; - self.after_context_remaining = self.after_context; + self.after_context_remaining = self.opts.after_context; } #[inline(always)] @@ -309,21 +345,23 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { #[inline(always)] fn print_separator(&mut self, before: usize) { - if self.before_context == 0 && self.after_context == 0 { + if self.opts.before_context == 0 && self.opts.after_context == 0 { return; } if !self.printer.has_printed() { return; } - if (self.last_printed == 0 && before > 0) || self.last_printed < before { - self.printer.context_separator(); + if (self.last_printed == 0 && before > 0) + || self.last_printed < before { + self.printer.context_separate(); } } #[inline(always)] fn count_lines(&mut self, upto: usize) { if let Some(ref mut line_count) = self.line_count { - *line_count += count_lines(&self.inp.buf[self.last_line..upto]); + *line_count += count_lines( + &self.inp.buf[self.last_line..upto], self.opts.eol); self.last_line = upto; } } @@ -337,15 +375,53 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { } } +/// InputBuffer encapsulates the logic of maintaining a ~fixed sized buffer +/// on which to search. There are three key pieces of complexity: +/// +/// 1. We must be able to handle lines that are longer than the size of the +/// buffer. For this reason, the buffer is allowed to expand (and is +/// therefore not technically fixed). Note that once a buffer expands, it +/// will never contract. +/// 2. The contents of the buffer may end with a partial line, so we must keep +/// track of where the last complete line ends. Namely, the partial line +/// is only completed on subsequent reads *after* searching up through +/// the last complete line is done. +/// 3. When printing the context of a match, the last N lines of the buffer +/// may need to be rolled over into the next buffer. For example, a match +/// may occur at the beginning of a buffer, in which case, lines at the end +/// of the previous contents of the buffer need to be printed. +/// +/// An InputBuffer is designed to be reused and isn't tied to any particular +/// reader. pub struct InputBuffer { + /// The number of bytes to attempt to read at a time. Once set, this is + /// never changed. read_size: usize, + /// The end-of-line terminator used in this buffer. + eol: u8, + /// A scratch buffer. + tmp: Vec, + /// A buffer to read bytes into. All searches are executed directly against + /// this buffer and pos/lastnl/end point into it. buf: Vec, - tmp1: Vec, - tmp2: Vec, + /// The current position in buf. The current position represents where the + /// next search should start. pos: usize, + /// The position immediately following the last line terminator in buf. + /// This may be equal to end. + /// + /// Searching should never cross this boundary. In particular, the contents + /// of the buffer following this position may correspond to *partial* line. + /// All contents before this position are complete lines. lastnl: usize, + /// The end position of the buffer. Data after this position is not + /// specified. end: usize, + /// Set to true if and only if no reads have occurred yet. first: bool, + /// Set to true if and only if the contents of buf are determined to be + /// "binary" (i.e., not searchable text). Note that its value may be + /// falsely negative *or* falsely positive. It is only a heuristic. is_binary: bool, } @@ -367,9 +443,9 @@ impl InputBuffer { } InputBuffer { read_size: cap, + eol: b'\n', buf: vec![0; cap], - tmp1: vec![], - tmp2: vec![], + tmp: vec![], pos: 0, lastnl: 0, end: 0, @@ -378,6 +454,12 @@ impl InputBuffer { } } + /// Set the end-of-line terminator used by this input buffer. + pub fn eol(&mut self, eol: u8) { + self.eol = eol; + } + + /// Resets this buffer so that it may be reused with a new reader. fn reset(&mut self) { self.pos = 0; self.lastnl = 0; @@ -386,36 +468,30 @@ impl InputBuffer { self.is_binary = false; } + /// Fill the contents of this buffer with the reader given. The reader + /// given should be the same in every call to fill unless reset has been + /// called. + /// + /// The bytes in buf[keep_from..end] are rolled over into the beginning + /// of the buffer. fn fill( &mut self, rdr: &mut R, keep_from: usize, ) -> Result { - self.pos = 0; - self.tmp1.clear(); - self.tmp2.clear(); - - // Save the leftovers from the previous fill before anything else. - if self.lastnl < self.end { - self.tmp1.extend_from_slice(&self.buf[self.lastnl..self.end]); - } - // If we need to save lines to account for context, do that here. - // These context lines have already been searched, but make up the - // first bytes of this buffer. - if keep_from < self.lastnl { - self.tmp2.extend_from_slice(&self.buf[keep_from..self.lastnl]); - self.buf[0..self.tmp2.len()].copy_from_slice(&self.tmp2); - self.pos = self.tmp2.len(); - } - if !self.tmp1.is_empty() { - let (start, end) = (self.pos, self.pos + self.tmp1.len()); - self.buf[start..end].copy_from_slice(&self.tmp1); - self.end = end; - } else { - self.end = self.pos; - } + // Rollover bytes from buf[keep_from..end] and update our various + // pointers. N.B. This could be done with the unsafe ptr::copy, but + // I haven't been able to produce a benchmark that notices a difference + // in performance. (Invariably, ptr::copy is also clearer IMO.) + self.tmp.clear(); + self.tmp.extend_from_slice(&self.buf[keep_from..self.end]); + self.buf[0..self.tmp.len()].copy_from_slice(&self.tmp); + self.pos = self.lastnl - keep_from; self.lastnl = 0; + self.end = self.tmp.len(); while self.lastnl == 0 { + // If our buffer isn't big enough to hold the contents of a full + // read, expand it. if self.buf.len() - self.end < self.read_size { let min_len = self.read_size + self.buf.len() - self.end; let new_len = cmp::max(min_len, self.buf.len() * 2); @@ -423,22 +499,28 @@ impl InputBuffer { } let n = try!(rdr.read( &mut self.buf[self.end..self.end + self.read_size])); - if self.first { - if is_binary(&self.buf[self.end..self.end + n]) { - self.is_binary = true; - return Ok(false); - } + if self.first && is_binary(&self.buf[self.end..self.end + n]) { + self.is_binary = true; + } + if self.is_binary { + replace_buf( + &mut self.buf[self.end..self.end + n], b'\x00', self.eol); } self.first = false; + // We assume that reading 0 bytes means we've hit EOF. if n == 0 { + // If we've searched everything up to the end of the buffer, + // then there's nothing left to do. if self.end - self.pos == 0 { return Ok(false); } + // Even if we hit EOF, we might still have to search the + // last line if it didn't contain a trailing terminator. self.lastnl = self.end; break; } self.lastnl = - memrchr(b'\n', &self.buf[self.end..self.end + n]) + memrchr(self.eol, &self.buf[self.end..self.end + n]) .map(|i| self.end + i + 1) .unwrap_or(0); self.end += n; @@ -450,7 +532,7 @@ impl InputBuffer { /// Returns true if and only if the given buffer is determined to be "binary" /// or otherwise not contain text data that is usefully searchable. /// -/// Note that this may return both false positives and false negatives! +/// Note that this may return both false positives and false negatives. #[inline(always)] fn is_binary(buf: &[u8]) -> bool { if buf.len() >= 4 && &buf[0..4] == b"%PDF" { @@ -461,15 +543,31 @@ fn is_binary(buf: &[u8]) -> bool { /// Count the number of lines in the given buffer. #[inline(always)] -fn count_lines(mut buf: &[u8]) -> u64 { +fn count_lines(mut buf: &[u8], eol: u8) -> u64 { let mut count = 0; - while let Some(pos) = memchr(b'\n', buf) { + while let Some(pos) = memchr(eol, buf) { count += 1; buf = &buf[pos + 1..]; } count } +/// Replaces a with b in buf. +fn replace_buf(buf: &mut [u8], a: u8, b: u8) { + if a == b { + return; + } + let mut pos = 0; + while let Some(i) = memchr(a, &buf[pos..]).map(|i| pos + i) { + buf[i] = b; + pos = i + 1; + while buf.get(pos) == Some(&a) { + buf[pos] = b; + pos += 1; + } + } +} + /// An "iterator" over lines in a particular buffer. /// /// Idiomatic Rust would borrow the buffer and use it as internal state to @@ -477,6 +575,7 @@ fn count_lines(mut buf: &[u8]) -> u64 { /// the borrow in the search code. (Because the borrow prevents composition /// through other mutable methods.) struct IterLines { + eol: u8, pos: usize, } @@ -485,8 +584,9 @@ impl IterLines { /// /// The buffer is passed to the `next` method. #[inline(always)] - fn new(start: usize) -> IterLines { + fn new(eol: u8, start: usize) -> IterLines { IterLines { + eol: eol, pos: start, } } @@ -497,7 +597,7 @@ impl IterLines { /// The range returned includes the new line. #[inline(always)] fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> { - match memchr(b'\n', &buf[self.pos..]) { + match memchr(self.eol, &buf[self.pos..]) { None => { if self.pos < buf.len() { let start = self.pos; @@ -528,10 +628,13 @@ impl IterLines { /// The position returned corresponds to the first byte in the given line. #[inline(always)] fn start_of_previous_lines( + eol: u8, buf: &[u8], mut end: usize, mut count: usize, ) -> usize { + // TODO(burntsushi): This function needs to be badly simplified. The case + // analysis is impossible to follow. if buf[..end].is_empty() { return 0; } @@ -541,14 +644,14 @@ fn start_of_previous_lines( if end == buf.len() { end -= 1; } - if buf[end] == b'\n' { + if buf[end] == eol { if end == 0 { return end + 1; } end -= 1; } while count > 0 { - if buf[end] == b'\n' { + if buf[end] == eol { count -= 1; if count == 0 { return end + 1; @@ -559,7 +662,7 @@ fn start_of_previous_lines( end -= 1; continue; } - match memrchr(b'\n', &buf[..end]) { + match memrchr(eol, &buf[..end]) { None => { return 0; } @@ -567,7 +670,7 @@ fn start_of_previous_lines( count -= 1; end = i; if end == 0 { - if buf[end] == b'\n' && count == 0 { + if buf[end] == eol && count == 0 { end += 1; } return end; @@ -579,10 +682,6 @@ fn start_of_previous_lines( end + 2 } -fn show(bytes: &[u8]) -> &str { - ::std::str::from_utf8(bytes).unwrap() -} - #[cfg(test)] mod tests { use std::io; @@ -668,102 +767,105 @@ fn main() { #[test] fn previous_lines() { + let eol = b'\n'; let text = SHERLOCK.as_bytes(); assert_eq!(366, text.len()); - assert_eq!(0, start_of_previous_lines(text, 366, 100)); - assert_eq!(366, start_of_previous_lines(text, 366, 0)); + assert_eq!(0, start_of_previous_lines(eol, text, 366, 100)); + assert_eq!(366, start_of_previous_lines(eol, text, 366, 0)); - assert_eq!(321, start_of_previous_lines(text, 366, 1)); - assert_eq!(321, start_of_previous_lines(text, 365, 1)); - assert_eq!(321, start_of_previous_lines(text, 364, 1)); - assert_eq!(321, start_of_previous_lines(text, 322, 1)); - assert_eq!(321, start_of_previous_lines(text, 321, 1)); - assert_eq!(258, start_of_previous_lines(text, 320, 1)); + assert_eq!(321, start_of_previous_lines(eol, text, 366, 1)); + assert_eq!(321, start_of_previous_lines(eol, text, 365, 1)); + assert_eq!(321, start_of_previous_lines(eol, text, 364, 1)); + assert_eq!(321, start_of_previous_lines(eol, text, 322, 1)); + assert_eq!(321, start_of_previous_lines(eol, text, 321, 1)); + assert_eq!(258, start_of_previous_lines(eol, text, 320, 1)); - assert_eq!(258, start_of_previous_lines(text, 366, 2)); - assert_eq!(258, start_of_previous_lines(text, 365, 2)); - assert_eq!(258, start_of_previous_lines(text, 364, 2)); - assert_eq!(258, start_of_previous_lines(text, 322, 2)); - assert_eq!(258, start_of_previous_lines(text, 321, 2)); - assert_eq!(193, start_of_previous_lines(text, 320, 2)); + assert_eq!(258, start_of_previous_lines(eol, text, 366, 2)); + assert_eq!(258, start_of_previous_lines(eol, text, 365, 2)); + assert_eq!(258, start_of_previous_lines(eol, text, 364, 2)); + assert_eq!(258, start_of_previous_lines(eol, text, 322, 2)); + assert_eq!(258, start_of_previous_lines(eol, text, 321, 2)); + assert_eq!(193, start_of_previous_lines(eol, text, 320, 2)); - assert_eq!(65, start_of_previous_lines(text, 66, 1)); - assert_eq!(0, start_of_previous_lines(text, 66, 2)); - assert_eq!(64, start_of_previous_lines(text, 64, 0)); - assert_eq!(0, start_of_previous_lines(text, 64, 1)); - assert_eq!(0, start_of_previous_lines(text, 64, 2)); + assert_eq!(65, start_of_previous_lines(eol, text, 66, 1)); + assert_eq!(0, start_of_previous_lines(eol, text, 66, 2)); + assert_eq!(64, start_of_previous_lines(eol, text, 64, 0)); + assert_eq!(0, start_of_previous_lines(eol, text, 64, 1)); + assert_eq!(0, start_of_previous_lines(eol, text, 64, 2)); - assert_eq!(0, start_of_previous_lines(text, 0, 2)); - assert_eq!(0, start_of_previous_lines(text, 0, 1)); + assert_eq!(0, start_of_previous_lines(eol, text, 0, 2)); + assert_eq!(0, start_of_previous_lines(eol, text, 0, 1)); } #[test] fn previous_lines_short() { + let eol = b'\n'; let text = &b"a\nb\nc\nd\ne\nf\n"[..]; assert_eq!(12, text.len()); - assert_eq!(10, start_of_previous_lines(text, 12, 1)); - assert_eq!(8, start_of_previous_lines(text, 12, 2)); - assert_eq!(6, start_of_previous_lines(text, 12, 3)); - assert_eq!(4, start_of_previous_lines(text, 12, 4)); - assert_eq!(2, start_of_previous_lines(text, 12, 5)); - assert_eq!(0, start_of_previous_lines(text, 12, 6)); - assert_eq!(0, start_of_previous_lines(text, 12, 7)); - assert_eq!(10, start_of_previous_lines(text, 11, 1)); - assert_eq!(8, start_of_previous_lines(text, 11, 2)); - assert_eq!(6, start_of_previous_lines(text, 11, 3)); - assert_eq!(4, start_of_previous_lines(text, 11, 4)); - assert_eq!(2, start_of_previous_lines(text, 11, 5)); - assert_eq!(0, start_of_previous_lines(text, 11, 6)); - assert_eq!(0, start_of_previous_lines(text, 11, 7)); - assert_eq!(10, start_of_previous_lines(text, 10, 1)); - assert_eq!(8, start_of_previous_lines(text, 10, 2)); - assert_eq!(6, start_of_previous_lines(text, 10, 3)); - assert_eq!(4, start_of_previous_lines(text, 10, 4)); - assert_eq!(2, start_of_previous_lines(text, 10, 5)); - assert_eq!(0, start_of_previous_lines(text, 10, 6)); - assert_eq!(0, start_of_previous_lines(text, 10, 7)); + assert_eq!(10, start_of_previous_lines(eol, text, 12, 1)); + assert_eq!(8, start_of_previous_lines(eol, text, 12, 2)); + assert_eq!(6, start_of_previous_lines(eol, text, 12, 3)); + assert_eq!(4, start_of_previous_lines(eol, text, 12, 4)); + assert_eq!(2, start_of_previous_lines(eol, text, 12, 5)); + assert_eq!(0, start_of_previous_lines(eol, text, 12, 6)); + assert_eq!(0, start_of_previous_lines(eol, text, 12, 7)); + assert_eq!(10, start_of_previous_lines(eol, text, 11, 1)); + assert_eq!(8, start_of_previous_lines(eol, text, 11, 2)); + assert_eq!(6, start_of_previous_lines(eol, text, 11, 3)); + assert_eq!(4, start_of_previous_lines(eol, text, 11, 4)); + assert_eq!(2, start_of_previous_lines(eol, text, 11, 5)); + assert_eq!(0, start_of_previous_lines(eol, text, 11, 6)); + assert_eq!(0, start_of_previous_lines(eol, text, 11, 7)); + assert_eq!(10, start_of_previous_lines(eol, text, 10, 1)); + assert_eq!(8, start_of_previous_lines(eol, text, 10, 2)); + assert_eq!(6, start_of_previous_lines(eol, text, 10, 3)); + assert_eq!(4, start_of_previous_lines(eol, text, 10, 4)); + assert_eq!(2, start_of_previous_lines(eol, text, 10, 5)); + assert_eq!(0, start_of_previous_lines(eol, text, 10, 6)); + assert_eq!(0, start_of_previous_lines(eol, text, 10, 7)); - assert_eq!(8, start_of_previous_lines(text, 9, 1)); - assert_eq!(8, start_of_previous_lines(text, 8, 1)); + assert_eq!(8, start_of_previous_lines(eol, text, 9, 1)); + assert_eq!(8, start_of_previous_lines(eol, text, 8, 1)); - assert_eq!(6, start_of_previous_lines(text, 7, 1)); - assert_eq!(6, start_of_previous_lines(text, 6, 1)); + assert_eq!(6, start_of_previous_lines(eol, text, 7, 1)); + assert_eq!(6, start_of_previous_lines(eol, text, 6, 1)); - assert_eq!(4, start_of_previous_lines(text, 5, 1)); - assert_eq!(4, start_of_previous_lines(text, 4, 1)); + assert_eq!(4, start_of_previous_lines(eol, text, 5, 1)); + assert_eq!(4, start_of_previous_lines(eol, text, 4, 1)); - assert_eq!(2, start_of_previous_lines(text, 3, 1)); - assert_eq!(2, start_of_previous_lines(text, 2, 1)); + assert_eq!(2, start_of_previous_lines(eol, text, 3, 1)); + assert_eq!(2, start_of_previous_lines(eol, text, 2, 1)); - assert_eq!(0, start_of_previous_lines(text, 1, 1)); - assert_eq!(0, start_of_previous_lines(text, 0, 1)); + assert_eq!(0, start_of_previous_lines(eol, text, 1, 1)); + assert_eq!(0, start_of_previous_lines(eol, text, 0, 1)); } #[test] fn previous_lines_empty() { + let eol = b'\n'; let text = &b"\n\n\nd\ne\nf\n"[..]; assert_eq!(9, text.len()); - assert_eq!(7, start_of_previous_lines(text, 9, 1)); - assert_eq!(5, start_of_previous_lines(text, 9, 2)); - assert_eq!(3, start_of_previous_lines(text, 9, 3)); - assert_eq!(2, start_of_previous_lines(text, 9, 4)); - assert_eq!(1, start_of_previous_lines(text, 9, 5)); - assert_eq!(0, start_of_previous_lines(text, 9, 6)); - assert_eq!(0, start_of_previous_lines(text, 9, 7)); + assert_eq!(7, start_of_previous_lines(eol, text, 9, 1)); + assert_eq!(5, start_of_previous_lines(eol, text, 9, 2)); + assert_eq!(3, start_of_previous_lines(eol, text, 9, 3)); + assert_eq!(2, start_of_previous_lines(eol, text, 9, 4)); + assert_eq!(1, start_of_previous_lines(eol, text, 9, 5)); + assert_eq!(0, start_of_previous_lines(eol, text, 9, 6)); + assert_eq!(0, start_of_previous_lines(eol, text, 9, 7)); let text = &b"a\n\n\nd\ne\nf\n"[..]; assert_eq!(10, text.len()); - assert_eq!(8, start_of_previous_lines(text, 10, 1)); - assert_eq!(6, start_of_previous_lines(text, 10, 2)); - assert_eq!(4, start_of_previous_lines(text, 10, 3)); - assert_eq!(3, start_of_previous_lines(text, 10, 4)); - assert_eq!(2, start_of_previous_lines(text, 10, 5)); - assert_eq!(0, start_of_previous_lines(text, 10, 6)); - assert_eq!(0, start_of_previous_lines(text, 10, 7)); + assert_eq!(8, start_of_previous_lines(eol, text, 10, 1)); + assert_eq!(6, start_of_previous_lines(eol, text, 10, 2)); + assert_eq!(4, start_of_previous_lines(eol, text, 10, 3)); + assert_eq!(3, start_of_previous_lines(eol, text, 10, 4)); + assert_eq!(2, start_of_previous_lines(eol, text, 10, 5)); + assert_eq!(0, start_of_previous_lines(eol, text, 10, 6)); + assert_eq!(0, start_of_previous_lines(eol, text, 10, 7)); } #[test] @@ -776,6 +878,23 @@ fn main() { "); } + #[test] + fn binary() { + let text = "Sherlock\n\x00Holmes\n"; + let (count, out) = search("Sherlock|Holmes", text, |s|s); + assert_eq!(0, count); + assert_eq!(out, ""); + } + + + #[test] + fn binary_text() { + let text = "Sherlock\n\x00Holmes\n"; + let (count, out) = search("Sherlock|Holmes", text, |s| s.text(true)); + assert_eq!(2, count); + assert_eq!(out, "/baz.rs:Sherlock\n/baz.rs:Holmes\n"); + } + #[test] fn line_numbers() { let (count, out) = search_smallcap( diff --git a/src/types.rs b/src/types.rs new file mode 100644 index 00000000..027697d9 --- /dev/null +++ b/src/types.rs @@ -0,0 +1,358 @@ +/*! +The types module provides a way of associating glob patterns on file names to +file types. +*/ + +use std::collections::HashMap; +use std::error::Error as StdError; +use std::fmt; +use std::path::Path; + +use gitignore::{self, Gitignore, GitignoreBuilder, Match, Pattern}; + +const TYPE_EXTENSIONS: &'static [(&'static str, &'static [&'static str])] = &[ + ("asm", &["*.asm", "*.s", "*.S"]), + ("awk", &["*.awk"]), + ("c", &["*.c", "*.h", "*.H"]), + ("cbor", &["*.cbor"]), + ("clojure", &["*.clj", "*.cljs"]), + ("cmake", &["CMakeLists.txt"]), + ("coffeescript", &["*.coffee"]), + ("cpp", &[ + "*.C", "*.cc", "*.cpp", "*.cxx", + "*.h", "*.H", "*.hh", "*.hpp", + ]), + ("csharp", &["*.cs"]), + ("css", &["*.css"]), + ("cython", &["*.pyx"]), + ("dart", &["*.dart"]), + ("d", &["*.d"]), + ("elisp", &["*.el"]), + ("erlang", &["*.erl", "*.hrl"]), + ("fortran", &[ + "*.f", "*.F", "*.f77", "*.F77", "*.pfo", + "*.f90", "*.F90", "*.f95", "*.F95", + ]), + ("go", &["*.go"]), + ("groovy", &["*.groovy"]), + ("haskell", &["*.hs", "*.lhs"]), + ("html", &["*.htm", "*.html"]), + ("java", &["*.java"]), + ("js", &["*.js"]), + ("json", &["*.json"]), + ("jsonl", &["*.jsonl"]), + ("lisp", &["*.el", "*.jl", "*.lisp", "*.lsp", "*.sc", "*.scm"]), + ("lua", &["*.lua"]), + ("m4", &["*.ac", "*.m4"]), + ("make", &["gnumakefile", "Gnumakefile", "makefile", "Makefile", "*.mk"]), + ("markdown", &["*.md"]), + ("matlab", &["*.m"]), + ("mk", &["mkfile"]), + ("ml", &["*.ml"]), + ("objc", &["*.h", "*.m"]), + ("objcpp", &["*.h", "*.mm"]), + ("ocaml", &["*.ml", "*.mli", "*.mll", "*.mly"]), + ("perl", &["*.perl", "*.pl", "*.PL", "*.plh", "*.plx", "*.pm"]), + ("php", &["*.php", "*.php3", "*.php4", "*.php5", "*.phtml"]), + ("py", &["*.py"]), + ("rr", &["*.R"]), + ("rst", &["*.rst"]), + ("ruby", &["*.rb"]), + ("rust", &["*.rs"]), + ("scala", &["*.scala"]), + ("sh", &["*.bash", "*.csh", "*.ksh", "*.sh", "*.tcsh"]), + ("sql", &["*.sql"]), + ("tex", &["*.tex", "*.cls", "*.sty"]), + ("txt", &["*.txt"]), + ("toml", &["*.toml", "Cargo.lock"]), + ("vala", &["*.vala"]), + ("vimscript", &["*.vim"]), + ("xml", &["*.xml"]), + ("yacc", &["*.y"]), + ("yaml", &["*.yaml", "*.yml"]), +]; + +/// Describes all the possible failure conditions for building a file type +/// matcher. +#[derive(Debug)] +pub enum Error { + /// We tried to select (or negate) a file type that is not defined. + UnrecognizedFileType(String), + /// A user specified file type definition could not be parsed. + InvalidDefinition, + /// There was an error building the matcher (probably a bad glob). + Gitignore(gitignore::Error), +} + +impl StdError for Error { + fn description(&self) -> &str { + match *self { + Error::UnrecognizedFileType(_) => "unrecognized file type", + Error::InvalidDefinition => "invalid definition", + Error::Gitignore(ref err) => err.description(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::UnrecognizedFileType(ref ty) => { + write!(f, "unrecognized file type: {}", ty) + } + Error::InvalidDefinition => { + write!(f, "invalid definition (format is type:glob, e.g., \ + html:*.html)") + } + Error::Gitignore(ref err) => err.fmt(f), + } + } +} + +impl From for Error { + fn from(err: gitignore::Error) -> Error { + Error::Gitignore(err) + } +} + +/// A single file type definition. +#[derive(Clone, Debug)] +pub struct FileTypeDef { + name: String, + pats: Vec, +} + +impl FileTypeDef { + /// Return the name of this file type. + pub fn name(&self) -> &str { + &self.name + } + + /// Return the glob patterns used to recognize this file type. + pub fn patterns(&self) -> &[String] { + &self.pats + } +} + +/// Types is a file type matcher. +#[derive(Clone, Debug)] +pub struct Types { + gi: Option, + has_selected: bool, + unmatched_pat: Pattern, +} + +impl Types { + /// Creates a new file type matcher from the given Gitignore matcher. If + /// not Gitignore matcher is provided, then the file type matcher has no + /// effect. + /// + /// If has_selected is true, then at least one file type was selected. + /// Therefore, any non-matches should be ignored. + fn new(gi: Option, has_selected: bool) -> Types { + Types { + gi: gi, + has_selected: has_selected, + unmatched_pat: Pattern { + from: Path::new("").to_path_buf(), + original: "".to_string(), + pat: "".to_string(), + whitelist: false, + only_dir: false, + }, + } + } + + /// Returns a match for the given path against this file type matcher. + /// + /// The path is considered whitelisted if it matches a selected file type. + /// The path is considered ignored if it matched a negated file type. + /// If at least one file type is selected and path doesn't match, then + /// the path is also considered ignored. + pub fn matched>(&self, path: P, is_dir: bool) -> Match { + // File types don't apply to directories. + if is_dir { + return Match::None; + } + let path = path.as_ref(); + self.gi.as_ref() + .map(|gi| { + let path = &*path.to_string_lossy(); + let mat = gi.matched_utf8(path, is_dir).invert(); + if self.has_selected && mat.is_none() { + Match::Ignored(&self.unmatched_pat) + } else { + mat + } + }) + .unwrap_or(Match::None) + } +} + +/// TypesBuilder builds a type matcher from a set of file type definitions and +/// a set of file type selections. +pub struct TypesBuilder { + types: HashMap>, + select: Vec, + select_not: Vec, +} + +impl TypesBuilder { + /// Create a new builder for a file type matcher. + pub fn new() -> TypesBuilder { + TypesBuilder { + types: HashMap::new(), + select: vec![], + select_not: vec![], + } + } + + /// Build the current set of file type definitions *and* selections into + /// a file type matcher. + pub fn build(&self) -> Result { + if self.select.is_empty() && self.select_not.is_empty() { + return Ok(Types::new(None, false)); + } + let mut bgi = GitignoreBuilder::new("/"); + for name in &self.select { + let globs = match self.types.get(name) { + Some(globs) => globs, + None => { + return Err(Error::UnrecognizedFileType(name.to_string())); + } + }; + for glob in globs { + try!(bgi.add("", glob)); + } + } + for name in &self.select_not { + let globs = match self.types.get(name) { + Some(globs) => globs, + None => { + return Err(Error::UnrecognizedFileType(name.to_string())); + } + }; + for glob in globs { + try!(bgi.add("", &format!("!{}", glob))); + } + } + Ok(Types::new(Some(try!(bgi.build())), !self.select.is_empty())) + } + + /// Return the set of current file type definitions. + pub fn definitions(&self) -> Vec { + let mut defs = vec![]; + for (ref name, ref pats) in &self.types { + let mut pats = pats.to_vec(); + pats.sort(); + defs.push(FileTypeDef { + name: name.to_string(), + pats: pats, + }); + } + defs.sort_by(|def1, def2| def1.name().cmp(def2.name())); + defs + } + + /// Select the file type given by `name`. + pub fn select(&mut self, name: &str) -> &mut TypesBuilder { + self.select.push(name.to_string()); + self + } + + /// Ignore the file type given by `name`. + pub fn select_not(&mut self, name: &str) -> &mut TypesBuilder { + self.select_not.push(name.to_string()); + self + } + + /// Clear any file type definitions for the type given. + pub fn clear(&mut self, name: &str) -> &mut TypesBuilder { + self.types.remove(name); + self + } + + /// Add a new file type definition. `name` can be arbitrary and `pat` + /// should be a glob recognizing file paths belonging to the `name` type. + pub fn add(&mut self, name: &str, pat: &str) -> &mut TypesBuilder { + self.types.entry(name.to_string()) + .or_insert(vec![]).push(pat.to_string()); + self + } + + /// Add a new file type definition specified in string form. The format + /// is `name:glob`. Names may not include a colon. + pub fn add_def(&mut self, def: &str) -> Result<(), Error> { + let name: String = def.chars().take_while(|&c| c != ':').collect(); + let pat: String = def.chars().skip(name.chars().count() + 1).collect(); + if name.is_empty() || pat.is_empty() { + return Err(Error::InvalidDefinition); + } + self.add(&name, &pat); + Ok(()) + } + + /// Add a set of default file type definitions. + pub fn add_defaults(&mut self) -> &mut TypesBuilder { + for &(name, exts) in TYPE_EXTENSIONS { + for ext in exts { + self.add(name, ext); + } + } + self + } +} + +#[cfg(test)] +mod tests { + use super::TypesBuilder; + + macro_rules! matched { + ($name:ident, $types:expr, $sel:expr, $selnot:expr, + $path:expr) => { + matched!($name, $types, $sel, $selnot, $path, true); + }; + (not, $name:ident, $types:expr, $sel:expr, $selnot:expr, + $path:expr) => { + matched!($name, $types, $sel, $selnot, $path, false); + }; + ($name:ident, $types:expr, $sel:expr, $selnot:expr, + $path:expr, $matched:expr) => { + #[test] + fn $name() { + let mut btypes = TypesBuilder::new(); + for tydef in $types { + btypes.add_def(tydef).unwrap(); + } + for sel in $sel { + btypes.select(sel); + } + for selnot in $selnot { + btypes.select_not(selnot); + } + let types = btypes.build().unwrap(); + let mat = types.matched($path, false); + assert_eq!($matched, !mat.is_ignored()); + } + }; + } + + fn types() -> Vec<&'static str> { + vec![ + "html:*.html", + "html:*.htm", + "rust:*.rs", + "js:*.js", + ] + } + + matched!(match1, types(), vec!["rust"], vec![], "lib.rs"); + matched!(match2, types(), vec!["html"], vec![], "index.html"); + matched!(match3, types(), vec!["html"], vec![], "index.htm"); + matched!(match4, types(), vec!["html", "rust"], vec![], "main.rs"); + matched!(match5, types(), vec![], vec![], "index.html"); + matched!(match6, types(), vec![], vec!["rust"], "index.html"); + + matched!(not, matchnot1, types(), vec!["rust"], vec![], "index.html"); + matched!(not, matchnot2, types(), vec![], vec!["rust"], "main.rs"); +}