From 66efbad871620fe2dbe675438fdc8d9922e72826 Mon Sep 17 00:00:00 2001 From: Marc Tiehuis Date: Sun, 2 Apr 2017 09:55:58 +1200 Subject: [PATCH] Add dfa-size-limit and regex-size-limit arguments Fixes #362. --- src/app.rs | 22 ++++++++++-- src/args.rs | 96 ++++++++++++++++++++++++++++++++++++++++---------- tests/tests.rs | 44 +++++++++++++++++++++-- 3 files changed, 139 insertions(+), 23 deletions(-) diff --git a/src/app.rs b/src/app.rs index b8f25b87..44e49517 100644 --- a/src/app.rs +++ b/src/app.rs @@ -41,7 +41,9 @@ OPTIONS: /// in a `build.rs` script to build shell completion files. pub fn app() -> App<'static, 'static> { let arg = |name| { - Arg::with_name(name).help(USAGES[name].short).long_help(USAGES[name].long) + Arg::with_name(name) + .help(USAGES[name].short) + .long_help(USAGES[name].long) }; let flag = |name| arg(name).long(name); @@ -53,7 +55,7 @@ pub fn app() -> App<'static, 'static> { .setting(AppSettings::UnifiedHelpMessage) .usage(USAGE) .template(TEMPLATE) - .help_message("Prints help information. Use --help for more details.") + .help_message("Prints help information. Use --help for more details.") // First, set up primary positional/flag arguments. .arg(arg("pattern") .required_unless_one(&[ @@ -114,6 +116,8 @@ pub fn app() -> App<'static, 'static> { .arg(flag("column")) .arg(flag("context-separator") .value_name("SEPARATOR").takes_value(true)) + .arg(flag("dfa-size-limit") + .value_name("NUM+SUFFIX?").takes_value(true)) .arg(flag("debug")) .arg(flag("file").short("f") .value_name("FILE").takes_value(true) @@ -148,6 +152,8 @@ pub fn app() -> App<'static, 'static> { .arg(flag("path-separator").value_name("SEPARATOR").takes_value(true)) .arg(flag("pretty").short("p")) .arg(flag("replace").short("r").value_name("ARG").takes_value(true)) + .arg(flag("regex-size-limit") + .value_name("NUM+SUFFIX?").takes_value(true)) .arg(flag("case-sensitive").short("s")) .arg(flag("smart-case").short("S")) .arg(flag("sort-files")) @@ -326,6 +332,13 @@ lazy_static! { doc!(h, "debug", "Show debug messages.", "Show debug messages. Please use this when filing a bug report."); + doc!(h, "dfa-size-limit", + "The upper size limit of the generated dfa.", + "The upper size limit of the generated dfa. The default limit is \ + 10M. This should only be changed on very large regex inputs \ + where the (slower) fallback regex engine may otherwise be used. \ + \n\nThe argument accepts the same size suffixes as allowed in \ + the 'max-filesize' argument."); doc!(h, "file", "Search for patterns from the given file.", "Search for patterns from the given file, with one pattern per \ @@ -444,6 +457,11 @@ lazy_static! { Note that the replacement by default replaces each match, and \ NOT the entire line. To replace the entire line, you should \ match the entire line."); + doc!(h, "regex-size-limit", + "The upper size limit of the compiled regex.", + "The upper size limit of the compiled regex. The default limit \ + is 10M. \n\nThe argument accepts the same size suffixes as \ + allowed in the 'max-filesize' argument."); doc!(h, "case-sensitive", "Search case sensitively.", "Search case sensitively. This overrides -i/--ignore-case and \ diff --git a/src/args.rs b/src/args.rs index 34f38792..48c4ad56 100644 --- a/src/args.rs +++ b/src/args.rs @@ -771,12 +771,18 @@ impl<'a> ArgMatches<'a> { let casei = self.is_present("ignore-case") && !self.is_present("case-sensitive"); - GrepBuilder::new(&try!(self.pattern())) + let mut gb = GrepBuilder::new(&try!(self.pattern())) .case_smart(smart) .case_insensitive(casei) - .line_terminator(b'\n') - .build() - .map_err(From::from) + .line_terminator(b'\n'); + + if let Some(limit) = try!(self.dfa_size_limit()) { + gb = gb.dfa_size_limit(limit); + } + if let Some(limit) = try!(self.regex_size_limit()) { + gb = gb.size_limit(limit); + } + gb.build().map_err(From::from) } /// Builds the set of glob overrides from the command line flags. @@ -807,31 +813,64 @@ impl<'a> ArgMatches<'a> { btypes.build().map_err(From::from) } - /// Parses the max-filesize argument option into a byte count. - fn max_filesize(&self) -> Result> { - use regex::Regex; - - let max_filesize = match self.value_of_lossy("max-filesize") { + /// Parses an argument of the form `[0-9]+(KMG)?`. + /// + /// This always returns the result as a type `u64`. This must be converted + /// to the appropriate type by the caller. + fn parse_human_readable_size_arg( + &self, + arg_name: &str, + ) -> Result> { + let arg_value = match self.value_of_lossy(arg_name) { Some(x) => x, None => return Ok(None) }; + let re = regex::Regex::new("^([0-9]+)([KMG])?$").unwrap(); + let caps = try!( + re.captures(&arg_value).ok_or_else(|| { + format!("invalid format for {}", arg_name) + })); - let re = Regex::new("^([0-9]+)([KMG])?$").unwrap(); - let caps = try!(re.captures(&max_filesize) - .ok_or("invalid format for max-filesize argument")); - - let value = try!(caps[1].parse::().map_err(|err|err.to_string())); + let value = try!(caps[1].parse::()); let suffix = caps.get(2).map(|x| x.as_str()); + let v_10 = value.checked_mul(1024); + let v_20 = v_10.and_then(|x| x.checked_mul(1024)); + let v_30 = v_20.and_then(|x| x.checked_mul(1024)); + + let try_suffix = |x: Option| { + if x.is_some() { + Ok(x) + } else { + Err(From::from(format!("number too large for {}", arg_name))) + } + }; match suffix { None => Ok(Some(value)), - Some("K") => Ok(Some(value * 1024)), - Some("M") => Ok(Some(value * 1024 * 1024)), - Some("G") => Ok(Some(value * 1024 * 1024 * 1024)), - _ => Err(From::from("invalid suffix for max-filesize argument")) + Some("K") => try_suffix(v_10), + Some("M") => try_suffix(v_20), + Some("G") => try_suffix(v_30), + _ => Err(From::from(format!("invalid suffix for {}", arg_name))) } } + /// Parse the dfa-size-limit argument option into a byte count. + fn dfa_size_limit(&self) -> Result> { + let r = try!(self.parse_human_readable_size_arg("dfa-size-limit")); + human_readable_to_usize("dfa-size-limit", r) + } + + /// Parse the regex-size-limit argument option into a byte count. + fn regex_size_limit(&self) -> Result> { + let r = try!(self.parse_human_readable_size_arg("regex-size-limit")); + human_readable_to_usize("regex-size-limit", r) + } + + /// Parses the max-filesize argument option into a byte count. + fn max_filesize(&self) -> Result> { + self.parse_human_readable_size_arg("max-filesize") + } + /// Returns true if ignore files should be ignored. fn no_ignore(&self) -> bool { self.is_present("no-ignore") @@ -926,6 +965,27 @@ impl QuietMatched { } } +/// Convert the result of a `parse_human_readable_size_arg` call into +/// a `usize`, failing if the type does not fit. +fn human_readable_to_usize( + arg_name: &str, + value: Option, +) -> Result> { + use std::usize; + + match value { + None => Ok(None), + Some(v) => { + if v <= usize::MAX as u64 { + Ok(Some(v as usize)) + } else { + let msg = format!("number too large for {}", arg_name); + Err(From::from(msg)) + } + } + } +} + /// Returns true if and only if stdin is deemed searchable. #[cfg(unix)] fn stdin_is_readable() -> bool { diff --git a/tests/tests.rs b/tests/tests.rs index 578dbcba..ef332ae7 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -455,7 +455,6 @@ sherlock!(max_filesize_parse_no_suffix, "Sherlock", ".", let expected = "\ foo "; - assert_eq!(lines, expected); }); @@ -470,7 +469,6 @@ sherlock!(max_filesize_parse_k_suffix, "Sherlock", ".", let expected = "\ foo "; - assert_eq!(lines, expected); }); @@ -485,10 +483,19 @@ sherlock!(max_filesize_parse_m_suffix, "Sherlock", ".", let expected = "\ foo "; - assert_eq!(lines, expected); }); +sherlock!(max_filesize_suffix_overflow, "Sherlock", ".", +|wd: WorkDir, mut cmd: Command| { + wd.remove("sherlock"); + wd.create_size("foo", 1000000); + + // 2^35 * 2^30 would otherwise overflow + cmd.arg("--max-filesize").arg("34359738368G").arg("--files"); + wd.assert_err(&mut cmd); +}); + sherlock!(ignore_hidden, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| { wd.remove("sherlock"); wd.create(".sherlock", hay::SHERLOCK); @@ -1443,6 +1450,37 @@ clean!(feature_275_pathsep, "test", ".", |wd: WorkDir, mut cmd: Command| { assert_eq!(lines, "fooZbar:test\n"); }); +// See: https://github.com/BurntSushi/ripgrep/issues/362 +sherlock!(feature_362_dfa_size_limit, r"For\s", +|wd: WorkDir, mut cmd: Command| { + // This should fall back to the nfa engine but should still produce the + // expected result. + cmd.arg("--dfa-size-limit").arg("10"); + let lines: String = wd.stdout(&mut cmd); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +"; + assert_eq!(lines, expected); +}); + +sherlock!(feature_362_exceeds_regex_size_limit, r"[0-9]\w+", +|wd: WorkDir, mut cmd: Command| { + cmd.arg("--regex-size-limit").arg("10K"); + wd.assert_err(&mut cmd); +}); + +#[cfg(target_pointer_width = "32")] +sherlock!(feature_362_u64_to_narrow_usize_suffix_overflow, "Sherlock", ".", +|wd: WorkDir, mut cmd: Command| { + wd.remove("sherlock"); + wd.create_size("foo", 1000000); + + // 2^35 * 2^20 is ok for u64, but not for usize + cmd.arg("--dfa-size-limit").arg("34359738368M").arg("--files"); + wd.assert_err(&mut cmd); +}); + + // See: https://github.com/BurntSushi/ripgrep/issues/419 sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {