diff --git a/Cargo.lock b/Cargo.lock index 597aa63e..aa01606f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -174,6 +174,7 @@ name = "grep-pcre2" version = "0.1.6" dependencies = [ "grep-matcher", + "log", "pcre2", ] @@ -412,7 +413,6 @@ dependencies = [ "jemallocator", "lazy_static", "log", - "regex", "serde", "serde_derive", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index bb15e124..5bc66336 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,7 +52,6 @@ grep = { version = "0.2.12", path = "crates/grep" } ignore = { version = "0.4.19", path = "crates/ignore" } lazy_static = "1.1.0" log = "0.4.5" -regex = "1.8.3" serde_json = "1.0.23" termcolor = "1.1.0" diff --git a/crates/core/args.rs b/crates/core/args.rs index 3d5f7742..5ad038ed 100644 --- a/crates/core/args.rs +++ b/crates/core/args.rs @@ -31,7 +31,6 @@ use ignore::overrides::{Override, OverrideBuilder}; use ignore::types::{FileTypeDef, Types, TypesBuilder}; use ignore::{Walk, WalkBuilder, WalkParallel}; use log; -use regex; use termcolor::{BufferWriter, ColorChoice, WriteColor}; use crate::app; @@ -653,6 +652,8 @@ impl ArgMatches { .multi_line(true) .unicode(self.unicode()) .octal(false) + .fixed_strings(self.is_present("fixed-strings")) + .whole_line(self.is_present("line-regexp")) .word(self.is_present("word-regexp")); if self.is_present("multiline") { builder.dot_matches_new_line(self.is_present("multiline-dotall")); @@ -679,12 +680,7 @@ impl ArgMatches { if let Some(limit) = self.dfa_size_limit()? { builder.dfa_size_limit(limit); } - let res = if self.is_present("fixed-strings") { - builder.build_literals(patterns) - } else { - builder.build(&patterns.join("|")) - }; - match res { + match builder.build_many(patterns) { Ok(m) => Ok(m), Err(err) => Err(From::from(suggest_multiline(err.to_string()))), } @@ -701,6 +697,8 @@ impl ArgMatches { .case_smart(self.case_smart()) .caseless(self.case_insensitive()) .multi_line(true) + .fixed_strings(self.is_present("fixed-strings")) + .whole_line(self.is_present("line-regexp")) .word(self.is_present("word-regexp")); // For whatever reason, the JIT craps out during regex compilation with // a "no more memory" error on 32 bit systems. So don't use it there. @@ -721,7 +719,7 @@ impl ArgMatches { if self.is_present("crlf") { builder.crlf(true); } - Ok(builder.build(&patterns.join("|"))?) + Ok(builder.build_many(patterns)?) } /// Build a JSON printer that writes results to the given writer. @@ -1385,11 +1383,6 @@ impl ArgMatches { /// Get a sequence of all available patterns from the command line. /// This includes reading the -e/--regexp and -f/--file flags. /// - /// Note that if -F/--fixed-strings is set, then all patterns will be - /// escaped. If -x/--line-regexp is set, then all patterns are surrounded - /// by `^...$`. Other things, such as --word-regexp, are handled by the - /// regex matcher itself. - /// /// If any pattern is invalid UTF-8, then an error is returned. fn patterns(&self) -> Result> { if self.is_present("files") || self.is_present("type-list") { @@ -1430,16 +1423,6 @@ impl ArgMatches { Ok(pats) } - /// Returns a pattern that is guaranteed to produce an empty regular - /// expression that is valid in any position. - fn pattern_empty(&self) -> String { - // This would normally just be an empty string, which works on its - // own, but if the patterns are joined in a set of alternations, then - // you wind up with `foo|`, which is currently invalid in Rust's regex - // engine. - "(?:)".to_string() - } - /// Converts an OsStr pattern to a String pattern. The pattern is escaped /// if -F/--fixed-strings is set. /// @@ -1458,30 +1441,12 @@ impl ArgMatches { /// Applies additional processing on the given pattern if necessary /// (such as escaping meta characters or turning it into a line regex). fn pattern_from_string(&self, pat: String) -> String { - let pat = self.pattern_line(self.pattern_literal(pat)); if pat.is_empty() { - self.pattern_empty() - } else { - pat - } - } - - /// Returns the given pattern as a line pattern if the -x/--line-regexp - /// flag is set. Otherwise, the pattern is returned unchanged. - fn pattern_line(&self, pat: String) -> String { - if self.is_present("line-regexp") { - format!(r"^(?:{})$", pat) - } else { - pat - } - } - - /// Returns the given pattern as a literal pattern if the - /// -F/--fixed-strings flag is set. Otherwise, the pattern is returned - /// unchanged. - fn pattern_literal(&self, pat: String) -> String { - if self.is_present("fixed-strings") { - regex::escape(&pat) + // This would normally just be an empty string, which works on its + // own, but if the patterns are joined in a set of alternations, + // then you wind up with `foo|`, which is currently invalid in + // Rust's regex engine. + "(?:)".to_string() } else { pat } diff --git a/crates/pcre2/Cargo.toml b/crates/pcre2/Cargo.toml index c0c3ede2..2725ba7d 100644 --- a/crates/pcre2/Cargo.toml +++ b/crates/pcre2/Cargo.toml @@ -15,4 +15,5 @@ edition = "2018" [dependencies] grep-matcher = { version = "0.1.6", path = "../matcher" } +log = "0.4.19" pcre2 = "0.2.4" diff --git a/crates/pcre2/src/matcher.rs b/crates/pcre2/src/matcher.rs index a8c47c32..66c2b74a 100644 --- a/crates/pcre2/src/matcher.rs +++ b/crates/pcre2/src/matcher.rs @@ -11,6 +11,8 @@ pub struct RegexMatcherBuilder { builder: RegexBuilder, case_smart: bool, word: bool, + fixed_strings: bool, + whole_line: bool, } impl RegexMatcherBuilder { @@ -20,6 +22,8 @@ impl RegexMatcherBuilder { builder: RegexBuilder::new(), case_smart: false, word: false, + fixed_strings: false, + whole_line: false, } } @@ -29,17 +33,40 @@ impl RegexMatcherBuilder { /// If there was a problem compiling the pattern, then an error is /// returned. pub fn build(&self, pattern: &str) -> Result { + self.build_many(&[pattern]) + } + + /// Compile all of the given patterns into a single regex that matches when + /// at least one of the patterns matches. + /// + /// If there was a problem building the regex, then an error is returned. + pub fn build_many>( + &self, + patterns: &[P], + ) -> Result { let mut builder = self.builder.clone(); - if self.case_smart && !has_uppercase_literal(pattern) { + let mut pats = Vec::with_capacity(patterns.len()); + for p in patterns.iter() { + pats.push(if self.fixed_strings { + format!("(?:{})", pcre2::escape(p.as_ref())) + } else { + format!("(?:{})", p.as_ref()) + }); + } + let mut singlepat = pats.join("|"); + if self.case_smart && !has_uppercase_literal(&singlepat) { builder.caseless(true); } - let res = if self.word { - let pattern = format!(r"(? &mut RegexMatcherBuilder { + self.fixed_strings = yes; + self + } + + /// Whether each pattern should match the entire line or not. This is + /// equivalent to surrounding the pattern with `(?m:^)` and `(?m:$)`. + pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.whole_line = yes; + self + } + /// Enable Unicode matching mode. /// /// When enabled, the following patterns become Unicode aware: `\b`, `\B`, diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml index 13d2e5e1..8aece183 100644 --- a/crates/regex/Cargo.toml +++ b/crates/regex/Cargo.toml @@ -17,7 +17,7 @@ edition = "2021" aho-corasick = "1.0.2" bstr = "1.5.0" grep-matcher = { version = "0.1.6", path = "../matcher" } -log = "0.4.5" +log = "0.4.19" regex = "1.8.3" regex-automata = { version = "0.3.0" } regex-syntax = "0.7.2" diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index a2aa3243..732cda70 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -1,14 +1,15 @@ use { grep_matcher::{ByteSet, LineTerminator}, regex_automata::meta::Regex, - regex_syntax::ast::{self, Ast}, - regex_syntax::hir::{self, Hir}, + regex_syntax::{ + ast, + hir::{self, Hir, HirKind}, + }, }; use crate::{ ast::AstAnalysis, error::Error, literal::LiteralSets, - multi::alternation_literals, non_matching::non_matching_bytes, - strip::strip_from_match, + non_matching::non_matching_bytes, strip::strip_from_match, }; /// Config represents the configuration of a regex matcher in this crate. @@ -36,6 +37,8 @@ pub(crate) struct Config { pub(crate) line_terminator: Option, pub(crate) crlf: bool, pub(crate) word: bool, + pub(crate) fixed_strings: bool, + pub(crate) whole_line: bool, } impl Default for Config { @@ -50,47 +53,28 @@ impl Default for Config { unicode: true, octal: false, // These size limits are much bigger than what's in the regex - // crate. + // crate by default. size_limit: 100 * (1 << 20), dfa_size_limit: 1000 * (1 << 20), nest_limit: 250, line_terminator: None, crlf: false, word: false, + fixed_strings: false, + whole_line: false, } } } impl Config { - /// Parse the given pattern and returned its HIR expression along with - /// the current configuration. - /// - /// If there was a problem parsing the given expression then an error - /// is returned. - pub(crate) fn hir(&self, pattern: &str) -> Result { - let ast = self.ast(pattern)?; - let analysis = self.analysis(&ast)?; - let expr = hir::translate::TranslatorBuilder::new() - .utf8(false) - .case_insensitive(self.is_case_insensitive(&analysis)) - .multi_line(self.multi_line) - .dot_matches_new_line(self.dot_matches_new_line) - .crlf(self.crlf) - .swap_greed(self.swap_greed) - .unicode(self.unicode) - .build() - .translate(pattern, &ast) - .map_err(Error::generic)?; - let expr = match self.line_terminator { - None => expr, - Some(line_term) => strip_from_match(expr, line_term)?, - }; - Ok(ConfiguredHIR { - original: pattern.to_string(), - config: self.clone(), - analysis, - expr, - }) + /// Use this configuration to build an HIR from the given patterns. The HIR + /// returned corresponds to a single regex that is an alternation of the + /// patterns given. + pub(crate) fn build_many>( + &self, + patterns: &[P], + ) -> Result { + ConfiguredHIR::new(self.clone(), patterns) } /// Accounting for the `smart_case` config knob, return true if and only if @@ -105,35 +89,55 @@ impl Config { analysis.any_literal() && !analysis.any_uppercase() } - /// Returns true if and only if this config is simple enough such that - /// if the pattern is a simple alternation of literals, then it can be - /// constructed via a plain Aho-Corasick automaton. + /// Returns whether the given patterns should be treated as "fixed strings" + /// literals. This is different from just querying the `fixed_strings` knob + /// in that if the knob is false, this will still return true in some cases + /// if the patterns are themselves indistinguishable from literals. /// - /// Note that it is OK to return true even when settings like `multi_line` - /// are enabled, since if multi-line can impact the match semantics of a - /// regex, then it is by definition not a simple alternation of literals. - pub(crate) fn can_plain_aho_corasick(&self) -> bool { - !self.word && !self.case_insensitive && !self.case_smart - } - - /// Perform analysis on the AST of this pattern. - /// - /// This returns an error if the given pattern failed to parse. - fn analysis(&self, ast: &Ast) -> Result { - Ok(AstAnalysis::from_ast(ast)) - } - - /// Parse the given pattern into its abstract syntax. - /// - /// This returns an error if the given pattern failed to parse. - fn ast(&self, pattern: &str) -> Result { - ast::parse::ParserBuilder::new() - .nest_limit(self.nest_limit) - .octal(self.octal) - .ignore_whitespace(self.ignore_whitespace) - .build() - .parse(pattern) - .map_err(Error::generic) + /// The main idea here is that if this returns true, then it is safe + /// to build an `regex_syntax::hir::Hir` value directly from the given + /// patterns as an alternation of `hir::Literal` values. + fn is_fixed_strings>(&self, patterns: &[P]) -> bool { + // When these are enabled, we really need to parse the patterns and + // let them go through the standard HIR translation process in order + // for case folding transforms to be applied. + if self.case_insensitive || self.case_smart { + return false; + } + // Even if whole_line or word is enabled, both of those things can + // be implemented by wrapping the Hir generated by an alternation of + // fixed string literals. So for here at least, we don't care about the + // word or whole_line settings. + if self.fixed_strings { + // ... but if any literal contains a line terminator, then we've + // got to bail out because this will ultimately result in an error. + if let Some(lineterm) = self.line_terminator { + for p in patterns.iter() { + if has_line_terminator(lineterm, p.as_ref()) { + return false; + } + } + } + return true; + } + // In this case, the only way we can hand construct the Hir is if none + // of the patterns contain meta characters. If they do, then we need to + // send them through the standard parsing/translation process. + for p in patterns.iter() { + let p = p.as_ref(); + if p.chars().any(regex_syntax::is_meta_character) { + return false; + } + // Same deal as when fixed_strings is set above. If the pattern has + // a line terminator anywhere, then we need to bail out and let + // an error occur. + if let Some(lineterm) = self.line_terminator { + if has_line_terminator(lineterm, p) { + return false; + } + } + } + true } } @@ -150,192 +154,278 @@ impl Config { /// subsequently constructed HIR or regular expression. #[derive(Clone, Debug)] pub(crate) struct ConfiguredHIR { - original: String, config: Config, - analysis: AstAnalysis, - expr: Hir, + hir: Hir, } impl ConfiguredHIR { - /// Return the configuration for this HIR expression. + /// Parse the given patterns into a single HIR expression that represents + /// an alternation of the patterns given. + fn new>( + config: Config, + patterns: &[P], + ) -> Result { + let hir = if config.is_fixed_strings(patterns) { + let mut alts = vec![]; + for p in patterns.iter() { + alts.push(Hir::literal(p.as_ref().as_bytes())); + } + log::debug!( + "assembling HIR from {} fixed string literals", + alts.len() + ); + let hir = Hir::alternation(alts); + hir + } else { + let mut alts = vec![]; + for p in patterns.iter() { + alts.push(if config.fixed_strings { + format!("(?:{})", regex_syntax::escape(p.as_ref())) + } else { + format!("(?:{})", p.as_ref()) + }); + } + let pattern = alts.join("|"); + let ast = ast::parse::ParserBuilder::new() + .nest_limit(config.nest_limit) + .octal(config.octal) + .ignore_whitespace(config.ignore_whitespace) + .build() + .parse(&pattern) + .map_err(Error::generic)?; + let analysis = AstAnalysis::from_ast(&ast); + let mut hir = hir::translate::TranslatorBuilder::new() + .utf8(false) + .case_insensitive(config.is_case_insensitive(&analysis)) + .multi_line(config.multi_line) + .dot_matches_new_line(config.dot_matches_new_line) + .crlf(config.crlf) + .swap_greed(config.swap_greed) + .unicode(config.unicode) + .build() + .translate(&pattern, &ast) + .map_err(Error::generic)?; + // We don't need to do this for the fixed-strings case above + // because is_fixed_strings will return false if any pattern + // contains a line terminator. Therefore, we don't need to strip + // it. + // + // We go to some pains to avoid doing this in the fixed-strings + // case because this can result in building a new HIR when ripgrep + // is given a huge set of literals to search for. And this can + // actually take a little time. It's not huge, but it's noticeable. + hir = match config.line_terminator { + None => hir, + Some(line_term) => strip_from_match(hir, line_term)?, + }; + hir + }; + Ok(ConfiguredHIR { config, hir }) + } + + /// Return a reference to the underlying configuration. pub(crate) fn config(&self) -> &Config { &self.config } + /// Convert this HIR to a regex that can be used for matching. + pub(crate) fn to_regex(&self) -> Result { + let meta = Regex::config() + .utf8_empty(false) + .nfa_size_limit(Some(self.config.size_limit)) + .hybrid_cache_capacity(self.config.dfa_size_limit); + Regex::builder() + .configure(meta) + .build_from_hir(&self.hir) + .map_err(Error::regex) + } + + /// Convert this HIR to its concrete syntax. + pub(crate) fn to_pattern(&self) -> String { + self.hir.to_string() + } + + /// Attempt to extract a "fast" regex that can be used for quickly finding + /// candidates lines for a match. + /// + /// If no line terminator was configured, then this always returns + /// `Ok(None)`. If a line terminator is configured, then this may return a + /// regex. + pub(crate) fn to_fast_line_regex(&self) -> Result, Error> { + if self.config.line_terminator.is_none() { + return Ok(None); + } + match LiteralSets::new(&self.hir).one_regex(self.config.word) { + None => Ok(None), + Some(pattern) => { + let config = self.config.clone(); + let chir = ConfiguredHIR::new(config, &[pattern])?; + Ok(Some(chir.to_regex()?)) + } + } + } + /// Compute the set of non-matching bytes for this HIR expression. pub(crate) fn non_matching_bytes(&self) -> ByteSet { - non_matching_bytes(&self.expr) + non_matching_bytes(&self.hir) } /// Returns the line terminator configured on this expression. /// /// When we have beginning/end anchors (NOT line anchors), the fast line - /// searching path isn't quite correct. Or at least, doesn't match the - /// slow path. Namely, the slow path strips line terminators while the - /// fast path does not. Since '$' (when multi-line mode is disabled) - /// doesn't match at line boundaries, the existence of a line terminator - /// might cause it to not match when it otherwise would with the line - /// terminator stripped. + /// searching path isn't quite correct. Or at least, doesn't match the slow + /// path. Namely, the slow path strips line terminators while the fast path + /// does not. Since '$' (when multi-line mode is disabled) doesn't match at + /// line boundaries, the existence of a line terminator might cause it to + /// not match when it otherwise would with the line terminator stripped. /// - /// Since searching with text anchors is exceptionally rare in the - /// context of line oriented searching (multi-line mode is basically - /// always enabled), we just disable this optimization when there are - /// text anchors. We disable it by not returning a line terminator, since + /// Since searching with text anchors is exceptionally rare in the context + /// of line oriented searching (multi-line mode is basically always + /// enabled), we just disable this optimization when there are text + /// anchors. We disable it by not returning a line terminator, since /// without a line terminator, the fast search path can't be executed. /// + /// Actually, the above is no longer quite correct. Later on, another + /// optimization was added where if the line terminator was in the set of + /// bytes that was guaranteed to never be part of a match, then the higher + /// level search infrastructure assumes that the fast line-by-line search + /// path can still be taken. This optimization applies when multi-line + /// search (not multi-line mode) is enabled. In that case, there is no + /// configured line terminator since the regex is permitted to match a + /// line terminator. But if the regex is guaranteed to never match across + /// multiple lines despite multi-line search being requested, we can still + /// do the faster and more flexible line-by-line search. This is why the + /// non-matching extraction routine removes `\n` when `\A` and `\z` are + /// present even though that's not quite correct... + /// /// See: pub(crate) fn line_terminator(&self) -> Option { - if self.is_any_anchored() { + if self.hir.properties().look_set().contains_anchor_haystack() { None } else { self.config.line_terminator } } - /// Returns true if and only if the underlying HIR has any text anchors. - fn is_any_anchored(&self) -> bool { - self.expr.properties().look_set().contains_anchor_haystack() - } - - /// Builds a regular expression from this HIR expression. - pub(crate) fn regex(&self) -> Result { - self.pattern_to_regex(&self.pattern()) - } - - /// Returns the pattern string by converting this HIR to its concrete - /// syntax. - pub(crate) fn pattern(&self) -> String { - self.expr.to_string() - } - - /// If this HIR corresponds to an alternation of literals with no - /// capturing groups, then this returns those literals. - pub(crate) fn alternation_literals(&self) -> Option>> { - if !self.config.can_plain_aho_corasick() { - return None; - } - alternation_literals(&self.expr) - } - - /// Applies the given function to the concrete syntax of this HIR and then - /// generates a new HIR based on the result of the function in a way that - /// preserves the configuration. + /// Turns this configured HIR into one that only matches when both sides of + /// the match correspond to a word boundary. /// - /// For example, this can be used to wrap a user provided regular - /// expression with additional semantics. e.g., See the `WordMatcher`. - pub(crate) fn with_pattern String>( - &self, - mut f: F, - ) -> Result { - self.pattern_to_hir(&f(&self.pattern())) + /// Note that the HIR returned is like turning `pat` into + /// `(?m:^|\W)(pat)(?m:$|\W)`. That is, the true match is at capture group + /// `1` and not `0`. + pub(crate) fn into_word(self) -> Result { + // In theory building the HIR for \W should never fail, but there are + // likely some pathological cases (particularly with respect to certain + // values of limits) where it could in theory fail. + let non_word = { + let mut config = self.config.clone(); + config.fixed_strings = false; + ConfiguredHIR::new(config, &[r"\W"])? + }; + let line_anchor_start = Hir::look(self.line_anchor_start()); + let line_anchor_end = Hir::look(self.line_anchor_end()); + let hir = Hir::concat(vec![ + Hir::alternation(vec![line_anchor_start, non_word.hir.clone()]), + Hir::capture(hir::Capture { + index: 1, + name: None, + sub: Box::new(renumber_capture_indices(self.hir)?), + }), + Hir::alternation(vec![non_word.hir, line_anchor_end]), + ]); + Ok(ConfiguredHIR { config: self.config, hir }) } - /// If the current configuration has a line terminator set and if useful - /// literals could be extracted, then a regular expression matching those - /// literals is returned. If no line terminator is set, then `None` is - /// returned. - /// - /// If compiling the resulting regular expression failed, then an error - /// is returned. - /// - /// This method only returns something when a line terminator is set - /// because matches from this regex are generally candidates that must be - /// confirmed before reporting a match. When performing a line oriented - /// search, confirmation is easy: just extend the candidate match to its - /// respective line boundaries and then re-search that line for a full - /// match. This only works when the line terminator is set because the line - /// terminator setting guarantees that the regex itself can never match - /// through the line terminator byte. - pub(crate) fn fast_line_regex(&self) -> Result, Error> { - if self.config.line_terminator.is_none() { - return Ok(None); - } - match LiteralSets::new(&self.expr).one_regex(self.config.word) { - None => Ok(None), - Some(pattern) => self.pattern_to_regex(&pattern).map(Some), + /// Turns this configured HIR into an equivalent one, but where it must + /// match at the start and end of a line. + pub(crate) fn into_whole_line(self) -> ConfiguredHIR { + let line_anchor_start = Hir::look(self.line_anchor_start()); + let line_anchor_end = Hir::look(self.line_anchor_end()); + let hir = + Hir::concat(vec![line_anchor_start, self.hir, line_anchor_end]); + ConfiguredHIR { config: self.config, hir } + } + + /// Turns this configured HIR into an equivalent one, but where it must + /// match at the start and end of the haystack. + pub(crate) fn into_anchored(self) -> ConfiguredHIR { + let hir = Hir::concat(vec![ + Hir::look(hir::Look::Start), + self.hir, + Hir::look(hir::Look::End), + ]); + ConfiguredHIR { config: self.config, hir } + } + + /// Returns the "start line" anchor for this configuration. + fn line_anchor_start(&self) -> hir::Look { + if self.config.crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF } } - /// Create a regex from the given pattern using this HIR's configuration. - fn pattern_to_regex(&self, pattern: &str) -> Result { - // The settings we explicitly set here are intentionally a subset - // of the settings we have. The key point here is that our HIR - // expression is computed with the settings in mind, such that setting - // them here could actually lead to unintended behavior. For example, - // consider the pattern `(?U)a+`. This will get folded into the HIR - // as a non-greedy repetition operator which will in turn get printed - // to the concrete syntax as `a+?`, which is correct. But if we - // set the `swap_greed` option again, then we'll wind up with `(?U)a+?` - // which is equal to `a+` which is not the same as what we were given. - // - // We also don't need to apply `case_insensitive` since this gets - // folded into the HIR and would just cause us to do redundant work. - // - // Finally, we don't need to set `ignore_whitespace` since the concrete - // syntax emitted by the HIR printer never needs it. - // - // We set the rest of the options. Some of them are important, such as - // the size limit, and some of them are necessary to preserve the - // intention of the original pattern. For example, the Unicode flag - // will impact how the WordMatcher functions, namely, whether its - // word boundaries are Unicode aware or not. - let syntax = regex_automata::util::syntax::Config::new() - .utf8(false) - .nest_limit(self.config.nest_limit) - .octal(self.config.octal) - .multi_line(self.config.multi_line) - .dot_matches_new_line(self.config.dot_matches_new_line) - .crlf(self.config.crlf) - .unicode(self.config.unicode); - let meta = Regex::config() - .utf8_empty(false) - .nfa_size_limit(Some(self.config.size_limit)) - .hybrid_cache_capacity(self.config.dfa_size_limit); - Regex::builder() - .syntax(syntax) - .configure(meta) - .build(pattern) - .map_err(Error::regex) + /// Returns the "end line" anchor for this configuration. + fn line_anchor_end(&self) -> hir::Look { + if self.config.crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } + } +} + +/// This increments the index of every capture group in the given hir by 1. If +/// any increment results in an overflow, then an error is returned. +fn renumber_capture_indices(hir: Hir) -> Result { + Ok(match hir.into_kind() { + HirKind::Empty => Hir::empty(), + HirKind::Literal(hir::Literal(lit)) => Hir::literal(lit), + HirKind::Class(cls) => Hir::class(cls), + HirKind::Look(x) => Hir::look(x), + HirKind::Repetition(mut x) => { + x.sub = Box::new(renumber_capture_indices(*x.sub)?); + Hir::repetition(x) + } + HirKind::Capture(mut cap) => { + cap.index = match cap.index.checked_add(1) { + Some(index) => index, + None => { + // This error message kind of sucks, but it's probably + // impossible for it to happen. The only way a capture + // index can overflow addition is if the regex is huge + // (or something else has gone horribly wrong). + let msg = "could not renumber capture index, too big"; + return Err(Error::any(msg)); + } + }; + cap.sub = Box::new(renumber_capture_indices(*cap.sub)?); + Hir::capture(cap) + } + HirKind::Concat(subs) => { + let subs = subs + .into_iter() + .map(|sub| renumber_capture_indices(sub)) + .collect::, Error>>()?; + Hir::concat(subs) + } + HirKind::Alternation(subs) => { + let subs = subs + .into_iter() + .map(|sub| renumber_capture_indices(sub)) + .collect::, Error>>()?; + Hir::alternation(subs) + } + }) +} + +/// Returns true if the given literal string contains any byte from the line +/// terminator given. +fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool { + if lineterm.is_crlf() { + literal.as_bytes().iter().copied().any(|b| b == b'\r' || b == b'\n') + } else { + literal.as_bytes().iter().copied().any(|b| b == lineterm.as_byte()) } - - /// Create an HIR expression from the given pattern using this HIR's - /// configuration. - fn pattern_to_hir(&self, pattern: &str) -> Result { - // See `pattern_to_regex` comment for explanation of why we only set - // a subset of knobs here. e.g., `swap_greed` is explicitly left out. - let expr = regex_syntax::ParserBuilder::new() - .nest_limit(self.config.nest_limit) - .octal(self.config.octal) - .utf8(false) - .multi_line(self.config.multi_line) - .dot_matches_new_line(self.config.dot_matches_new_line) - .crlf(self.config.crlf) - .unicode(self.config.unicode) - .build() - .parse(pattern) - .map_err(Error::generic)?; - Ok(ConfiguredHIR { - original: self.original.clone(), - config: self.config.clone(), - analysis: self.analysis.clone(), - expr, - }) - } - - /* - fn syntax_config(&self) -> regex_automata::util::syntax::Config { - regex_automata::util::syntax::Config::new() - .nest_limit(self.config.nest_limit) - .octal(self.config.octal) - .multi_line(self.config.multi_line) - .dot_matches_new_line(self.config.dot_matches_new_line) - .unicode(self.config.unicode) - } - - fn meta_config(&self) -> regex_automata::meta::Config { - Regex::config() - .nfa_size_limit(Some(self.config.size_limit)) - .hybrid_cache_capacity(self.config.dfa_size_limit) - } - */ } diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs index 88a8adbe..1c921773 100644 --- a/crates/regex/src/error.rs +++ b/crates/regex/src/error.rs @@ -30,6 +30,10 @@ impl Error { Error { kind: ErrorKind::Regex(err.to_string()) } } + pub(crate) fn any(msg: E) -> Error { + Error { kind: ErrorKind::Regex(msg.to_string()) } + } + /// Return the kind of this error. pub fn kind(&self) -> &ErrorKind { &self.kind diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs index a677b787..9175be9d 100644 --- a/crates/regex/src/lib.rs +++ b/crates/regex/src/lib.rs @@ -11,7 +11,6 @@ mod config; mod error; mod literal; mod matcher; -mod multi; mod non_matching; mod strip; mod word; diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 112222bc..95d25c43 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -12,7 +12,6 @@ use { use crate::{ config::{Config, ConfiguredHIR}, error::Error, - multi::MultiLiteralMatcher, word::WordMatcher, }; @@ -48,19 +47,30 @@ impl RegexMatcherBuilder { /// The syntax supported is documented as part of the regex crate: /// . pub fn build(&self, pattern: &str) -> Result { - let chir = self.config.hir(pattern)?; - let fast_line_regex = chir.fast_line_regex()?; + self.build_many(&[pattern]) + } + + /// Build a new matcher using the current configuration for the provided + /// patterns. The resulting matcher behaves as if all of the patterns + /// given are joined together into a single alternation. That is, it + /// reports matches where at least one of the given patterns matches. + pub fn build_many>( + &self, + patterns: &[P], + ) -> Result { + let chir = self.config.build_many(patterns)?; + let fast_line_regex = chir.to_fast_line_regex()?; let non_matching_bytes = chir.non_matching_bytes(); if let Some(ref re) = fast_line_regex { log::debug!("extracted fast line regex: {:?}", re); } - let matcher = RegexMatcherImpl::new(&chir)?; - log::trace!("final regex: {:?}", matcher.regex()); - let mut config = self.config.clone(); - // We override the line terminator in case the configured expr doesn't + // We override the line terminator in case the configured HIR doesn't // support it. + let mut config = self.config.clone(); config.line_terminator = chir.line_terminator(); + let matcher = RegexMatcherImpl::new(chir)?; + log::trace!("final regex: {:?}", matcher.regex()); Ok(RegexMatcher { config, matcher, @@ -78,66 +88,7 @@ impl RegexMatcherBuilder { &self, literals: &[B], ) -> Result { - // BREADCRUMBS: Ideally we would remove this method and just let the - // underlying regex engine handle this case. But... this is tricky. - // Part of the problem is that ripgrep escapes all patterns by the - // time the regex engine is constructed, which is necessary for PCRE2 - // for example. So that logic would need to change so that we don't - // escape things first. - // - // If we adjusted that, then I think we could just build an HIR value - // directly from the literals, thus skipping the parser altogether. - // - // But that still requires using and keeping this method. But we could - // at least get rid of the MultiLiteral matcher since the regex engine - // should now handle that case. - // - // Getting rid of this method is trickier, unless we make multi-pattern - // support a first class concept. But I don't think I want to go down - // that path? That implies we still need to accept a single pattern - // everywhere, which in turn means ripgrep would be forced to join - // the literals together using | and escape meta characters. By that - // point, we've lost. So I do think we still need this special method. - // But we can at least simplify the implementation. - // - // I still wonder if "fast parse" is still a good idea though. - // Basically, reject all nesting except for single-depth alternation. - // And reject character classes and all options. Just basically - // support `foo|bar|..|quux`. Maybe skip this for now I think. - - let mut has_escape = false; - let mut slices = vec![]; - for lit in literals { - slices.push(lit.as_ref()); - has_escape = has_escape || lit.as_ref().contains('\\'); - } - // Even when we have a fixed set of literals, we might still want to - // use the regex engine. Specifically, if any string has an escape - // in it, then we probably can't feed it to Aho-Corasick without - // removing the escape. Additionally, if there are any particular - // special match semantics we need to honor, that Aho-Corasick isn't - // enough. Finally, the regex engine can do really well with a small - // number of literals (at time of writing, this is changing soon), so - // we use it when there's a small set. - // - // Yes, this is one giant hack. Ideally, this entirely separate literal - // matcher that uses Aho-Corasick would be pushed down into the regex - // engine. - if has_escape - || !self.config.can_plain_aho_corasick() - || literals.len() < 40 - { - return self.build(&slices.join("|")); - } - - let matcher = MultiLiteralMatcher::new(&slices)?; - let imp = RegexMatcherImpl::MultiLiteral(matcher); - Ok(RegexMatcher { - config: self.config.clone(), - matcher: imp, - fast_line_regex: None, - non_matching_bytes: ByteSet::empty(), - }) + self.build_many(literals) } /// Set the value for the case insensitive (`i`) flag. @@ -338,20 +289,15 @@ impl RegexMatcherBuilder { /// 1. It causes the line terminator for the matcher to be `\r\n`. Namely, /// this prevents the matcher from ever producing a match that contains /// a `\r` or `\n`. - /// 2. It translates all instances of `$` in the pattern to `(?:\r??$)`. - /// This works around the fact that the regex engine does not support - /// matching CRLF as a line terminator when using `$`. + /// 2. It enables CRLF mode for `^` and `$`. This means that line anchors + /// will treat both `\r` and `\n` as line terminators, but will never + /// match between a `\r` and `\n`. /// - /// In particular, because of (2), the matches produced by the matcher may - /// be slightly different than what one would expect given the pattern. - /// This is the trade off made: in many cases, `$` will "just work" in the - /// presence of `\r\n` line terminators, but matches may require some - /// trimming to faithfully represent the intended match. - /// - /// Note that if you do not wish to set the line terminator but would still - /// like `$` to match `\r\n` line terminators, then it is valid to call - /// `crlf(true)` followed by `line_terminator(None)`. Ordering is - /// important, since `crlf` and `line_terminator` override each other. + /// Note that if you do not wish to set the line terminator but would + /// still like `$` to match `\r\n` line terminators, then it is valid to + /// call `crlf(true)` followed by `line_terminator(None)`. Ordering is + /// important, since `crlf` sets the line terminator, but `line_terminator` + /// does not touch the `crlf` setting. pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder { if yes { self.config.line_terminator = Some(LineTerminator::crlf()); @@ -377,6 +323,21 @@ impl RegexMatcherBuilder { self.config.word = yes; self } + + /// Whether the patterns should be treated as literal strings or not. When + /// this is active, all characters, including ones that would normally be + /// special regex meta characters, are matched literally. + pub fn fixed_strings(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.fixed_strings = yes; + self + } + + /// Whether each pattern should match the entire line or not. This is + /// equivalent to surrounding the pattern with `(?m:^)` and `(?m:$)`. + pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.whole_line = yes; + self + } } /// An implementation of the `Matcher` trait using Rust's standard regex @@ -406,10 +367,10 @@ impl RegexMatcher { /// Create a new matcher from the given pattern using the default /// configuration, but matches lines terminated by `\n`. /// - /// This is meant to be a convenience constructor for using a - /// `RegexMatcherBuilder` and setting its - /// [`line_terminator`](struct.RegexMatcherBuilder.html#method.line_terminator) - /// to `\n`. The purpose of using this constructor is to permit special + /// This is meant to be a convenience constructor for + /// using a `RegexMatcherBuilder` and setting its + /// [`line_terminator`](RegexMatcherBuilder::method.line_terminator) to + /// `\n`. The purpose of using this constructor is to permit special /// optimizations that help speed up line oriented search. These types of /// optimizations are only appropriate when matches span no more than one /// line. For this reason, this constructor will return an error if the @@ -425,8 +386,6 @@ impl RegexMatcher { enum RegexMatcherImpl { /// The standard matcher used for all regular expressions. Standard(StandardMatcher), - /// A matcher for an alternation of plain literals. - MultiLiteral(MultiLiteralMatcher), /// A matcher that only matches at word boundaries. This transforms the /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`. /// Because of this, the WordMatcher provides its own implementation of @@ -438,25 +397,24 @@ enum RegexMatcherImpl { impl RegexMatcherImpl { /// Based on the configuration, create a new implementation of the /// `Matcher` trait. - fn new(expr: &ConfiguredHIR) -> Result { - if expr.config().word { - Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?)) + fn new(mut chir: ConfiguredHIR) -> Result { + // When whole_line is set, we don't use a word matcher even if word + // matching was requested. Why? Because `(?m:^)(pat)(?m:$)` implies + // word matching. + Ok(if chir.config().word && !chir.config().whole_line { + RegexMatcherImpl::Word(WordMatcher::new(chir)?) } else { - if let Some(lits) = expr.alternation_literals() { - if lits.len() >= 40 { - let matcher = MultiLiteralMatcher::new(&lits)?; - return Ok(RegexMatcherImpl::MultiLiteral(matcher)); - } + if chir.config().whole_line { + chir = chir.into_whole_line(); } - Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?)) - } + RegexMatcherImpl::Standard(StandardMatcher::new(chir)?) + }) } /// Return the underlying regex object used. fn regex(&self) -> String { match *self { RegexMatcherImpl::Word(ref x) => x.pattern().to_string(), - RegexMatcherImpl::MultiLiteral(_) => "".to_string(), RegexMatcherImpl::Standard(ref x) => x.pattern.clone(), } } @@ -477,7 +435,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find_at(haystack, at), - MultiLiteral(ref m) => m.find_at(haystack, at), Word(ref m) => m.find_at(haystack, at), } } @@ -486,7 +443,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.new_captures(), - MultiLiteral(ref m) => m.new_captures(), Word(ref m) => m.new_captures(), } } @@ -495,7 +451,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.capture_count(), - MultiLiteral(ref m) => m.capture_count(), Word(ref m) => m.capture_count(), } } @@ -504,7 +459,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.capture_index(name), - MultiLiteral(ref m) => m.capture_index(name), Word(ref m) => m.capture_index(name), } } @@ -513,7 +467,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find(haystack), - MultiLiteral(ref m) => m.find(haystack), Word(ref m) => m.find(haystack), } } @@ -525,7 +478,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find_iter(haystack, matched), - MultiLiteral(ref m) => m.find_iter(haystack, matched), Word(ref m) => m.find_iter(haystack, matched), } } @@ -541,7 +493,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.try_find_iter(haystack, matched), - MultiLiteral(ref m) => m.try_find_iter(haystack, matched), Word(ref m) => m.try_find_iter(haystack, matched), } } @@ -554,7 +505,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures(haystack, caps), - MultiLiteral(ref m) => m.captures(haystack, caps), Word(ref m) => m.captures(haystack, caps), } } @@ -571,7 +521,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures_iter(haystack, caps, matched), - MultiLiteral(ref m) => m.captures_iter(haystack, caps, matched), Word(ref m) => m.captures_iter(haystack, caps, matched), } } @@ -588,9 +537,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.try_captures_iter(haystack, caps, matched), - MultiLiteral(ref m) => { - m.try_captures_iter(haystack, caps, matched) - } Word(ref m) => m.try_captures_iter(haystack, caps, matched), } } @@ -604,7 +550,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures_at(haystack, at, caps), - MultiLiteral(ref m) => m.captures_at(haystack, at, caps), Word(ref m) => m.captures_at(haystack, at, caps), } } @@ -621,7 +566,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.replace(haystack, dst, append), - MultiLiteral(ref m) => m.replace(haystack, dst, append), Word(ref m) => m.replace(haystack, dst, append), } } @@ -641,9 +585,6 @@ impl Matcher for RegexMatcher { Standard(ref m) => { m.replace_with_captures(haystack, caps, dst, append) } - MultiLiteral(ref m) => { - m.replace_with_captures(haystack, caps, dst, append) - } Word(ref m) => { m.replace_with_captures(haystack, caps, dst, append) } @@ -654,7 +595,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.is_match(haystack), - MultiLiteral(ref m) => m.is_match(haystack), Word(ref m) => m.is_match(haystack), } } @@ -667,7 +607,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.is_match_at(haystack, at), - MultiLiteral(ref m) => m.is_match_at(haystack, at), Word(ref m) => m.is_match_at(haystack, at), } } @@ -679,7 +618,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.shortest_match(haystack), - MultiLiteral(ref m) => m.shortest_match(haystack), Word(ref m) => m.shortest_match(haystack), } } @@ -692,7 +630,6 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.shortest_match_at(haystack, at), - MultiLiteral(ref m) => m.shortest_match_at(haystack, at), Word(ref m) => m.shortest_match_at(haystack, at), } } @@ -734,9 +671,9 @@ struct StandardMatcher { } impl StandardMatcher { - fn new(expr: &ConfiguredHIR) -> Result { - let regex = expr.regex()?; - let pattern = expr.pattern(); + fn new(chir: ConfiguredHIR) -> Result { + let regex = chir.to_regex()?; + let pattern = chir.to_pattern(); Ok(StandardMatcher { regex, pattern }) } } @@ -821,63 +758,38 @@ impl Matcher for StandardMatcher { /// index of the group using the corresponding matcher's `capture_index` /// method, and then use that index with `RegexCaptures::get`. #[derive(Clone, Debug)] -pub struct RegexCaptures(RegexCapturesImp); - -#[derive(Clone, Debug)] -enum RegexCapturesImp { - AhoCorasick { - /// The start and end of the match, corresponding to capture group 0. - mat: Option, - }, - Regex { - /// Where the captures are stored. - caps: AutomataCaptures, - /// These captures behave as if the capturing groups begin at the given - /// offset. When set to `0`, this has no affect and capture groups are - /// indexed like normal. - /// - /// This is useful when building matchers that wrap arbitrary regular - /// expressions. For example, `WordMatcher` takes an existing regex - /// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that - /// the regex has been wrapped from the caller. In order to do this, - /// the matcher and the capturing groups must behave as if `(re)` is - /// the `0`th capture group. - offset: usize, - }, +pub struct RegexCaptures { + /// Where the captures are stored. + caps: AutomataCaptures, + /// These captures behave as if the capturing groups begin at the given + /// offset. When set to `0`, this has no affect and capture groups are + /// indexed like normal. + /// + /// This is useful when building matchers that wrap arbitrary regular + /// expressions. For example, `WordMatcher` takes an existing regex + /// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that + /// the regex has been wrapped from the caller. In order to do this, + /// the matcher and the capturing groups must behave as if `(re)` is + /// the `0`th capture group. + offset: usize, } impl Captures for RegexCaptures { fn len(&self) -> usize { - match self.0 { - RegexCapturesImp::AhoCorasick { .. } => 1, - RegexCapturesImp::Regex { ref caps, offset, .. } => { - caps.group_info().all_group_len().checked_sub(offset).unwrap() - } - } + self.caps + .group_info() + .all_group_len() + .checked_sub(self.offset) + .unwrap() } fn get(&self, i: usize) -> Option { - match self.0 { - RegexCapturesImp::AhoCorasick { mat, .. } => { - if i == 0 { - mat - } else { - None - } - } - RegexCapturesImp::Regex { ref caps, offset } => { - let actual = i.checked_add(offset).unwrap(); - caps.get_group(actual).map(|sp| Match::new(sp.start, sp.end)) - } - } + let actual = i.checked_add(self.offset).unwrap(); + self.caps.get_group(actual).map(|sp| Match::new(sp.start, sp.end)) } } impl RegexCaptures { - pub(crate) fn simple() -> RegexCaptures { - RegexCaptures(RegexCapturesImp::AhoCorasick { mat: None }) - } - pub(crate) fn new(caps: AutomataCaptures) -> RegexCaptures { RegexCaptures::with_offset(caps, 0) } @@ -886,27 +798,11 @@ impl RegexCaptures { caps: AutomataCaptures, offset: usize, ) -> RegexCaptures { - RegexCaptures(RegexCapturesImp::Regex { caps, offset }) + RegexCaptures { caps, offset } } pub(crate) fn captures_mut(&mut self) -> &mut AutomataCaptures { - match self.0 { - RegexCapturesImp::AhoCorasick { .. } => { - panic!("getting captures for multi-literal matcher is invalid") - } - RegexCapturesImp::Regex { ref mut caps, .. } => caps, - } - } - - pub(crate) fn set_simple(&mut self, one: Option) { - match self.0 { - RegexCapturesImp::AhoCorasick { ref mut mat } => { - *mat = one; - } - RegexCapturesImp::Regex { .. } => { - panic!("setting simple captures for regex is invalid") - } - } + &mut self.caps } } diff --git a/crates/regex/src/strip.rs b/crates/regex/src/strip.rs index 3e141563..f0da9446 100644 --- a/crates/regex/src/strip.rs +++ b/crates/regex/src/strip.rs @@ -67,6 +67,9 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result { Hir::literal(lit) } HirKind::Class(hir::Class::Unicode(mut cls)) => { + if cls.ranges().is_empty() { + return Ok(Hir::class(hir::Class::Unicode(cls))); + } let remove = hir::ClassUnicode::new(Some( hir::ClassUnicodeRange::new(ch, ch), )); @@ -77,6 +80,9 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result { Hir::class(hir::Class::Unicode(cls)) } HirKind::Class(hir::Class::Bytes(mut cls)) => { + if cls.ranges().is_empty() { + return Ok(Hir::class(hir::Class::Bytes(cls))); + } let remove = hir::ClassBytes::new(Some( hir::ClassBytesRange::new(byte, byte), )); diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs index 5aa603b0..643ae76a 100644 --- a/crates/regex/src/word.rs +++ b/crates/regex/src/word.rs @@ -59,16 +59,11 @@ impl WordMatcher { /// /// The given options are used to construct the regular expression /// internally. - pub(crate) fn new(expr: &ConfiguredHIR) -> Result { - let original = - expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?; - let word_expr = expr.with_pattern(|pat| { - let pat = format!(r"(?:(?m:^)|\W)({})(?:\W|(?m:$))", pat); - log::debug!("word regex: {:?}", pat); - pat - })?; - let regex = word_expr.regex()?; - let pattern = word_expr.pattern(); + pub(crate) fn new(chir: ConfiguredHIR) -> Result { + let original = chir.clone().into_anchored().to_regex()?; + let word_chir = chir.into_word()?; + let regex = word_chir.to_regex()?; + let pattern = word_chir.to_pattern(); let caps = Arc::new(Pool::new({ let regex = regex.clone(); Box::new(move || regex.create_captures()) as PoolFn @@ -104,7 +99,7 @@ impl WordMatcher { // slower regex engine to extract capture groups. Remember, our word // regex looks like this: // - // (^|\W)()($|\W) + // (^|\W)()(\W|$) // // What we want are the match offsets of . So in the // easy/common case, the original regex will be sandwiched between @@ -217,8 +212,8 @@ mod tests { use grep_matcher::{Captures, Match, Matcher}; fn matcher(pattern: &str) -> WordMatcher { - let chir = Config::default().hir(pattern).unwrap(); - WordMatcher::new(&chir).unwrap() + let chir = Config::default().build_many(&[pattern]).unwrap(); + WordMatcher::new(chir).unwrap() } fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {