mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-06-14 22:15:13 +02:00
libripgrep: initial commit introducing libripgrep
libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162
This commit is contained in:
21
grep-regex/Cargo.toml
Normal file
21
grep-regex/Cargo.toml
Normal file
@ -0,0 +1,21 @@
|
||||
[package]
|
||||
name = "grep-regex"
|
||||
version = "0.0.1" #:version
|
||||
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||
description = """
|
||||
Use Rust's regex library with the 'grep' crate.
|
||||
"""
|
||||
documentation = "https://docs.rs/grep-regex"
|
||||
homepage = "https://github.com/BurntSushi/ripgrep"
|
||||
repository = "https://github.com/BurntSushi/ripgrep"
|
||||
readme = "README.md"
|
||||
keywords = ["regex", "grep", "search", "pattern", "line"]
|
||||
license = "Unlicense/MIT"
|
||||
|
||||
[dependencies]
|
||||
log = "0.4"
|
||||
grep-matcher = { version = "0.0.1", path = "../grep-matcher" }
|
||||
regex = "1"
|
||||
regex-syntax = "0.6"
|
||||
thread_local = "0.3.5"
|
||||
utf8-ranges = "1"
|
21
grep-regex/LICENSE-MIT
Normal file
21
grep-regex/LICENSE-MIT
Normal file
@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Andrew Gallant
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
35
grep-regex/README.md
Normal file
35
grep-regex/README.md
Normal file
@ -0,0 +1,35 @@
|
||||
grep-regex
|
||||
----------
|
||||
The `grep-regex` crate provides an implementation of the `Matcher` trait from
|
||||
the `grep-matcher` crate. This implementation permits Rust's regex engine to
|
||||
be used in the `grep` crate for fast line oriented searching.
|
||||
|
||||
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||
[](https://crates.io/crates/grep-regex)
|
||||
|
||||
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||
|
||||
### Documentation
|
||||
|
||||
[https://docs.rs/grep-regex](https://docs.rs/grep-regex)
|
||||
|
||||
**NOTE:** You probably don't want to use this crate directly. Instead, you
|
||||
should prefer the facade defined in the
|
||||
[`grep`](https://docs.rs/grep)
|
||||
crate.
|
||||
|
||||
### Usage
|
||||
|
||||
Add this to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
grep-regex = "0.1"
|
||||
```
|
||||
|
||||
and this to your crate root:
|
||||
|
||||
```rust
|
||||
extern crate grep_regex;
|
||||
```
|
24
grep-regex/UNLICENSE
Normal file
24
grep-regex/UNLICENSE
Normal file
@ -0,0 +1,24 @@
|
||||
This is free and unencumbered software released into the public domain.
|
||||
|
||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
distribute this software, either in source code form or as a compiled
|
||||
binary, for any purpose, commercial or non-commercial, and by any
|
||||
means.
|
||||
|
||||
In jurisdictions that recognize copyright laws, the author or authors
|
||||
of this software dedicate any and all copyright interest in the
|
||||
software to the public domain. We make this dedication for the benefit
|
||||
of the public at large and to the detriment of our heirs and
|
||||
successors. We intend this dedication to be an overt act of
|
||||
relinquishment in perpetuity of all present and future rights to this
|
||||
software under copyright law.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
For more information, please refer to <http://unlicense.org/>
|
263
grep-regex/src/ast.rs
Normal file
263
grep-regex/src/ast.rs
Normal file
@ -0,0 +1,263 @@
|
||||
use regex_syntax::ast::{self, Ast};
|
||||
use regex_syntax::ast::parse::Parser;
|
||||
|
||||
/// The results of analyzing AST of a regular expression (e.g., for supporting
|
||||
/// smart case).
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AstAnalysis {
|
||||
/// True if and only if a literal uppercase character occurs in the regex.
|
||||
any_uppercase: bool,
|
||||
/// True if and only if the regex contains any literal at all.
|
||||
any_literal: bool,
|
||||
/// True if and only if the regex consists entirely of a literal and no
|
||||
/// other special regex characters.
|
||||
all_verbatim_literal: bool,
|
||||
}
|
||||
|
||||
impl AstAnalysis {
|
||||
/// Returns a `AstAnalysis` value by doing analysis on the AST of `pattern`.
|
||||
///
|
||||
/// If `pattern` is not a valid regular expression, then `None` is
|
||||
/// returned.
|
||||
#[allow(dead_code)]
|
||||
pub fn from_pattern(pattern: &str) -> Option<AstAnalysis> {
|
||||
Parser::new()
|
||||
.parse(pattern)
|
||||
.map(|ast| AstAnalysis::from_ast(&ast))
|
||||
.ok()
|
||||
}
|
||||
|
||||
/// Perform an AST analysis given the AST.
|
||||
pub fn from_ast(ast: &Ast) -> AstAnalysis {
|
||||
let mut analysis = AstAnalysis::new();
|
||||
analysis.from_ast_impl(ast);
|
||||
analysis
|
||||
}
|
||||
|
||||
/// Returns true if and only if a literal uppercase character occurs in
|
||||
/// the pattern.
|
||||
///
|
||||
/// For example, a pattern like `\pL` contains no uppercase literals,
|
||||
/// even though `L` is uppercase and the `\pL` class contains uppercase
|
||||
/// characters.
|
||||
pub fn any_uppercase(&self) -> bool {
|
||||
self.any_uppercase
|
||||
}
|
||||
|
||||
/// Returns true if and only if the regex contains any literal at all.
|
||||
///
|
||||
/// For example, a pattern like `\pL` reports `false`, but a pattern like
|
||||
/// `\pLfoo` reports `true`.
|
||||
pub fn any_literal(&self) -> bool {
|
||||
self.any_literal
|
||||
}
|
||||
|
||||
/// Returns true if and only if the entire pattern is a verbatim literal
|
||||
/// with no special meta characters.
|
||||
///
|
||||
/// When this is true, then the pattern satisfies the following law:
|
||||
/// `escape(pattern) == pattern`. Notable examples where this returns
|
||||
/// `false` include patterns like `a\u0061` even though `\u0061` is just
|
||||
/// a literal `a`.
|
||||
///
|
||||
/// The purpose of this flag is to determine whether the patterns can be
|
||||
/// given to non-regex substring search algorithms as-is.
|
||||
#[allow(dead_code)]
|
||||
pub fn all_verbatim_literal(&self) -> bool {
|
||||
self.all_verbatim_literal
|
||||
}
|
||||
|
||||
/// Creates a new `AstAnalysis` value with an initial configuration.
|
||||
fn new() -> AstAnalysis {
|
||||
AstAnalysis {
|
||||
any_uppercase: false,
|
||||
any_literal: false,
|
||||
all_verbatim_literal: true,
|
||||
}
|
||||
}
|
||||
|
||||
fn from_ast_impl(&mut self, ast: &Ast) {
|
||||
if self.done() {
|
||||
return;
|
||||
}
|
||||
match *ast {
|
||||
Ast::Empty(_) => {}
|
||||
Ast::Flags(_)
|
||||
| Ast::Dot(_)
|
||||
| Ast::Assertion(_)
|
||||
| Ast::Class(ast::Class::Unicode(_))
|
||||
| Ast::Class(ast::Class::Perl(_)) => {
|
||||
self.all_verbatim_literal = false;
|
||||
}
|
||||
Ast::Literal(ref x) => {
|
||||
self.from_ast_literal(x);
|
||||
}
|
||||
Ast::Class(ast::Class::Bracketed(ref x)) => {
|
||||
self.all_verbatim_literal = false;
|
||||
self.from_ast_class_set(&x.kind);
|
||||
}
|
||||
Ast::Repetition(ref x) => {
|
||||
self.all_verbatim_literal = false;
|
||||
self.from_ast_impl(&x.ast);
|
||||
}
|
||||
Ast::Group(ref x) => {
|
||||
self.all_verbatim_literal = false;
|
||||
self.from_ast_impl(&x.ast);
|
||||
}
|
||||
Ast::Alternation(ref alt) => {
|
||||
self.all_verbatim_literal = false;
|
||||
for x in &alt.asts {
|
||||
self.from_ast_impl(x);
|
||||
}
|
||||
}
|
||||
Ast::Concat(ref alt) => {
|
||||
for x in &alt.asts {
|
||||
self.from_ast_impl(x);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
|
||||
if self.done() {
|
||||
return;
|
||||
}
|
||||
match *ast {
|
||||
ast::ClassSet::Item(ref item) => {
|
||||
self.from_ast_class_set_item(item);
|
||||
}
|
||||
ast::ClassSet::BinaryOp(ref x) => {
|
||||
self.from_ast_class_set(&x.lhs);
|
||||
self.from_ast_class_set(&x.rhs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
|
||||
if self.done() {
|
||||
return;
|
||||
}
|
||||
match *ast {
|
||||
ast::ClassSetItem::Empty(_)
|
||||
| ast::ClassSetItem::Ascii(_)
|
||||
| ast::ClassSetItem::Unicode(_)
|
||||
| ast::ClassSetItem::Perl(_) => {}
|
||||
ast::ClassSetItem::Literal(ref x) => {
|
||||
self.from_ast_literal(x);
|
||||
}
|
||||
ast::ClassSetItem::Range(ref x) => {
|
||||
self.from_ast_literal(&x.start);
|
||||
self.from_ast_literal(&x.end);
|
||||
}
|
||||
ast::ClassSetItem::Bracketed(ref x) => {
|
||||
self.from_ast_class_set(&x.kind);
|
||||
}
|
||||
ast::ClassSetItem::Union(ref union) => {
|
||||
for x in &union.items {
|
||||
self.from_ast_class_set_item(x);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn from_ast_literal(&mut self, ast: &ast::Literal) {
|
||||
if ast.kind != ast::LiteralKind::Verbatim {
|
||||
self.all_verbatim_literal = false;
|
||||
}
|
||||
self.any_literal = true;
|
||||
self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
|
||||
}
|
||||
|
||||
/// Returns true if and only if the attributes can never change no matter
|
||||
/// what other AST it might see.
|
||||
fn done(&self) -> bool {
|
||||
self.any_uppercase && self.any_literal && !self.all_verbatim_literal
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn analysis(pattern: &str) -> AstAnalysis {
|
||||
AstAnalysis::from_pattern(pattern).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn various() {
|
||||
let x = analysis("");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(!x.any_literal);
|
||||
assert!(x.all_verbatim_literal);
|
||||
|
||||
let x = analysis("foo");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(x.all_verbatim_literal);
|
||||
|
||||
let x = analysis("Foo");
|
||||
assert!(x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(x.all_verbatim_literal);
|
||||
|
||||
let x = analysis("foO");
|
||||
assert!(x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"foo\\");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"foo\w");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"foo\S");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"foo\p{Ll}");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"foo[a-z]");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"foo[A-Z]");
|
||||
assert!(x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"foo[\S\t]");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"foo\\S");
|
||||
assert!(x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"\p{Ll}");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(!x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"aBc\w");
|
||||
assert!(x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
|
||||
let x = analysis(r"a\u0061");
|
||||
assert!(!x.any_uppercase);
|
||||
assert!(x.any_literal);
|
||||
assert!(!x.all_verbatim_literal);
|
||||
}
|
||||
}
|
265
grep-regex/src/config.rs
Normal file
265
grep-regex/src/config.rs
Normal file
@ -0,0 +1,265 @@
|
||||
use grep_matcher::{ByteSet, LineTerminator};
|
||||
use regex::bytes::{Regex, RegexBuilder};
|
||||
use regex_syntax::ast::{self, Ast};
|
||||
use regex_syntax::hir::Hir;
|
||||
|
||||
use ast::AstAnalysis;
|
||||
use crlf::crlfify;
|
||||
use error::Error;
|
||||
use literal::LiteralSets;
|
||||
use non_matching::non_matching_bytes;
|
||||
use strip::strip_from_match;
|
||||
|
||||
/// Config represents the configuration of a regex matcher in this crate.
|
||||
/// The configuration is itself a rough combination of the knobs found in
|
||||
/// the `regex` crate itself, along with additional `grep-matcher` specific
|
||||
/// options.
|
||||
///
|
||||
/// The configuration can be used to build a "configured" HIR expression. A
|
||||
/// configured HIR expression is an HIR expression that is aware of the
|
||||
/// configuration which generated it, and provides transformation on that HIR
|
||||
/// such that the configuration is preserved.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Config {
|
||||
pub case_insensitive: bool,
|
||||
pub case_smart: bool,
|
||||
pub multi_line: bool,
|
||||
pub dot_matches_new_line: bool,
|
||||
pub swap_greed: bool,
|
||||
pub ignore_whitespace: bool,
|
||||
pub unicode: bool,
|
||||
pub octal: bool,
|
||||
pub size_limit: usize,
|
||||
pub dfa_size_limit: usize,
|
||||
pub nest_limit: u32,
|
||||
pub line_terminator: Option<LineTerminator>,
|
||||
pub crlf: bool,
|
||||
pub word: bool,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Config {
|
||||
Config {
|
||||
case_insensitive: false,
|
||||
case_smart: false,
|
||||
multi_line: false,
|
||||
dot_matches_new_line: false,
|
||||
swap_greed: false,
|
||||
ignore_whitespace: false,
|
||||
unicode: true,
|
||||
octal: false,
|
||||
// These size limits are much bigger than what's in the regex
|
||||
// crate.
|
||||
size_limit: 100 * (1<<20),
|
||||
dfa_size_limit: 1000 * (1<<20),
|
||||
nest_limit: 250,
|
||||
line_terminator: None,
|
||||
crlf: false,
|
||||
word: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Parse the given pattern and returned its HIR expression along with
|
||||
/// the current configuration.
|
||||
///
|
||||
/// If there was a problem parsing the given expression then an error
|
||||
/// is returned.
|
||||
pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
||||
let analysis = self.analysis(pattern)?;
|
||||
let expr = ::regex_syntax::ParserBuilder::new()
|
||||
.nest_limit(self.nest_limit)
|
||||
.octal(self.octal)
|
||||
.allow_invalid_utf8(true)
|
||||
.ignore_whitespace(self.ignore_whitespace)
|
||||
.case_insensitive(self.is_case_insensitive(&analysis)?)
|
||||
.multi_line(self.multi_line)
|
||||
.dot_matches_new_line(self.dot_matches_new_line)
|
||||
.swap_greed(self.swap_greed)
|
||||
.unicode(self.unicode)
|
||||
.build()
|
||||
.parse(pattern)
|
||||
.map_err(Error::regex)?;
|
||||
let expr = match self.line_terminator {
|
||||
None => expr,
|
||||
Some(line_term) => strip_from_match(expr, line_term)?,
|
||||
};
|
||||
Ok(ConfiguredHIR {
|
||||
original: pattern.to_string(),
|
||||
config: self.clone(),
|
||||
analysis: analysis,
|
||||
// If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
|
||||
expr: if self.crlf { crlfify(expr) } else { expr },
|
||||
})
|
||||
}
|
||||
|
||||
/// Accounting for the `smart_case` config knob, return true if and only if
|
||||
/// this pattern should be matched case insensitively.
|
||||
fn is_case_insensitive(
|
||||
&self,
|
||||
analysis: &AstAnalysis,
|
||||
) -> Result<bool, Error> {
|
||||
if self.case_insensitive {
|
||||
return Ok(true);
|
||||
}
|
||||
if !self.case_smart {
|
||||
return Ok(false);
|
||||
}
|
||||
Ok(analysis.any_literal() && !analysis.any_uppercase())
|
||||
}
|
||||
|
||||
/// Perform analysis on the AST of this pattern.
|
||||
///
|
||||
/// This returns an error if the given pattern failed to parse.
|
||||
fn analysis(&self, pattern: &str) -> Result<AstAnalysis, Error> {
|
||||
Ok(AstAnalysis::from_ast(&self.ast(pattern)?))
|
||||
}
|
||||
|
||||
/// Parse the given pattern into its abstract syntax.
|
||||
///
|
||||
/// This returns an error if the given pattern failed to parse.
|
||||
fn ast(&self, pattern: &str) -> Result<Ast, Error> {
|
||||
ast::parse::ParserBuilder::new()
|
||||
.nest_limit(self.nest_limit)
|
||||
.octal(self.octal)
|
||||
.ignore_whitespace(self.ignore_whitespace)
|
||||
.build()
|
||||
.parse(pattern)
|
||||
.map_err(Error::regex)
|
||||
}
|
||||
}
|
||||
|
||||
/// A "configured" HIR expression, which is aware of the configuration which
|
||||
/// produced this HIR.
|
||||
///
|
||||
/// Since the configuration is tracked, values with this type can be
|
||||
/// transformed into other HIR expressions (or regular expressions) in a way
|
||||
/// that preserves the configuration. For example, the `fast_line_regex`
|
||||
/// method will apply literal extraction to the inner HIR and use that to build
|
||||
/// a new regex that matches the extracted literals in a way that is
|
||||
/// consistent with the configuration that produced this HIR. For example, the
|
||||
/// size limits set on the configured HIR will be propagated out to any
|
||||
/// subsequently constructed HIR or regular expression.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ConfiguredHIR {
|
||||
original: String,
|
||||
config: Config,
|
||||
analysis: AstAnalysis,
|
||||
expr: Hir,
|
||||
}
|
||||
|
||||
impl ConfiguredHIR {
|
||||
/// Return the configuration for this HIR expression.
|
||||
pub fn config(&self) -> &Config {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Compute the set of non-matching bytes for this HIR expression.
|
||||
pub fn non_matching_bytes(&self) -> ByteSet {
|
||||
non_matching_bytes(&self.expr)
|
||||
}
|
||||
|
||||
/// Builds a regular expression from this HIR expression.
|
||||
pub fn regex(&self) -> Result<Regex, Error> {
|
||||
self.pattern_to_regex(&self.expr.to_string())
|
||||
}
|
||||
|
||||
/// Applies the given function to the concrete syntax of this HIR and then
|
||||
/// generates a new HIR based on the result of the function in a way that
|
||||
/// preserves the configuration.
|
||||
///
|
||||
/// For example, this can be used to wrap a user provided regular
|
||||
/// expression with additional semantics. e.g., See the `WordMatcher`.
|
||||
pub fn with_pattern<F: FnMut(&str) -> String>(
|
||||
&self,
|
||||
mut f: F,
|
||||
) -> Result<ConfiguredHIR, Error>
|
||||
{
|
||||
self.pattern_to_hir(&f(&self.expr.to_string()))
|
||||
}
|
||||
|
||||
/// If the current configuration has a line terminator set and if useful
|
||||
/// literals could be extracted, then a regular expression matching those
|
||||
/// literals is returned. If no line terminator is set, then `None` is
|
||||
/// returned.
|
||||
///
|
||||
/// If compiling the resulting regular expression failed, then an error
|
||||
/// is returned.
|
||||
///
|
||||
/// This method only returns something when a line terminator is set
|
||||
/// because matches from this regex are generally candidates that must be
|
||||
/// confirmed before reporting a match. When performing a line oriented
|
||||
/// search, confirmation is easy: just extend the candidate match to its
|
||||
/// respective line boundaries and then re-search that line for a full
|
||||
/// match. This only works when the line terminator is set because the line
|
||||
/// terminator setting guarantees that the regex itself can never match
|
||||
/// through the line terminator byte.
|
||||
pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
|
||||
if self.config.line_terminator.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
match LiteralSets::new(&self.expr).one_regex() {
|
||||
None => Ok(None),
|
||||
Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a regex from the given pattern using this HIR's configuration.
|
||||
fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> {
|
||||
// The settings we explicitly set here are intentionally a subset
|
||||
// of the settings we have. The key point here is that our HIR
|
||||
// expression is computed with the settings in mind, such that setting
|
||||
// them here could actually lead to unintended behavior. For example,
|
||||
// consider the pattern `(?U)a+`. This will get folded into the HIR
|
||||
// as a non-greedy repetition operator which will in turn get printed
|
||||
// to the concrete syntax as `a+?`, which is correct. But if we
|
||||
// set the `swap_greed` option again, then we'll wind up with `(?U)a+?`
|
||||
// which is equal to `a+` which is not the same as what we were given.
|
||||
//
|
||||
// We also don't need to apply `case_insensitive` since this gets
|
||||
// folded into the HIR and would just cause us to do redundant work.
|
||||
//
|
||||
// Finally, we don't need to set `ignore_whitespace` since the concrete
|
||||
// syntax emitted by the HIR printer never needs it.
|
||||
//
|
||||
// We set the rest of the options. Some of them are important, such as
|
||||
// the size limit, and some of them are necessary to preserve the
|
||||
// intention of the original pattern. For example, the Unicode flag
|
||||
// will impact how the WordMatcher functions, namely, whether its
|
||||
// word boundaries are Unicode aware or not.
|
||||
RegexBuilder::new(&pattern)
|
||||
.nest_limit(self.config.nest_limit)
|
||||
.octal(self.config.octal)
|
||||
.multi_line(self.config.multi_line)
|
||||
.dot_matches_new_line(self.config.dot_matches_new_line)
|
||||
.unicode(self.config.unicode)
|
||||
.size_limit(self.config.size_limit)
|
||||
.dfa_size_limit(self.config.dfa_size_limit)
|
||||
.build()
|
||||
.map_err(Error::regex)
|
||||
}
|
||||
|
||||
/// Create an HIR expression from the given pattern using this HIR's
|
||||
/// configuration.
|
||||
fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
||||
// See `pattern_to_regex` comment for explanation of why we only set
|
||||
// a subset of knobs here. e.g., `swap_greed` is explicitly left out.
|
||||
let expr = ::regex_syntax::ParserBuilder::new()
|
||||
.nest_limit(self.config.nest_limit)
|
||||
.octal(self.config.octal)
|
||||
.allow_invalid_utf8(true)
|
||||
.multi_line(self.config.multi_line)
|
||||
.dot_matches_new_line(self.config.dot_matches_new_line)
|
||||
.unicode(self.config.unicode)
|
||||
.build()
|
||||
.parse(pattern)
|
||||
.map_err(Error::regex)?;
|
||||
Ok(ConfiguredHIR {
|
||||
original: self.original.clone(),
|
||||
config: self.config.clone(),
|
||||
analysis: self.analysis.clone(),
|
||||
expr: expr,
|
||||
})
|
||||
}
|
||||
}
|
83
grep-regex/src/crlf.rs
Normal file
83
grep-regex/src/crlf.rs
Normal file
@ -0,0 +1,83 @@
|
||||
use regex_syntax::hir::{self, Hir, HirKind};
|
||||
|
||||
/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
|
||||
///
|
||||
/// This does not preserve the exact semantics of the given expression,
|
||||
/// however, it does have the useful property that anything that matched the
|
||||
/// given expression will also match the returned expression. The difference is
|
||||
/// that the returned expression can match possibly other things as well.
|
||||
///
|
||||
/// The principle reason why we do this is because the underlying regex engine
|
||||
/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that
|
||||
/// level, but we perform this kludge in the mean time.
|
||||
///
|
||||
/// Note that while the match preserving semantics are nice and neat, the
|
||||
/// match position semantics are quite a bit messier. Namely, `$` only ever
|
||||
/// matches the position between characters where as `\r??` can match a
|
||||
/// character and change the offset. This is regretable, but works out pretty
|
||||
/// nicely in most cases, especially when a match is limited to a single line.
|
||||
pub fn crlfify(expr: Hir) -> Hir {
|
||||
match expr.into_kind() {
|
||||
HirKind::Anchor(hir::Anchor::EndLine) => {
|
||||
let concat = Hir::concat(vec![
|
||||
Hir::repetition(hir::Repetition {
|
||||
kind: hir::RepetitionKind::ZeroOrOne,
|
||||
greedy: false,
|
||||
hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),
|
||||
}),
|
||||
Hir::anchor(hir::Anchor::EndLine),
|
||||
]);
|
||||
Hir::group(hir::Group {
|
||||
kind: hir::GroupKind::NonCapturing,
|
||||
hir: Box::new(concat),
|
||||
})
|
||||
}
|
||||
HirKind::Empty => Hir::empty(),
|
||||
HirKind::Literal(x) => Hir::literal(x),
|
||||
HirKind::Class(x) => Hir::class(x),
|
||||
HirKind::Anchor(x) => Hir::anchor(x),
|
||||
HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
||||
HirKind::Repetition(mut x) => {
|
||||
x.hir = Box::new(crlfify(*x.hir));
|
||||
Hir::repetition(x)
|
||||
}
|
||||
HirKind::Group(mut x) => {
|
||||
x.hir = Box::new(crlfify(*x.hir));
|
||||
Hir::group(x)
|
||||
}
|
||||
HirKind::Concat(xs) => {
|
||||
Hir::concat(xs.into_iter().map(crlfify).collect())
|
||||
}
|
||||
HirKind::Alternation(xs) => {
|
||||
Hir::alternation(xs.into_iter().map(crlfify).collect())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use regex_syntax::Parser;
|
||||
use super::crlfify;
|
||||
|
||||
fn roundtrip(pattern: &str) -> String {
|
||||
let expr1 = Parser::new().parse(pattern).unwrap();
|
||||
let expr2 = crlfify(expr1);
|
||||
expr2.to_string()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn various() {
|
||||
assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
|
||||
assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");
|
||||
assert_eq!(
|
||||
roundtrip(r"(?m)(?:foo$|bar$)"),
|
||||
"(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))"
|
||||
);
|
||||
assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");
|
||||
|
||||
// Not a multiline `$`, so no crlfifying occurs.
|
||||
assert_eq!(roundtrip(r"$"), "\\z");
|
||||
// It's a literal, derp.
|
||||
assert_eq!(roundtrip(r"\$"), "\\$");
|
||||
}
|
||||
}
|
88
grep-regex/src/error.rs
Normal file
88
grep-regex/src/error.rs
Normal file
@ -0,0 +1,88 @@
|
||||
use std::error;
|
||||
use std::fmt;
|
||||
|
||||
use util;
|
||||
|
||||
/// An error that can occur in this crate.
|
||||
///
|
||||
/// Generally, this error corresponds to problems building a regular
|
||||
/// expression, whether it's in parsing, compilation or a problem with
|
||||
/// guaranteeing a configured optimization.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Error {
|
||||
kind: ErrorKind,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
pub(crate) fn new(kind: ErrorKind) -> Error {
|
||||
Error { kind }
|
||||
}
|
||||
|
||||
pub(crate) fn regex<E: error::Error>(err: E) -> Error {
|
||||
Error { kind: ErrorKind::Regex(err.to_string()) }
|
||||
}
|
||||
|
||||
/// Return the kind of this error.
|
||||
pub fn kind(&self) -> &ErrorKind {
|
||||
&self.kind
|
||||
}
|
||||
}
|
||||
|
||||
/// The kind of an error that can occur.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ErrorKind {
|
||||
/// An error that occurred as a result of parsing a regular expression.
|
||||
/// This can be a syntax error or an error that results from attempting to
|
||||
/// compile a regular expression that is too big.
|
||||
///
|
||||
/// The string here is the underlying error converted to a string.
|
||||
Regex(String),
|
||||
/// An error that occurs when a building a regex that isn't permitted to
|
||||
/// match a line terminator. In general, building the regex will do its
|
||||
/// best to make matching a line terminator impossible (e.g., by removing
|
||||
/// `\n` from the `\s` character class), but if the regex contains a
|
||||
/// `\n` literal, then there is no reasonable choice that can be made and
|
||||
/// therefore an error is reported.
|
||||
///
|
||||
/// The string is the literal sequence found in the regex that is not
|
||||
/// allowed.
|
||||
NotAllowed(String),
|
||||
/// This error occurs when a non-ASCII line terminator was provided.
|
||||
///
|
||||
/// The invalid byte is included in this error.
|
||||
InvalidLineTerminator(u8),
|
||||
/// Hints that destructuring should not be exhaustive.
|
||||
///
|
||||
/// This enum may grow additional variants, so this makes sure clients
|
||||
/// don't count on exhaustive matching. (Otherwise, adding a new variant
|
||||
/// could break existing code.)
|
||||
#[doc(hidden)]
|
||||
__Nonexhaustive,
|
||||
}
|
||||
|
||||
impl error::Error for Error {
|
||||
fn description(&self) -> &str {
|
||||
match self.kind {
|
||||
ErrorKind::Regex(_) => "regex error",
|
||||
ErrorKind::NotAllowed(_) => "literal not allowed",
|
||||
ErrorKind::InvalidLineTerminator(_) => "invalid line terminator",
|
||||
ErrorKind::__Nonexhaustive => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self.kind {
|
||||
ErrorKind::Regex(ref s) => write!(f, "{}", s),
|
||||
ErrorKind::NotAllowed(ref lit) => {
|
||||
write!(f, "the literal '{:?}' is not allowed in a regex", lit)
|
||||
}
|
||||
ErrorKind::InvalidLineTerminator(byte) => {
|
||||
let x = util::show_bytes(&[byte]);
|
||||
write!(f, "line terminators must be ASCII, but '{}' is not", x)
|
||||
}
|
||||
ErrorKind::__Nonexhaustive => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
27
grep-regex/src/lib.rs
Normal file
27
grep-regex/src/lib.rs
Normal file
@ -0,0 +1,27 @@
|
||||
/*!
|
||||
An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine.
|
||||
*/
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
extern crate grep_matcher;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
extern crate regex;
|
||||
extern crate regex_syntax;
|
||||
extern crate thread_local;
|
||||
extern crate utf8_ranges;
|
||||
|
||||
pub use error::{Error, ErrorKind};
|
||||
pub use matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder};
|
||||
|
||||
mod ast;
|
||||
mod config;
|
||||
mod crlf;
|
||||
mod error;
|
||||
mod literal;
|
||||
mod matcher;
|
||||
mod non_matching;
|
||||
mod strip;
|
||||
mod util;
|
||||
mod word;
|
304
grep-regex/src/literal.rs
Normal file
304
grep-regex/src/literal.rs
Normal file
@ -0,0 +1,304 @@
|
||||
/*
|
||||
This module is responsible for extracting *inner* literals out of the AST of a
|
||||
regular expression. Normally this is the job of the regex engine itself, but
|
||||
the regex engine doesn't look for inner literals. Since we're doing line based
|
||||
searching, we can use them, so we need to do it ourselves.
|
||||
*/
|
||||
|
||||
use std::cmp;
|
||||
|
||||
use regex_syntax::hir::{self, Hir, HirKind};
|
||||
use regex_syntax::hir::literal::{Literal, Literals};
|
||||
|
||||
use util;
|
||||
|
||||
/// Represents prefix, suffix and inner "required" literals for a regular
|
||||
/// expression.
|
||||
///
|
||||
/// Prefixes and suffixes are detected using regex-syntax. The inner required
|
||||
/// literals are detected using something custom (but based on the code in
|
||||
/// regex-syntax).
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LiteralSets {
|
||||
/// A set of prefix literals.
|
||||
prefixes: Literals,
|
||||
/// A set of suffix literals.
|
||||
suffixes: Literals,
|
||||
/// A set of literals such that at least one of them must appear in every
|
||||
/// match. A literal in this set may be neither a prefix nor a suffix.
|
||||
required: Literals,
|
||||
}
|
||||
|
||||
impl LiteralSets {
|
||||
/// Create a set of literals from the given HIR expression.
|
||||
pub fn new(expr: &Hir) -> LiteralSets {
|
||||
let mut required = Literals::empty();
|
||||
union_required(expr, &mut required);
|
||||
LiteralSets {
|
||||
prefixes: Literals::prefixes(expr),
|
||||
suffixes: Literals::suffixes(expr),
|
||||
required: required,
|
||||
}
|
||||
}
|
||||
|
||||
/// If it is deemed advantageuous to do so (via various suspicious
|
||||
/// heuristics), this will return a single regular expression pattern that
|
||||
/// matches a subset of the language matched by the regular expression that
|
||||
/// generated these literal sets. The idea here is that the pattern
|
||||
/// returned by this method is much cheaper to search for. i.e., It is
|
||||
/// usually a single literal or an alternation of literals.
|
||||
pub fn one_regex(&self) -> Option<String> {
|
||||
// TODO: The logic in this function is basically inscrutable. It grew
|
||||
// organically in the old grep 0.1 crate. Ideally, it would be
|
||||
// re-worked. In fact, the entire inner literal extraction should be
|
||||
// re-worked. Actually, most of regex-syntax's literal extraction
|
||||
// should also be re-worked. Alas... only so much time in the day.
|
||||
|
||||
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
|
||||
debug!("literal prefixes detected: {:?}", self.prefixes);
|
||||
// When this is true, the regex engine will do a literal scan,
|
||||
// so we don't need to return anything.
|
||||
return None;
|
||||
}
|
||||
|
||||
// Out of inner required literals, prefixes and suffixes, which one
|
||||
// is the longest? We pick the longest to do fast literal scan under
|
||||
// the assumption that a longer literal will have a lower false
|
||||
// positive rate.
|
||||
let pre_lcp = self.prefixes.longest_common_prefix();
|
||||
let pre_lcs = self.prefixes.longest_common_suffix();
|
||||
let suf_lcp = self.suffixes.longest_common_prefix();
|
||||
let suf_lcs = self.suffixes.longest_common_suffix();
|
||||
|
||||
let req_lits = self.required.literals();
|
||||
let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
|
||||
None => &[],
|
||||
Some(req) => &***req,
|
||||
};
|
||||
|
||||
let mut lit = pre_lcp;
|
||||
if pre_lcs.len() > lit.len() {
|
||||
lit = pre_lcs;
|
||||
}
|
||||
if suf_lcp.len() > lit.len() {
|
||||
lit = suf_lcp;
|
||||
}
|
||||
if suf_lcs.len() > lit.len() {
|
||||
lit = suf_lcs;
|
||||
}
|
||||
if req_lits.len() == 1 && req.len() > lit.len() {
|
||||
lit = req;
|
||||
}
|
||||
|
||||
// Special case: if we detected an alternation of inner required
|
||||
// literals and its longest literal is bigger than the longest
|
||||
// prefix/suffix, then choose the alternation. In practice, this
|
||||
// helps with case insensitive matching, which can generate lots of
|
||||
// inner required literals.
|
||||
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
|
||||
if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
|
||||
debug!("required literals found: {:?}", req_lits);
|
||||
let alts: Vec<String> = req_lits
|
||||
.into_iter()
|
||||
.map(|x| util::bytes_to_regex(x))
|
||||
.collect();
|
||||
// We're matching raw bytes, so disable Unicode mode.
|
||||
Some(format!("(?-u:{})", alts.join("|")))
|
||||
} else if lit.is_empty() {
|
||||
None
|
||||
} else {
|
||||
debug!("required literal found: {:?}", util::show_bytes(lit));
|
||||
Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn union_required(expr: &Hir, lits: &mut Literals) {
|
||||
match *expr.kind() {
|
||||
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
||||
let mut buf = [0u8; 4];
|
||||
lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
|
||||
}
|
||||
HirKind::Literal(hir::Literal::Byte(b)) => {
|
||||
lits.cross_add(&[b]);
|
||||
}
|
||||
HirKind::Class(hir::Class::Unicode(ref cls)) => {
|
||||
if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
|
||||
lits.cut();
|
||||
}
|
||||
}
|
||||
HirKind::Class(hir::Class::Bytes(ref cls)) => {
|
||||
if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
|
||||
lits.cut();
|
||||
}
|
||||
}
|
||||
HirKind::Group(hir::Group { ref hir, .. }) => {
|
||||
union_required(&**hir, lits);
|
||||
}
|
||||
HirKind::Repetition(ref x) => {
|
||||
match x.kind {
|
||||
hir::RepetitionKind::ZeroOrOne => lits.cut(),
|
||||
hir::RepetitionKind::ZeroOrMore => lits.cut(),
|
||||
hir::RepetitionKind::OneOrMore => {
|
||||
union_required(&x.hir, lits);
|
||||
lits.cut();
|
||||
}
|
||||
hir::RepetitionKind::Range(ref rng) => {
|
||||
let (min, max) = match *rng {
|
||||
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
|
||||
hir::RepetitionRange::AtLeast(m) => (m, None),
|
||||
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
|
||||
};
|
||||
repeat_range_literals(
|
||||
&x.hir, min, max, x.greedy, lits, union_required);
|
||||
}
|
||||
}
|
||||
}
|
||||
HirKind::Concat(ref es) if es.is_empty() => {}
|
||||
HirKind::Concat(ref es) if es.len() == 1 => {
|
||||
union_required(&es[0], lits)
|
||||
}
|
||||
HirKind::Concat(ref es) => {
|
||||
for e in es {
|
||||
let mut lits2 = lits.to_empty();
|
||||
union_required(e, &mut lits2);
|
||||
if lits2.is_empty() {
|
||||
lits.cut();
|
||||
continue;
|
||||
}
|
||||
if lits2.contains_empty() {
|
||||
lits.cut();
|
||||
}
|
||||
if !lits.cross_product(&lits2) {
|
||||
// If this expression couldn't yield any literal that
|
||||
// could be extended, then we need to quit. Since we're
|
||||
// short-circuiting, we also need to freeze every member.
|
||||
lits.cut();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
HirKind::Alternation(ref es) => {
|
||||
alternate_literals(es, lits, union_required);
|
||||
}
|
||||
_ => lits.cut(),
|
||||
}
|
||||
}
|
||||
|
||||
fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
|
||||
e: &Hir,
|
||||
min: u32,
|
||||
max: Option<u32>,
|
||||
_greedy: bool,
|
||||
lits: &mut Literals,
|
||||
mut f: F,
|
||||
) {
|
||||
if min == 0 {
|
||||
// This is a bit conservative. If `max` is set, then we could
|
||||
// treat this as a finite set of alternations. For now, we
|
||||
// just treat it as `e*`.
|
||||
lits.cut();
|
||||
} else {
|
||||
let n = cmp::min(lits.limit_size(), min as usize);
|
||||
// We only extract literals from a single repetition, even though
|
||||
// we could do more. e.g., `a{3}` will have `a` extracted instead of
|
||||
// `aaa`. The reason is that inner literal extraction can't be unioned
|
||||
// across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
|
||||
// is wrong.
|
||||
f(e, lits);
|
||||
if n < min as usize {
|
||||
lits.cut();
|
||||
}
|
||||
if max.map_or(true, |max| min < max) {
|
||||
lits.cut();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
|
||||
es: &[Hir],
|
||||
lits: &mut Literals,
|
||||
mut f: F,
|
||||
) {
|
||||
let mut lits2 = lits.to_empty();
|
||||
for e in es {
|
||||
let mut lits3 = lits.to_empty();
|
||||
lits3.set_limit_size(lits.limit_size() / 5);
|
||||
f(e, &mut lits3);
|
||||
if lits3.is_empty() || !lits2.union(lits3) {
|
||||
// If we couldn't find suffixes for *any* of the
|
||||
// alternates, then the entire alternation has to be thrown
|
||||
// away and any existing members must be frozen. Similarly,
|
||||
// if the union couldn't complete, stop and freeze.
|
||||
lits.cut();
|
||||
return;
|
||||
}
|
||||
}
|
||||
// All we do at the moment is look for prefixes and suffixes. If both
|
||||
// are empty, then we report nothing. We should be able to do better than
|
||||
// this, but we'll need something more expressive than just a "set of
|
||||
// literals."
|
||||
let lcp = lits2.longest_common_prefix();
|
||||
let lcs = lits2.longest_common_suffix();
|
||||
if !lcp.is_empty() {
|
||||
lits.cross_add(lcp);
|
||||
}
|
||||
lits.cut();
|
||||
if !lcs.is_empty() {
|
||||
lits.add(Literal::empty());
|
||||
lits.add(Literal::new(lcs.to_vec()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of characters in the given class.
|
||||
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
|
||||
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
||||
}
|
||||
|
||||
/// Return the number of bytes in the given class.
|
||||
fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
|
||||
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use regex_syntax::Parser;
|
||||
use super::LiteralSets;
|
||||
|
||||
fn sets(pattern: &str) -> LiteralSets {
|
||||
let hir = Parser::new().parse(pattern).unwrap();
|
||||
LiteralSets::new(&hir)
|
||||
}
|
||||
|
||||
fn one_regex(pattern: &str) -> Option<String> {
|
||||
sets(pattern).one_regex()
|
||||
}
|
||||
|
||||
// Put a pattern into the same format as the one returned by `one_regex`.
|
||||
fn pat(pattern: &str) -> Option<String> {
|
||||
Some(format!("(?-u:{})", pattern))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn various() {
|
||||
// Obviously no literals.
|
||||
assert!(one_regex(r"\w").is_none());
|
||||
assert!(one_regex(r"\pL").is_none());
|
||||
|
||||
// Tantalizingly close.
|
||||
assert!(one_regex(r"\w|foo").is_none());
|
||||
|
||||
// There's a literal, but it's better if the regex engine handles it
|
||||
// internally.
|
||||
assert!(one_regex(r"abc").is_none());
|
||||
|
||||
// Core use cases.
|
||||
assert_eq!(one_regex(r"\wabc\w"), pat("abc"));
|
||||
assert_eq!(one_regex(r"abc\w"), pat("abc"));
|
||||
|
||||
// TODO: Make these pass. We're missing some potentially big wins
|
||||
// without these.
|
||||
// assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
|
||||
// assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
|
||||
}
|
||||
}
|
864
grep-regex/src/matcher.rs
Normal file
864
grep-regex/src/matcher.rs
Normal file
@ -0,0 +1,864 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use grep_matcher::{
|
||||
Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError, ByteSet,
|
||||
};
|
||||
use regex::bytes::{CaptureLocations, Regex};
|
||||
|
||||
use config::{Config, ConfiguredHIR};
|
||||
use error::Error;
|
||||
use word::WordMatcher;
|
||||
|
||||
/// A builder for constructing a `Matcher` using regular expressions.
|
||||
///
|
||||
/// This builder re-exports many of the same options found on the regex crate's
|
||||
/// builder, in addition to a few other options such as smart case, word
|
||||
/// matching and the ability to set a line terminator which may enable certain
|
||||
/// types of optimizations.
|
||||
///
|
||||
/// The syntax supported is documented as part of the regex crate:
|
||||
/// https://docs.rs/regex/*/regex/#syntax
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RegexMatcherBuilder {
|
||||
config: Config,
|
||||
}
|
||||
|
||||
impl Default for RegexMatcherBuilder {
|
||||
fn default() -> RegexMatcherBuilder {
|
||||
RegexMatcherBuilder::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexMatcherBuilder {
|
||||
/// Create a new builder for configuring a regex matcher.
|
||||
pub fn new() -> RegexMatcherBuilder {
|
||||
RegexMatcherBuilder {
|
||||
config: Config::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a new matcher using the current configuration for the provided
|
||||
/// pattern.
|
||||
///
|
||||
/// The syntax supported is documented as part of the regex crate:
|
||||
/// https://docs.rs/regex/*/regex/#syntax
|
||||
pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
|
||||
let chir = self.config.hir(pattern)?;
|
||||
let fast_line_regex = chir.fast_line_regex()?;
|
||||
let non_matching_bytes = chir.non_matching_bytes();
|
||||
if let Some(ref re) = fast_line_regex {
|
||||
trace!("extracted fast line regex: {:?}", re);
|
||||
}
|
||||
Ok(RegexMatcher {
|
||||
config: self.config.clone(),
|
||||
matcher: RegexMatcherImpl::new(&chir)?,
|
||||
fast_line_regex: fast_line_regex,
|
||||
non_matching_bytes: non_matching_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
/// Set the value for the case insensitive (`i`) flag.
|
||||
///
|
||||
/// When enabled, letters in the pattern will match both upper case and
|
||||
/// lower case variants.
|
||||
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||
self.config.case_insensitive = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to enable "smart case" or not.
|
||||
///
|
||||
/// When smart case is enabled, the builder will automatically enable
|
||||
/// case insensitive matching based on how the pattern is written. Namely,
|
||||
/// case insensitive mode is enabled when both of the following things
|
||||
/// are true:
|
||||
///
|
||||
/// 1. The pattern contains at least one literal character. For example,
|
||||
/// `a\w` contains a literal (`a`) but `\w` does not.
|
||||
/// 2. Of the literals in the pattern, none of them are considered to be
|
||||
/// uppercase according to Unicode. For example, `foo\pL` has no
|
||||
/// uppercase literals but `Foo\pL` does.
|
||||
pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||
self.config.case_smart = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the multi-line matching (`m`) flag.
|
||||
///
|
||||
/// When enabled, `^` matches the beginning of lines and `$` matches the
|
||||
/// end of lines.
|
||||
///
|
||||
/// By default, they match beginning/end of the input.
|
||||
pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||
self.config.multi_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the any character (`s`) flag, where in `.` matches
|
||||
/// anything when `s` is set and matches anything except for new line when
|
||||
/// it is not set (the default).
|
||||
///
|
||||
/// N.B. "matches anything" means "any byte" when Unicode is disabled and
|
||||
/// means "any valid UTF-8 encoding of any Unicode scalar value" when
|
||||
/// Unicode is enabled.
|
||||
pub fn dot_matches_new_line(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexMatcherBuilder {
|
||||
self.config.dot_matches_new_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the greedy swap (`U`) flag.
|
||||
///
|
||||
/// When enabled, a pattern like `a*` is lazy (tries to find shortest
|
||||
/// match) and `a*?` is greedy (tries to find longest match).
|
||||
///
|
||||
/// By default, `a*` is greedy and `a*?` is lazy.
|
||||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||
self.config.swap_greed = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the ignore whitespace (`x`) flag.
|
||||
///
|
||||
/// When enabled, whitespace such as new lines and spaces will be ignored
|
||||
/// between expressions of the pattern, and `#` can be used to start a
|
||||
/// comment until the next new line.
|
||||
pub fn ignore_whitespace(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut RegexMatcherBuilder {
|
||||
self.config.ignore_whitespace = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the value for the Unicode (`u`) flag.
|
||||
///
|
||||
/// Enabled by default. When disabled, character classes such as `\w` only
|
||||
/// match ASCII word characters instead of all Unicode word characters.
|
||||
pub fn unicode(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||
self.config.unicode = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to support octal syntax or not.
|
||||
///
|
||||
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
||||
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
||||
/// `\141` are all equivalent regular expressions, where the last example
|
||||
/// shows octal syntax.
|
||||
///
|
||||
/// While supporting octal syntax isn't in and of itself a problem, it does
|
||||
/// make good error messages harder. That is, in PCRE based regex engines,
|
||||
/// syntax like `\0` invokes a backreference, which is explicitly
|
||||
/// unsupported in Rust's regex engine. However, many users expect it to
|
||||
/// be supported. Therefore, when octal support is disabled, the error
|
||||
/// message will explicitly mention that backreferences aren't supported.
|
||||
///
|
||||
/// Octal syntax is disabled by default.
|
||||
pub fn octal(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||
self.config.octal = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the approximate size limit of the compiled regular expression.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes occupied by a single
|
||||
/// compiled program. If the program exceeds this number, then a
|
||||
/// compilation error is returned.
|
||||
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexMatcherBuilder {
|
||||
self.config.size_limit = bytes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the approximate size of the cache used by the DFA.
|
||||
///
|
||||
/// This roughly corresponds to the number of bytes that the DFA will
|
||||
/// use while searching.
|
||||
///
|
||||
/// Note that this is a *per thread* limit. There is no way to set a global
|
||||
/// limit. In particular, if a regex is used from multiple threads
|
||||
/// simultaneously, then each thread may use up to the number of bytes
|
||||
/// specified here.
|
||||
pub fn dfa_size_limit(
|
||||
&mut self,
|
||||
bytes: usize,
|
||||
) -> &mut RegexMatcherBuilder {
|
||||
self.config.dfa_size_limit = bytes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the nesting limit for this parser.
|
||||
///
|
||||
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||
/// groups), then an error is returned by the parser.
|
||||
///
|
||||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||
/// overflow for consumers that do structural induction on an `Ast` using
|
||||
/// explicit recursion. While this crate never does this (instead using
|
||||
/// constant stack space and moving the call stack to the heap), other
|
||||
/// crates may.
|
||||
///
|
||||
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
||||
/// if callers want to put a limit on the amount of heap space used, then
|
||||
/// they should impose a limit on the length, in bytes, of the concrete
|
||||
/// pattern string. In particular, this is viable since this parser
|
||||
/// implementation will limit itself to heap space proportional to the
|
||||
/// lenth of the pattern string.
|
||||
///
|
||||
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
||||
/// depth of `1`. In general, a nest limit is not something that manifests
|
||||
/// in an obvious way in the concrete syntax, therefore, it should not be
|
||||
/// used in a granular way.
|
||||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexMatcherBuilder {
|
||||
self.config.nest_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set an ASCII line terminator for the matcher.
|
||||
///
|
||||
/// The purpose of setting a line terminator is to enable a certain class
|
||||
/// of optimizations that can make line oriented searching faster. Namely,
|
||||
/// when a line terminator is enabled, then the builder will guarantee that
|
||||
/// the resulting matcher will never be capable of producing a match that
|
||||
/// contains the line terminator. Because of this guarantee, users of the
|
||||
/// resulting matcher do not need to slowly execute a search line by line
|
||||
/// for line oriented search.
|
||||
///
|
||||
/// If the aforementioned guarantee about not matching a line terminator
|
||||
/// cannot be made because of how the pattern was written, then the builder
|
||||
/// will return an error when attempting to construct the matcher. For
|
||||
/// example, the pattern `a\sb` will be transformed such that it can never
|
||||
/// match `a\nb` (when `\n` is the line terminator), but the pattern `a\nb`
|
||||
/// will result in an error since the `\n` cannot be easily removed without
|
||||
/// changing the fundamental intent of the pattern.
|
||||
///
|
||||
/// If the given line terminator isn't an ASCII byte (`<=127`), then the
|
||||
/// builder will return an error when constructing the matcher.
|
||||
pub fn line_terminator(
|
||||
&mut self,
|
||||
line_term: Option<u8>,
|
||||
) -> &mut RegexMatcherBuilder {
|
||||
self.config.line_terminator = line_term.map(LineTerminator::byte);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
|
||||
/// regex patterns.
|
||||
///
|
||||
/// This method sets two distinct settings:
|
||||
///
|
||||
/// 1. It causes the line terminator for the matcher to be `\r\n`. Namely,
|
||||
/// this prevents the matcher from ever producing a match that contains
|
||||
/// a `\r` or `\n`.
|
||||
/// 2. It translates all instances of `$` in the pattern to `(?:\r??$)`.
|
||||
/// This works around the fact that the regex engine does not support
|
||||
/// matching CRLF as a line terminator when using `$`.
|
||||
///
|
||||
/// In particular, because of (2), the matches produced by the matcher may
|
||||
/// be slightly different than what one would expect given the pattern.
|
||||
/// This is the trade off made: in many cases, `$` will "just work" in the
|
||||
/// presence of `\r\n` line terminators, but matches may require some
|
||||
/// trimming to faithfully represent the intended match.
|
||||
///
|
||||
/// Note that if you do not wish to set the line terminator but would still
|
||||
/// like `$` to match `\r\n` line terminators, then it is valid to call
|
||||
/// `crlf(true)` followed by `line_terminator(None)`. Ordering is
|
||||
/// important, since `crlf` and `line_terminator` override each other.
|
||||
pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||
if yes {
|
||||
self.config.line_terminator = Some(LineTerminator::crlf());
|
||||
} else {
|
||||
self.config.line_terminator = None;
|
||||
}
|
||||
self.config.crlf = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Require that all matches occur on word boundaries.
|
||||
///
|
||||
/// Enabling this option is subtly different than putting `\b` assertions
|
||||
/// on both sides of your pattern. In particular, a `\b` assertion requires
|
||||
/// that one side of it match a word character while the other match a
|
||||
/// non-word character. This option, in contrast, merely requires that
|
||||
/// one side match a non-word character.
|
||||
///
|
||||
/// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
|
||||
/// word character. However, `-2` with this `word` option enabled will
|
||||
/// match the `-2` in `foo -2 bar`.
|
||||
pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||
self.config.word = yes;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// An implementation of the `Matcher` trait using Rust's standard regex
|
||||
/// library.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RegexMatcher {
|
||||
/// The configuration specified by the caller.
|
||||
config: Config,
|
||||
/// The underlying matcher implementation.
|
||||
matcher: RegexMatcherImpl,
|
||||
/// A regex that never reports false negatives but may report false
|
||||
/// positives that is believed to be capable of being matched more quickly
|
||||
/// than `regex`. Typically, this is a single literal or an alternation
|
||||
/// of literals.
|
||||
fast_line_regex: Option<Regex>,
|
||||
/// A set of bytes that will never appear in a match.
|
||||
non_matching_bytes: ByteSet,
|
||||
}
|
||||
|
||||
impl RegexMatcher {
|
||||
/// Create a new matcher from the given pattern using the default
|
||||
/// configuration.
|
||||
pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
|
||||
RegexMatcherBuilder::new().build(pattern)
|
||||
}
|
||||
|
||||
/// Create a new matcher from the given pattern using the default
|
||||
/// configuration, but matches lines terminated by `\n`.
|
||||
///
|
||||
/// This returns an error if the given pattern contains a literal `\n`.
|
||||
/// Other uses of `\n` (such as in `\s`) are removed transparently.
|
||||
pub fn new_line_matcher(pattern: &str) -> Result<RegexMatcher, Error> {
|
||||
RegexMatcherBuilder::new()
|
||||
.line_terminator(Some(b'\n'))
|
||||
.build(pattern)
|
||||
}
|
||||
}
|
||||
|
||||
/// An encapsulation of the type of matcher we use in `RegexMatcher`.
|
||||
#[derive(Clone, Debug)]
|
||||
enum RegexMatcherImpl {
|
||||
/// The standard matcher used for all regular expressions.
|
||||
Standard(StandardMatcher),
|
||||
/// A matcher that only matches at word boundaries. This transforms the
|
||||
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
|
||||
/// Because of this, the WordMatcher provides its own implementation of
|
||||
/// `Matcher` to encapsulate its use of capture groups to make them
|
||||
/// invisible to the caller.
|
||||
Word(WordMatcher),
|
||||
}
|
||||
|
||||
impl RegexMatcherImpl {
|
||||
/// Based on the configuration, create a new implementation of the
|
||||
/// `Matcher` trait.
|
||||
fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
|
||||
if expr.config().word {
|
||||
Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
|
||||
} else {
|
||||
Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This implementation just dispatches on the internal matcher impl except
|
||||
// for the line terminator optimization, which is possibly executed via
|
||||
// `fast_line_regex`.
|
||||
impl Matcher for RegexMatcher {
|
||||
type Captures = RegexCaptures;
|
||||
type Error = NoError;
|
||||
|
||||
fn find_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<Match>, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find_at(haystack, at),
|
||||
Word(ref m) => m.find_at(haystack, at),
|
||||
}
|
||||
}
|
||||
|
||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.new_captures(),
|
||||
Word(ref m) => m.new_captures(),
|
||||
}
|
||||
}
|
||||
|
||||
fn capture_count(&self) -> usize {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.capture_count(),
|
||||
Word(ref m) => m.capture_count(),
|
||||
}
|
||||
}
|
||||
|
||||
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.capture_index(name),
|
||||
Word(ref m) => m.capture_index(name),
|
||||
}
|
||||
}
|
||||
|
||||
fn find(&self, haystack: &[u8]) -> Result<Option<Match>, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find(haystack),
|
||||
Word(ref m) => m.find(haystack),
|
||||
}
|
||||
}
|
||||
|
||||
fn find_iter<F>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
matched: F,
|
||||
) -> Result<(), NoError>
|
||||
where F: FnMut(Match) -> bool
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find_iter(haystack, matched),
|
||||
Word(ref m) => m.find_iter(haystack, matched),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_find_iter<F, E>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
matched: F,
|
||||
) -> Result<Result<(), E>, NoError>
|
||||
where F: FnMut(Match) -> Result<bool, E>
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.try_find_iter(haystack, matched),
|
||||
Word(ref m) => m.try_find_iter(haystack, matched),
|
||||
}
|
||||
}
|
||||
|
||||
fn captures(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
caps: &mut RegexCaptures,
|
||||
) -> Result<bool, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures(haystack, caps),
|
||||
Word(ref m) => m.captures(haystack, caps),
|
||||
}
|
||||
}
|
||||
|
||||
fn captures_iter<F>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
caps: &mut RegexCaptures,
|
||||
matched: F,
|
||||
) -> Result<(), NoError>
|
||||
where F: FnMut(&RegexCaptures) -> bool
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
Word(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_captures_iter<F, E>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
caps: &mut RegexCaptures,
|
||||
matched: F,
|
||||
) -> Result<Result<(), E>, NoError>
|
||||
where F: FnMut(&RegexCaptures) -> Result<bool, E>
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
}
|
||||
}
|
||||
|
||||
fn captures_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
caps: &mut RegexCaptures,
|
||||
) -> Result<bool, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures_at(haystack, at, caps),
|
||||
Word(ref m) => m.captures_at(haystack, at, caps),
|
||||
}
|
||||
}
|
||||
|
||||
fn replace<F>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
dst: &mut Vec<u8>,
|
||||
append: F,
|
||||
) -> Result<(), NoError>
|
||||
where F: FnMut(Match, &mut Vec<u8>) -> bool
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.replace(haystack, dst, append),
|
||||
Word(ref m) => m.replace(haystack, dst, append),
|
||||
}
|
||||
}
|
||||
|
||||
fn replace_with_captures<F>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
caps: &mut RegexCaptures,
|
||||
dst: &mut Vec<u8>,
|
||||
append: F,
|
||||
) -> Result<(), NoError>
|
||||
where F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool
|
||||
{
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
Word(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_match(&self, haystack: &[u8]) -> Result<bool, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.is_match(haystack),
|
||||
Word(ref m) => m.is_match(haystack),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_match_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<bool, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.is_match_at(haystack, at),
|
||||
Word(ref m) => m.is_match_at(haystack, at),
|
||||
}
|
||||
}
|
||||
|
||||
fn shortest_match(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
) -> Result<Option<usize>, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.shortest_match(haystack),
|
||||
Word(ref m) => m.shortest_match(haystack),
|
||||
}
|
||||
}
|
||||
|
||||
fn shortest_match_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<usize>, NoError> {
|
||||
use self::RegexMatcherImpl::*;
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.shortest_match_at(haystack, at),
|
||||
Word(ref m) => m.shortest_match_at(haystack, at),
|
||||
}
|
||||
}
|
||||
|
||||
fn non_matching_bytes(&self) -> Option<&ByteSet> {
|
||||
Some(&self.non_matching_bytes)
|
||||
}
|
||||
|
||||
fn line_terminator(&self) -> Option<LineTerminator> {
|
||||
self.config.line_terminator
|
||||
}
|
||||
|
||||
fn find_candidate_line(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
) -> Result<Option<LineMatchKind>, NoError> {
|
||||
Ok(match self.fast_line_regex {
|
||||
Some(ref regex) => {
|
||||
regex.shortest_match(haystack).map(LineMatchKind::Candidate)
|
||||
}
|
||||
None => {
|
||||
self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The implementation of the standard regex matcher.
|
||||
#[derive(Clone, Debug)]
|
||||
struct StandardMatcher {
|
||||
/// The regular expression compiled from the pattern provided by the
|
||||
/// caller.
|
||||
regex: Regex,
|
||||
/// A map from capture group name to its corresponding index.
|
||||
names: HashMap<String, usize>,
|
||||
}
|
||||
|
||||
impl StandardMatcher {
|
||||
fn new(expr: &ConfiguredHIR) -> Result<StandardMatcher, Error> {
|
||||
let regex = expr.regex()?;
|
||||
let mut names = HashMap::new();
|
||||
for (i, optional_name) in regex.capture_names().enumerate() {
|
||||
if let Some(name) = optional_name {
|
||||
names.insert(name.to_string(), i);
|
||||
}
|
||||
}
|
||||
Ok(StandardMatcher { regex, names })
|
||||
}
|
||||
}
|
||||
|
||||
impl Matcher for StandardMatcher {
|
||||
type Captures = RegexCaptures;
|
||||
type Error = NoError;
|
||||
|
||||
fn find_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<Match>, NoError> {
|
||||
Ok(self.regex
|
||||
.find_at(haystack, at)
|
||||
.map(|m| Match::new(m.start(), m.end())))
|
||||
}
|
||||
|
||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
||||
}
|
||||
|
||||
fn capture_count(&self) -> usize {
|
||||
self.regex.captures_len()
|
||||
}
|
||||
|
||||
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||
self.names.get(name).map(|i| *i)
|
||||
}
|
||||
|
||||
fn try_find_iter<F, E>(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
mut matched: F,
|
||||
) -> Result<Result<(), E>, NoError>
|
||||
where F: FnMut(Match) -> Result<bool, E>
|
||||
{
|
||||
for m in self.regex.find_iter(haystack) {
|
||||
match matched(Match::new(m.start(), m.end())) {
|
||||
Ok(true) => continue,
|
||||
Ok(false) => return Ok(Ok(())),
|
||||
Err(err) => return Ok(Err(err)),
|
||||
}
|
||||
}
|
||||
Ok(Ok(()))
|
||||
}
|
||||
|
||||
fn captures_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
caps: &mut RegexCaptures,
|
||||
) -> Result<bool, NoError> {
|
||||
Ok(self.regex.captures_read_at(&mut caps.locs, haystack, at).is_some())
|
||||
}
|
||||
|
||||
fn shortest_match_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<usize>, NoError> {
|
||||
Ok(self.regex.shortest_match_at(haystack, at))
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the match offsets of each capturing group in a match.
|
||||
///
|
||||
/// The first, or `0`th capture group, always corresponds to the entire match
|
||||
/// and is guaranteed to be present when a match occurs. The next capture
|
||||
/// group, at index `1`, corresponds to the first capturing group in the regex,
|
||||
/// ordered by the position at which the left opening parenthesis occurs.
|
||||
///
|
||||
/// Note that not all capturing groups are guaranteed to be present in a match.
|
||||
/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
|
||||
/// or `bar` will ever be set in any given match.
|
||||
///
|
||||
/// In order to access a capture group by name, you'll need to first find the
|
||||
/// index of the group using the corresponding matcher's `capture_index`
|
||||
/// method, and then use that index with `RegexCaptures::get`.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RegexCaptures {
|
||||
/// Where the locations are stored.
|
||||
locs: CaptureLocations,
|
||||
/// These captures behave as if the capturing groups begin at the given
|
||||
/// offset. When set to `0`, this has no affect and capture groups are
|
||||
/// indexed like normal.
|
||||
///
|
||||
/// This is useful when building matchers that wrap arbitrary regular
|
||||
/// expressions. For example, `WordMatcher` takes an existing regex `re`
|
||||
/// and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that the regex
|
||||
/// has been wrapped from the caller. In order to do this, the matcher
|
||||
/// and the capturing groups must behave as if `(re)` is the `0`th capture
|
||||
/// group.
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
impl Captures for RegexCaptures {
|
||||
fn len(&self) -> usize {
|
||||
self.locs.len().checked_sub(self.offset).unwrap()
|
||||
}
|
||||
|
||||
fn get(&self, i: usize) -> Option<Match> {
|
||||
let actual = i.checked_add(self.offset).unwrap();
|
||||
self.locs.pos(actual).map(|(s, e)| Match::new(s, e))
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexCaptures {
|
||||
pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
|
||||
RegexCaptures::with_offset(locs, 0)
|
||||
}
|
||||
|
||||
pub(crate) fn with_offset(
|
||||
locs: CaptureLocations,
|
||||
offset: usize,
|
||||
) -> RegexCaptures {
|
||||
RegexCaptures { locs, offset }
|
||||
}
|
||||
|
||||
pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
|
||||
&mut self.locs
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use grep_matcher::{LineMatchKind, Matcher};
|
||||
use super::*;
|
||||
|
||||
// Test that enabling word matches does the right thing and demonstrate
|
||||
// the difference between it and surrounding the regex in `\b`.
|
||||
#[test]
|
||||
fn word() {
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.word(true)
|
||||
.build(r"-2")
|
||||
.unwrap();
|
||||
assert!(matcher.is_match(b"abc -2 foo").unwrap());
|
||||
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.word(false)
|
||||
.build(r"\b-2\b")
|
||||
.unwrap();
|
||||
assert!(!matcher.is_match(b"abc -2 foo").unwrap());
|
||||
}
|
||||
|
||||
// Test that enabling a line terminator prevents it from matching through
|
||||
// said line terminator.
|
||||
#[test]
|
||||
fn line_terminator() {
|
||||
// This works, because there's no line terminator specified.
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.build(r"abc\sxyz")
|
||||
.unwrap();
|
||||
assert!(matcher.is_match(b"abc\nxyz").unwrap());
|
||||
|
||||
// This doesn't.
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.line_terminator(Some(b'\n'))
|
||||
.build(r"abc\sxyz")
|
||||
.unwrap();
|
||||
assert!(!matcher.is_match(b"abc\nxyz").unwrap());
|
||||
}
|
||||
|
||||
// Ensure that the builder returns an error if a line terminator is set
|
||||
// and the regex could not be modified to remove a line terminator.
|
||||
#[test]
|
||||
fn line_terminator_error() {
|
||||
assert!(RegexMatcherBuilder::new()
|
||||
.line_terminator(Some(b'\n'))
|
||||
.build(r"a\nz")
|
||||
.is_err())
|
||||
}
|
||||
|
||||
// Test that enabling CRLF permits `$` to match at the end of a line.
|
||||
#[test]
|
||||
fn line_terminator_crlf() {
|
||||
// Test normal use of `$` with a `\n` line terminator.
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.multi_line(true)
|
||||
.build(r"abc$")
|
||||
.unwrap();
|
||||
assert!(matcher.is_match(b"abc\n").unwrap());
|
||||
|
||||
// Test that `$` doesn't match at `\r\n` boundary normally.
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.multi_line(true)
|
||||
.build(r"abc$")
|
||||
.unwrap();
|
||||
assert!(!matcher.is_match(b"abc\r\n").unwrap());
|
||||
|
||||
// Now check the CRLF handling.
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.multi_line(true)
|
||||
.crlf(true)
|
||||
.build(r"abc$")
|
||||
.unwrap();
|
||||
assert!(matcher.is_match(b"abc\r\n").unwrap());
|
||||
}
|
||||
|
||||
// Test that smart case works.
|
||||
#[test]
|
||||
fn case_smart() {
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.case_smart(true)
|
||||
.build(r"abc")
|
||||
.unwrap();
|
||||
assert!(matcher.is_match(b"ABC").unwrap());
|
||||
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.case_smart(true)
|
||||
.build(r"aBc")
|
||||
.unwrap();
|
||||
assert!(!matcher.is_match(b"ABC").unwrap());
|
||||
}
|
||||
|
||||
// Test that finding candidate lines works as expected.
|
||||
#[test]
|
||||
fn candidate_lines() {
|
||||
fn is_confirmed(m: LineMatchKind) -> bool {
|
||||
match m {
|
||||
LineMatchKind::Confirmed(_) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
fn is_candidate(m: LineMatchKind) -> bool {
|
||||
match m {
|
||||
LineMatchKind::Candidate(_) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
// With no line terminator set, we can't employ any optimizations,
|
||||
// so we get a confirmed match.
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.build(r"\wfoo\s")
|
||||
.unwrap();
|
||||
let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
|
||||
assert!(is_confirmed(m));
|
||||
|
||||
// With a line terminator and a regex specially crafted to have an
|
||||
// easy-to-detect inner literal, we can apply an optimization that
|
||||
// quickly finds candidate matches.
|
||||
let matcher = RegexMatcherBuilder::new()
|
||||
.line_terminator(Some(b'\n'))
|
||||
.build(r"\wfoo\s")
|
||||
.unwrap();
|
||||
let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
|
||||
assert!(is_candidate(m));
|
||||
}
|
||||
}
|
128
grep-regex/src/non_matching.rs
Normal file
128
grep-regex/src/non_matching.rs
Normal file
@ -0,0 +1,128 @@
|
||||
use grep_matcher::ByteSet;
|
||||
use regex_syntax::hir::{self, Hir, HirKind};
|
||||
use utf8_ranges::Utf8Sequences;
|
||||
|
||||
/// Return a confirmed set of non-matching bytes from the given expression.
|
||||
pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
|
||||
let mut set = ByteSet::full();
|
||||
remove_matching_bytes(expr, &mut set);
|
||||
set
|
||||
}
|
||||
|
||||
/// Remove any bytes from the given set that can occur in a matched produced by
|
||||
/// the given expression.
|
||||
fn remove_matching_bytes(
|
||||
expr: &Hir,
|
||||
set: &mut ByteSet,
|
||||
) {
|
||||
match *expr.kind() {
|
||||
HirKind::Empty
|
||||
| HirKind::Anchor(_)
|
||||
| HirKind::WordBoundary(_) => {}
|
||||
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
||||
for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
|
||||
set.remove(b);
|
||||
}
|
||||
}
|
||||
HirKind::Literal(hir::Literal::Byte(b)) => {
|
||||
set.remove(b);
|
||||
}
|
||||
HirKind::Class(hir::Class::Unicode(ref cls)) => {
|
||||
for range in cls.iter() {
|
||||
// This is presumably faster than encoding every codepoint
|
||||
// to UTF-8 and then removing those bytes from the set.
|
||||
for seq in Utf8Sequences::new(range.start(), range.end()) {
|
||||
for byte_range in seq.as_slice() {
|
||||
set.remove_all(byte_range.start, byte_range.end);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
HirKind::Class(hir::Class::Bytes(ref cls)) => {
|
||||
for range in cls.iter() {
|
||||
set.remove_all(range.start(), range.end());
|
||||
}
|
||||
}
|
||||
HirKind::Repetition(ref x) => {
|
||||
remove_matching_bytes(&x.hir, set);
|
||||
}
|
||||
HirKind::Group(ref x) => {
|
||||
remove_matching_bytes(&x.hir, set);
|
||||
}
|
||||
HirKind::Concat(ref xs) => {
|
||||
for x in xs {
|
||||
remove_matching_bytes(x, set);
|
||||
}
|
||||
}
|
||||
HirKind::Alternation(ref xs) => {
|
||||
for x in xs {
|
||||
remove_matching_bytes(x, set);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use grep_matcher::ByteSet;
|
||||
use regex_syntax::ParserBuilder;
|
||||
|
||||
use super::non_matching_bytes;
|
||||
|
||||
fn extract(pattern: &str) -> ByteSet {
|
||||
let expr = ParserBuilder::new()
|
||||
.allow_invalid_utf8(true)
|
||||
.build()
|
||||
.parse(pattern)
|
||||
.unwrap();
|
||||
non_matching_bytes(&expr)
|
||||
}
|
||||
|
||||
fn sparse(set: &ByteSet) -> Vec<u8> {
|
||||
let mut sparse_set = vec![];
|
||||
for b in (0..256).map(|b| b as u8) {
|
||||
if set.contains(b) {
|
||||
sparse_set.push(b);
|
||||
}
|
||||
}
|
||||
sparse_set
|
||||
}
|
||||
|
||||
fn sparse_except(except: &[u8]) -> Vec<u8> {
|
||||
let mut except_set = vec![false; 256];
|
||||
for &b in except {
|
||||
except_set[b as usize] = true;
|
||||
}
|
||||
|
||||
let mut set = vec![];
|
||||
for b in (0..256).map(|b| b as u8) {
|
||||
if !except_set[b as usize] {
|
||||
set.push(b);
|
||||
}
|
||||
}
|
||||
set
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dot() {
|
||||
assert_eq!(sparse(&extract(".")), vec![
|
||||
b'\n',
|
||||
192, 193, 245, 246, 247, 248, 249,
|
||||
250, 251, 252, 253, 254, 255,
|
||||
]);
|
||||
assert_eq!(sparse(&extract("(?s).")), vec![
|
||||
192, 193, 245, 246, 247, 248, 249,
|
||||
250, 251, 252, 253, 254, 255,
|
||||
]);
|
||||
assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);
|
||||
assert_eq!(sparse(&extract("(?s-u).")), vec![]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn literal() {
|
||||
assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));
|
||||
assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));
|
||||
assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));
|
||||
assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));
|
||||
}
|
||||
}
|
154
grep-regex/src/strip.rs
Normal file
154
grep-regex/src/strip.rs
Normal file
@ -0,0 +1,154 @@
|
||||
use grep_matcher::LineTerminator;
|
||||
use regex_syntax::hir::{self, Hir, HirKind};
|
||||
|
||||
use error::{Error, ErrorKind};
|
||||
|
||||
/// Return an HIR that is guaranteed to never match the given line terminator,
|
||||
/// if possible.
|
||||
///
|
||||
/// If the transformation isn't possible, then an error is returned.
|
||||
///
|
||||
/// In general, if a literal line terminator occurs anywhere in the HIR, then
|
||||
/// this will return an error. However, if the line terminator occurs within
|
||||
/// a character class with at least one other character (that isn't also a line
|
||||
/// terminator), then the line terminator is simply stripped from that class.
|
||||
///
|
||||
/// If the given line terminator is not ASCII, then this function returns an
|
||||
/// error.
|
||||
pub fn strip_from_match(
|
||||
expr: Hir,
|
||||
line_term: LineTerminator,
|
||||
) -> Result<Hir, Error> {
|
||||
if line_term.is_crlf() {
|
||||
let expr1 = strip_from_match_ascii(expr, b'\r')?;
|
||||
strip_from_match_ascii(expr1, b'\n')
|
||||
} else {
|
||||
let b = line_term.as_byte();
|
||||
if b > 0x7F {
|
||||
return Err(Error::new(ErrorKind::InvalidLineTerminator(b)));
|
||||
}
|
||||
strip_from_match_ascii(expr, b)
|
||||
}
|
||||
}
|
||||
|
||||
/// The implementation of strip_from_match. The given byte must be ASCII. This
|
||||
/// function panics otherwise.
|
||||
fn strip_from_match_ascii(
|
||||
expr: Hir,
|
||||
byte: u8,
|
||||
) -> Result<Hir, Error> {
|
||||
assert!(byte <= 0x7F);
|
||||
let chr = byte as char;
|
||||
assert_eq!(chr.len_utf8(), 1);
|
||||
|
||||
let invalid = || Err(Error::new(ErrorKind::NotAllowed(chr.to_string())));
|
||||
|
||||
Ok(match expr.into_kind() {
|
||||
HirKind::Empty => Hir::empty(),
|
||||
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
||||
if c == chr {
|
||||
return invalid();
|
||||
}
|
||||
Hir::literal(hir::Literal::Unicode(c))
|
||||
}
|
||||
HirKind::Literal(hir::Literal::Byte(b)) => {
|
||||
if b as char == chr {
|
||||
return invalid();
|
||||
}
|
||||
Hir::literal(hir::Literal::Byte(b))
|
||||
}
|
||||
HirKind::Class(hir::Class::Unicode(mut cls)) => {
|
||||
let remove = hir::ClassUnicode::new(Some(
|
||||
hir::ClassUnicodeRange::new(chr, chr),
|
||||
));
|
||||
cls.difference(&remove);
|
||||
if cls.ranges().is_empty() {
|
||||
return invalid();
|
||||
}
|
||||
Hir::class(hir::Class::Unicode(cls))
|
||||
}
|
||||
HirKind::Class(hir::Class::Bytes(mut cls)) => {
|
||||
let remove = hir::ClassBytes::new(Some(
|
||||
hir::ClassBytesRange::new(byte, byte),
|
||||
));
|
||||
cls.difference(&remove);
|
||||
if cls.ranges().is_empty() {
|
||||
return invalid();
|
||||
}
|
||||
Hir::class(hir::Class::Bytes(cls))
|
||||
}
|
||||
HirKind::Anchor(x) => Hir::anchor(x),
|
||||
HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
||||
HirKind::Repetition(mut x) => {
|
||||
x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
|
||||
Hir::repetition(x)
|
||||
}
|
||||
HirKind::Group(mut x) => {
|
||||
x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
|
||||
Hir::group(x)
|
||||
}
|
||||
HirKind::Concat(xs) => {
|
||||
let xs = xs.into_iter()
|
||||
.map(|e| strip_from_match_ascii(e, byte))
|
||||
.collect::<Result<Vec<Hir>, Error>>()?;
|
||||
Hir::concat(xs)
|
||||
}
|
||||
HirKind::Alternation(xs) => {
|
||||
let xs = xs.into_iter()
|
||||
.map(|e| strip_from_match_ascii(e, byte))
|
||||
.collect::<Result<Vec<Hir>, Error>>()?;
|
||||
Hir::alternation(xs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use regex_syntax::Parser;
|
||||
|
||||
use error::Error;
|
||||
use super::{LineTerminator, strip_from_match};
|
||||
|
||||
fn roundtrip(pattern: &str, byte: u8) -> String {
|
||||
roundtrip_line_term(pattern, LineTerminator::byte(byte)).unwrap()
|
||||
}
|
||||
|
||||
fn roundtrip_crlf(pattern: &str) -> String {
|
||||
roundtrip_line_term(pattern, LineTerminator::crlf()).unwrap()
|
||||
}
|
||||
|
||||
fn roundtrip_err(pattern: &str, byte: u8) -> Result<String, Error> {
|
||||
roundtrip_line_term(pattern, LineTerminator::byte(byte))
|
||||
}
|
||||
|
||||
fn roundtrip_line_term(
|
||||
pattern: &str,
|
||||
line_term: LineTerminator,
|
||||
) -> Result<String, Error> {
|
||||
let expr1 = Parser::new().parse(pattern).unwrap();
|
||||
let expr2 = strip_from_match(expr1, line_term)?;
|
||||
Ok(expr2.to_string())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn various() {
|
||||
assert_eq!(roundtrip(r"[a\n]", b'\n'), "[a]");
|
||||
assert_eq!(roundtrip(r"[a\n]", b'a'), "[\n]");
|
||||
assert_eq!(roundtrip_crlf(r"[a\n]"), "[a]");
|
||||
assert_eq!(roundtrip_crlf(r"[a\r]"), "[a]");
|
||||
assert_eq!(roundtrip_crlf(r"[a\r\n]"), "[a]");
|
||||
|
||||
assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])");
|
||||
assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])");
|
||||
|
||||
assert!(roundtrip_err(r"\n", b'\n').is_err());
|
||||
assert!(roundtrip_err(r"abc\n", b'\n').is_err());
|
||||
assert!(roundtrip_err(r"\nabc", b'\n').is_err());
|
||||
assert!(roundtrip_err(r"abc\nxyz", b'\n').is_err());
|
||||
assert!(roundtrip_err(r"\x0A", b'\n').is_err());
|
||||
assert!(roundtrip_err(r"\u000A", b'\n').is_err());
|
||||
assert!(roundtrip_err(r"\U0000000A", b'\n').is_err());
|
||||
assert!(roundtrip_err(r"\u{A}", b'\n').is_err());
|
||||
assert!(roundtrip_err("\n", b'\n').is_err());
|
||||
}
|
||||
}
|
29
grep-regex/src/util.rs
Normal file
29
grep-regex/src/util.rs
Normal file
@ -0,0 +1,29 @@
|
||||
/// Converts an arbitrary sequence of bytes to a literal suitable for building
|
||||
/// a regular expression.
|
||||
pub fn bytes_to_regex(bs: &[u8]) -> String {
|
||||
use std::fmt::Write;
|
||||
use regex_syntax::is_meta_character;
|
||||
|
||||
let mut s = String::with_capacity(bs.len());
|
||||
for &b in bs {
|
||||
if b <= 0x7F && !is_meta_character(b as char) {
|
||||
write!(s, r"{}", b as char).unwrap();
|
||||
} else {
|
||||
write!(s, r"\x{:02x}", b).unwrap();
|
||||
}
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
/// Converts arbitrary bytes to a nice string.
|
||||
pub fn show_bytes(bs: &[u8]) -> String {
|
||||
use std::ascii::escape_default;
|
||||
use std::str;
|
||||
|
||||
let mut nice = String::new();
|
||||
for &b in bs {
|
||||
let part: Vec<u8> = escape_default(b).collect();
|
||||
nice.push_str(str::from_utf8(&part).unwrap());
|
||||
}
|
||||
nice
|
||||
}
|
196
grep-regex/src/word.rs
Normal file
196
grep-regex/src/word.rs
Normal file
@ -0,0 +1,196 @@
|
||||
use std::collections::HashMap;
|
||||
use std::cell::RefCell;
|
||||
use std::sync::Arc;
|
||||
|
||||
use grep_matcher::{Match, Matcher, NoError};
|
||||
use regex::bytes::{CaptureLocations, Regex};
|
||||
use thread_local::CachedThreadLocal;
|
||||
|
||||
use config::ConfiguredHIR;
|
||||
use error::Error;
|
||||
use matcher::RegexCaptures;
|
||||
|
||||
/// A matcher for implementing "word match" semantics.
|
||||
#[derive(Debug)]
|
||||
pub struct WordMatcher {
|
||||
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
||||
regex: Regex,
|
||||
/// A map from capture group name to capture group index.
|
||||
names: HashMap<String, usize>,
|
||||
/// A reusable buffer for finding the match location of the inner group.
|
||||
locs: Arc<CachedThreadLocal<RefCell<CaptureLocations>>>,
|
||||
}
|
||||
|
||||
impl Clone for WordMatcher {
|
||||
fn clone(&self) -> WordMatcher {
|
||||
// We implement Clone manually so that we get a fresh CachedThreadLocal
|
||||
// such that it can set its own thread owner. This permits each thread
|
||||
// usings `locs` to hit the fast path.
|
||||
WordMatcher {
|
||||
regex: self.regex.clone(),
|
||||
names: self.names.clone(),
|
||||
locs: Arc::new(CachedThreadLocal::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WordMatcher {
|
||||
/// Create a new matcher from the given pattern that only produces matches
|
||||
/// that are considered "words."
|
||||
///
|
||||
/// The given options are used to construct the regular expression
|
||||
/// internally.
|
||||
pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
|
||||
let word_expr = expr.with_pattern(|pat| {
|
||||
format!(r"(?:(?m:^)|\W)({})(?:(?m:$)|\W)", pat)
|
||||
})?;
|
||||
let regex = word_expr.regex()?;
|
||||
let locs = Arc::new(CachedThreadLocal::new());
|
||||
|
||||
let mut names = HashMap::new();
|
||||
for (i, optional_name) in regex.capture_names().enumerate() {
|
||||
if let Some(name) = optional_name {
|
||||
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||
}
|
||||
}
|
||||
Ok(WordMatcher { regex, names, locs })
|
||||
}
|
||||
}
|
||||
|
||||
impl Matcher for WordMatcher {
|
||||
type Captures = RegexCaptures;
|
||||
type Error = NoError;
|
||||
|
||||
fn find_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Result<Option<Match>, NoError> {
|
||||
// To make this easy to get right, we extract captures here instead of
|
||||
// calling `find_at`. The actual match is at capture group `1` instead
|
||||
// of `0`. We *could* use `find_at` here and then trim the match after
|
||||
// the fact, but that's a bit harder to get right, and it's not clear
|
||||
// if it's worth it.
|
||||
|
||||
let cell = self.locs.get_or(|| {
|
||||
Box::new(RefCell::new(self.regex.capture_locations()))
|
||||
});
|
||||
let mut caps = cell.borrow_mut();
|
||||
self.regex.captures_read_at(&mut caps, haystack, at);
|
||||
Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
|
||||
}
|
||||
|
||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||
Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
|
||||
}
|
||||
|
||||
fn capture_count(&self) -> usize {
|
||||
self.regex.captures_len().checked_sub(1).unwrap()
|
||||
}
|
||||
|
||||
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||
self.names.get(name).map(|i| *i)
|
||||
}
|
||||
|
||||
fn captures_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
caps: &mut RegexCaptures,
|
||||
) -> Result<bool, NoError> {
|
||||
let r = self.regex.captures_read_at(caps.locations(), haystack, at);
|
||||
Ok(r.is_some())
|
||||
}
|
||||
|
||||
// We specifically do not implement other methods like find_iter or
|
||||
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
||||
// by virtue of implementing find_at and captures_at above.
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use grep_matcher::{Captures, Match, Matcher};
|
||||
use config::Config;
|
||||
use super::WordMatcher;
|
||||
|
||||
fn matcher(pattern: &str) -> WordMatcher {
|
||||
let chir = Config::default().hir(pattern).unwrap();
|
||||
WordMatcher::new(&chir).unwrap()
|
||||
}
|
||||
|
||||
fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
||||
matcher(pattern)
|
||||
.find(haystack.as_bytes())
|
||||
.unwrap()
|
||||
.map(|m| (m.start(), m.end()))
|
||||
}
|
||||
|
||||
fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
||||
let m = matcher(pattern);
|
||||
let mut caps = m.new_captures().unwrap();
|
||||
if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
|
||||
None
|
||||
} else {
|
||||
caps.get(0).map(|m| (m.start(), m.end()))
|
||||
}
|
||||
}
|
||||
|
||||
// Test that the standard `find` API reports offsets correctly.
|
||||
#[test]
|
||||
fn various_find() {
|
||||
assert_eq!(Some((0, 3)), find(r"foo", "foo"));
|
||||
assert_eq!(Some((0, 3)), find(r"foo", "foo("));
|
||||
assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
|
||||
assert_eq!(None, find(r"foo", "!afoo("));
|
||||
|
||||
assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
|
||||
assert_eq!(None, find(r"foo", "fooб"));
|
||||
// assert_eq!(Some((0, 3)), find(r"foo", "fooб"));
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
||||
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
|
||||
}
|
||||
|
||||
// Test that the captures API also reports offsets correctly, just as
|
||||
// find does. This exercises a different path in the code since captures
|
||||
// are handled differently.
|
||||
#[test]
|
||||
fn various_captures() {
|
||||
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
|
||||
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
|
||||
assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
|
||||
assert_eq!(None, find_by_caps(r"foo", "!afoo("));
|
||||
|
||||
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
|
||||
assert_eq!(None, find_by_caps(r"foo", "fooб"));
|
||||
// assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
||||
assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
|
||||
}
|
||||
|
||||
// Test that the capture reporting methods work as advertised.
|
||||
#[test]
|
||||
fn capture_indexing() {
|
||||
let m = matcher(r"(a)(?P<foo>b)(c)");
|
||||
assert_eq!(4, m.capture_count());
|
||||
assert_eq!(Some(2), m.capture_index("foo"));
|
||||
|
||||
let mut caps = m.new_captures().unwrap();
|
||||
assert_eq!(4, caps.len());
|
||||
|
||||
assert!(m.captures(b"abc", &mut caps).unwrap());
|
||||
assert_eq!(caps.get(0), Some(Match::new(0, 3)));
|
||||
assert_eq!(caps.get(1), Some(Match::new(0, 1)));
|
||||
assert_eq!(caps.get(2), Some(Match::new(1, 2)));
|
||||
assert_eq!(caps.get(3), Some(Match::new(2, 3)));
|
||||
assert_eq!(caps.get(4), None);
|
||||
|
||||
assert!(m.captures(b"#abc#", &mut caps).unwrap());
|
||||
assert_eq!(caps.get(0), Some(Match::new(1, 4)));
|
||||
assert_eq!(caps.get(1), Some(Match::new(1, 2)));
|
||||
assert_eq!(caps.get(2), Some(Match::new(2, 3)));
|
||||
assert_eq!(caps.get(3), Some(Match::new(3, 4)));
|
||||
assert_eq!(caps.get(4), None);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user