1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-04-08 16:53:58 +02:00

regex: simplify AST analysis a bit

The verbatim literal stuff hasn't been used for a while and I don't
foresee it being used. If it's really needed, it would probably better
to just implement it by looking at the pattern string itself, which
avoids parsing it into an AST altogether.
This commit is contained in:
Andrew Gallant 2023-06-17 10:00:07 -04:00
parent d9bd261be8
commit 51480d57a6

@ -1,17 +1,13 @@
use regex_syntax::ast::parse::Parser;
use regex_syntax::ast::{self, Ast};
/// The results of analyzing AST of a regular expression (e.g., for supporting
/// smart case).
#[derive(Clone, Debug)]
pub struct AstAnalysis {
pub(crate) struct AstAnalysis {
/// True if and only if a literal uppercase character occurs in the regex.
any_uppercase: bool,
/// True if and only if the regex contains any literal at all.
any_literal: bool,
/// True if and only if the regex consists entirely of a literal and no
/// other special regex characters.
all_verbatim_literal: bool,
}
impl AstAnalysis {
@ -19,16 +15,16 @@ impl AstAnalysis {
///
/// If `pattern` is not a valid regular expression, then `None` is
/// returned.
#[allow(dead_code)]
pub fn from_pattern(pattern: &str) -> Option<AstAnalysis> {
Parser::new()
#[cfg(test)]
pub(crate) fn from_pattern(pattern: &str) -> Option<AstAnalysis> {
regex_syntax::ast::parse::Parser::new()
.parse(pattern)
.map(|ast| AstAnalysis::from_ast(&ast))
.ok()
}
/// Perform an AST analysis given the AST.
pub fn from_ast(ast: &Ast) -> AstAnalysis {
pub(crate) fn from_ast(ast: &Ast) -> AstAnalysis {
let mut analysis = AstAnalysis::new();
analysis.from_ast_impl(ast);
analysis
@ -40,7 +36,7 @@ impl AstAnalysis {
/// For example, a pattern like `\pL` contains no uppercase literals,
/// even though `L` is uppercase and the `\pL` class contains uppercase
/// characters.
pub fn any_uppercase(&self) -> bool {
pub(crate) fn any_uppercase(&self) -> bool {
self.any_uppercase
}
@ -48,32 +44,13 @@ impl AstAnalysis {
///
/// For example, a pattern like `\pL` reports `false`, but a pattern like
/// `\pLfoo` reports `true`.
pub fn any_literal(&self) -> bool {
pub(crate) fn any_literal(&self) -> bool {
self.any_literal
}
/// Returns true if and only if the entire pattern is a verbatim literal
/// with no special meta characters.
///
/// When this is true, then the pattern satisfies the following law:
/// `escape(pattern) == pattern`. Notable examples where this returns
/// `false` include patterns like `a\u0061` even though `\u0061` is just
/// a literal `a`.
///
/// The purpose of this flag is to determine whether the patterns can be
/// given to non-regex substring search algorithms as-is.
#[allow(dead_code)]
pub fn all_verbatim_literal(&self) -> bool {
self.all_verbatim_literal
}
/// Creates a new `AstAnalysis` value with an initial configuration.
fn new() -> AstAnalysis {
AstAnalysis {
any_uppercase: false,
any_literal: false,
all_verbatim_literal: true,
}
AstAnalysis { any_uppercase: false, any_literal: false }
}
fn from_ast_impl(&mut self, ast: &Ast) {
@ -86,26 +63,20 @@ impl AstAnalysis {
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::Class(ast::Class::Unicode(_))
| Ast::Class(ast::Class::Perl(_)) => {
self.all_verbatim_literal = false;
}
| Ast::Class(ast::Class::Perl(_)) => {}
Ast::Literal(ref x) => {
self.from_ast_literal(x);
}
Ast::Class(ast::Class::Bracketed(ref x)) => {
self.all_verbatim_literal = false;
self.from_ast_class_set(&x.kind);
}
Ast::Repetition(ref x) => {
self.all_verbatim_literal = false;
self.from_ast_impl(&x.ast);
}
Ast::Group(ref x) => {
self.all_verbatim_literal = false;
self.from_ast_impl(&x.ast);
}
Ast::Alternation(ref alt) => {
self.all_verbatim_literal = false;
for x in &alt.asts {
self.from_ast_impl(x);
}
@ -161,9 +132,6 @@ impl AstAnalysis {
}
fn from_ast_literal(&mut self, ast: &ast::Literal) {
if ast.kind != ast::LiteralKind::Verbatim {
self.all_verbatim_literal = false;
}
self.any_literal = true;
self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
}
@ -171,7 +139,7 @@ impl AstAnalysis {
/// Returns true if and only if the attributes can never change no matter
/// what other AST it might see.
fn done(&self) -> bool {
self.any_uppercase && self.any_literal && !self.all_verbatim_literal
self.any_uppercase && self.any_literal
}
}
@ -188,76 +156,61 @@ mod tests {
let x = analysis("");
assert!(!x.any_uppercase);
assert!(!x.any_literal);
assert!(x.all_verbatim_literal);
let x = analysis("foo");
assert!(!x.any_uppercase);
assert!(x.any_literal);
assert!(x.all_verbatim_literal);
let x = analysis("Foo");
assert!(x.any_uppercase);
assert!(x.any_literal);
assert!(x.all_verbatim_literal);
let x = analysis("foO");
assert!(x.any_uppercase);
assert!(x.any_literal);
assert!(x.all_verbatim_literal);
let x = analysis(r"foo\\");
assert!(!x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo\w");
assert!(!x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo\S");
assert!(!x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo\p{Ll}");
assert!(!x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo[a-z]");
assert!(!x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo[A-Z]");
assert!(x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo[\S\t]");
assert!(!x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo\\S");
assert!(x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"\p{Ll}");
assert!(!x.any_uppercase);
assert!(!x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"aBc\w");
assert!(x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"a\u0061");
assert!(!x.any_uppercase);
assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
}
}