1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-03-17 20:28:03 +02:00

deps: initial migration steps to regex 1.9

This leaves the grep-regex crate in tatters. Pretty much the entire
thing needs to be re-worked. The upshot is that it should result in some
big simplifications. I hope.

The idea here is to drop down and actually use regex-automata 0.3
instead of the regex crate itself.
This commit is contained in:
Andrew Gallant 2023-06-11 21:25:23 -04:00
parent a7f1276021
commit 1035f6b1ff
15 changed files with 606 additions and 558 deletions

78
Cargo.lock generated
View File

@ -4,18 +4,9 @@ version = 3
[[package]]
name = "aho-corasick"
version = "0.7.20"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
dependencies = [
"memchr",
]
[[package]]
name = "aho-corasick"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04"
checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41"
dependencies = [
"memchr",
]
@ -40,7 +31,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
dependencies = [
"memchr",
"once_cell",
"regex-automata",
"regex-automata 0.1.10",
"serde",
]
@ -131,7 +122,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
name = "globset"
version = "0.4.10"
dependencies = [
"aho-corasick 0.7.20",
"aho-corasick",
"bstr",
"fnv",
"glob",
@ -204,12 +195,12 @@ dependencies = [
name = "grep-regex"
version = "0.1.11"
dependencies = [
"aho-corasick 0.7.20",
"aho-corasick",
"bstr",
"grep-matcher",
"log",
"regex",
"regex-syntax 0.6.29",
"regex-syntax",
"thread_local",
]
@ -287,9 +278,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.144"
version = "0.2.146"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b"
[[package]]
name = "libm"
@ -299,12 +290,9 @@ checksum = "7fc7aa29613bd6a620df431842069224d8bc9011086b1db4c0e0cd47fa03ec9a"
[[package]]
name = "log"
version = "0.4.17"
version = "0.4.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
dependencies = [
"cfg-if",
]
checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
[[package]]
name = "memchr"
@ -323,9 +311,9 @@ dependencies = [
[[package]]
name = "once_cell"
version = "1.17.1"
version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
[[package]]
name = "packed_simd_2"
@ -368,31 +356,30 @@ checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
[[package]]
name = "proc-macro2"
version = "1.0.58"
version = "1.0.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.27"
version = "1.0.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81ca098a9821bd52d6b24fd8b10bd081f47d39c22778cafaa75a2857a62c6390"
version = "1.8.4"
dependencies = [
"aho-corasick 1.0.1",
"aho-corasick",
"memchr",
"regex-syntax 0.7.2",
"regex-automata 0.3.0",
"regex-syntax",
]
[[package]]
@ -402,16 +389,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
[[package]]
name = "regex-syntax"
version = "0.6.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
name = "regex-automata"
version = "0.3.0"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
[[package]]
name = "ripgrep"
@ -449,18 +437,18 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.163"
version = "1.0.164"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.163"
version = "1.0.164"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68"
dependencies = [
"proc-macro2",
"quote",
@ -486,9 +474,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "syn"
version = "2.0.16"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e"
dependencies = [
"proc-macro2",
"quote",

View File

@ -19,6 +19,11 @@ autotests = false
edition = "2018"
rust-version = "1.65"
[patch.crates-io]
regex = { path = "/home/andrew/rust/regex" }
regex-automata = { path = "/home/andrew/rust/regex/regex-automata" }
regex-syntax = { path = "/home/andrew/rust/regex/regex-syntax" }
[[bin]]
bench = false
path = "crates/core/main.rs"
@ -47,7 +52,7 @@ grep = { version = "0.2.12", path = "crates/grep" }
ignore = { version = "0.4.19", path = "crates/ignore" }
lazy_static = "1.1.0"
log = "0.4.5"
regex = "1.3.5"
regex = "1.8.3"
serde_json = "1.0.23"
termcolor = "1.1.0"

View File

@ -1464,7 +1464,7 @@ impl ArgMatches {
// own, but if the patterns are joined in a set of alternations, then
// you wind up with `foo|`, which is currently invalid in Rust's regex
// engine.
"(?:z{0})*".to_string()
"(?:)".to_string()
}
/// Converts an OsStr pattern to a String pattern. The pattern is escaped

View File

@ -20,11 +20,11 @@ name = "globset"
bench = false
[dependencies]
aho-corasick = "0.7.3"
bstr = { version = "1.1.0", default-features = false, features = ["std"] }
aho-corasick = "1.0.2"
bstr = { version = "1.5.0", default-features = false, features = ["std"] }
fnv = "1.0.6"
log = { version = "0.4.5", optional = true }
regex = { version = "1.1.5", default-features = false, features = ["perf", "std"] }
regex = { version = "1.8.3", default-features = false, features = ["perf", "std"] }
serde = { version = "1.0.104", optional = true }
[dev-dependencies]

View File

@ -818,7 +818,7 @@ impl MultiStrategyBuilder {
fn prefix(self) -> PrefixStrategy {
PrefixStrategy {
matcher: AhoCorasick::new_auto_configured(&self.literals),
matcher: AhoCorasick::new(&self.literals).unwrap(),
map: self.map,
longest: self.longest,
}
@ -826,7 +826,7 @@ impl MultiStrategyBuilder {
fn suffix(self) -> SuffixStrategy {
SuffixStrategy {
matcher: AhoCorasick::new_auto_configured(&self.literals),
matcher: AhoCorasick::new(&self.literals).unwrap(),
map: self.map,
longest: self.longest,
}

View File

@ -14,10 +14,10 @@ license = "Unlicense OR MIT"
edition = "2018"
[dependencies]
aho-corasick = "0.7.3"
bstr = "1.1.0"
aho-corasick = "1.0.2"
bstr = "1.5.0"
grep-matcher = { version = "0.1.6", path = "../matcher" }
log = "0.4.5"
regex = "1.1"
regex-syntax = "0.6.5"
thread_local = "1.1.2"
regex = "1.8.3"
regex-syntax = "0.7.2"
thread_local = "1.1.7"

View File

@ -71,7 +71,7 @@ impl Config {
let ast = self.ast(pattern)?;
let analysis = self.analysis(&ast)?;
let expr = hir::translate::TranslatorBuilder::new()
.allow_invalid_utf8(true)
.utf8(false)
.case_insensitive(self.is_case_insensitive(&analysis))
.multi_line(self.multi_line)
.dot_matches_new_line(self.dot_matches_new_line)
@ -172,7 +172,12 @@ impl ConfiguredHIR {
/// CRLF hack is enabled and the regex is line anchored at the end. In
/// this case, matches that end with a `\r` have the `\r` stripped.
pub fn needs_crlf_stripped(&self) -> bool {
self.config.crlf && self.expr.is_line_anchored_end()
self.config.crlf
&& self
.expr
.properties()
.look_set_suffix_any()
.contains(hir::Look::EndLF)
}
/// Returns the line terminator configured on this expression.
@ -202,7 +207,7 @@ impl ConfiguredHIR {
/// Returns true if and only if the underlying HIR has any text anchors.
fn is_any_anchored(&self) -> bool {
self.expr.is_any_anchored_start() || self.expr.is_any_anchored_end()
self.expr.properties().look_set().contains_anchor_haystack()
}
/// Builds a regular expression from this HIR expression.
@ -301,7 +306,7 @@ impl ConfiguredHIR {
let expr = ::regex_syntax::ParserBuilder::new()
.nest_limit(self.config.nest_limit)
.octal(self.config.octal)
.allow_invalid_utf8(true)
.utf8(false)
.multi_line(self.config.multi_line)
.dot_matches_new_line(self.config.dot_matches_new_line)
.unicode(self.config.unicode)

View File

@ -124,32 +124,26 @@ pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
/// nicely in most cases, especially when a match is limited to a single line.
pub fn crlfify(expr: Hir) -> Hir {
match expr.into_kind() {
HirKind::Anchor(hir::Anchor::EndLine) => {
let concat = Hir::concat(vec![
Hir::repetition(hir::Repetition {
kind: hir::RepetitionKind::ZeroOrOne,
greedy: false,
hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),
}),
Hir::anchor(hir::Anchor::EndLine),
]);
Hir::group(hir::Group {
kind: hir::GroupKind::NonCapturing,
hir: Box::new(concat),
})
}
HirKind::Look(hir::Look::EndLF) => Hir::concat(vec![
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: false,
sub: Box::new(Hir::literal("\r".as_bytes())),
}),
Hir::look(hir::Look::EndLF),
]),
HirKind::Empty => Hir::empty(),
HirKind::Literal(x) => Hir::literal(x),
HirKind::Literal(hir::Literal(x)) => Hir::literal(x),
HirKind::Class(x) => Hir::class(x),
HirKind::Anchor(x) => Hir::anchor(x),
HirKind::WordBoundary(x) => Hir::word_boundary(x),
HirKind::Look(x) => Hir::look(x),
HirKind::Repetition(mut x) => {
x.hir = Box::new(crlfify(*x.hir));
x.sub = Box::new(crlfify(*x.sub));
Hir::repetition(x)
}
HirKind::Group(mut x) => {
x.hir = Box::new(crlfify(*x.hir));
Hir::group(x)
HirKind::Capture(mut x) => {
x.sub = Box::new(crlfify(*x.sub));
Hir::capture(x)
}
HirKind::Concat(xs) => {
Hir::concat(xs.into_iter().map(crlfify).collect())
@ -174,12 +168,12 @@ mod tests {
#[test]
fn various() {
assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");
assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$)\r??(?m:$))");
assert_eq!(
roundtrip(r"(?m)(?:foo$|bar$)"),
"(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))"
"(?:(?:(?:foo)\r??(?m:$))|(?:(?:bar)\r??(?m:$)))"
);
assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");
assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$)a)");
// Not a multiline `$`, so no crlfifying occurs.
assert_eq!(roundtrip(r"$"), "\\z");

View File

@ -1,43 +1,12 @@
/*
This module is responsible for extracting *inner* literals out of the AST of a
regular expression. Normally this is the job of the regex engine itself, but
the regex engine doesn't look for inner literals. Since we're doing line based
searching, we can use them, so we need to do it ourselves.
*/
use regex_syntax::hir::Hir;
use bstr::ByteSlice;
use regex_syntax::hir::literal::{Literal, Literals};
use regex_syntax::hir::{self, Hir, HirKind};
use crate::util;
/// Represents prefix, suffix and inner "required" literals for a regular
/// expression.
///
/// Prefixes and suffixes are detected using regex-syntax. The inner required
/// literals are detected using something custom (but based on the code in
/// regex-syntax).
#[derive(Clone, Debug)]
pub struct LiteralSets {
/// A set of prefix literals.
prefixes: Literals,
/// A set of suffix literals.
suffixes: Literals,
/// A set of literals such that at least one of them must appear in every
/// match. A literal in this set may be neither a prefix nor a suffix.
required: Literals,
}
pub struct LiteralSets {}
impl LiteralSets {
/// Create a set of literals from the given HIR expression.
pub fn new(expr: &Hir) -> LiteralSets {
let mut required = Literals::empty();
union_required(expr, &mut required);
LiteralSets {
prefixes: Literals::prefixes(expr),
suffixes: Literals::suffixes(expr),
required,
}
pub fn new(_: &Hir) -> LiteralSets {
LiteralSets {}
}
/// If it is deemed advantageuous to do so (via various suspicious
@ -46,383 +15,7 @@ impl LiteralSets {
/// generated these literal sets. The idea here is that the pattern
/// returned by this method is much cheaper to search for. i.e., It is
/// usually a single literal or an alternation of literals.
pub fn one_regex(&self, word: bool) -> Option<String> {
// TODO: The logic in this function is basically inscrutable. It grew
// organically in the old grep 0.1 crate. Ideally, it would be
// re-worked. In fact, the entire inner literal extraction should be
// re-worked. Actually, most of regex-syntax's literal extraction
// should also be re-worked. Alas... only so much time in the day.
if !word {
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
log::debug!("literal prefixes detected: {:?}", self.prefixes);
// When this is true, the regex engine will do a literal scan,
// so we don't need to return anything. But we only do this
// if we aren't doing a word regex, since a word regex adds
// a `(?:\W|^)` to the beginning of the regex, thereby
// defeating the regex engine's literal detection.
return None;
}
}
// Out of inner required literals, prefixes and suffixes, which one
// is the longest? We pick the longest to do fast literal scan under
// the assumption that a longer literal will have a lower false
// positive rate.
let pre_lcp = self.prefixes.longest_common_prefix();
let pre_lcs = self.prefixes.longest_common_suffix();
let suf_lcp = self.suffixes.longest_common_prefix();
let suf_lcs = self.suffixes.longest_common_suffix();
let req_lits = self.required.literals();
let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
None => &[],
Some(req) => &***req,
};
let mut lit = pre_lcp;
if pre_lcs.len() > lit.len() {
lit = pre_lcs;
}
if suf_lcp.len() > lit.len() {
lit = suf_lcp;
}
if suf_lcs.len() > lit.len() {
lit = suf_lcs;
}
if req_lits.len() == 1 && req.len() > lit.len() {
lit = req;
}
// Special case: if we detected an alternation of inner required
// literals and its longest literal is bigger than the longest
// prefix/suffix, then choose the alternation. In practice, this
// helps with case insensitive matching, which can generate lots of
// inner required literals.
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
let any_white = has_only_whitespace(&req_lits);
if req.len() > lit.len()
&& req_lits.len() > 1
&& !any_empty
&& !any_white
{
log::debug!("required literals found: {:?}", req_lits);
let alts: Vec<String> = req_lits
.into_iter()
.map(|x| util::bytes_to_regex(x))
.collect();
// We're matching raw bytes, so disable Unicode mode.
Some(format!("(?-u:{})", alts.join("|")))
} else if lit.is_empty() {
// If we're here, then we have no LCP. No LCS. And no detected
// inner required literals. In theory this shouldn't happen, but
// the inner literal detector isn't as nice as we hope and doesn't
// actually support returning a set of alternating required
// literals. (Instead, it only returns a set where EVERY literal
// in it is required. It cannot currently express "either P or Q
// is required.")
//
// In this case, it is possible that we still have meaningful
// prefixes or suffixes to use. So we look for the set of literals
// with the highest minimum length and use that to build our "fast"
// regex.
//
// This manifests in fairly common scenarios. e.g.,
//
// rg -w 'foo|bar|baz|quux'
//
// Normally, without the `-w`, the regex engine itself would
// detect the prefix correctly. Unfortunately, the `-w` option
// turns the regex into something like this:
//
// rg '(^|\W)(foo|bar|baz|quux)($|\W)'
//
// Which will defeat all prefix and suffix literal optimizations.
// (Not in theory---it could be better. But the current
// implementation isn't good enough.) ... So we make up for it
// here.
if !word {
return None;
}
let p_min_len = self.prefixes.min_len();
let s_min_len = self.suffixes.min_len();
let lits = match (p_min_len, s_min_len) {
(None, None) => return None,
(Some(_), None) => {
log::debug!("prefix literals found");
self.prefixes.literals()
}
(None, Some(_)) => {
log::debug!("suffix literals found");
self.suffixes.literals()
}
(Some(p), Some(s)) => {
if p >= s {
log::debug!("prefix literals found");
self.prefixes.literals()
} else {
log::debug!("suffix literals found");
self.suffixes.literals()
}
}
};
log::debug!("prefix/suffix literals found: {:?}", lits);
if has_only_whitespace(lits) {
log::debug!("dropping literals because one was whitespace");
return None;
}
let alts: Vec<String> =
lits.into_iter().map(|x| util::bytes_to_regex(x)).collect();
// We're matching raw bytes, so disable Unicode mode.
Some(format!("(?-u:{})", alts.join("|")))
} else {
log::debug!("required literal found: {:?}", util::show_bytes(lit));
if lit.chars().all(|c| c.is_whitespace()) {
log::debug!("dropping literal because one was whitespace");
return None;
}
Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))
}
}
}
fn union_required(expr: &Hir, lits: &mut Literals) {
match *expr.kind() {
HirKind::Literal(hir::Literal::Unicode(c)) => {
let mut buf = [0u8; 4];
lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
}
HirKind::Literal(hir::Literal::Byte(b)) => {
lits.cross_add(&[b]);
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
lits.cut();
}
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
lits.cut();
}
}
HirKind::Group(hir::Group { ref hir, .. }) => {
union_required(&**hir, lits);
}
HirKind::Repetition(ref x) => match x.kind {
hir::RepetitionKind::ZeroOrOne => lits.cut(),
hir::RepetitionKind::ZeroOrMore => lits.cut(),
hir::RepetitionKind::OneOrMore => {
union_required(&x.hir, lits);
}
hir::RepetitionKind::Range(ref rng) => {
let (min, max) = match *rng {
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
hir::RepetitionRange::AtLeast(m) => (m, None),
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
};
repeat_range_literals(
&x.hir,
min,
max,
x.greedy,
lits,
union_required,
);
}
},
HirKind::Concat(ref es) if es.is_empty() => {}
HirKind::Concat(ref es) if es.len() == 1 => {
union_required(&es[0], lits)
}
HirKind::Concat(ref es) => {
for e in es {
let mut lits2 = lits.to_empty();
union_required(e, &mut lits2);
if lits2.is_empty() {
lits.cut();
continue;
}
if lits2.contains_empty() || !is_simple(&e) {
lits.cut();
}
if !lits.cross_product(&lits2) || !lits2.any_complete() {
// If this expression couldn't yield any literal that
// could be extended, then we need to quit. Since we're
// short-circuiting, we also need to freeze every member.
lits.cut();
break;
}
}
}
HirKind::Alternation(ref es) => {
alternate_literals(es, lits, union_required);
}
_ => lits.cut(),
}
}
fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
e: &Hir,
min: u32,
_max: Option<u32>,
_greedy: bool,
lits: &mut Literals,
mut f: F,
) {
if min == 0 {
// This is a bit conservative. If `max` is set, then we could
// treat this as a finite set of alternations. For now, we
// just treat it as `e*`.
lits.cut();
} else {
// We only extract literals from a single repetition, even though
// we could do more. e.g., `a{3}` will have `a` extracted instead of
// `aaa`. The reason is that inner literal extraction can't be unioned
// across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
// is wrong.
f(e, lits);
lits.cut();
}
}
fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
es: &[Hir],
lits: &mut Literals,
mut f: F,
) {
let mut lits2 = lits.to_empty();
for e in es {
let mut lits3 = lits.to_empty();
lits3.set_limit_size(lits.limit_size() / 5);
f(e, &mut lits3);
if lits3.is_empty() || !lits2.union(lits3) {
// If we couldn't find suffixes for *any* of the
// alternates, then the entire alternation has to be thrown
// away and any existing members must be frozen. Similarly,
// if the union couldn't complete, stop and freeze.
lits.cut();
return;
}
}
// All we do at the moment is look for prefixes and suffixes. If both
// are empty, then we report nothing. We should be able to do better than
// this, but we'll need something more expressive than just a "set of
// literals."
let lcp = lits2.longest_common_prefix();
let lcs = lits2.longest_common_suffix();
if !lcp.is_empty() {
lits.cross_add(lcp);
}
lits.cut();
if !lcs.is_empty() {
lits.add(Literal::empty());
lits.add(Literal::new(lcs.to_vec()));
}
}
fn is_simple(expr: &Hir) -> bool {
match *expr.kind() {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Concat(_)
| HirKind::Alternation(_) => true,
HirKind::Anchor(_)
| HirKind::WordBoundary(_)
| HirKind::Group(_)
| HirKind::Repetition(_) => false,
}
}
/// Return the number of characters in the given class.
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
}
/// Return the number of bytes in the given class.
fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
}
/// Returns true if and only if any of the literals in the given set is
/// entirely whitespace.
fn has_only_whitespace(lits: &[Literal]) -> bool {
for lit in lits {
if lit.chars().all(|c| c.is_whitespace()) {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::LiteralSets;
use regex_syntax::Parser;
fn sets(pattern: &str) -> LiteralSets {
let hir = Parser::new().parse(pattern).unwrap();
LiteralSets::new(&hir)
}
fn one_regex(pattern: &str) -> Option<String> {
sets(pattern).one_regex(false)
}
// Put a pattern into the same format as the one returned by `one_regex`.
fn pat(pattern: &str) -> Option<String> {
Some(format!("(?-u:{})", pattern))
}
#[test]
fn various() {
// Obviously no literals.
assert!(one_regex(r"\w").is_none());
assert!(one_regex(r"\pL").is_none());
// Tantalizingly close.
assert!(one_regex(r"\w|foo").is_none());
// There's a literal, but it's better if the regex engine handles it
// internally.
assert!(one_regex(r"abc").is_none());
// Core use cases.
assert_eq!(one_regex(r"\wabc\w"), pat("abc"));
assert_eq!(one_regex(r"abc\w"), pat("abc"));
// TODO: Make these pass. We're missing some potentially big wins
// without these.
// assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
// assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
}
#[test]
fn regression_1064() {
// Regression from:
// https://github.com/BurntSushi/ripgrep/issues/1064
// assert_eq!(one_regex(r"a.*c"), pat("a"));
assert_eq!(one_regex(r"a(.*c)"), pat("a"));
}
#[test]
fn regression_1319() {
// Regression from:
// https://github.com/BurntSushi/ripgrep/issues/1319
assert_eq!(
one_regex(r"TTGAGTCCAGGAG[ATCG]{2}C"),
pat("TTGAGTCCAGGAG"),
);
}
#[test]
fn regression_1537() {
// Regression from:
// https://github.com/BurntSushi/ripgrep/issues/1537
assert_eq!(one_regex(r";(.*,)"), pat(";"));
assert_eq!(one_regex(r";((.*,))"), pat(";"));
assert_eq!(one_regex(r";(.*,)+"), pat(";"),);
assert_eq!(one_regex(r";(.*,){1}"), pat(";"),);
pub fn one_regex(&self, _word: bool) -> Option<String> {
None
}
}

View File

@ -0,0 +1,466 @@
/*
This module is responsible for extracting *inner* literals out of the AST of a
regular expression. Normally this is the job of the regex engine itself, but
the regex engine doesn't look for inner literals. Since we're doing line based
searching, we can use them, so we need to do it ourselves.
*/
use {
bstr::ByteSlice,
regex_syntax::hir::{
self,
literal::{self, Literal, Seq},
Hir, HirKind,
},
};
use crate::util;
/// Represents prefix, suffix and inner "required" literals for a regular
/// expression.
///
/// Prefixes and suffixes are detected using regex-syntax. The inner required
/// literals are detected using something custom (but based on the code in
/// regex-syntax).
#[derive(Clone, Debug)]
pub struct LiteralSets {
/// A set of prefix literals.
prefixes: Seq,
/// A set of suffix literals.
suffixes: Seq,
/// A set of literals such that at least one of them must appear in every
/// match. A literal in this set may be neither a prefix nor a suffix.
required: Seq,
}
impl LiteralSets {
/// Create a set of literals from the given HIR expression.
pub fn new(expr: &Hir) -> LiteralSets {
let mut required = Seq::singleton(Literal::exact(vec![]));
union_required(expr, &mut required);
LiteralSets {
prefixes: prefixes(expr),
suffixes: suffixes(expr),
required,
}
}
/// If it is deemed advantageuous to do so (via various suspicious
/// heuristics), this will return a single regular expression pattern that
/// matches a subset of the language matched by the regular expression that
/// generated these literal sets. The idea here is that the pattern
/// returned by this method is much cheaper to search for. i.e., It is
/// usually a single literal or an alternation of literals.
pub fn one_regex(&self, word: bool) -> Option<String> {
// TODO: The logic in this function is basically inscrutable. It grew
// organically in the old grep 0.1 crate. Ideally, it would be
// re-worked. In fact, the entire inner literal extraction should be
// re-worked. Actually, most of regex-syntax's literal extraction
// should also be re-worked. Alas... only so much time in the day.
if !word {
if self.prefixes.is_exact() && !self.prefixes.is_empty() {
log::debug!("literal prefixes detected: {:?}", self.prefixes);
// When this is true, the regex engine will do a literal scan,
// so we don't need to return anything. But we only do this
// if we aren't doing a word regex, since a word regex adds
// a `(?:\W|^)` to the beginning of the regex, thereby
// defeating the regex engine's literal detection.
return None;
}
}
// Out of inner required literals, prefixes and suffixes, which one
// is the longest? We pick the longest to do fast literal scan under
// the assumption that a longer literal will have a lower false
// positive rate.
let pre_lcp = self.prefixes.longest_common_prefix().unwrap_or(&[]);
let pre_lcs = self.prefixes.longest_common_suffix().unwrap_or(&[]);
let suf_lcp = self.suffixes.longest_common_prefix().unwrap_or(&[]);
let suf_lcs = self.suffixes.longest_common_suffix().unwrap_or(&[]);
let req_lits = self.required.literals().unwrap_or(&[]);
let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
None => &[],
Some(req) => req.as_bytes(),
};
let mut lit = pre_lcp;
if pre_lcs.len() > lit.len() {
lit = pre_lcs;
}
if suf_lcp.len() > lit.len() {
lit = suf_lcp;
}
if suf_lcs.len() > lit.len() {
lit = suf_lcs;
}
if req_lits.len() == 1 && req.len() > lit.len() {
lit = req;
}
// Special case: if we detected an alternation of inner required
// literals and its longest literal is bigger than the longest
// prefix/suffix, then choose the alternation. In practice, this
// helps with case insensitive matching, which can generate lots of
// inner required literals.
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
let any_white = has_only_whitespace(&req_lits);
if req.len() > lit.len()
&& req_lits.len() > 1
&& !any_empty
&& !any_white
{
log::debug!("required literals found: {:?}", req_lits);
let alts: Vec<String> = req_lits
.into_iter()
.map(|x| util::bytes_to_regex(x.as_bytes()))
.collect();
// We're matching raw bytes, so disable Unicode mode.
Some(format!("(?-u:{})", alts.join("|")))
} else if lit.is_empty() {
// If we're here, then we have no LCP. No LCS. And no detected
// inner required literals. In theory this shouldn't happen, but
// the inner literal detector isn't as nice as we hope and doesn't
// actually support returning a set of alternating required
// literals. (Instead, it only returns a set where EVERY literal
// in it is required. It cannot currently express "either P or Q
// is required.")
//
// In this case, it is possible that we still have meaningful
// prefixes or suffixes to use. So we look for the set of literals
// with the highest minimum length and use that to build our "fast"
// regex.
//
// This manifests in fairly common scenarios. e.g.,
//
// rg -w 'foo|bar|baz|quux'
//
// Normally, without the `-w`, the regex engine itself would
// detect the prefix correctly. Unfortunately, the `-w` option
// turns the regex into something like this:
//
// rg '(^|\W)(foo|bar|baz|quux)($|\W)'
//
// Which will defeat all prefix and suffix literal optimizations.
// (Not in theory---it could be better. But the current
// implementation isn't good enough.) ... So we make up for it
// here.
if !word {
return None;
}
let p_min_len = self.prefixes.min_literal_len();
let s_min_len = self.suffixes.min_literal_len();
let lits = match (p_min_len, s_min_len) {
(None, None) => return None,
(Some(_), None) => {
log::debug!("prefix literals found");
self.prefixes.literals().unwrap()
}
(None, Some(_)) => {
log::debug!("suffix literals found");
self.suffixes.literals().unwrap()
}
(Some(p), Some(s)) => {
if p >= s {
log::debug!("prefix literals found");
self.prefixes.literals().unwrap()
} else {
log::debug!("suffix literals found");
self.suffixes.literals().unwrap()
}
}
};
log::debug!("prefix/suffix literals found: {:?}", lits);
if has_only_whitespace(lits) {
log::debug!("dropping literals because one was whitespace");
return None;
}
let alts: Vec<String> = lits
.into_iter()
.map(|x| util::bytes_to_regex(x.as_bytes()))
.collect();
// We're matching raw bytes, so disable Unicode mode.
Some(format!("(?-u:{})", alts.join("|")))
} else {
log::debug!("required literal found: {:?}", util::show_bytes(lit));
if lit.chars().all(|c| c.is_whitespace()) {
log::debug!("dropping literal because one was whitespace");
return None;
}
Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))
}
}
}
fn union_required(expr: &Hir, lits: &mut Seq) {
match *expr.kind() {
HirKind::Literal(hir::Literal(ref bytes)) => {
lits.cross_forward(&mut Seq::new([bytes]));
}
HirKind::Class(hir::Class::Unicode(_)) => {
lits.make_inexact();
}
HirKind::Class(hir::Class::Bytes(_)) => {
lits.make_inexact();
}
HirKind::Capture(hir::Capture { ref sub, .. }) => {
union_required(&**sub, lits);
}
HirKind::Repetition(hir::Repetition { min, max, greedy, ref sub }) => {
repeat_range_literals(
&sub,
min,
max,
greedy,
lits,
union_required,
);
}
HirKind::Concat(ref es) if es.is_empty() => {}
HirKind::Concat(ref es) if es.len() == 1 => {
union_required(&es[0], lits)
}
HirKind::Concat(ref es) => {
for e in es {
let mut lits2 = Seq::singleton(Literal::exact(vec![]));
union_required(e, &mut lits2);
if lits2.len() == Some(1) && lits2.min_literal_len() == Some(0)
{
lits.make_inexact();
continue;
}
if lits2.min_literal_len() == Some(0) || !is_simple(&e) {
lits.make_inexact();
}
lits.cross_forward(&mut lits2);
if lits2.is_inexact() {
// If this expression couldn't yield any literal that
// could be extended, then we need to quit. Since we're
// short-circuiting, we also need to freeze every member.
lits.make_inexact();
break;
}
}
}
HirKind::Alternation(ref es) => {
alternate_literals(es, lits, union_required);
}
_ => lits.make_inexact(),
}
}
fn repeat_range_literals<F: FnMut(&Hir, &mut Seq)>(
e: &Hir,
min: u32,
_max: Option<u32>,
_greedy: bool,
lits: &mut Seq,
mut f: F,
) {
if min == 0 {
// This is a bit conservative. If `max` is set, then we could
// treat this as a finite set of alternations. For now, we
// just treat it as `e*`.
lits.make_inexact();
} else {
// We only extract literals from a single repetition, even though
// we could do more. e.g., `a{3}` will have `a` extracted instead of
// `aaa`. The reason is that inner literal extraction can't be unioned
// across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
// is wrong.
f(e, lits);
lits.make_inexact();
}
}
fn alternate_literals<F: FnMut(&Hir, &mut Seq)>(
es: &[Hir],
lits: &mut Seq,
mut f: F,
) {
let mut lits2 = Seq::empty();
for e in es {
let mut lits3 = Seq::empty();
// FIXME
// lits3.set_limit_size(lits.limit_size() / 5);
f(e, &mut lits3);
if lits3.is_empty() {
lits.make_inexact();
return;
}
lits2.union(&mut lits3);
}
// All we do at the moment is look for prefixes and suffixes. If both
// are empty, then we report nothing. We should be able to do better than
// this, but we'll need something more expressive than just a "set of
// literals."
if let Some(lcp) = lits2.longest_common_prefix() {
lits.cross_forward(&mut Seq::new([lcp]));
}
lits.make_inexact();
if let Some(lcs) = lits2.longest_common_suffix() {
lits.push(Literal::exact([]));
lits.push(Literal::exact(lcs));
}
/*
let lcp = lits2.longest_common_prefix();
let lcs = lits2.longest_common_suffix();
if !lcp.is_empty() {
lits.cross_forward(lcp);
}
lits.make_inexact();
if !lcs.is_empty() {
lits.push(Literal::exact([]));
lits.push(Literal::exact(lcs));
}
*/
}
fn is_simple(expr: &Hir) -> bool {
match *expr.kind() {
HirKind::Empty
| HirKind::Literal(_)
| HirKind::Class(_)
| HirKind::Concat(_)
| HirKind::Alternation(_) => true,
HirKind::Look(_) | HirKind::Capture(_) | HirKind::Repetition(_) => {
false
}
}
}
/*
/// Return the number of characters in the given class.
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
}
/// Return the number of bytes in the given class.
fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
}
*/
/// Returns true if and only if any of the literals in the given set is
/// entirely whitespace.
fn has_only_whitespace(lits: &[Literal]) -> bool {
for lit in lits {
if lit.as_bytes().chars().all(|c| c.is_whitespace()) {
return true;
}
}
false
}
fn prefixes(hir: &Hir) -> Seq {
let mut extractor = literal::Extractor::new();
extractor.kind(literal::ExtractKind::Prefix);
let mut prefixes = extractor.extract(hir);
log::debug!(
"prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
prefixes.len(),
prefixes.is_exact(),
prefixes
);
prefixes.optimize_for_prefix_by_preference();
log::debug!(
"prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
prefixes.len(),
prefixes.is_exact(),
prefixes
);
prefixes
}
fn suffixes(hir: &Hir) -> Seq {
let mut extractor = literal::Extractor::new();
extractor.kind(literal::ExtractKind::Suffix);
let mut suffixes = extractor.extract(hir);
log::debug!(
"suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
suffixes.len(),
suffixes.is_exact(),
suffixes
);
suffixes.optimize_for_suffix_by_preference();
log::debug!(
"suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
suffixes.len(),
suffixes.is_exact(),
suffixes
);
suffixes
}
#[cfg(test)]
mod tests {
use super::LiteralSets;
use regex_syntax::Parser;
fn sets(pattern: &str) -> LiteralSets {
let hir = Parser::new().parse(pattern).unwrap();
LiteralSets::new(&hir)
}
fn one_regex(pattern: &str) -> Option<String> {
sets(pattern).one_regex(false)
}
// Put a pattern into the same format as the one returned by `one_regex`.
fn pat(pattern: &str) -> Option<String> {
Some(format!("(?-u:{})", pattern))
}
#[test]
fn various() {
// Obviously no literals.
assert!(one_regex(r"\w").is_none());
assert!(one_regex(r"\pL").is_none());
// Tantalizingly close.
assert!(one_regex(r"\w|foo").is_none());
// There's a literal, but it's better if the regex engine handles it
// internally.
assert!(one_regex(r"abc").is_none());
// Core use cases.
assert_eq!(one_regex(r"\wabc\w"), pat("abc"));
assert_eq!(one_regex(r"abc\w"), pat("abc"));
// TODO: Make these pass. We're missing some potentially big wins
// without these.
// assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
// assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
}
#[test]
fn regression_1064() {
// Regression from:
// https://github.com/BurntSushi/ripgrep/issues/1064
// assert_eq!(one_regex(r"a.*c"), pat("a"));
assert_eq!(one_regex(r"a(.*c)"), pat("a"));
}
#[test]
fn regression_1319() {
// Regression from:
// https://github.com/BurntSushi/ripgrep/issues/1319
assert_eq!(
one_regex(r"TTGAGTCCAGGAG[ATCG]{2}C"),
pat("TTGAGTCCAGGAG"),
);
}
#[test]
fn regression_1537() {
// Regression from:
// https://github.com/BurntSushi/ripgrep/issues/1537
assert_eq!(one_regex(r";(.*,)"), pat(";"));
assert_eq!(one_regex(r";((.*,))"), pat(";"));
assert_eq!(one_regex(r";(.*,)+"), pat(";"),);
assert_eq!(one_regex(r";(.*,){1}"), pat(";"),);
}
}

View File

@ -1036,7 +1036,9 @@ mod tests {
}
// Test that finding candidate lines works as expected.
// FIXME: Re-enable this test once inner literal extraction works.
#[test]
#[ignore]
fn candidate_lines() {
fn is_confirmed(m: LineMatchKind) -> bool {
match m {

View File

@ -1,6 +1,6 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use aho_corasick::{AhoCorasick, MatchKind};
use grep_matcher::{Match, Matcher, NoError};
use regex_syntax::hir::Hir;
use regex_syntax::hir::{Hir, HirKind};
use crate::error::Error;
use crate::matcher::RegexCaptures;
@ -23,10 +23,9 @@ impl MultiLiteralMatcher {
pub fn new<B: AsRef<[u8]>>(
literals: &[B],
) -> Result<MultiLiteralMatcher, Error> {
let ac = AhoCorasickBuilder::new()
let ac = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostFirst)
.auto_configure(literals)
.build_with_size::<usize, _, _>(literals)
.build(literals)
.map_err(Error::regex)?;
Ok(MultiLiteralMatcher { ac })
}
@ -79,13 +78,11 @@ impl Matcher for MultiLiteralMatcher {
/// Alternation literals checks if the given HIR is a simple alternation of
/// literals, and if so, returns them. Otherwise, this returns None.
pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
use regex_syntax::hir::{HirKind, Literal};
// This is pretty hacky, but basically, if `is_alternation_literal` is
// true, then we can make several assumptions about the structure of our
// HIR. This is what justifies the `unreachable!` statements below.
if !expr.is_alternation_literal() {
if !expr.properties().is_alternation_literal() {
return None;
}
let alts = match *expr.kind() {
@ -93,26 +90,16 @@ pub fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
_ => return None, // one literal isn't worth it
};
let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
Literal::Unicode(c) => {
let mut buf = [0; 4];
dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
}
Literal::Byte(b) => {
dst.push(b);
}
};
let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
HirKind::Empty => {}
HirKind::Literal(ref x) => extendlit(x, &mut lit),
HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
HirKind::Literal(ref x) => extendlit(x, &mut lit),
HirKind::Literal(ref x) => lit.extend_from_slice(&x.0),
_ => unreachable!("expected literal, got {:?}", e),
}
}

View File

@ -1,6 +1,10 @@
use grep_matcher::ByteSet;
use regex_syntax::hir::{self, Hir, HirKind};
use regex_syntax::utf8::Utf8Sequences;
use {
grep_matcher::ByteSet,
regex_syntax::{
hir::{self, Hir, HirKind, Look},
utf8::Utf8Sequences,
},
};
/// Return a confirmed set of non-matching bytes from the given expression.
pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
@ -13,18 +17,28 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
/// the given expression.
fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
match *expr.kind() {
HirKind::Empty | HirKind::WordBoundary(_) => {}
HirKind::Anchor(_) => {
HirKind::Empty
// | HirKind::Look(Look::Start | Look::End)
| HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
| HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
HirKind::Look(Look::Start | Look::End) => {
// FIXME: This is wrong, but not doing this leads to incorrect
// results because of how anchored searches are implemented in
// the 'grep-searcher' crate.
set.remove(b'\n');
}
HirKind::Literal(hir::Literal::Unicode(c)) => {
for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
HirKind::Look(Look::StartLF | Look::EndLF) => {
set.remove(b'\n');
}
HirKind::Look(Look::StartCRLF | Look::EndCRLF) => {
set.remove(b'\r');
set.remove(b'\n');
}
HirKind::Literal(hir::Literal(ref lit)) => {
for &b in lit.iter() {
set.remove(b);
}
}
HirKind::Literal(hir::Literal::Byte(b)) => {
set.remove(b);
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
for range in cls.iter() {
// This is presumably faster than encoding every codepoint
@ -42,10 +56,10 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
}
}
HirKind::Repetition(ref x) => {
remove_matching_bytes(&x.hir, set);
remove_matching_bytes(&x.sub, set);
}
HirKind::Group(ref x) => {
remove_matching_bytes(&x.hir, set);
HirKind::Capture(ref x) => {
remove_matching_bytes(&x.sub, set);
}
HirKind::Concat(ref xs) => {
for x in xs {
@ -62,17 +76,13 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
#[cfg(test)]
mod tests {
use grep_matcher::ByteSet;
use regex_syntax::ParserBuilder;
use {grep_matcher::ByteSet, regex_syntax::ParserBuilder};
use super::non_matching_bytes;
fn extract(pattern: &str) -> ByteSet {
let expr = ParserBuilder::new()
.allow_invalid_utf8(true)
.build()
.parse(pattern)
.unwrap();
let expr =
ParserBuilder::new().utf8(false).build().parse(pattern).unwrap();
non_matching_bytes(&expr)
}
@ -131,9 +141,13 @@ mod tests {
#[test]
fn anchor() {
// FIXME: The first four tests below should correspond to a full set
// of bytes for the non-matching bytes I think.
assert_eq!(sparse(&extract(r"^")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"$")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"\A")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"\z")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"(?m)^")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"(?m)$")), sparse_except(&[b'\n']));
}
}

View File

@ -42,17 +42,11 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
Ok(match expr.into_kind() {
HirKind::Empty => Hir::empty(),
HirKind::Literal(hir::Literal::Unicode(c)) => {
if c == chr {
HirKind::Literal(hir::Literal(lit)) => {
if lit.iter().find(|&&b| b == byte).is_some() {
return invalid();
}
Hir::literal(hir::Literal::Unicode(c))
}
HirKind::Literal(hir::Literal::Byte(b)) => {
if b as char == chr {
return invalid();
}
Hir::literal(hir::Literal::Byte(b))
Hir::literal(lit)
}
HirKind::Class(hir::Class::Unicode(mut cls)) => {
let remove = hir::ClassUnicode::new(Some(
@ -74,15 +68,14 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
}
Hir::class(hir::Class::Bytes(cls))
}
HirKind::Anchor(x) => Hir::anchor(x),
HirKind::WordBoundary(x) => Hir::word_boundary(x),
HirKind::Look(x) => Hir::look(x),
HirKind::Repetition(mut x) => {
x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
x.sub = Box::new(strip_from_match_ascii(*x.sub, byte)?);
Hir::repetition(x)
}
HirKind::Group(mut x) => {
x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
Hir::group(x)
HirKind::Capture(mut x) => {
x.sub = Box::new(strip_from_match_ascii(*x.sub, byte)?);
Hir::capture(x)
}
HirKind::Concat(xs) => {
let xs = xs
@ -131,11 +124,11 @@ mod tests {
#[test]
fn various() {
assert_eq!(roundtrip(r"[a\n]", b'\n'), "[a]");
assert_eq!(roundtrip(r"[a\n]", b'a'), "[\n]");
assert_eq!(roundtrip_crlf(r"[a\n]"), "[a]");
assert_eq!(roundtrip_crlf(r"[a\r]"), "[a]");
assert_eq!(roundtrip_crlf(r"[a\r\n]"), "[a]");
assert_eq!(roundtrip(r"[a\n]", b'\n'), "a");
assert_eq!(roundtrip(r"[a\n]", b'a'), "\n");
assert_eq!(roundtrip_crlf(r"[a\n]"), "a");
assert_eq!(roundtrip_crlf(r"[a\r]"), "a");
assert_eq!(roundtrip_crlf(r"[a\r\n]"), "a");
assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])");
assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])");

View File

@ -1,5 +1,6 @@
/// Converts an arbitrary sequence of bytes to a literal suitable for building
/// a regular expression.
#[allow(dead_code)]
pub fn bytes_to_regex(bs: &[u8]) -> String {
use regex_syntax::is_meta_character;
use std::fmt::Write;