1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-06-04 05:57:39 +02:00
ripgrep/crates/regex/src/non_matching.rs
Andrew Gallant e028ea3792 regex: migrate grep-regex to regex-automata
We just do a "basic" dumb migration. We don't try to improve anything
here.
2023-07-05 14:04:29 -04:00

153 lines
4.8 KiB
Rust

use {
grep_matcher::ByteSet,
regex_syntax::{
hir::{self, Hir, HirKind, Look},
utf8::Utf8Sequences,
},
};
/// Return a confirmed set of non-matching bytes from the given expression.
pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
let mut set = ByteSet::full();
remove_matching_bytes(expr, &mut set);
set
}
/// Remove any bytes from the given set that can occur in a matched produced by
/// the given expression.
fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
match *expr.kind() {
HirKind::Empty
| HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
| HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
HirKind::Look(Look::Start | Look::End) => {
// FIXME: This is wrong, but not doing this leads to incorrect
// results because of how anchored searches are implemented in
// the 'grep-searcher' crate.
set.remove(b'\n');
}
HirKind::Look(Look::StartLF | Look::EndLF) => {
set.remove(b'\n');
}
HirKind::Look(Look::StartCRLF | Look::EndCRLF) => {
set.remove(b'\r');
set.remove(b'\n');
}
HirKind::Literal(hir::Literal(ref lit)) => {
for &b in lit.iter() {
set.remove(b);
}
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
for range in cls.iter() {
// This is presumably faster than encoding every codepoint
// to UTF-8 and then removing those bytes from the set.
for seq in Utf8Sequences::new(range.start(), range.end()) {
for byte_range in seq.as_slice() {
set.remove_all(byte_range.start, byte_range.end);
}
}
}
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
for range in cls.iter() {
set.remove_all(range.start(), range.end());
}
}
HirKind::Repetition(ref x) => {
remove_matching_bytes(&x.sub, set);
}
HirKind::Capture(ref x) => {
remove_matching_bytes(&x.sub, set);
}
HirKind::Concat(ref xs) => {
for x in xs {
remove_matching_bytes(x, set);
}
}
HirKind::Alternation(ref xs) => {
for x in xs {
remove_matching_bytes(x, set);
}
}
}
}
#[cfg(test)]
mod tests {
use {grep_matcher::ByteSet, regex_syntax::ParserBuilder};
use super::non_matching_bytes;
fn extract(pattern: &str) -> ByteSet {
let expr =
ParserBuilder::new().utf8(false).build().parse(pattern).unwrap();
non_matching_bytes(&expr)
}
fn sparse(set: &ByteSet) -> Vec<u8> {
let mut sparse_set = vec![];
for b in (0..256).map(|b| b as u8) {
if set.contains(b) {
sparse_set.push(b);
}
}
sparse_set
}
fn sparse_except(except: &[u8]) -> Vec<u8> {
let mut except_set = vec![false; 256];
for &b in except {
except_set[b as usize] = true;
}
let mut set = vec![];
for b in (0..256).map(|b| b as u8) {
if !except_set[b as usize] {
set.push(b);
}
}
set
}
#[test]
fn dot() {
assert_eq!(
sparse(&extract(".")),
vec![
b'\n', 192, 193, 245, 246, 247, 248, 249, 250, 251, 252, 253,
254, 255,
]
);
assert_eq!(
sparse(&extract("(?s).")),
vec![
192, 193, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
255,
]
);
assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);
assert_eq!(sparse(&extract("(?s-u).")), vec![]);
}
#[test]
fn literal() {
assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));
assert_eq!(sparse(&extract("")), sparse_except(&[0xE2, 0x98, 0x83]));
assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));
assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));
}
#[test]
fn anchor() {
// FIXME: The first four tests below should correspond to a full set
// of bytes for the non-matching bytes I think.
assert_eq!(sparse(&extract(r"^")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"$")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"\A")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"\z")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"(?m)^")), sparse_except(&[b'\n']));
assert_eq!(sparse(&extract(r"(?m)$")), sparse_except(&[b'\n']));
}
}