ripgrep/grep-regex/src/non_matching.rs

use grep_matcher::ByteSet;
use regex_syntax::hir::{self, Hir, HirKind};
use utf8_ranges::Utf8Sequences;

/// Return a confirmed set of non-matching bytes from the given expression.
pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
    let mut set = ByteSet::full();
    remove_matching_bytes(expr, &mut set);
    set
}

/// Remove any bytes from the given set that can occur in a matched produced by
/// the given expression.
fn remove_matching_bytes(
    expr: &Hir,
    set: &mut ByteSet,
) {
    match *expr.kind() {
        HirKind::Empty
        | HirKind::Anchor(_)
        | HirKind::WordBoundary(_) => {}
        HirKind::Literal(hir::Literal::Unicode(c)) => {
            for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
                set.remove(b);
            }
        }
        HirKind::Literal(hir::Literal::Byte(b)) => {
            set.remove(b);
        }
        HirKind::Class(hir::Class::Unicode(ref cls)) => {
            for range in cls.iter() {
                // This is presumably faster than encoding every codepoint
                // to UTF-8 and then removing those bytes from the set.
                for seq in Utf8Sequences::new(range.start(), range.end()) {
                    for byte_range in seq.as_slice() {
                        set.remove_all(byte_range.start, byte_range.end);
                    }
                }
            }
        }
        HirKind::Class(hir::Class::Bytes(ref cls)) => {
            for range in cls.iter() {
                set.remove_all(range.start(), range.end());
            }
        }
        HirKind::Repetition(ref x) => {
            remove_matching_bytes(&x.hir, set);
        }
        HirKind::Group(ref x) => {
            remove_matching_bytes(&x.hir, set);
        }
        HirKind::Concat(ref xs) => {
            for x in xs {
                remove_matching_bytes(x, set);
            }
        }
        HirKind::Alternation(ref xs) => {
            for x in xs {
                remove_matching_bytes(x, set);
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use grep_matcher::ByteSet;
    use regex_syntax::ParserBuilder;

    use super::non_matching_bytes;

    fn extract(pattern: &str) -> ByteSet {
        let expr = ParserBuilder::new()
            .allow_invalid_utf8(true)
            .build()
            .parse(pattern)
            .unwrap();
        non_matching_bytes(&expr)
    }

    fn sparse(set: &ByteSet) -> Vec<u8> {
        let mut sparse_set = vec![];
        for b in (0..256).map(|b| b as u8) {
            if set.contains(b) {
                sparse_set.push(b);
            }
        }
        sparse_set
    }

    fn sparse_except(except: &[u8]) -> Vec<u8> {
        let mut except_set = vec![false; 256];
        for &b in except {
            except_set[b as usize] = true;
        }

        let mut set = vec![];
        for b in (0..256).map(|b| b as u8) {
            if !except_set[b as usize] {
                set.push(b);
            }
        }
        set
    }

    #[test]
    fn dot() {
        assert_eq!(sparse(&extract(".")), vec![
            b'\n',
            192, 193, 245, 246, 247, 248, 249,
            250, 251, 252, 253, 254, 255,
        ]);
        assert_eq!(sparse(&extract("(?s).")), vec![
            192, 193, 245, 246, 247, 248, 249,
            250, 251, 252, 253, 254, 255,
        ]);
        assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);
        assert_eq!(sparse(&extract("(?s-u).")), vec![]);
    }

    #[test]
    fn literal() {
        assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));
        assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));
        assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));
        assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));
    }
}
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`use grep_matcher::ByteSet;`
			`use regex_syntax::hir::{self, Hir, HirKind};`
			`use utf8_ranges::Utf8Sequences;`

			`/// Return a confirmed set of non-matching bytes from the given expression.`
			`pub fn non_matching_bytes(expr: &Hir) -> ByteSet {`
			`let mut set = ByteSet::full();`
			`remove_matching_bytes(expr, &mut set);`
			`set`
			`}`

			`/// Remove any bytes from the given set that can occur in a matched produced by`
			`/// the given expression.`
			`fn remove_matching_bytes(`
			`expr: &Hir,`
			`set: &mut ByteSet,`
			`) {`
			`match *expr.kind() {`
			`HirKind::Empty`
			`\| HirKind::Anchor(_)`
			`\| HirKind::WordBoundary(_) => {}`
			`HirKind::Literal(hir::Literal::Unicode(c)) => {`
			`for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {`
			`set.remove(b);`
			`}`
			`}`
			`HirKind::Literal(hir::Literal::Byte(b)) => {`
			`set.remove(b);`
			`}`
			`HirKind::Class(hir::Class::Unicode(ref cls)) => {`
			`for range in cls.iter() {`
			`// This is presumably faster than encoding every codepoint`
			`// to UTF-8 and then removing those bytes from the set.`
			`for seq in Utf8Sequences::new(range.start(), range.end()) {`
			`for byte_range in seq.as_slice() {`
			`set.remove_all(byte_range.start, byte_range.end);`
			`}`
			`}`
			`}`
			`}`
			`HirKind::Class(hir::Class::Bytes(ref cls)) => {`
			`for range in cls.iter() {`
			`set.remove_all(range.start(), range.end());`
			`}`
			`}`
			`HirKind::Repetition(ref x) => {`
			`remove_matching_bytes(&x.hir, set);`
			`}`
			`HirKind::Group(ref x) => {`
			`remove_matching_bytes(&x.hir, set);`
			`}`
			`HirKind::Concat(ref xs) => {`
			`for x in xs {`
			`remove_matching_bytes(x, set);`
			`}`
			`}`
			`HirKind::Alternation(ref xs) => {`
			`for x in xs {`
			`remove_matching_bytes(x, set);`
			`}`
			`}`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use grep_matcher::ByteSet;`
			`use regex_syntax::ParserBuilder;`

			`use super::non_matching_bytes;`

			`fn extract(pattern: &str) -> ByteSet {`
			`let expr = ParserBuilder::new()`
			`.allow_invalid_utf8(true)`
			`.build()`
			`.parse(pattern)`
			`.unwrap();`
			`non_matching_bytes(&expr)`
			`}`

			`fn sparse(set: &ByteSet) -> Vec<u8> {`
			`let mut sparse_set = vec![];`
			`for b in (0..256).map(\|b\| b as u8) {`
			`if set.contains(b) {`
			`sparse_set.push(b);`
			`}`
			`}`
			`sparse_set`
			`}`

			`fn sparse_except(except: &[u8]) -> Vec<u8> {`
			`let mut except_set = vec![false; 256];`
			`for &b in except {`
			`except_set[b as usize] = true;`
			`}`

			`let mut set = vec![];`
			`for b in (0..256).map(\|b\| b as u8) {`
			`if !except_set[b as usize] {`
			`set.push(b);`
			`}`
			`}`
			`set`
			`}`

			`#[test]`
			`fn dot() {`
			`assert_eq!(sparse(&extract(".")), vec![`
			`b'\n',`
			`192, 193, 245, 246, 247, 248, 249,`
			`250, 251, 252, 253, 254, 255,`
			`]);`
			`assert_eq!(sparse(&extract("(?s).")), vec![`
			`192, 193, 245, 246, 247, 248, 249,`
			`250, 251, 252, 253, 254, 255,`
			`]);`
			`assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);`
			`assert_eq!(sparse(&extract("(?s-u).")), vec![]);`
			`}`

			`#[test]`
			`fn literal() {`
			`assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));`
			`assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));`
			`assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));`
			`assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));`
			`}`
			`}`