crates/regex/src/non_matching.rs

use grep_matcher::ByteSet;
use regex_syntax::hir::{self, Hir, HirKind};
use regex_syntax::utf8::Utf8Sequences;

/// Return a confirmed set of non-matching bytes from the given expression.
pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
    let mut set = ByteSet::full();
    remove_matching_bytes(expr, &mut set);
    set
}

/// Remove any bytes from the given set that can occur in a matched produced by
/// the given expression.
fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
    match *expr.kind() {
        HirKind::Empty | HirKind::Anchor(_) | HirKind::WordBoundary(_) => {}
        HirKind::Literal(hir::Literal::Unicode(c)) => {
            for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
                set.remove(b);
            }
        }
        HirKind::Literal(hir::Literal::Byte(b)) => {
            set.remove(b);
        }
        HirKind::Class(hir::Class::Unicode(ref cls)) => {
            for range in cls.iter() {
                // This is presumably faster than encoding every codepoint
                // to UTF-8 and then removing those bytes from the set.
                for seq in Utf8Sequences::new(range.start(), range.end()) {
                    for byte_range in seq.as_slice() {
                        set.remove_all(byte_range.start, byte_range.end);
                    }
                }
            }
        }
        HirKind::Class(hir::Class::Bytes(ref cls)) => {
            for range in cls.iter() {
                set.remove_all(range.start(), range.end());
            }
        }
        HirKind::Repetition(ref x) => {
            remove_matching_bytes(&x.hir, set);
        }
        HirKind::Group(ref x) => {
            remove_matching_bytes(&x.hir, set);
        }
        HirKind::Concat(ref xs) => {
            for x in xs {
                remove_matching_bytes(x, set);
            }
        }
        HirKind::Alternation(ref xs) => {
            for x in xs {
                remove_matching_bytes(x, set);
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use grep_matcher::ByteSet;
    use regex_syntax::ParserBuilder;

    use super::non_matching_bytes;

    fn extract(pattern: &str) -> ByteSet {
        let expr = ParserBuilder::new()
            .allow_invalid_utf8(true)
            .build()
            .parse(pattern)
            .unwrap();
        non_matching_bytes(&expr)
    }

    fn sparse(set: &ByteSet) -> Vec<u8> {
        let mut sparse_set = vec![];
        for b in (0..256).map(|b| b as u8) {
            if set.contains(b) {
                sparse_set.push(b);
            }
        }
        sparse_set
    }

    fn sparse_except(except: &[u8]) -> Vec<u8> {
        let mut except_set = vec![false; 256];
        for &b in except {
            except_set[b as usize] = true;
        }

        let mut set = vec![];
        for b in (0..256).map(|b| b as u8) {
            if !except_set[b as usize] {
                set.push(b);
            }
        }
        set
    }

    #[test]
    fn dot() {
        assert_eq!(
            sparse(&extract(".")),
            vec![
                b'\n', 192, 193, 245, 246, 247, 248, 249, 250, 251, 252, 253,
                254, 255,
            ]
        );
        assert_eq!(
            sparse(&extract("(?s).")),
            vec![
                192, 193, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
                255,
            ]
        );
        assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);
        assert_eq!(sparse(&extract("(?s-u).")), vec![]);
    }

    #[test]
    fn literal() {
        assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));
        assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));
        assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));
        assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));
    }
}
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`use grep_matcher::ByteSet;`
			`use regex_syntax::hir::{self, Hir, HirKind};`
deps: update everything Mostly this just updates regex and its assorted dependencies. This does drop utf8-ranges and ucd-util, in accordance with changes to regex-syntax and regex. 2019-08-05 13:50:55 -04:00			`use regex_syntax::utf8::Utf8Sequences;`
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00
			`/// Return a confirmed set of non-matching bytes from the given expression.`
			`pub fn non_matching_bytes(expr: &Hir) -> ByteSet {`
			`let mut set = ByteSet::full();`
			`remove_matching_bytes(expr, &mut set);`
			`set`
			`}`

			`/// Remove any bytes from the given set that can occur in a matched produced by`
			`/// the given expression.`
style: rustfmt everything This is why I was so intent on clearing the PR queue. This will effectively invalidate all existing patches, so I wanted to start from a clean slate. We do make one little tweak: we put the default type definitions in their own file and tell rustfmt to keep its grubby mits off of it. We also sort it lexicographically and hopefully will enforce that from here on. 2020-02-17 18:08:47 -05:00			`fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {`
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`match *expr.kind() {`
style: rustfmt everything This is why I was so intent on clearing the PR queue. This will effectively invalidate all existing patches, so I wanted to start from a clean slate. We do make one little tweak: we put the default type definitions in their own file and tell rustfmt to keep its grubby mits off of it. We also sort it lexicographically and hopefully will enforce that from here on. 2020-02-17 18:08:47 -05:00			`HirKind::Empty \| HirKind::Anchor(_) \| HirKind::WordBoundary(_) => {}`
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`HirKind::Literal(hir::Literal::Unicode(c)) => {`
			`for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {`
			`set.remove(b);`
			`}`
			`}`
			`HirKind::Literal(hir::Literal::Byte(b)) => {`
			`set.remove(b);`
			`}`
			`HirKind::Class(hir::Class::Unicode(ref cls)) => {`
			`for range in cls.iter() {`
			`// This is presumably faster than encoding every codepoint`
			`// to UTF-8 and then removing those bytes from the set.`
			`for seq in Utf8Sequences::new(range.start(), range.end()) {`
			`for byte_range in seq.as_slice() {`
			`set.remove_all(byte_range.start, byte_range.end);`
			`}`
			`}`
			`}`
			`}`
			`HirKind::Class(hir::Class::Bytes(ref cls)) => {`
			`for range in cls.iter() {`
			`set.remove_all(range.start(), range.end());`
			`}`
			`}`
			`HirKind::Repetition(ref x) => {`
			`remove_matching_bytes(&x.hir, set);`
			`}`
			`HirKind::Group(ref x) => {`
			`remove_matching_bytes(&x.hir, set);`
			`}`
			`HirKind::Concat(ref xs) => {`
			`for x in xs {`
			`remove_matching_bytes(x, set);`
			`}`
			`}`
			`HirKind::Alternation(ref xs) => {`
			`for x in xs {`
			`remove_matching_bytes(x, set);`
			`}`
			`}`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use grep_matcher::ByteSet;`
			`use regex_syntax::ParserBuilder;`

			`use super::non_matching_bytes;`

			`fn extract(pattern: &str) -> ByteSet {`
			`let expr = ParserBuilder::new()`
			`.allow_invalid_utf8(true)`
			`.build()`
			`.parse(pattern)`
			`.unwrap();`
			`non_matching_bytes(&expr)`
			`}`

			`fn sparse(set: &ByteSet) -> Vec<u8> {`
			`let mut sparse_set = vec![];`
			`for b in (0..256).map(\|b\| b as u8) {`
			`if set.contains(b) {`
			`sparse_set.push(b);`
			`}`
			`}`
			`sparse_set`
			`}`

			`fn sparse_except(except: &[u8]) -> Vec<u8> {`
			`let mut except_set = vec![false; 256];`
			`for &b in except {`
			`except_set[b as usize] = true;`
			`}`

			`let mut set = vec![];`
			`for b in (0..256).map(\|b\| b as u8) {`
			`if !except_set[b as usize] {`
			`set.push(b);`
			`}`
			`}`
			`set`
			`}`

			`#[test]`
			`fn dot() {`
style: rustfmt everything This is why I was so intent on clearing the PR queue. This will effectively invalidate all existing patches, so I wanted to start from a clean slate. We do make one little tweak: we put the default type definitions in their own file and tell rustfmt to keep its grubby mits off of it. We also sort it lexicographically and hopefully will enforce that from here on. 2020-02-17 18:08:47 -05:00			`assert_eq!(`
			`sparse(&extract(".")),`
			`vec![`
			`b'\n', 192, 193, 245, 246, 247, 248, 249, 250, 251, 252, 253,`
			`254, 255,`
			`]`
			`);`
			`assert_eq!(`
			`sparse(&extract("(?s).")),`
			`vec![`
			`192, 193, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,`
			`255,`
			`]`
			`);`
libripgrep: initial commit introducing libripgrep libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162 2018-04-29 09:29:52 -04:00			`assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);`
			`assert_eq!(sparse(&extract("(?s-u).")), vec![]);`
			`}`

			`#[test]`
			`fn literal() {`
			`assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));`
			`assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));`
			`assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));`
			`assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));`
			`}`
			`}`