mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-03-23 04:34:39 +02:00
154 lines
5.2 KiB
Rust
154 lines
5.2 KiB
Rust
use grep_matcher::LineTerminator;
|
|
use regex_syntax::hir::{self, Hir, HirKind};
|
|
|
|
use crate::error::{Error, ErrorKind};
|
|
|
|
/// Return an HIR that is guaranteed to never match the given line terminator,
|
|
/// if possible.
|
|
///
|
|
/// If the transformation isn't possible, then an error is returned.
|
|
///
|
|
/// In general, if a literal line terminator occurs anywhere in the HIR, then
|
|
/// this will return an error. However, if the line terminator occurs within
|
|
/// a character class with at least one other character (that isn't also a line
|
|
/// terminator), then the line terminator is simply stripped from that class.
|
|
///
|
|
/// If the given line terminator is not ASCII, then this function returns an
|
|
/// error.
|
|
pub fn strip_from_match(
|
|
expr: Hir,
|
|
line_term: LineTerminator,
|
|
) -> Result<Hir, Error> {
|
|
if line_term.is_crlf() {
|
|
let expr1 = strip_from_match_ascii(expr, b'\r')?;
|
|
strip_from_match_ascii(expr1, b'\n')
|
|
} else {
|
|
let b = line_term.as_byte();
|
|
if b > 0x7F {
|
|
return Err(Error::new(ErrorKind::InvalidLineTerminator(b)));
|
|
}
|
|
strip_from_match_ascii(expr, b)
|
|
}
|
|
}
|
|
|
|
/// The implementation of strip_from_match. The given byte must be ASCII. This
|
|
/// function panics otherwise.
|
|
fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
|
|
assert!(byte <= 0x7F);
|
|
let chr = byte as char;
|
|
assert_eq!(chr.len_utf8(), 1);
|
|
|
|
let invalid = || Err(Error::new(ErrorKind::NotAllowed(chr.to_string())));
|
|
|
|
Ok(match expr.into_kind() {
|
|
HirKind::Empty => Hir::empty(),
|
|
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
|
if c == chr {
|
|
return invalid();
|
|
}
|
|
Hir::literal(hir::Literal::Unicode(c))
|
|
}
|
|
HirKind::Literal(hir::Literal::Byte(b)) => {
|
|
if b as char == chr {
|
|
return invalid();
|
|
}
|
|
Hir::literal(hir::Literal::Byte(b))
|
|
}
|
|
HirKind::Class(hir::Class::Unicode(mut cls)) => {
|
|
let remove = hir::ClassUnicode::new(Some(
|
|
hir::ClassUnicodeRange::new(chr, chr),
|
|
));
|
|
cls.difference(&remove);
|
|
if cls.ranges().is_empty() {
|
|
return invalid();
|
|
}
|
|
Hir::class(hir::Class::Unicode(cls))
|
|
}
|
|
HirKind::Class(hir::Class::Bytes(mut cls)) => {
|
|
let remove = hir::ClassBytes::new(Some(
|
|
hir::ClassBytesRange::new(byte, byte),
|
|
));
|
|
cls.difference(&remove);
|
|
if cls.ranges().is_empty() {
|
|
return invalid();
|
|
}
|
|
Hir::class(hir::Class::Bytes(cls))
|
|
}
|
|
HirKind::Anchor(x) => Hir::anchor(x),
|
|
HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
|
HirKind::Repetition(mut x) => {
|
|
x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
|
|
Hir::repetition(x)
|
|
}
|
|
HirKind::Group(mut x) => {
|
|
x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
|
|
Hir::group(x)
|
|
}
|
|
HirKind::Concat(xs) => {
|
|
let xs = xs
|
|
.into_iter()
|
|
.map(|e| strip_from_match_ascii(e, byte))
|
|
.collect::<Result<Vec<Hir>, Error>>()?;
|
|
Hir::concat(xs)
|
|
}
|
|
HirKind::Alternation(xs) => {
|
|
let xs = xs
|
|
.into_iter()
|
|
.map(|e| strip_from_match_ascii(e, byte))
|
|
.collect::<Result<Vec<Hir>, Error>>()?;
|
|
Hir::alternation(xs)
|
|
}
|
|
})
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use regex_syntax::Parser;
|
|
|
|
use super::{strip_from_match, LineTerminator};
|
|
use crate::error::Error;
|
|
|
|
fn roundtrip(pattern: &str, byte: u8) -> String {
|
|
roundtrip_line_term(pattern, LineTerminator::byte(byte)).unwrap()
|
|
}
|
|
|
|
fn roundtrip_crlf(pattern: &str) -> String {
|
|
roundtrip_line_term(pattern, LineTerminator::crlf()).unwrap()
|
|
}
|
|
|
|
fn roundtrip_err(pattern: &str, byte: u8) -> Result<String, Error> {
|
|
roundtrip_line_term(pattern, LineTerminator::byte(byte))
|
|
}
|
|
|
|
fn roundtrip_line_term(
|
|
pattern: &str,
|
|
line_term: LineTerminator,
|
|
) -> Result<String, Error> {
|
|
let expr1 = Parser::new().parse(pattern).unwrap();
|
|
let expr2 = strip_from_match(expr1, line_term)?;
|
|
Ok(expr2.to_string())
|
|
}
|
|
|
|
#[test]
|
|
fn various() {
|
|
assert_eq!(roundtrip(r"[a\n]", b'\n'), "[a]");
|
|
assert_eq!(roundtrip(r"[a\n]", b'a'), "[\n]");
|
|
assert_eq!(roundtrip_crlf(r"[a\n]"), "[a]");
|
|
assert_eq!(roundtrip_crlf(r"[a\r]"), "[a]");
|
|
assert_eq!(roundtrip_crlf(r"[a\r\n]"), "[a]");
|
|
|
|
assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])");
|
|
assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])");
|
|
|
|
assert!(roundtrip_err(r"\n", b'\n').is_err());
|
|
assert!(roundtrip_err(r"abc\n", b'\n').is_err());
|
|
assert!(roundtrip_err(r"\nabc", b'\n').is_err());
|
|
assert!(roundtrip_err(r"abc\nxyz", b'\n').is_err());
|
|
assert!(roundtrip_err(r"\x0A", b'\n').is_err());
|
|
assert!(roundtrip_err(r"\u000A", b'\n').is_err());
|
|
assert!(roundtrip_err(r"\U0000000A", b'\n').is_err());
|
|
assert!(roundtrip_err(r"\u{A}", b'\n').is_err());
|
|
assert!(roundtrip_err("\n", b'\n').is_err());
|
|
}
|
|
}
|