mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-04-24 17:12:16 +02:00
When a pattern with invalid UTF-8 is given, the error message suggests unqualified use of hex escape sequences to match arbitrary bytes. But you *also* need to disable Unicode mode. So include that in the error message. Fixes #1339
193 lines
5.7 KiB
Rust
193 lines
5.7 KiB
Rust
use std::error;
|
|
use std::ffi::OsStr;
|
|
use std::fmt;
|
|
use std::fs::File;
|
|
use std::io;
|
|
use std::path::Path;
|
|
use std::str;
|
|
|
|
use bstr::io::BufReadExt;
|
|
|
|
use escape::{escape, escape_os};
|
|
|
|
/// An error that occurs when a pattern could not be converted to valid UTF-8.
|
|
///
|
|
/// The purpose of this error is to give a more targeted failure mode for
|
|
/// patterns written by end users that are not valid UTF-8.
|
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
|
pub struct InvalidPatternError {
|
|
original: String,
|
|
valid_up_to: usize,
|
|
}
|
|
|
|
impl InvalidPatternError {
|
|
/// Returns the index in the given string up to which valid UTF-8 was
|
|
/// verified.
|
|
pub fn valid_up_to(&self) -> usize {
|
|
self.valid_up_to
|
|
}
|
|
}
|
|
|
|
impl error::Error for InvalidPatternError {
|
|
fn description(&self) -> &str {
|
|
"invalid pattern"
|
|
}
|
|
}
|
|
|
|
impl fmt::Display for InvalidPatternError {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
write!(
|
|
f,
|
|
"found invalid UTF-8 in pattern at byte offset {}: {} \
|
|
(disable Unicode mode and use hex escape sequences to match \
|
|
arbitrary bytes in a pattern, e.g., '(?-u)\\xFF')",
|
|
self.valid_up_to, self.original,
|
|
)
|
|
}
|
|
}
|
|
|
|
impl From<InvalidPatternError> for io::Error {
|
|
fn from(paterr: InvalidPatternError) -> io::Error {
|
|
io::Error::new(io::ErrorKind::Other, paterr)
|
|
}
|
|
}
|
|
|
|
/// Convert an OS string into a regular expression pattern.
|
|
///
|
|
/// This conversion fails if the given pattern is not valid UTF-8, in which
|
|
/// case, a targeted error with more information about where the invalid UTF-8
|
|
/// occurs is given. The error also suggests the use of hex escape sequences,
|
|
/// which are supported by many regex engines.
|
|
pub fn pattern_from_os(pattern: &OsStr) -> Result<&str, InvalidPatternError> {
|
|
pattern.to_str().ok_or_else(|| {
|
|
let valid_up_to = pattern
|
|
.to_string_lossy()
|
|
.find('\u{FFFD}')
|
|
.expect("a Unicode replacement codepoint for invalid UTF-8");
|
|
InvalidPatternError { original: escape_os(pattern), valid_up_to }
|
|
})
|
|
}
|
|
|
|
/// Convert arbitrary bytes into a regular expression pattern.
|
|
///
|
|
/// This conversion fails if the given pattern is not valid UTF-8, in which
|
|
/// case, a targeted error with more information about where the invalid UTF-8
|
|
/// occurs is given. The error also suggests the use of hex escape sequences,
|
|
/// which are supported by many regex engines.
|
|
pub fn pattern_from_bytes(
|
|
pattern: &[u8],
|
|
) -> Result<&str, InvalidPatternError> {
|
|
str::from_utf8(pattern).map_err(|err| InvalidPatternError {
|
|
original: escape(pattern),
|
|
valid_up_to: err.valid_up_to(),
|
|
})
|
|
}
|
|
|
|
/// Read patterns from a file path, one per line.
|
|
///
|
|
/// If there was a problem reading or if any of the patterns contain invalid
|
|
/// UTF-8, then an error is returned. If there was a problem with a specific
|
|
/// pattern, then the error message will include the line number and the file
|
|
/// path.
|
|
pub fn patterns_from_path<P: AsRef<Path>>(path: P) -> io::Result<Vec<String>> {
|
|
let path = path.as_ref();
|
|
let file = File::open(path).map_err(|err| {
|
|
io::Error::new(
|
|
io::ErrorKind::Other,
|
|
format!("{}: {}", path.display(), err),
|
|
)
|
|
})?;
|
|
patterns_from_reader(file).map_err(|err| {
|
|
io::Error::new(
|
|
io::ErrorKind::Other,
|
|
format!("{}:{}", path.display(), err),
|
|
)
|
|
})
|
|
}
|
|
|
|
/// Read patterns from stdin, one per line.
|
|
///
|
|
/// If there was a problem reading or if any of the patterns contain invalid
|
|
/// UTF-8, then an error is returned. If there was a problem with a specific
|
|
/// pattern, then the error message will include the line number and the fact
|
|
/// that it came from stdin.
|
|
pub fn patterns_from_stdin() -> io::Result<Vec<String>> {
|
|
let stdin = io::stdin();
|
|
let locked = stdin.lock();
|
|
patterns_from_reader(locked).map_err(|err| {
|
|
io::Error::new(io::ErrorKind::Other, format!("<stdin>:{}", err))
|
|
})
|
|
}
|
|
|
|
/// Read patterns from any reader, one per line.
|
|
///
|
|
/// If there was a problem reading or if any of the patterns contain invalid
|
|
/// UTF-8, then an error is returned. If there was a problem with a specific
|
|
/// pattern, then the error message will include the line number.
|
|
///
|
|
/// Note that this routine uses its own internal buffer, so the caller should
|
|
/// not provide their own buffered reader if possible.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// This shows how to parse patterns, one per line.
|
|
///
|
|
/// ```
|
|
/// use grep_cli::patterns_from_reader;
|
|
///
|
|
/// # fn example() -> Result<(), Box<::std::error::Error>> {
|
|
/// let patterns = "\
|
|
/// foo
|
|
/// bar\\s+foo
|
|
/// [a-z]{3}
|
|
/// ";
|
|
///
|
|
/// assert_eq!(patterns_from_reader(patterns.as_bytes())?, vec![
|
|
/// r"foo",
|
|
/// r"bar\s+foo",
|
|
/// r"[a-z]{3}",
|
|
/// ]);
|
|
/// # Ok(()) }
|
|
/// ```
|
|
pub fn patterns_from_reader<R: io::Read>(rdr: R) -> io::Result<Vec<String>> {
|
|
let mut patterns = vec![];
|
|
let mut line_number = 0;
|
|
io::BufReader::new(rdr).for_byte_line(|line| {
|
|
line_number += 1;
|
|
match pattern_from_bytes(line) {
|
|
Ok(pattern) => {
|
|
patterns.push(pattern.to_string());
|
|
Ok(true)
|
|
}
|
|
Err(err) => Err(io::Error::new(
|
|
io::ErrorKind::Other,
|
|
format!("{}: {}", line_number, err),
|
|
)),
|
|
}
|
|
})?;
|
|
Ok(patterns)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn bytes() {
|
|
let pat = b"abc\xFFxyz";
|
|
let err = pattern_from_bytes(pat).unwrap_err();
|
|
assert_eq!(3, err.valid_up_to());
|
|
}
|
|
|
|
#[test]
|
|
#[cfg(unix)]
|
|
fn os() {
|
|
use std::ffi::OsStr;
|
|
use std::os::unix::ffi::OsStrExt;
|
|
|
|
let pat = OsStr::from_bytes(b"abc\xFFxyz");
|
|
let err = pattern_from_os(pat).unwrap_err();
|
|
assert_eq!(3, err.valid_up_to());
|
|
}
|
|
}
|