config: switch to using bstrs

This lets us implement correct Unicode trimming and also simplifies the parsing logic a bit. This also removes the last platform specific bits of code in ripgrep core.
2025-09-16 08:26:28 +02:00 · 2019-04-04 15:14:29 -04:00
parent 5e50a3c43c
commit 26a83c6301
4 changed files with 17 additions and 48 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,7 @@ members = [
 ]

 [dependencies]
+bstr = "0.1.2"
 grep = { version = "0.2.3", path = "grep" }
 ignore = { version = "0.4.4", path = "ignore" }
 lazy_static = "1.1.0"
--- a/GUIDE.md
+++ b/GUIDE.md
@@ -525,9 +525,9 @@ config file. Once the environment variable is set, open the file and just type
 in the flags you want set automatically. There are only two rules for
 describing the format of the config file:

-1. Every line is a shell argument, after trimming ASCII whitespace.
-2. Lines starting with `#` (optionally preceded by any amount of
-   ASCII whitespace) are ignored.
+1. Every line is a shell argument, after trimming whitespace.
+2. Lines starting with `#` (optionally preceded by any amount of whitespace)
+are ignored.

 In particular, there is no escaping. Each line is given to ripgrep as a single
 command line argument verbatim.
--- a/doc/rg.1.txt.tpl
+++ b/doc/rg.1.txt.tpl
@@ -107,9 +107,9 @@ ripgrep supports reading configuration files that change ripgrep's default
 behavior. The format of the configuration file is an "rc" style and is very
 simple. It is defined by two rules:

-    1. Every line is a shell argument, after trimming ASCII whitespace.
+    1. Every line is a shell argument, after trimming whitespace.
    2. Lines starting with *#* (optionally preceded by any amount of
-       ASCII whitespace) are ignored.
+       whitespace) are ignored.

 ripgrep will look for a single configuration file if and only if the
 *RIPGREP_CONFIG_PATH* environment variable is set and is non-empty.
--- a/src/config.rs
+++ b/src/config.rs
@@ -5,10 +5,11 @@
 use std::env;
 use std::error::Error;
 use std::fs::File;
-use std::io::{self, BufRead};
+use std::io;
 use std::ffi::OsString;
 use std::path::{Path, PathBuf};

+use bstr::io::BufReadExt;
 use log;

 use crate::Result;
@@ -76,62 +77,29 @@ fn parse<P: AsRef<Path>>(
 fn parse_reader<R: io::Read>(
    rdr: R,
 ) -> Result<(Vec<OsString>, Vec<Box<Error>>)> {
-    let mut bufrdr = io::BufReader::new(rdr);
+    let bufrdr = io::BufReader::new(rdr);
    let (mut args, mut errs) = (vec![], vec![]);
-    let mut line = vec![];
    let mut line_number = 0;
-    while {
-        line.clear();
+    bufrdr.for_byte_line_with_terminator(|line| {
        line_number += 1;
-        bufrdr.read_until(b'\n', &mut line)? > 0
-    } {
-        trim(&mut line);
+
+        let line = line.trim();
        if line.is_empty() || line[0] == b'#' {
-            continue;
+            return Ok(true);
        }
-        match bytes_to_os_string(&line) {
+        match line.to_os_str() {
            Ok(osstr) => {
-                args.push(osstr);
+                args.push(osstr.to_os_string());
            }
            Err(err) => {
                errs.push(format!("{}: {}", line_number, err).into());
            }
        }
-    }
+        Ok(true)
+    })?;
    Ok((args, errs))
 }

-/// Trim the given bytes of whitespace according to the ASCII definition.
-fn trim(x: &mut Vec<u8>) {
-    let upto = x.iter().take_while(|b| is_space(**b)).count();
-    x.drain(..upto);
-    let revto = x.len() - x.iter().rev().take_while(|b| is_space(**b)).count();
-    x.drain(revto..);
-}
-
-/// Returns true if and only if the given byte is an ASCII space character.
-fn is_space(b: u8) -> bool {
-    b == b'\t'
-    || b == b'\n'
-    || b == b'\x0B'
-    || b == b'\x0C'
-    || b == b'\r'
-    || b == b' '
-}
-
-/// On Unix, get an OsString from raw bytes.
-#[cfg(unix)]
-fn bytes_to_os_string(bytes: &[u8]) -> Result<OsString> {
-    use std::os::unix::ffi::OsStringExt;
-    Ok(OsString::from_vec(bytes.to_vec()))
-}
-
-/// On non-Unix (like Windows), require UTF-8.
-#[cfg(not(unix))]
-fn bytes_to_os_string(bytes: &[u8]) -> Result<OsString> {
-    String::from_utf8(bytes.to_vec()).map(OsString::from).map_err(From::from)
-}
-
 #[cfg(test)]
 mod tests {
    use std::ffi::OsString;