1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2025-01-19 05:49:14 +02:00

searcher: partially migrate to bstr

This commit causes grep-searcher to use byte strings internally for its
line buffer support. We manage to remove a use of `unsafe` by doing this
(by pushing it down into `bstr`).

We stop short of using byte strings everywhere else because we rely
heavily on the `impl ops::Index<[u8]> for grep_matcher::Match` impl,
which isn't available for byte strings. (It is premature to make bstr a
public dep of a core crate like grep-matcher, but maybe some day.)
This commit is contained in:
Andrew Gallant 2019-04-03 13:04:52 -04:00
parent bef1f0e770
commit 85417e52e9
No known key found for this signature in database
GPG Key ID: B2E3A4923F8B0D44
6 changed files with 63 additions and 89 deletions

View File

@ -13,12 +13,12 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
[dependencies]
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
bytecount = "0.5"
encoding_rs = "0.8.14"
encoding_rs_io = "0.1.4"
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
log = "0.4.5"
memchr = "2.1"
memmap = "0.7"
[dev-dependencies]

View File

@ -99,13 +99,13 @@ searches stdin.
#![deny(missing_docs)]
extern crate bstr;
extern crate bytecount;
extern crate encoding_rs;
extern crate encoding_rs_io;
extern crate grep_matcher;
#[macro_use]
extern crate log;
extern crate memchr;
extern crate memmap;
#[cfg(test)]
extern crate regex;

View File

@ -1,8 +1,7 @@
use std::cmp;
use std::io;
use std::ptr;
use memchr::{memchr, memrchr};
use bstr::{BStr, BString};
/// The default buffer capacity that we use for the line buffer.
pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1<<10); // 8 KB
@ -123,7 +122,7 @@ impl LineBufferBuilder {
pub fn build(&self) -> LineBuffer {
LineBuffer {
config: self.config,
buf: vec![0; self.config.capacity],
buf: BString::from(vec![0; self.config.capacity]),
pos: 0,
last_lineterm: 0,
end: 0,
@ -255,6 +254,12 @@ impl<'b, R: io::Read> LineBufferReader<'b, R> {
/// Return the contents of this buffer.
pub fn buffer(&self) -> &[u8] {
self.line_buffer.buffer().as_bytes()
}
/// Return the underlying buffer as a byte string. Used for tests only.
#[cfg(test)]
fn bstr(&self) -> &BStr {
self.line_buffer.buffer()
}
@ -284,7 +289,7 @@ pub struct LineBuffer {
/// The configuration of this buffer.
config: Config,
/// The primary buffer with which to hold data.
buf: Vec<u8>,
buf: BString,
/// The current position of this buffer. This is always a valid sliceable
/// index into `buf`, and its maximum value is the length of `buf`.
pos: usize,
@ -339,13 +344,13 @@ impl LineBuffer {
}
/// Return the contents of this buffer.
fn buffer(&self) -> &[u8] {
fn buffer(&self) -> &BStr {
&self.buf[self.pos..self.last_lineterm]
}
/// Return the contents of the free space beyond the end of the buffer as
/// a mutable slice.
fn free_buffer(&mut self) -> &mut [u8] {
fn free_buffer(&mut self) -> &mut BStr {
&mut self.buf[self.end..]
}
@ -396,7 +401,7 @@ impl LineBuffer {
assert_eq!(self.pos, 0);
loop {
self.ensure_capacity()?;
let readlen = rdr.read(self.free_buffer())?;
let readlen = rdr.read(self.free_buffer().as_bytes_mut())?;
if readlen == 0 {
// We're only done reading for good once the caller has
// consumed everything.
@ -416,7 +421,7 @@ impl LineBuffer {
match self.config.binary {
BinaryDetection::None => {} // nothing to do
BinaryDetection::Quit(byte) => {
if let Some(i) = memchr(byte, newbytes) {
if let Some(i) = newbytes.find_byte(byte) {
self.end = oldend + i;
self.last_lineterm = self.end;
self.binary_byte_offset =
@ -444,7 +449,7 @@ impl LineBuffer {
}
// Update our `last_lineterm` positions if we read one.
if let Some(i) = memrchr(self.config.lineterm, newbytes) {
if let Some(i) = newbytes.rfind_byte(self.config.lineterm) {
self.last_lineterm = oldend + i + 1;
return Ok(true);
}
@ -467,40 +472,8 @@ impl LineBuffer {
return;
}
assert!(self.pos < self.end && self.end <= self.buf.len());
let roll_len = self.end - self.pos;
unsafe {
// SAFETY: A buffer contains Copy data, so there's no problem
// moving it around. Safety also depends on our indices being
// in bounds, which they should always be, and we enforce with
// an assert above.
//
// It seems like it should be possible to do this in safe code that
// results in the same codegen. I tried the obvious:
//
// for (src, dst) in (self.pos..self.end).zip(0..) {
// self.buf[dst] = self.buf[src];
// }
//
// But the above does not work, and in fact compiles down to a slow
// byte-by-byte loop. I tried a few other minor variations, but
// alas, better minds might prevail.
//
// Overall, this doesn't save us *too* much. It mostly matters when
// the number of bytes we're copying is large, which can happen
// if the searcher is asked to produce a lot of context. We could
// decide this isn't worth it, but it does make an appreciable
// impact at or around the context=30 range on my machine.
//
// We could also use a temporary buffer that compiles down to two
// memcpys and is faster than the byte-at-a-time loop, but it
// complicates our options for limiting memory allocation a bit.
ptr::copy(
self.buf[self.pos..].as_ptr(),
self.buf.as_mut_ptr(),
roll_len,
);
}
self.buf.copy_within(self.pos.., 0);
self.pos = 0;
self.last_lineterm = roll_len;
self.end = roll_len;
@ -536,14 +509,15 @@ impl LineBuffer {
}
}
/// Replaces `src` with `replacement` in bytes.
fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option<usize> {
/// Replaces `src` with `replacement` in bytes, and return the offset of the
/// first replacement, if one exists.
fn replace_bytes(bytes: &mut BStr, src: u8, replacement: u8) -> Option<usize> {
if src == replacement {
return None;
}
let mut first_pos = None;
let mut pos = 0;
while let Some(i) = memchr(src, &bytes[pos..]).map(|i| pos + i) {
while let Some(i) = bytes[pos..].find_byte(src).map(|i| pos + i) {
if first_pos.is_none() {
first_pos = Some(i);
}
@ -560,6 +534,7 @@ fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option<usize> {
#[cfg(test)]
mod tests {
use std::str;
use bstr::BString;
use super::*;
const SHERLOCK: &'static str = "\
@ -575,18 +550,14 @@ and exhibited clearly, with a label attached.\
slice.to_string()
}
fn btos(slice: &[u8]) -> &str {
str::from_utf8(slice).unwrap()
}
fn replace_str(
slice: &str,
src: u8,
replacement: u8,
) -> (String, Option<usize>) {
let mut dst = slice.to_string().into_bytes();
let mut dst = BString::from(slice);
let result = replace_bytes(&mut dst, src, replacement);
(String::from_utf8(dst).unwrap(), result)
(dst.into_string().unwrap(), result)
}
#[test]
@ -607,7 +578,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\n");
assert_eq!(rdr.bstr(), "homer\nlisa\n");
assert_eq!(rdr.absolute_byte_offset(), 0);
rdr.consume(5);
assert_eq!(rdr.absolute_byte_offset(), 5);
@ -615,7 +586,7 @@ and exhibited clearly, with a label attached.\
assert_eq!(rdr.absolute_byte_offset(), 11);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "maggie");
assert_eq!(rdr.bstr(), "maggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -630,7 +601,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -645,7 +616,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "\n");
assert_eq!(rdr.bstr(), "\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -660,7 +631,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "\n\n");
assert_eq!(rdr.bstr(), "\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -698,12 +669,12 @@ and exhibited clearly, with a label attached.\
let mut linebuf = LineBufferBuilder::new().capacity(1).build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
let mut got = vec![];
let mut got = BString::new();
while rdr.fill().unwrap() {
got.extend(rdr.buffer());
got.push(rdr.buffer());
rdr.consume_all();
}
assert_eq!(bytes, btos(&got));
assert_eq!(bytes, got);
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
@ -718,11 +689,11 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\n");
assert_eq!(rdr.bstr(), "homer\n");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "lisa\n");
assert_eq!(rdr.bstr(), "lisa\n");
rdr.consume_all();
// This returns an error because while we have just enough room to
@ -732,11 +703,11 @@ and exhibited clearly, with a label attached.\
assert!(rdr.fill().is_err());
// We can mush on though!
assert_eq!(btos(rdr.buffer()), "m");
assert_eq!(rdr.bstr(), "m");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "aggie");
assert_eq!(rdr.bstr(), "aggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -752,16 +723,16 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\n");
assert_eq!(rdr.bstr(), "homer\n");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "lisa\n");
assert_eq!(rdr.bstr(), "lisa\n");
rdr.consume_all();
// We have just enough space.
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "maggie");
assert_eq!(rdr.bstr(), "maggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -777,7 +748,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().is_err());
assert_eq!(btos(rdr.buffer()), "");
assert_eq!(rdr.bstr(), "");
}
#[test]
@ -789,7 +760,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nli\x00sa\nmaggie\n");
assert_eq!(rdr.bstr(), "homer\nli\x00sa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -808,7 +779,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nli");
assert_eq!(rdr.bstr(), "homer\nli");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -825,7 +796,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(!rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "");
assert_eq!(rdr.bstr(), "");
assert_eq!(rdr.absolute_byte_offset(), 0);
assert_eq!(rdr.binary_byte_offset(), Some(0));
}
@ -841,7 +812,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -860,7 +831,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -878,7 +849,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "\
assert_eq!(rdr.bstr(), "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, s\
");
@ -901,7 +872,7 @@ Holmeses, s\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nli\nsa\nmaggie\n");
assert_eq!(rdr.bstr(), "homer\nli\nsa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -920,7 +891,7 @@ Holmeses, s\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "\nhomer\nlisa\nmaggie\n");
assert_eq!(rdr.bstr(), "\nhomer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -939,7 +910,7 @@ Holmeses, s\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@ -958,7 +929,7 @@ Holmeses, s\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());

View File

@ -2,8 +2,8 @@
A collection of routines for performing operations on lines.
*/
use bstr::B;
use bytecount;
use memchr::{memchr, memrchr};
use grep_matcher::{LineTerminator, Match};
/// An iterator over lines in a particular slice of bytes.
@ -85,7 +85,7 @@ impl LineStep {
#[inline(always)]
fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
bytes = &bytes[..self.end];
match memchr(self.line_term, &bytes[self.pos..]) {
match B(&bytes[self.pos..]).find_byte(self.line_term) {
None => {
if self.pos < bytes.len() {
let m = (self.pos, bytes.len());
@ -135,14 +135,16 @@ pub fn locate(
line_term: u8,
range: Match,
) -> Match {
let line_start = memrchr(line_term, &bytes[0..range.start()])
let line_start = B(&bytes[..range.start()])
.rfind_byte(line_term)
.map_or(0, |i| i + 1);
let line_end =
if range.end() > line_start && bytes[range.end() - 1] == line_term {
range.end()
} else {
memchr(line_term, &bytes[range.end()..])
.map_or(bytes.len(), |i| range.end() + i + 1)
B(&bytes[range.end()..])
.find_byte(line_term)
.map_or(bytes.len(), |i| range.end() + i + 1)
};
Match::new(line_start, line_end)
}
@ -180,7 +182,7 @@ fn preceding_by_pos(
pos -= 1;
}
loop {
match memrchr(line_term, &bytes[..pos]) {
match B(&bytes[..pos]).rfind_byte(line_term) {
None => {
return 0;
}

View File

@ -1,6 +1,6 @@
use std::cmp;
use memchr::memchr;
use bstr::B;
use grep_matcher::{LineMatchKind, Matcher};
use lines::{self, LineStep};
@ -149,7 +149,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
BinaryDetection::Quit(b) => b,
_ => return false,
};
if let Some(i) = memchr(binary_byte, &buf[*range]) {
if let Some(i) = B(&buf[*range]).find_byte(binary_byte) {
self.binary_byte_offset = Some(range.start() + i);
true
} else {

View File

@ -1,10 +1,10 @@
use std::io::{self, Write};
use std::str;
use bstr::B;
use grep_matcher::{
LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError,
};
use memchr::memchr;
use regex::bytes::{Regex, RegexBuilder};
use searcher::{BinaryDetection, Searcher, SearcherBuilder};
@ -94,7 +94,8 @@ impl Matcher for RegexMatcher {
}
// Make it interesting and return the last byte in the current
// line.
let i = memchr(self.line_term.unwrap().as_byte(), haystack)
let i = B(haystack)
.find_byte(self.line_term.unwrap().as_byte())
.map(|i| i)
.unwrap_or(haystack.len() - 1);
Ok(Some(LineMatchKind::Candidate(i)))