progress

2025-06-30 22:23:44 +02:00 · 2016-06-22 21:19:02 -04:00
parent 0163b39faa
commit a3f609222c
3 changed files with 13290 additions and 46 deletions
--- a/grep/src/data/sherlock.txt
+++ b/grep/src/data/sherlock.txt
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@ -4,6 +4,7 @@ extern crate regex_syntax as syntax;
 use std::error;
 use std::fmt;
 use std::io;
 use std::result;
 pub use search::{Grep, GrepBuilder};
@ -26,6 +27,13 @@ pub enum Error {
    /// pattern. For example, if the line terminator is `\n` and the regex
    /// pattern is `\w+\n\w+`, then the presence of `\n` will cause this error.
    LiteralNotAllowed(char),
    /// This errors occurs when a line exceeds the buffer size. The buffer
    /// size is given.
    LineTooLong(usize),
    /// An IO error occurred while searching.
    Io(io::Error),
    /// An unused enum variant that indicates this enum may be expanded in
    /// the future and therefore should not be exhaustively matched.
    #[doc(hidden)]
    __Nonexhaustive,
 }
@ -35,6 +43,8 @@ impl error::Error for Error {
        match *self {
            Error::Regex(ref err) => err.description(),
            Error::LiteralNotAllowed(_) => "use of forbidden literal",
            Error::LineTooLong(_) => "line exceeds buffer size",
            Error::Io(ref err) => err.description(),
            Error::__Nonexhaustive => unreachable!(),
        }
    }
@ -42,6 +52,7 @@ impl error::Error for Error {
    fn cause(&self) -> Option<&error::Error> {
        match *self {
            Error::Regex(ref err) => err.cause(),
            Error::Io(ref err) => err.cause(),
            _ => None,
        }
    }
@ -54,6 +65,11 @@ impl fmt::Display for Error {
            Error::LiteralNotAllowed(chr) => {
                write!(f, "Literal '{}' not allowed.", chr)
            }
            Error::LineTooLong(limit) => {
                write!(f, "Line exceeded buffer size of {} bytes, try \
                           searching with memory maps instead.", limit)
            }
            Error::Io(ref err) => err.fmt(f),
            Error::__Nonexhaustive => unreachable!(),
        }
    }
@ -70,3 +86,9 @@ impl From<syntax::Error> for Error {
        Error::Regex(regex::Error::Syntax(err))
    }
 }
 impl From<io::Error> for Error {
    fn from(err: io::Error) -> Error {
        Error::Io(err)
    }
 }
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@ -1,3 +1,4 @@
 use std::cmp;
 use std::io;
 use memchr::{memchr, memrchr};
@ -6,7 +7,52 @@ use syntax;
 use literals::LiteralSets;
 use nonl;
-use Result;
+use {Error, Result};
 #[derive(Clone, Debug, Default, Eq, PartialEq)]
 pub struct Match {
    start: usize,
    end: usize,
    line: Option<usize>,
    locations: Vec<(usize, usize)>,
 }
 impl Match {
    pub fn new() -> Match {
        Match::default()
    }
    /// Return the starting byte offset of the line that matched.
    #[inline]
    pub fn start(&self) -> usize {
        self.start
    }
    /// Return the ending byte offset of the line that matched.
    #[inline]
    pub fn end(&self) -> usize {
        self.end
    }
    /// Return the line number that this match corresponds to.
    ///
    /// Note that this is `None` if line numbers aren't being computed. Line
    /// number tracking can be enabled using `GrepBuilder`.
    #[inline]
    pub fn line(&self) -> Option<usize> {
        self.line
    }
    /// Return the exact start and end locations (in byte offsets) of every
    /// regex match in this line.
    ///
    /// Note that this always returns an empty slice if exact locations aren't
    /// computed. Exact location tracking can be enabled using `GrepBuilder`.
    #[inline]
    pub fn locations(&self) -> &[(usize, usize)] {
        &self.locations
    }
 }
 #[derive(Clone, Debug)]
 pub struct Grep {
@ -164,6 +210,22 @@ impl Grep {
        }
    }
    pub fn buffered_reader<'g, R: io::Read>(
        &'g self,
        buf: Buffer,
        rdr: R,
    ) -> GrepBuffered<'g, R> {
        GrepBuffered {
            grep: self,
            rdr: rdr,
            b: buf,
            pos: 0,
            start: 0,
            lastnl: 0,
            end: 0,
        }
    }
    pub fn read_match(
        &self,
        mat: &mut Match,
@ -222,48 +284,133 @@ impl Grep {
    }
 }
-#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Buffer {
-pub struct Match {
+    buf: Vec<u8>,
-    start: usize,
+    tmp: Vec<u8>,
    end: usize,
    line: Option<usize>,
    locations: Vec<(usize, usize)>,
 }
-impl Match {
+impl Buffer {
-    pub fn new() -> Match {
+    pub fn new() -> Buffer {
-        Match::default()
+        Buffer::with_capacity(16 * (1<<10))
    }
-    /// Return the starting byte offset of the line that matched.
+    pub fn with_capacity(cap: usize) -> Buffer {
-    #[inline]
+        Buffer {
-    pub fn start(&self) -> usize {
+            buf: vec![0; cap],
-        self.start
+            tmp: Vec::new(),
        }
    }
 }
 pub struct GrepBuffered<'g, R> {
    grep: &'g Grep,
    rdr: R,
    b: Buffer,
    pos: usize,
    start: usize,
    lastnl: usize,
    end: usize,
 }
 impl<'g, R: io::Read> GrepBuffered<'g, R> {
    pub fn into_buffer(self) -> Buffer {
        self.b
    }
-    /// Return the ending byte offset of the line that matched.
+    pub fn iter<'b>(&'b mut self) -> IterBuffered<'b, 'g, R> {
-    #[inline]
+        IterBuffered { grep: self }
    pub fn end(&self) -> usize {
        self.end
    }
-    /// Return the line number that this match corresponds to.
+    pub fn read_match(
-    ///
+        &mut self,
-    /// Note that this is `None` if line numbers aren't being computed. Line
+        mat: &mut Match,
-    /// number tracking can be enabled using `GrepBuilder`.
+    ) -> Result<bool> {
-    #[inline]
+        loop {
-    pub fn line(&self) -> Option<usize> {
+            if self.start == self.lastnl {
-        self.line
+                if !try!(self.fill()) {
                    return Ok(false);
                }
            }
            let ok = self.grep.read_match(
                mat, &self.b.buf[..self.lastnl], self.start);
            if !ok {
                self.start = self.lastnl;
                continue;
            }
            // Move start to the first possible byte of the next line.
            self.start = cmp::min(
                self.lastnl, mat.end.checked_add(1).unwrap());
            mat.start += self.pos;
            mat.end += self.pos;
            return Ok(true);
        }
    }
-    /// Return the exact start and end locations (in byte offsets) of every
+    fn fill(&mut self) -> Result<bool> {
-    /// regex match in this line.
+        {
-    ///
+            // The buffer might have leftover bytes that have not been
-    /// Note that this always returns an empty slice if exact locations aren't
+            // searched yet. Leftovers correspond to all bytes proceding the
-    /// computed. Exact location tracking can be enabled using `GrepBuilder`.
+            // final \n in the current buffer.
-    #[inline]
+            //
-    pub fn locations(&self) -> &[(usize, usize)] {
+            // TODO(ag): Seems like we should be able to memmove from the end
-        &self.locations
+            // of the buffer to the beginning, but let's do it the stupid (but
            // safe) way for now.
            let leftovers = &self.b.buf[self.lastnl..self.end];
            self.b.tmp.clear();
            self.b.tmp.resize(leftovers.len(), 0);
            self.b.tmp.copy_from_slice(leftovers);
        }
        // Move the leftovers to the beginning of our buffer.
        self.b.buf[0..self.b.tmp.len()].copy_from_slice(&self.b.tmp);
        // Fill the rest with fresh bytes.
        let nread = try!(self.rdr.read(&mut self.b.buf[self.b.tmp.len()..]));
        // Now update our various positions.
        self.pos += self.start;
        println!("start: {:?}, pos: {:?}", self.start, self.pos);
        self.start = 0;
        // The end is the total number of bytes read plus whatever we had for
        // leftovers.
        self.end = self.b.tmp.len() + nread;
        // Find the last new line. All searches on this buffer will be capped
        // at this position since any proceding bytes may correspond to a
        // partial line.
        //
        // This is a little complicated because must handle the case where
        // the buffer is not full and no new line character could be found.
        // We detect this case because this could potentially be a partial
        // line. If we fill our buffer and still can't find a `\n`, then we
        // give up.
        let mut start = 0;
        let term = self.grep.opts.line_terminator;
        loop {
            match memrchr(term, &self.b.buf[start..self.end]) {
                Some(i) => {
                    self.lastnl = start + i + 1;
                    break;
                }
                None => {
                    // If we couldn't find a new line and our buffer is
                    // completely full, then this line is terribly long and we
                    // return an error.
                    if self.end == self.b.buf.len() {
                        return Err(Error::LineTooLong(self.b.buf.len()));
                    }
                    // Otherwise we try to ask for more bytes and look again.
                    let nread = try!(
                        self.rdr.read(&mut self.b.buf[self.end..]));
                    // If we got nothing than we're at EOF and we no longer
                    // need to care about leftovers.
                    if nread == 0 {
                        self.lastnl = self.end;
                        break;
                    }
                    start = self.end;
                    self.end += nread;
                }
            }
        }
        // If end is zero, then we've hit EOF and we have no leftovers.
        Ok(self.end > 0)
    }
 }
@ -287,21 +434,44 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
    }
 }
-pub struct GrepBuffered<'g, B> {
+pub struct IterBuffered<'b, 'g: 'b, R: 'b> {
-    grep: &'g Grep,
+    grep: &'b mut GrepBuffered<'g, R>,
    buf: B,
    start: usize,
 }
-impl<'g, B: BufRead> GrepBuffered {
+impl<'b, 'g, R: io::Read> Iterator for IterBuffered<'b, 'g, R> {
-    pub fn read_match(
+    type Item = Result<Match>;
-        &self,
+
-        mat: &mut Match,
+    fn next(&mut self) -> Option<Result<Match>> {
-    ) -> io::Result<bool> {
+        let mut mat = Match::default();
-        let buf = try!(self.buf.fill_buf());
+        match self.grep.read_match(&mut mat) {
-        if buf.is_empty() {
+            Err(err) => Some(Err(err)),
-            return Ok(false);
+            Ok(false) => None,
            Ok(true) => Some(Ok(mat)),
        }
-        Ok(false)
+    }
 }
 #[allow(dead_code)]
 fn s(bytes: &[u8]) -> String {
    String::from_utf8(bytes.to_vec()).unwrap()
 }
 #[cfg(test)]
 mod tests {
    #![allow(unused_imports)]
    use super::{Buffer, GrepBuilder, s};
    static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
    #[test]
    fn buffered() {
        let g = GrepBuilder::new("Sherlock Holmes").create().unwrap();
        let mut bg = g.buffered_reader(Buffer::new(), SHERLOCK);
        let ms: Vec<_> = bg.iter().map(|r| r.unwrap()).collect();
        let m = ms.last().unwrap();
        assert_eq!(91, ms.len());
        assert_eq!(575707, m.start());
        assert_eq!(575784, m.end());
    }
 }