From d4b77a8d8967ce1bf701ec65ceb9a75e85e5f2e0 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 14 Oct 2025 13:46:02 -0400 Subject: [PATCH] searcher: fix a performance bug with `-A/--after-context` Previously (with the previous commit): ``` $ cat bigger.txt | (time rg ZQZQZQZQZQ -A999) | wc -l real 2.321 user 0.674 sys 0.735 maxmem 30 MB faults 0 1000 $ cat bigger.txt | (time rg ZQZQZQZQZQ -A9999) | wc -l real 2.513 user 0.823 sys 0.686 maxmem 30 MB faults 0 10000 $ cat bigger.txt | (time rg ZQZQZQZQZQ -A99999) | wc -l real 5.067 user 3.254 sys 0.676 maxmem 30 MB faults 0 100000 $ cat bigger.txt | (time rg ZQZQZQZQZQ -A999999) | wc -l real 6.658 user 4.841 sys 0.778 maxmem 51 MB faults 0 1000000 ``` Now with this commit: ``` $ cat bigger.txt | (time rg ZQZQZQZQZQ -A999) | wc -l real 1.845 user 0.328 sys 0.757 maxmem 30 MB faults 0 1000 $ cat bigger.txt | (time rg ZQZQZQZQZQ -A9999) | wc -l real 1.917 user 0.334 sys 0.771 maxmem 30 MB faults 0 10000 $ cat bigger.txt | (time rg ZQZQZQZQZQ -A99999) | wc -l real 1.972 user 0.319 sys 0.812 maxmem 30 MB faults 0 100000 $ cat bigger.txt | (time rg ZQZQZQZQZQ -A999999) | wc -l real 2.005 user 0.333 sys 0.855 maxmem 30 MB faults 0 1000000 ``` And compare to GNU grep: ``` $ cat bigger.txt | (time grep ZQZQZQZQZQ -A999) | wc -l real 1.488 user 0.143 sys 0.866 maxmem 30 MB faults 0 1000 $ cat bigger.txt | (time grep ZQZQZQZQZQ -A9999) | wc -l real 1.697 user 0.170 sys 0.986 maxmem 30 MB faults 1 10000 $ cat bigger.txt | (time grep ZQZQZQZQZQ -A99999) | wc -l real 1.515 user 0.166 sys 0.856 maxmem 29 MB faults 0 100000 $ cat bigger.txt | (time grep ZQZQZQZQZQ -A999999) | wc -l real 1.490 user 0.174 sys 0.851 maxmem 30 MB faults 0 1000000 ``` Interestingly, GNU grep is still a bit faster. But both commands remain roughly invariant in search time as `-A` is increased. There is definitely something "odd" about searching `stdin`, where it seems substantially slower. We can also observe with GNU grep: ``` $ (time grep ZQZQZQZQZQ -A999999 bigger.txt) | wc -l real 0.692 user 0.184 sys 0.506 maxmem 30 MB faults 0 1000000 $ cat bigger.txt | (time grep ZQZQZQZQZQ -A999999) | wc -l real 1.700 user 0.201 sys 0.954 maxmem 30 MB faults 0 1000000 $ (time rg ZQZQZQZQZQ -A999999 bigger.txt) | wc -l real 0.640 user 0.428 sys 0.209 maxmem 7734 MB faults 0 1000000 $ (time rg ZQZQZQZQZQ --no-mmap -A999999 bigger.txt) | wc -l real 0.866 user 0.282 sys 0.581 maxmem 30 MB faults 0 1000000 $ cat bigger.txt | (time rg ZQZQZQZQZQ -A999999) | wc -l real 1.991 user 0.338 sys 0.819 maxmem 30 MB faults 0 1000000 ``` I wonder if this is related to my discovery in the previous commit where `read` calls on `stdin` seem to never return anything more than ~64K. Oh well, I'm satisfied at this point, especially given that GNU grep seems to do a lot worse than ripgrep with bigger values of `-B/--before-context`: ``` $ cat bigger.txt | (time grep ZQZQZQZQZQ -B9) | wc -l real 1.568 user 0.170 sys 0.885 maxmem 30 MB faults 0 1 $ cat bigger.txt | (time grep ZQZQZQZQZQ -B99) | wc -l real 1.734 user 0.338 sys 0.879 maxmem 30 MB faults 0 1 $ cat bigger.txt | (time grep ZQZQZQZQZQ -B999) | wc -l real 2.349 user 1.723 sys 0.620 maxmem 30 MB faults 0 1 $ cat bigger.txt | (time grep ZQZQZQZQZQ -B9999) | wc -l real 16.459 user 15.848 sys 0.586 maxmem 30 MB faults 0 1 $ time grep ZQZQZQZQZQ -B99999 bigger.txt ZQZQZQZQZQ real 1:45.06 user 1:44.12 sys 0.772 maxmem 30 MB faults 0 ``` The above pattern occurs regardless of whether you put `bigger.txt` on stdin or whether you search it directly. And now ripgrep: ``` $ cat bigger.txt | (time rg ZQZQZQZQZQ -B9) | wc -l real 1.965 user 0.326 sys 0.814 maxmem 29 MB faults 0 1 $ cat bigger.txt | (time rg ZQZQZQZQZQ -B99) | wc -l real 1.941 user 0.423 sys 0.813 maxmem 29 MB faults 0 1 $ cat bigger.txt | (time rg ZQZQZQZQZQ -B999) | wc -l real 2.372 user 0.759 sys 0.703 maxmem 30 MB faults 0 1 $ cat bigger.txt | (time rg ZQZQZQZQZQ -B9999) | wc -l real 2.638 user 0.895 sys 0.665 maxmem 29 MB faults 0 1 $ cat bigger.txt | (time rg ZQZQZQZQZQ -B99999) | wc -l real 5.172 user 3.282 sys 0.748 maxmem 29 MB faults 0 1 ``` NOTE: To get `bigger.txt`: ``` $ curl -LO 'https://burntsushi.net/stuff/opensubtitles/2018/en/sixteenth.txt.gz' $ gzip -d sixteenth.txt.gz $ (echo ZQZQZQZQZQ && for ((i=0;i<10;i++)); do cat sixteenth.txt; done) > bigger.txt ``` --- crates/searcher/src/searcher/core.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/searcher/src/searcher/core.rs b/crates/searcher/src/searcher/core.rs index 87eaf7f4..e0693c09 100644 --- a/crates/searcher/src/searcher/core.rs +++ b/crates/searcher/src/searcher/core.rs @@ -191,10 +191,14 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { // separator (when before_context==0 and after_context>0), we // need to know something about the position of the previous // line visited, even if we're at the beginning of the buffer. + // + // ... however, we only need to find the N preceding lines based + // on before context. We can skip this (potentially costly, for + // large values of N) step when before_context==0. let context_start = lines::preceding( buf, self.config.line_term.as_byte(), - self.config.max_context(), + self.config.before_context, ); let consumed = std::cmp::max(context_start, self.last_line_visited);