diff --git a/benchsuite/benchsuite b/benchsuite/benchsuite index c6a87220..8ab233a8 100755 --- a/benchsuite/benchsuite +++ b/benchsuite/benchsuite @@ -544,7 +544,11 @@ def bench_subtitles_ru_literal(suite_dir): Command('rg (lines)', ['rg', '-n', pat, ru]), Command('ag (lines)', ['ag', '-s', pat, ru]), Command('grep (lines)', ['grep', '-n', pat, ru], env=GREP_ASCII), - Command('ugrep (lines)', ['ugrep', '-n', pat, ru]) + # ugrep incorrectly identifies this corpus as binary, but it is + # entirely valid UTF-8. So we tell ugrep to always treat the corpus + # as text even though this technically gives it an edge over other + # tools. (It no longer needs to check for binary data.) + Command('ugrep (lines)', ['ugrep', '-a', '-n', pat, ru]) ]) @@ -562,7 +566,8 @@ def bench_subtitles_ru_literal_casei(suite_dir): Command('grep (ASCII)', ['grep', '-E', '-i', pat, ru], env=GREP_ASCII), Command('rg (lines)', ['rg', '-n', '-i', pat, ru]), Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]), - Command('ugrep (lines) (ASCII)', ['ugrep', '-n', '-i', pat, ru]) + # See bench_subtitles_ru_literal for why we use '-a' here. + Command('ugrep (lines) (ASCII)', ['ugrep', '-a', '-n', '-i', pat, ru]) ]) @@ -586,7 +591,8 @@ def bench_subtitles_ru_literal_word(suite_dir): Command('grep (ASCII)', [ 'grep', '-nw', pat, ru, ], env=GREP_ASCII), - Command('ugrep (ASCII)', ['ugrep', '-nw', pat, ru]), + # See bench_subtitles_ru_literal for why we use '-a' here. + Command('ugrep (ASCII)', ['ugrep', '-anw', pat, ru]), Command('rg', ['rg', '-nw', pat, ru]), Command('grep', ['grep', '-nw', pat, ru], env=GREP_UNICODE), ]) @@ -610,7 +616,8 @@ def bench_subtitles_ru_alternate(suite_dir): Command('rg (lines)', ['rg', '-n', pat, ru]), Command('ag (lines)', ['ag', '-s', pat, ru]), Command('grep (lines)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII), - Command('ugrep (lines)', ['ugrep', '-n', pat, ru]), + # See bench_subtitles_ru_literal for why we use '-a' here. + Command('ugrep (lines)', ['ugrep', '-an', pat, ru]), Command('rg', ['rg', pat, ru]), Command('grep', ['grep', '-E', pat, ru], env=GREP_ASCII), ]) @@ -635,7 +642,8 @@ def bench_subtitles_ru_alternate_casei(suite_dir): Command('grep (ASCII)', [ 'grep', '-E', '-ni', pat, ru, ], env=GREP_ASCII), - Command('ugrep (ASCII)', ['ugrep', '-n', '-i', pat, ru]), + # See bench_subtitles_ru_literal for why we use '-a' here. + Command('ugrep (ASCII)', ['ugrep', '-ani', pat, ru]), Command('rg', ['rg', '-n', '-i', pat, ru]), Command('grep', ['grep', '-E', '-ni', pat, ru], env=GREP_UNICODE), ]) @@ -652,10 +660,11 @@ def bench_subtitles_ru_surrounding_words(suite_dir): return Benchmark(pattern=pat, commands=[ Command('rg', ['rg', '-n', pat, ru]), Command('grep', ['grep', '-E', '-n', pat, ru], env=GREP_UNICODE), - Command('ugrep', ['ugrep', '-n', pat, ru]), + Command('ugrep', ['ugrep', '-an', pat, ru]), Command('ag (ASCII)', ['ag', '-s', pat, ru]), Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII), - Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru]), + # See bench_subtitles_ru_literal for why we use '-a' here. + Command('ugrep (ASCII)', ['ugrep', '-a', '-n', '-U', pat, ru]), ]) @@ -674,11 +683,13 @@ def bench_subtitles_ru_no_literal(suite_dir): return Benchmark(pattern=pat, commands=[ Command('rg', ['rg', '-n', pat, ru]), - Command('ugrep', ['ugrep', '-n', pat, ru]), + # See bench_subtitles_ru_literal for why we use '-a' here. + Command('ugrep', ['ugrep', '-an', pat, ru]), Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]), Command('ag (ASCII)', ['ag', '-s', pat, ru]), Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII), - Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru]) + # See bench_subtitles_ru_literal for why we use '-a' here. + Command('ugrep (ASCII)', ['ugrep', '-anU', pat, ru]) ])