mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2024-12-12 19:18:24 +02:00
0e46171e3b
We try to reduce the pressure on regexes and offload some of it to Aho-Corasick or exact lookups.
1084 lines
37 KiB
Python
Executable File
1084 lines
37 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
'''
|
|
benchsuite is a benchmark runner for comparing command line search tools.
|
|
'''
|
|
|
|
import argparse
|
|
import csv
|
|
import os
|
|
import os.path as path
|
|
from multiprocessing import cpu_count
|
|
import re
|
|
import statistics
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
# Some constants for identifying the corpora we use to run tests.
|
|
# We establish two very different kinds of corpora: a small number of large
|
|
# files and a large number of small files. These are vastly different use cases
|
|
# not only because of their performance characteristics, but also the
|
|
# strategies used to increase the relevance of results returned.
|
|
|
|
SUBTITLES_DIR = 'subtitles'
|
|
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
|
|
SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en'
|
|
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
|
|
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'
|
|
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
|
|
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
|
|
SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'
|
|
|
|
LINUX_DIR = 'linux'
|
|
LINUX_CLONE = 'git://github.com/BurntSushi/linux'
|
|
|
|
# Grep takes locale settings from the environment. There is a *substantial*
|
|
# performance impact for enabling Unicode, so we need to handle this explicitly
|
|
# in our benchmarks.
|
|
GREP_ASCII = {'LC_ALL': 'C'}
|
|
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}
|
|
|
|
|
|
def bench_linux_literal_default(suite_dir):
|
|
'''
|
|
Benchmark the speed of a literal using *default* settings.
|
|
|
|
This is a purposefully unfair benchmark for use in performance
|
|
analysis, but it is pedagogically useful.
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = 'PM_RESUME'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
# N.B. This is a purposefully unfair benchmark for illustrative purposes
|
|
# of how the default modes for each search tool differ.
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', pat]),
|
|
mkcmd('ag', ['ag', pat]),
|
|
# ucg reports the exact same matches as ag and rg even though it
|
|
# doesn't read gitignore files. Instead, it has a file whitelist
|
|
# that happens to match up exactly with the gitignores for this search.
|
|
mkcmd('ucg', ['ucg', pat]),
|
|
# I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
|
|
# default, but I'd guess it to be on most desktop systems.
|
|
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
|
|
mkcmd('pt', ['pt', pat]),
|
|
# sift reports an extra line here for a binary file matched.
|
|
mkcmd('sift', ['sift', pat]),
|
|
])
|
|
|
|
|
|
def bench_linux_literal(suite_dir):
|
|
'''
|
|
Benchmark the speed of a literal, attempting to be fair.
|
|
|
|
This tries to use the minimum set of options available in all tools
|
|
to test how fast they are. For example, it makes sure there is no
|
|
case insensitive matching and that line numbers are computed.
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = 'PM_RESUME'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', pat]),
|
|
mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
|
|
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
|
|
mkcmd('ag (mmap)', ['ag', '-s', pat]),
|
|
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
|
|
mkcmd('git grep', [
|
|
'git', 'grep', '-I', '-n', pat,
|
|
], env={'LC_ALL': 'C'}),
|
|
mkcmd('pt', ['pt', pat]),
|
|
mkcmd('sift', [
|
|
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
|
|
]),
|
|
])
|
|
|
|
|
|
def bench_linux_literal_casei(suite_dir):
|
|
'''
|
|
Benchmark the speed of a case insensitive literal search.
|
|
|
|
This is like the linux_literal benchmark, except we ask the
|
|
search tools to do case insensitive search.
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = 'PM_RESUME'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', '-i', pat]),
|
|
mkcmd('rg (mmap)', ['rg', '-n', '-i', pat]),
|
|
mkcmd('rg (whitelist)', [
|
|
'rg', '-n', '-i', '--no-ignore', '-tall', pat,
|
|
]),
|
|
mkcmd('ag (mmap)', ['ag', '-i', pat]),
|
|
mkcmd('ucg', ['ucg', '-i', pat]),
|
|
# It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
|
|
# since that is certainly what ripgrep is doing, but this is for an
|
|
# ASCII literal, so we should give `git grep` all the opportunity to
|
|
# do its best.
|
|
mkcmd('git grep', [
|
|
'git', 'grep', '-I', '-n', '-i', pat,
|
|
], env={'LC_ALL': 'C'}),
|
|
# sift yields more matches than it should here. Specifically, it gets
|
|
# matches in Module.symvers and System.map in the repo root. Both of
|
|
# those files show up in the repo root's .gitignore file.
|
|
mkcmd('sift', [
|
|
'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat,
|
|
]),
|
|
])
|
|
|
|
|
|
def bench_linux_re_literal_suffix(suite_dir):
|
|
'''
|
|
Benchmark the speed of a literal inside a regex.
|
|
|
|
This, for example, inhibits a prefix byte optimization used
|
|
inside of Go's regex engine (relevant for sift and pt).
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = '[A-Z]+_RESUME'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', pat]),
|
|
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
|
|
mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
|
|
mkcmd('ag', ['ag', '-s', pat]),
|
|
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
|
|
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
|
|
mkcmd(
|
|
'git grep',
|
|
['git', 'grep', '-E', '-I', '-n', pat],
|
|
env={'LC_ALL': 'C'},
|
|
),
|
|
mkcmd('sift', [
|
|
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
|
|
]),
|
|
])
|
|
|
|
|
|
def bench_linux_word(suite_dir):
|
|
'''
|
|
Benchmark use of the -w ("match word") flag in each tool.
|
|
|
|
sift has a lot of trouble with this because it forces it into Go's
|
|
regex engine by surrounding the pattern with \b assertions.
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = 'PM_RESUME'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', '-w', pat]),
|
|
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]),
|
|
mkcmd('rg-novcs-mmap', [
|
|
'rg', '--mmap', '--no-ignore', '-n', '-w', pat,
|
|
]),
|
|
mkcmd('ag', ['ag', '-s', '-w', pat]),
|
|
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]),
|
|
mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]),
|
|
mkcmd(
|
|
'git grep',
|
|
['git', 'grep', '-E', '-I', '-n', '-w', pat],
|
|
env={'LC_ALL': 'C'},
|
|
),
|
|
mkcmd('sift', [
|
|
'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat,
|
|
]),
|
|
])
|
|
|
|
|
|
def bench_linux_unicode_greek(suite_dir):
|
|
'''
|
|
Benchmark matching of a Unicode category.
|
|
|
|
Only three tools (ripgrep, sift and pt) support this.
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = r'\p{Greek}'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', pat]),
|
|
# sift tries to search a bunch of PDF files and clutters up the
|
|
# results, even though --binary-skip is provided. They are excluded
|
|
# here explicitly, but don't have a measurable impact on performance.
|
|
mkcmd('sift', [
|
|
'sift', '-n', '--binary-skip',
|
|
'--exclude-files', '.*',
|
|
'--exclude-files', '*.pdf',
|
|
pat,
|
|
]),
|
|
])
|
|
|
|
|
|
def bench_linux_unicode_greek_casei(suite_dir):
|
|
'''
|
|
Benchmark matching of a Unicode category, case insensitively.
|
|
|
|
Only ripgrep gets this right (and it's still fast).
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = r'\p{Greek}'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', '-i', pat]),
|
|
# sift tries to search a bunch of PDF files and clutters up the
|
|
# results, even though --binary-skip is provided. They are excluded
|
|
# here explicitly, but don't have a measurable impact on performance.
|
|
mkcmd('sift', [
|
|
'sift', '-n', '--binary-skip',
|
|
'--exclude-files', '.*',
|
|
'--exclude-files', '*.pdf',
|
|
pat,
|
|
]),
|
|
])
|
|
|
|
|
|
def bench_linux_unicode_word(suite_dir):
|
|
'''
|
|
Benchmark Unicode aware \w character class.
|
|
|
|
Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
|
|
this right. Everything else uses the standard ASCII interpretation
|
|
of \w.
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = r'\wAh'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', pat]),
|
|
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
|
|
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
|
|
mkcmd('rg-novcs-mmap', [
|
|
'rg', '--mmap', '--no-ignore', '-n', pat,
|
|
]),
|
|
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
|
|
mkcmd('ag-novcs (no Unicode)', [
|
|
'ag', '--skip-vcs-ignores', '-s', pat,
|
|
]),
|
|
mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
|
|
mkcmd(
|
|
'git grep',
|
|
['git', 'grep', '-E', '-I', '-n', pat],
|
|
env={'LC_ALL': 'en_US.UTF-8'},
|
|
),
|
|
mkcmd(
|
|
'git grep (no Unicode)',
|
|
['git', 'grep', '-E', '-I', '-n', pat],
|
|
env={'LC_ALL': 'C'},
|
|
),
|
|
mkcmd('sift (no Unicode)', [
|
|
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
|
|
]),
|
|
])
|
|
|
|
|
|
def bench_linux_no_literal(suite_dir):
|
|
'''
|
|
Benchmark a regex that defeats all literal optimizations.
|
|
|
|
Most search patterns have some kind of literal in them, which
|
|
typically permits searches to take some shortcuts. Therefore, the
|
|
applicability of this benchmark is somewhat suspicious, but the
|
|
suite wouldn't feel complete without it.
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', pat]),
|
|
mkcmd('rg-whitelist', ['rg', '-tall', '--no-ignore', '-n', pat]),
|
|
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
|
|
mkcmd('rg-whitelist (no Unicode)', [
|
|
'rg', '-tall', '--no-ignore', '-n', '(?-u)' + pat,
|
|
]),
|
|
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
|
|
mkcmd('ag-novcs (no Unicode)', [
|
|
'ag', '--skip-vcs-ignores', '-s', pat,
|
|
]),
|
|
mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
|
|
mkcmd(
|
|
'git grep',
|
|
['git', 'grep', '-E', '-I', '-n', pat],
|
|
env={'LC_ALL': 'en_US.UTF-8'},
|
|
),
|
|
mkcmd(
|
|
'git grep (no Unicode)',
|
|
['git', 'grep', '-E', '-I', '-n', pat],
|
|
env={'LC_ALL': 'C'},
|
|
),
|
|
mkcmd('sift (no Unicode)', [
|
|
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
|
|
]),
|
|
])
|
|
|
|
|
|
def bench_linux_alternates(suite_dir):
|
|
'''
|
|
Benchmark a small alternation of literals.
|
|
|
|
sift doesn't make the cut. It's more than 10x slower than the next
|
|
fastest result. The slowdown is likely because the Go regexp engine
|
|
doesn't do any literal optimizations for this case (there is no
|
|
common leading byte).
|
|
'''
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', pat]),
|
|
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
|
|
mkcmd('rg-novcs-mmap', [
|
|
'rg', '--mmap', '--no-ignore', '-n', pat,
|
|
]),
|
|
mkcmd('ag', ['ag', '-s', pat]),
|
|
mkcmd('ag-novcs', [
|
|
'ag', '--skip-vcs-ignores', '-s', pat,
|
|
]),
|
|
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
|
|
mkcmd(
|
|
'git grep',
|
|
['git', 'grep', '-E', '-I', '-n', pat],
|
|
env={'LC_ALL': 'C'},
|
|
),
|
|
])
|
|
|
|
|
|
def bench_linux_alternates_casei(suite_dir):
|
|
'Benchmark a small alternation of literals case insensitively.'
|
|
require(suite_dir, 'linux')
|
|
cwd = path.join(suite_dir, LINUX_DIR)
|
|
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
|
|
|
|
def mkcmd(*args, **kwargs):
|
|
kwargs['cwd'] = cwd
|
|
return Command(*args, **kwargs)
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
mkcmd('rg', ['rg', '-n', '-i', pat]),
|
|
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
|
|
mkcmd('rg-novcs-mmap', [
|
|
'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
|
|
]),
|
|
mkcmd('ag', ['ag', '-i', pat]),
|
|
mkcmd('ag-novcs', [
|
|
'ag', '--skip-vcs-ignores', '-i', pat,
|
|
]),
|
|
mkcmd('ucg', ['ucg', '-i', pat]),
|
|
mkcmd(
|
|
'git grep',
|
|
['git', 'grep', '-E', '-I', '-n', '-i', pat],
|
|
env={'LC_ALL': 'C'},
|
|
),
|
|
])
|
|
|
|
|
|
def bench_subtitles_en_literal(suite_dir):
|
|
'''
|
|
Benchmark the speed of an ASCII string literal.
|
|
'''
|
|
require(suite_dir, 'subtitles-en')
|
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
|
|
pat = 'Sherlock Holmes'
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
Command('rg', ['rg', '-n', pat, ru]),
|
|
Command('rg (no line numbers)', ['rg', pat, ru]),
|
|
Command('ag', ['ag', '-s', pat, ru]),
|
|
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
|
|
Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
|
|
Command('grep (no line numbers)', [
|
|
'grep', '-a', pat, ru,
|
|
], env=GREP_ASCII),
|
|
Command('pt', ['pt', pat, ru]),
|
|
Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
|
|
Command('sift', ['sift', '-n', pat, ru]),
|
|
Command('sift (no line numbers)', ['sift', pat, ru]),
|
|
])
|
|
|
|
|
|
def bench_subtitles_ru_literal(suite_dir):
|
|
'''
|
|
Benchmark the speed of a Unicode-y string literal.
|
|
'''
|
|
require(suite_dir, 'subtitles-ru')
|
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
|
pat = 'Шерлок Холмс' # Sherlock Holmes
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
Command('rg', ['rg', '-n', pat, ru]),
|
|
Command('rg (no line numbers)', ['rg', pat, ru]),
|
|
Command('ag', ['ag', '-s', pat, ru]),
|
|
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
|
|
Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
|
|
Command('grep (no line numbers)', [
|
|
'grep', '-a', pat, ru,
|
|
], env=GREP_ASCII),
|
|
Command('pt', ['pt', pat, ru]),
|
|
Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
|
|
Command('sift', ['sift', '-n', pat, ru]),
|
|
Command('sift (no line numbers)', ['sift', pat, ru]),
|
|
])
|
|
|
|
|
|
def bench_subtitles_ru_literal_casei(suite_dir):
|
|
'''
|
|
Benchmark the speed of a Unicode-y string case insensitively.
|
|
'''
|
|
require(suite_dir, 'subtitles-ru')
|
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
|
pat = 'Шерлок Холмс' # Sherlock Holmes
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
Command('rg', ['rg', '-n', '-i', pat, ru]),
|
|
Command('ag (not Unicode)', ['ag', '-i', pat, ru]),
|
|
Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
|
|
Command('grep', ['grep', '-ani', pat, ru], env=GREP_UNICODE),
|
|
Command('grep (not Unicode)', [
|
|
'grep', '-E', '-ani', pat, ru,
|
|
], env=GREP_ASCII),
|
|
])
|
|
|
|
|
|
def bench_subtitles_ru_literal_word(suite_dir):
|
|
'''
|
|
Benchmark the speed of finding a literal inside word boundaries.
|
|
'''
|
|
require(suite_dir, 'subtitles-ru')
|
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
|
pat = 'Шерлок Холмс' # Sherlock Holmes
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
Command('rg', ['rg', '-nw', pat, ru]),
|
|
Command('rg (not Unicode)', [
|
|
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
|
|
]),
|
|
Command('ag (not Unicode)', ['ag', '-sw', pat, ru]),
|
|
Command('ucg (not Unicode)', ['ucg', '--nosmart-case', pat, ru]),
|
|
Command('grep (not Unicode)', [
|
|
'grep', '-anw', pat, ru,
|
|
], env=GREP_ASCII),
|
|
Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE),
|
|
])
|
|
|
|
|
|
def bench_subtitles_ru_alternate(suite_dir):
|
|
'''
|
|
Benchmark the speed of a set of alternate literals.
|
|
'''
|
|
require(suite_dir, 'subtitles-ru')
|
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
|
pat = '|'.join([
|
|
'Шерлок Холмс', # Sherlock Holmes
|
|
'Джон Уотсон', # John Watson
|
|
'Ирен Адлер', # Irene Adler
|
|
'инспектор Лестрейд', # Inspector Lestrade
|
|
'профессор Мориарти', # Professor Moriarty
|
|
])
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
Command('rg', ['rg', '-n', pat, ru]),
|
|
Command('rg (no line numbers)', ['rg', pat, ru]),
|
|
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
|
|
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_ASCII),
|
|
Command('grep (no line numbers)', [
|
|
'grep', '-E', '-a', pat, ru,
|
|
], env=GREP_ASCII),
|
|
])
|
|
|
|
|
|
def bench_subtitles_ru_alternate_casei(suite_dir):
|
|
'''
|
|
Benchmark the speed of a set of alternate literals.
|
|
'''
|
|
require(suite_dir, 'subtitles-ru')
|
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
|
pat = '|'.join([
|
|
'Шерлок Холмс', # Sherlock Holmes
|
|
'Джон Уотсон', # John Watson
|
|
'Ирен Адлер', # Irene Adler
|
|
'инспектор Лестрейд', # Inspector Lestrade
|
|
'профессор Мориарти', # Professor Moriarty
|
|
])
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
Command('rg', ['rg', '-n', '-i', pat, ru]),
|
|
Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
|
|
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
|
|
Command('grep (not Unicode)', [
|
|
'grep', '-E', '-ani', pat, ru,
|
|
], env=GREP_ASCII),
|
|
])
|
|
|
|
|
|
def bench_subtitles_ru_no_literal(suite_dir):
|
|
'''
|
|
Benchmark the speed of a regex with no literals.
|
|
|
|
Note that we don't even try to run grep with Unicode support
|
|
on this one. While it should eventually get the right answer,
|
|
I killed it after it had already been running for two minutes
|
|
and showed no signs of finishing soon.
|
|
'''
|
|
require(suite_dir, 'subtitles-ru')
|
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
|
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
|
|
|
|
return Benchmark(pattern=pat, commands=[
|
|
Command('rg', ['rg', '-n', pat, ru]),
|
|
Command('rg (no line numbers)', ['rg', pat, ru]),
|
|
Command('ucg (no Unicode)', ['ucg', '--nosmart-case', pat, ru]),
|
|
Command('grep (no Unicode)', [
|
|
'grep', '-E', '-an', pat, ru,
|
|
], env=GREP_ASCII),
|
|
])
|
|
|
|
|
|
class MissingDependencies(Exception):
|
|
'''
|
|
A missing dependency exception.
|
|
|
|
This exception occurs when running a benchmark that requires a
|
|
particular corpus that isn't available.
|
|
|
|
:ivar list(str) missing_names:
|
|
A list of missing dependency names. These names correspond to
|
|
names that can be used with the --download flag.
|
|
'''
|
|
def __init__(self, missing_names):
|
|
self.missing_names = missing_names
|
|
|
|
def __str__(self):
|
|
return 'MissingDependency(%s)' % repr(self.missing_names)
|
|
|
|
|
|
class Benchmark(object):
|
|
'''
|
|
A single benchmark corresponding to a grouping of commands.
|
|
|
|
The main purpose of a benchmark is to compare the performance
|
|
characteristics of a group of commands.
|
|
'''
|
|
|
|
def __init__(self, name=None, pattern=None, commands=None,
|
|
warmup_count=1, count=3, line_count=True):
|
|
'''
|
|
Create a single benchmark.
|
|
|
|
A single benchmark is composed of a set of commands that are
|
|
benchmarked and compared against one another. A benchmark may
|
|
have multiple commands that use the same search tool (but
|
|
probably should have something differentiating them).
|
|
|
|
The grouping of commands is a purely human driven process.
|
|
|
|
By default, the output of every command is sent to /dev/null.
|
|
Other types of behavior are available via the methods defined
|
|
on this benchmark.
|
|
|
|
:param str name:
|
|
A human readable string denoting the name of this
|
|
benchmark.
|
|
:param str pattern:
|
|
The pattern that is used in search.
|
|
:param list(Command) commands:
|
|
A list of commands to initialize this benchmark with. More
|
|
commands may be added before running the benchmark.
|
|
:param int warmup_count:
|
|
The number of times to run each command before recording
|
|
samples.
|
|
:param int count:
|
|
The number of samples to collect from each command.
|
|
:param bool line_count:
|
|
When set, the lines of each search are counted and included
|
|
in the samples produced.
|
|
'''
|
|
self.name = name
|
|
self.pattern = pattern
|
|
self.commands = commands or []
|
|
self.warmup_count = warmup_count
|
|
self.count = count
|
|
self.line_count = line_count
|
|
|
|
def run(self):
|
|
'''
|
|
Runs this benchmark and returns the results.
|
|
|
|
:rtype: Result
|
|
'''
|
|
result = Result(self)
|
|
for cmd in self.commands:
|
|
# Do a warmup first.
|
|
for _ in range(self.warmup_count):
|
|
self.run_one(cmd)
|
|
for _ in range(self.count):
|
|
result.add(cmd, **self.run_one(cmd))
|
|
return result
|
|
|
|
def run_one(self, cmd):
|
|
'''
|
|
Runs the given command exactly once.
|
|
|
|
Returns an object that includes the time taken by the command.
|
|
If this benchmark was configured to count the number of lines
|
|
returned, then the line count is also returned.
|
|
|
|
:param Command cmd: The command to run.
|
|
:returns:
|
|
A dict with two fields, duration and line_count.
|
|
The duration is in seconds, with fractional milliseconds,
|
|
and is guaranteed to be available. The line_count is set
|
|
to None unless line counting is enabled, in which case,
|
|
it is the number of lines in the search output.
|
|
:rtype: int
|
|
'''
|
|
cmd.kwargs['stderr'] = subprocess.DEVNULL
|
|
if self.line_count:
|
|
cmd.kwargs['stdout'] = subprocess.PIPE
|
|
else:
|
|
cmd.kwargs['stdout'] = subprocess.DEVNULL
|
|
|
|
start = time.time()
|
|
completed = cmd.run()
|
|
end = time.time()
|
|
|
|
line_count = None
|
|
if self.line_count:
|
|
line_count = completed.stdout.count(b'\n')
|
|
return {
|
|
'duration': end - start,
|
|
'line_count': line_count,
|
|
}
|
|
|
|
|
|
class Result(object):
|
|
'''
|
|
The result of running a benchmark.
|
|
|
|
Benchmark results consist of a set of samples, where each sample
|
|
corresponds to a single run of a single command in the benchmark.
|
|
Various statistics can be computed from these samples such as mean
|
|
and standard deviation.
|
|
'''
|
|
def __init__(self, benchmark):
|
|
'''
|
|
Create a new set of results, initially empty.
|
|
|
|
:param Benchmarl benchmark:
|
|
The benchmark that produced these results.
|
|
'''
|
|
self.benchmark = benchmark
|
|
self.samples = []
|
|
|
|
def add(self, cmd, duration, line_count=None):
|
|
'''
|
|
Add a new sample to this result set.
|
|
|
|
:param Command cmd:
|
|
The command that produced this sample.
|
|
:param int duration:
|
|
The duration, in milliseconds, that the command took to
|
|
run.
|
|
:param int line_count:
|
|
The number of lines in the search output. This is optional.
|
|
'''
|
|
self.samples.append({
|
|
'cmd': cmd,
|
|
'duration': duration,
|
|
'line_count': line_count,
|
|
})
|
|
|
|
def fastest_sample(self):
|
|
'''
|
|
Returns the fastest recorded sample.
|
|
'''
|
|
return min(self.samples, key=lambda s: s['duration'])
|
|
|
|
def fastest_cmd(self):
|
|
'''
|
|
Returns the fastest command according to distribution.
|
|
'''
|
|
means = []
|
|
for cmd in self.benchmark.commands:
|
|
mean, _ = self.distribution_for(cmd)
|
|
means.append((cmd, mean))
|
|
return min(means, key=lambda tup: tup[1])[0]
|
|
|
|
def samples_for(self, cmd):
|
|
'Returns an iterable of samples for cmd'
|
|
yield from (s for s in self.samples if s['cmd'].name == cmd.name)
|
|
|
|
def line_counts_for(self, cmd):
|
|
'''
|
|
Returns the line counts recorded for each command.
|
|
|
|
:returns:
|
|
A dictionary from command name to a set of line
|
|
counts recorded.
|
|
'''
|
|
return {s['line_count'] for s in self.samples_for(cmd)
|
|
if s['line_count'] is not None}
|
|
|
|
def distribution_for(self, cmd):
|
|
'''
|
|
Returns the distribution (mean +/- std) of the given command.
|
|
|
|
:rtype: (float, float)
|
|
:returns:
|
|
A tuple containing the mean and standard deviation, in that
|
|
order.
|
|
'''
|
|
mean = statistics.mean(
|
|
s['duration'] for s in self.samples_for(cmd))
|
|
stdev = statistics.stdev(
|
|
s['duration'] for s in self.samples_for(cmd))
|
|
return mean, stdev
|
|
|
|
|
|
class Command(object):
|
|
def __init__(self, name, cmd, *args, **kwargs):
|
|
'''
|
|
Create a new command that is run as part of a benchmark.
|
|
|
|
*args and **kwargs are passed directly to ``subprocess.run``.
|
|
An exception to this is stdin/stdout/stderr. Output
|
|
redirection is completely controlled by the benchmark harness.
|
|
Trying to set them here will trigger an assert.
|
|
|
|
:param str name:
|
|
The human readable name of this command. This is
|
|
particularly useful if the same search tool is used
|
|
multiple times in the same benchmark with different
|
|
arguments.
|
|
:param list(str) cmd:
|
|
The command to run as a list of arguments (including the
|
|
command name itself).
|
|
'''
|
|
assert 'stdin' not in kwargs
|
|
assert 'stdout' not in kwargs
|
|
assert 'stderr' not in kwargs
|
|
self.name = name
|
|
self.cmd = cmd
|
|
self.args = args
|
|
self.kwargs = kwargs
|
|
|
|
def run(self):
|
|
'''
|
|
Runs this command and returns its status.
|
|
|
|
:rtype: subprocess.CompletedProcess
|
|
'''
|
|
return subprocess.run(self.cmd, *self.args, **self.kwargs)
|
|
|
|
|
|
def eprint(*args, **kwargs):
|
|
'Like print, but to stderr.'
|
|
kwargs['file'] = sys.stderr
|
|
print(*args, **kwargs)
|
|
|
|
|
|
def run_cmd(cmd, *args, **kwargs):
|
|
'''
|
|
Print the command to stderr and run it.
|
|
|
|
If the command fails, throw a traceback.
|
|
'''
|
|
eprint('# %s' % ' '.join(cmd))
|
|
kwargs['check'] = True
|
|
return subprocess.run(cmd, *args, **kwargs)
|
|
|
|
|
|
def require(suite_dir, *names):
|
|
'''
|
|
Declare a dependency on the given names for a benchmark.
|
|
|
|
If any dependency doesn't exist, then fail with an error message.
|
|
'''
|
|
errs = []
|
|
for name in names:
|
|
fun_name = name.replace('-', '_')
|
|
if not globals()['has_%s' % fun_name](suite_dir):
|
|
errs.append(name)
|
|
if len(errs) > 0:
|
|
raise MissingDependencies(errs)
|
|
|
|
|
|
def download_linux(suite_dir):
|
|
'Download and build the Linux kernel.'
|
|
checkout_dir = path.join(suite_dir, LINUX_DIR)
|
|
if not os.path.isdir(checkout_dir):
|
|
# Clone from my fork so that we always get the same corpus *and* still
|
|
# do a shallow clone. Shallow clones are much much cheaper than full
|
|
# clones.
|
|
run_cmd(['git', 'clone', '--depth', '1', LINUX_CLONE, checkout_dir])
|
|
# We want to build the kernel because the process of building it produces
|
|
# a lot of junk in the repository that a search tool probably shouldn't
|
|
# touch.
|
|
if not os.path.exists(path.join(checkout_dir, 'vmlinux')):
|
|
eprint('# Building Linux kernel...')
|
|
run_cmd(['make', 'defconfig'], cwd=checkout_dir)
|
|
run_cmd(['make', '-j', str(cpu_count())], cwd=checkout_dir)
|
|
|
|
|
|
def has_linux(suite_dir):
|
|
'Returns true if we believe the Linux kernel is built.'
|
|
checkout_dir = path.join(suite_dir, LINUX_DIR)
|
|
return path.exists(path.join(checkout_dir, 'vmlinux'))
|
|
|
|
|
|
def download_subtitles_en(suite_dir):
|
|
'Download and decompress English subtitles.'
|
|
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
|
|
en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
|
|
en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
|
|
en_path_sample = path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE)
|
|
|
|
if not os.path.isdir(subtitle_dir):
|
|
os.makedirs(subtitle_dir)
|
|
if not os.path.exists(en_path):
|
|
if not os.path.exists(en_path_gz):
|
|
run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
|
|
run_cmd(['gunzip', en_path_gz], cwd=subtitle_dir)
|
|
if not os.path.exists(en_path_sample):
|
|
# Get a sample roughly the same size as the Russian corpus so that
|
|
# benchmarks finish in a reasonable time.
|
|
with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f:
|
|
run_cmd(
|
|
['head', '-n', '32722372', en_path],
|
|
cwd=subtitle_dir, stdout=f)
|
|
|
|
|
|
def has_subtitles_en(suite_dir):
|
|
'Returns true if English subtitles have been downloaded.'
|
|
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
|
|
return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE))
|
|
|
|
|
|
def download_subtitles_ru(suite_dir):
|
|
'Download and decompress Russian subtitles.'
|
|
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
|
|
ru_path_gz = path.join(subtitle_dir, SUBTITLES_RU_NAME_GZ)
|
|
ru_path = path.join(subtitle_dir, SUBTITLES_RU_NAME)
|
|
|
|
if not os.path.isdir(subtitle_dir):
|
|
os.makedirs(subtitle_dir)
|
|
if not os.path.exists(ru_path):
|
|
if not os.path.exists(ru_path_gz):
|
|
run_cmd(['curl', '-LO', SUBTITLES_RU_URL], cwd=subtitle_dir)
|
|
run_cmd(['gunzip', ru_path_gz], cwd=subtitle_dir)
|
|
|
|
|
|
def has_subtitles_ru(suite_dir):
|
|
'Returns true if Russian subtitles have been downloaded.'
|
|
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
|
|
return path.exists(path.join(subtitle_dir, SUBTITLES_RU_NAME))
|
|
|
|
|
|
def download(suite_dir, choices):
|
|
'''
|
|
Download choices into suite_dir.
|
|
|
|
Specifically, choices specifies a list of corpora to fetch.
|
|
|
|
:param str suite_dir:
|
|
The directory in which to download corpora.
|
|
:param list(str) choices:
|
|
A list of corpora to download. Available choices are:
|
|
all, linux, subtitles-en, subtitles-ru.
|
|
'''
|
|
for choice in choices:
|
|
if choice == 'linux':
|
|
download_linux(suite_dir)
|
|
elif choice == 'subtitles-en':
|
|
download_subtitles_en(suite_dir)
|
|
elif choice == 'subtitles-ru':
|
|
download_subtitles_ru(suite_dir)
|
|
elif choice == 'all':
|
|
download_linux(suite_dir)
|
|
download_subtitles_en(suite_dir)
|
|
download_subtitles_ru(suite_dir)
|
|
else:
|
|
eprint('Unrecognized download choice: %s' % choice)
|
|
sys.exit(1)
|
|
|
|
|
|
def collect_benchmarks(suite_dir, filter_pat=None):
|
|
'''
|
|
Return an iterable of all runnable benchmarks.
|
|
|
|
:param str suite_dir:
|
|
The directory containing corpora.
|
|
:param str filter_pat:
|
|
A single regular expression that is used to filter benchmarks
|
|
by their name. When not specified, all benchmarks are run.
|
|
:returns:
|
|
An iterable over all runnable benchmarks. If a benchmark
|
|
requires corpora that are missing, then a log message is
|
|
emitted to stderr and it is not yielded.
|
|
'''
|
|
for fun in sorted(globals()):
|
|
if not fun.startswith('bench_'):
|
|
continue
|
|
name = re.sub('^bench_', '', fun)
|
|
if filter_pat is not None and not re.search(filter_pat, name):
|
|
continue
|
|
try:
|
|
benchmark = globals()[fun](suite_dir)
|
|
except MissingDependencies as e:
|
|
eprint(
|
|
'missing: %s, skipping benchmark %s (try running with: %s)' % (
|
|
', '.join(e.missing_names),
|
|
name,
|
|
' '.join(['--download %s' % n for n in e.missing_names]),
|
|
))
|
|
continue
|
|
benchmark.name = name
|
|
yield benchmark
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser('Command line search tool benchmark suite.')
|
|
p.add_argument(
|
|
'--dir', metavar='PATH', default=os.getcwd(),
|
|
help='The directory in which to download data and perform searches.')
|
|
p.add_argument(
|
|
'--download', metavar='CORPUS', action='append',
|
|
choices=['all', 'linux', 'subtitles-en', 'subtitles-ru'],
|
|
help='Download and prepare corpus data, then exit without running '
|
|
'any benchmarks. Note that this command is intended to be '
|
|
'idempotent. WARNING: This downloads over a gigabyte of data, '
|
|
'and also includes building the Linux kernel. If "all" is used '
|
|
'then the total uncompressed size is around 13 GB.')
|
|
p.add_argument(
|
|
'-f', '--force', action='store_true',
|
|
help='Overwrite existing files if there is a conflict.')
|
|
p.add_argument(
|
|
'--list', action='store_true',
|
|
help='List available benchmarks by name.')
|
|
p.add_argument(
|
|
'--raw', metavar='PATH',
|
|
help='Dump raw data (all samples collected) in CSV format to the '
|
|
'file path provided.')
|
|
p.add_argument(
|
|
'bench', metavar='PAT', nargs='?',
|
|
help='A regex pattern that will only run benchmarks that match.')
|
|
args = p.parse_args()
|
|
|
|
if args.download is not None and len(args.download) > 0:
|
|
download(args.dir, args.download)
|
|
sys.exit(0)
|
|
|
|
if not path.isdir(args.dir):
|
|
os.makedirs(args.dir)
|
|
if args.raw is not None and path.exists(args.raw) and not args.force:
|
|
eprint('File %s already exists (delete it or use --force)' % args.raw)
|
|
sys.exit(1)
|
|
raw_handle, raw_csv_wtr = None, None
|
|
if args.raw is not None:
|
|
fields = [
|
|
'benchmark', 'warmup_iter', 'iter',
|
|
'name', 'command', 'duration', 'lines', 'env',
|
|
]
|
|
raw_handle = open(args.raw, 'w+')
|
|
raw_csv_wtr = csv.DictWriter(raw_handle, fields)
|
|
raw_csv_wtr.writerow({x: x for x in fields})
|
|
|
|
benchmarks = collect_benchmarks(args.dir, filter_pat=args.bench)
|
|
for i, b in enumerate(benchmarks):
|
|
result = b.run()
|
|
fastest_cmd = result.fastest_cmd()
|
|
fastest_sample = result.fastest_sample()
|
|
max_name_len = max(len(cmd.name) for cmd in b.commands)
|
|
|
|
if i > 0:
|
|
print()
|
|
header = '%s (pattern: %s)' % (b.name, b.pattern)
|
|
print('%s\n%s' % (header, '-' * len(header)))
|
|
for cmd in b.commands:
|
|
name = cmd.name
|
|
mean, stdev = result.distribution_for(cmd)
|
|
line_counts = result.line_counts_for(cmd)
|
|
show_fast_cmd, show_line_counts = '', ''
|
|
if fastest_cmd.name == cmd.name:
|
|
show_fast_cmd = '*'
|
|
if fastest_sample['cmd'].name == cmd.name:
|
|
name += '*'
|
|
if len(line_counts) > 0:
|
|
counts = map(str, line_counts)
|
|
show_line_counts = ' (lines: %s)' % ', '.join(counts)
|
|
fmt = '{name:{pad}} {mean:0.3f} +/- {stdev:0.3f}{lines}{fast_cmd}'
|
|
print(fmt.format(
|
|
name=name, pad=max_name_len + 2, fast_cmd=show_fast_cmd,
|
|
mean=mean, stdev=stdev, lines=show_line_counts))
|
|
sys.stdout.flush()
|
|
|
|
if raw_csv_wtr is not None:
|
|
for sample in result.samples:
|
|
cmd, duration = sample['cmd'], sample['duration']
|
|
env = ' '.join(['%s=%s' % (k, v)
|
|
for k, v in cmd.kwargs.get('env', {}).items()])
|
|
raw_csv_wtr.writerow({
|
|
'benchmark': b.name,
|
|
'warmup_iter': b.warmup_count,
|
|
'iter': b.count,
|
|
'name': sample['cmd'].name,
|
|
'command': ' '.join(cmd.cmd),
|
|
'duration': duration,
|
|
'lines': sample['line_count'] or '',
|
|
'env': env,
|
|
})
|
|
raw_handle.flush()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|