1
0
mirror of https://github.com/BurntSushi/ripgrep.git synced 2024-12-12 19:18:24 +02:00
ripgrep/benchsuite/benchsuite
2017-07-17 09:28:49 -04:00

1324 lines
46 KiB
Python
Executable File

#!/usr/bin/env python3
'''
benchsuite is a benchmark runner for comparing command line search tools.
'''
import argparse
import csv
import os
import os.path as path
from multiprocessing import cpu_count
import re
import shutil
import statistics
import subprocess
import sys
import time
# Some constants for identifying the corpora we use to run tests.
# We establish two very different kinds of corpora: a small number of large
# files and a large number of small files. These are vastly different use cases
# not only because of their performance characteristics, but also the
# strategies used to increase the relevance of results returned.
SUBTITLES_DIR = 'subtitles'
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en'
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz' # noqa
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz' # noqa
LINUX_DIR = 'linux'
LINUX_CLONE = 'git://github.com/BurntSushi/linux'
# Grep takes locale settings from the environment. There is a *substantial*
# performance impact for enabling Unicode, so we need to handle this explicitly
# in our benchmarks.
GREP_ASCII = {'LC_ALL': 'C'}
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}
# Sift tries really hard to search everything by default. In our code search
# benchmarks, we don't want that.
SIFT = [
'sift',
'--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
]
def bench_linux_literal_default(suite_dir):
'''
Benchmark the speed of a literal using *default* settings.
This is a purposefully unfair benchmark for use in performance
analysis, but it is pedagogically useful to demonstrate how
default behaviors differ.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', pat]),
mkcmd('ag', ['ag', pat]),
# ucg reports the exact same matches as ag and rg even though it
# doesn't read gitignore files. Instead, it has a file whitelist
# that happens to match up exactly with the gitignores for this search.
mkcmd('ucg', ['ucg', pat]),
# I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
# default, but I'd guess it to be on most desktop systems.
mkcmd('pt', ['pt', pat]),
# sift reports an extra line here for a binary file matched.
mkcmd('sift', ['sift', pat]),
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
])
def bench_linux_literal(suite_dir):
'''
Benchmark the speed of a literal, attempting to be fair.
This tries to use the minimum set of options available in all tools
to test how fast they are. For example, it makes sure there is
no case insensitive matching and that line numbers are computed
(because some tools don't permit disabling line numbers).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]),
mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]),
mkcmd('pt (ignore)', ['pt', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
mkcmd('git grep (ignore)', [
'git', 'grep', '-I', '-n', pat,
], env={'LC_ALL': 'C'}),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
])
def bench_linux_literal_casei(suite_dir):
'''
Benchmark the speed of a case insensitive literal search.
This is like the linux_literal benchmark, except we ask the
search tools to do case insensitive search.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]),
mkcmd('pt (ignore)', ['pt', '-i', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]),
# It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
# since that is certainly what ripgrep is doing, but this is for an
# ASCII literal, so we should give `git grep` all the opportunity to
# do its best.
mkcmd('git grep (ignore)', [
'git', 'grep', '-I', '-n', '-i', pat,
], env={'LC_ALL': 'C'}),
mkcmd('rg (whitelist)', [
'rg', '-n', '-i', '--no-ignore', '-tall', pat,
]),
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
])
def bench_linux_re_literal_suffix(suite_dir):
'''
Benchmark the speed of a literal inside a regex.
This, for example, inhibits a prefix byte optimization used
inside of Go's regex engine (relevant for sift and pt).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = '[A-Z]+_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('ag (ignore)', ['ag', '-s', pat]),
mkcmd('pt (ignore)', ['pt', '-e', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
mkcmd(
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
])
def bench_linux_word(suite_dir):
'''
Benchmark use of the -w ("match word") flag in each tool.
sift has a lot of trouble with this because it forces it into Go's
regex engine by surrounding the pattern with \b assertions.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]),
mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]),
mkcmd('pt (ignore)', ['pt', '-w', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]),
mkcmd(
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', '-w', pat],
env={'LC_ALL': 'C'},
),
mkcmd('rg (whitelist)', [
'rg', '-n', '-w', '--no-ignore', '-tall', pat,
]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]),
])
def bench_linux_unicode_greek(suite_dir):
'''
Benchmark matching of a Unicode category.
Only three tools (ripgrep, sift and pt) support this. We omit
pt because it is too slow.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\p{Greek}'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('pt', ['pt', '-e', pat]),
mkcmd('sift', SIFT + ['-n', '--git', pat]),
])
def bench_linux_unicode_greek_casei(suite_dir):
'''
Benchmark matching of a Unicode category, case insensitively.
Only ripgrep gets this right (and it's still fast).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\p{Greek}'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('pt', ['pt', '-i', '-e', pat]),
mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]),
])
def bench_linux_unicode_word(suite_dir):
'''
Benchmark Unicode aware \w character class.
Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
this right. Everything else uses the standard ASCII interpretation
of \w.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\wAh'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
mkcmd(
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'},
),
mkcmd(
'git grep (ignore) (ASCII)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('rg (whitelist) (ASCII)', [
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
]),
mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]),
])
def bench_linux_no_literal(suite_dir):
'''
Benchmark a regex that defeats all literal optimizations.
Most search patterns have some kind of literal in them, which
typically permits searches to take some shortcuts. Therefore, the
applicability of this benchmark is somewhat suspicious, but the
suite wouldn't feel complete without it.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
mkcmd(
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'},
),
mkcmd(
'git grep (ignore) (ASCII)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('rg (whitelist) (ASCII)', [
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
]),
mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]),
])
def bench_linux_alternates(suite_dir):
'''
Benchmark a small alternation of literals.
sift doesn't make the cut. It's more than 10x slower than the next
fastest result. The slowdown is likely because the Go regexp engine
doesn't do any literal optimizations for this case (there is no
common leading byte).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('ag (ignore)', ['ag', '-s', pat]),
mkcmd(
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
])
def bench_linux_alternates_casei(suite_dir):
'Benchmark a small alternation of literals case insensitively.'
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
mkcmd('ag (ignore)', ['ag', '-i', pat]),
mkcmd(
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', '-i', pat],
env={'LC_ALL': 'C'},
),
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]),
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
])
def bench_subtitles_en_literal(suite_dir):
'''
Benchmark the speed of an ASCII string literal.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', pat, en]),
Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]),
Command('pt', ['pt', '-N', pat, en]),
Command('sift', ['sift', pat, en]),
Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', pat, en]),
Command('ag (lines)', ['ag', '-s', pat, en]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
Command('pt (lines)', ['pt', pat, en]),
Command('sift (lines)', ['sift', '-n', pat, en]),
Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII),
])
def bench_subtitles_en_literal_casei(suite_dir):
'''
Benchmark the speed of a Unicode-y string case insensitively.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-i', pat, en]),
Command('grep', ['grep', '-ai', pat, en], env=GREP_UNICODE),
Command('grep (ASCII)', [
'grep', '-E', '-ai', pat, en,
], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]),
])
def bench_subtitles_en_literal_word(suite_dir):
'''
Benchmark the speed of finding a literal inside word boundaries.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[
Command('rg (ASCII)', [
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en,
]),
Command('ag (ASCII)', ['ag', '-sw', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-anw', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', '-nw', pat, en]),
Command('grep', ['grep', '-anw', pat, en], env=GREP_UNICODE),
])
def bench_subtitles_en_alternate(suite_dir):
'''
Benchmark the speed of a set of alternate literals.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = '|'.join([
'Sherlock Holmes',
'John Watson',
'Irene Adler',
'Inspector Lestrade',
'Professor Moriarty',
])
return Benchmark(pattern=pat, commands=[
Command('rg (lines)', ['rg', '-n', pat, en]),
Command('ag (lines)', ['ag', '-s', pat, en]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (lines)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', pat, en]),
Command('grep', [
'grep', '-E', '-a', pat, en,
], env=GREP_ASCII),
])
def bench_subtitles_en_alternate_casei(suite_dir):
'''
Benchmark the speed of a set of alternate literals.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = '|'.join([
'Sherlock Holmes',
'John Watson',
'Irene Adler',
'Inspector Lestrade',
'Professor Moriarty',
])
return Benchmark(pattern=pat, commands=[
Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]),
Command('ucg (ASCII)', ['ucg', '-i', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-ani', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', '-n', '-i', pat, en]),
Command('grep', ['grep', '-E', '-ani', pat, en], env=GREP_UNICODE),
])
def bench_subtitles_en_surrounding_words(suite_dir):
'''
Benchmark a more complex regex with an inner literal.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = r'\w+\s+Holmes\s+\w+'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, en]),
Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
Command('ag (ASCII)', ['ag', '-s', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
])
def bench_subtitles_en_no_literal(suite_dir):
'''
Benchmark the speed of a regex with no literals.
Note that we don't even try to run grep with Unicode support
on this one. While it should eventually get the right answer,
I killed it after it had already been running for two minutes
and showed no signs of finishing soon.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, en]),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
Command('ag (ASCII)', ['ag', '-s', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
])
def bench_subtitles_ru_literal(suite_dir):
'''
Benchmark the speed of a Unicode-y string literal.
'''
require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', pat, ru]),
Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]),
Command('pt', ['pt', '-N', pat, ru]),
Command('sift', ['sift', pat, ru]),
Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', pat, ru]),
Command('ag (lines)', ['ag', '-s', pat, ru]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
Command('pt (lines)', ['pt', pat, ru]),
Command('sift (lines)', ['sift', '-n', pat, ru]),
Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII),
])
def bench_subtitles_ru_literal_casei(suite_dir):
'''
Benchmark the speed of a Unicode-y string case insensitively.
'''
require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-i', pat, ru]),
Command('grep', ['grep', '-ai', pat, ru], env=GREP_UNICODE),
Command('grep (ASCII)', [
'grep', '-E', '-ai', pat, ru,
], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]),
])
def bench_subtitles_ru_literal_word(suite_dir):
'''
Benchmark the speed of finding a literal inside word boundaries.
'''
require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[
Command('rg (ASCII)', [
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
]),
Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [
'grep', '-anw', pat, ru,
], env=GREP_ASCII),
Command('rg', ['rg', '-nw', pat, ru]),
Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE),
])
def bench_subtitles_ru_alternate(suite_dir):
'''
Benchmark the speed of a set of alternate literals.
'''
require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = '|'.join([
'Шерлок Холмс', # Sherlock Holmes
'Джон Уотсон', # John Watson
'Ирен Адлер', # Irene Adler
'инспектор Лестрейд', # Inspector Lestrade
'профессор Мориарти', # Professor Moriarty
])
return Benchmark(pattern=pat, commands=[
Command('rg (lines)', ['rg', '-n', pat, ru]),
Command('ag (lines)', ['ag', '-s', pat, ru]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (lines)', [
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
Command('rg', ['rg', pat, ru]),
Command('grep', [
'grep', '-E', '-a', pat, ru,
], env=GREP_ASCII),
])
def bench_subtitles_ru_alternate_casei(suite_dir):
'''
Benchmark the speed of a set of alternate literals.
'''
require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = '|'.join([
'Шерлок Холмс', # Sherlock Holmes
'Джон Уотсон', # John Watson
'Ирен Адлер', # Irene Adler
'инспектор Лестрейд', # Inspector Lestrade
'профессор Мориарти', # Professor Moriarty
])
return Benchmark(pattern=pat, commands=[
Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]),
Command('ucg (ASCII)', ['ucg', '-i', pat, ru]),
Command('grep (ASCII)', [
'grep', '-E', '-ani', pat, ru,
], env=GREP_ASCII),
Command('rg', ['rg', '-n', '-i', pat, ru]),
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
])
def bench_subtitles_ru_surrounding_words(suite_dir):
'''
Benchmark a more complex regex with an inner literal.
'''
require(suite_dir, 'subtitles-en')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = r'\w+\s+Холмс\s+\w+'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]),
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE),
Command('ag (ASCII)', ['ag', '-s', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
])
def bench_subtitles_ru_no_literal(suite_dir):
'''
Benchmark the speed of a regex with no literals.
Note that we don't even try to run grep with Unicode support
on this one. While it should eventually get the right answer,
I killed it after it had already been running for two minutes
and showed no signs of finishing soon.
'''
require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
Command('ag (ASCII)', ['ag', '-s', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
])
class MissingDependencies(Exception):
'''
A missing dependency exception.
This exception occurs when running a benchmark that requires a
particular corpus that isn't available.
:ivar list(str) missing_names:
A list of missing dependency names. These names correspond to
names that can be used with the --download flag.
'''
def __init__(self, missing_names):
self.missing_names = missing_names
def __str__(self):
return 'MissingDependency(%s)' % repr(self.missing_names)
class MissingCommands(Exception):
'''
A missing command exception.
This exception occurs when running a command in a benchmark
where the command could not be found on the current system.
:ivar list(str) missing_names:
The names of the command binaries that could not be found.
'''
def __init__(self, missing_names):
self.missing_names = sorted(set(missing_names))
def __str__(self):
return 'MissingCommands(%s)' % repr(self.missing_names)
class Benchmark(object):
'''
A single benchmark corresponding to a grouping of commands.
The main purpose of a benchmark is to compare the performance
characteristics of a group of commands.
'''
def __init__(self, name=None, pattern=None, commands=None,
warmup_count=1, count=3, line_count=True,
allow_missing_commands=False,
disabled_cmds=None):
'''
Create a single benchmark.
A single benchmark is composed of a set of commands that are
benchmarked and compared against one another. A benchmark may
have multiple commands that use the same search tool (but
probably should have something differentiating them).
The grouping of commands is a purely human driven process.
By default, the output of every command is sent to /dev/null.
Other types of behavior are available via the methods defined
on this benchmark.
:param str name:
A human readable string denoting the name of this
benchmark.
:param str pattern:
The pattern that is used in search.
:param list(Command) commands:
A list of commands to initialize this benchmark with. More
commands may be added before running the benchmark.
:param int warmup_count:
The number of times to run each command before recording
samples.
:param int count:
The number of samples to collect from each command.
:param bool line_count:
When set, the lines of each search are counted and included
in the samples produced.
:param bool allow_missing_commands:
When set, if a command is missing, then the benchmark
will simply skip it.
:param list(str) disabled_cmds:
A list of commands to skip.
'''
self.name = name
self.pattern = pattern
self.commands = commands or []
self.warmup_count = warmup_count
self.count = count
self.line_count = line_count
self.allow_missing_commands = allow_missing_commands
self.disabled_cmds = set(disabled_cmds or [])
def raise_if_missing(self):
'''
Raises a MissingCommands exception if applicable.
A MissingCommands exception is raised when the following
criteria are met: 1) allow_missing_commands is False, and 2) at
least one command in this benchmark could not be found on this
system.
'''
missing_commands = []
for c in self.commands:
if c.binary_name in self.disabled_cmds or c.exists():
continue
missing_commands.append(c.binary_name)
if not self.allow_missing_commands and len(missing_commands) > 0:
raise MissingCommands(missing_commands)
def run(self):
'''
Runs this benchmark and returns the results.
:rtype: Result
:raises:
MissingCommands if any command doesn't exist.
(Unless allow_missing_commands is enabled.)
'''
self.raise_if_missing()
result = Result(self)
for cmd in self.commands:
if cmd.binary_name in self.disabled_cmds:
continue
if self.allow_missing_commands and not cmd.exists():
# Skip this command if we're OK with it.
continue
# Do a warmup first.
for _ in range(self.warmup_count):
self.run_one(cmd)
for _ in range(self.count):
result.add(cmd, **self.run_one(cmd))
return result
def run_one(self, cmd):
'''
Runs the given command exactly once.
Returns an object that includes the time taken by the command.
If this benchmark was configured to count the number of lines
returned, then the line count is also returned.
:param Command cmd: The command to run.
:returns:
A dict with two fields, duration and line_count.
The duration is in seconds, with fractional milliseconds,
and is guaranteed to be available. The line_count is set
to None unless line counting is enabled, in which case,
it is the number of lines in the search output.
:rtype: int
'''
if not cmd.exists():
raise MissingCommands([cmd.cmd[0]])
cmd.kwargs['stderr'] = subprocess.DEVNULL
if self.line_count:
cmd.kwargs['stdout'] = subprocess.PIPE
else:
cmd.kwargs['stdout'] = subprocess.DEVNULL
start = time.time()
completed = cmd.run()
end = time.time()
line_count = None
if self.line_count:
line_count = completed.stdout.count(b'\n')
return {
'duration': end - start,
'line_count': line_count,
}
class Result(object):
'''
The result of running a benchmark.
Benchmark results consist of a set of samples, where each sample
corresponds to a single run of a single command in the benchmark.
Various statistics can be computed from these samples such as mean
and standard deviation.
'''
def __init__(self, benchmark):
'''
Create a new set of results, initially empty.
:param Benchmarl benchmark:
The benchmark that produced these results.
'''
self.benchmark = benchmark
self.samples = []
def add(self, cmd, duration, line_count=None):
'''
Add a new sample to this result set.
:param Command cmd:
The command that produced this sample.
:param int duration:
The duration, in milliseconds, that the command took to
run.
:param int line_count:
The number of lines in the search output. This is optional.
'''
self.samples.append({
'cmd': cmd,
'duration': duration,
'line_count': line_count,
})
def fastest_sample(self):
'''
Returns the fastest recorded sample.
'''
return min(self.samples, key=lambda s: s['duration'])
def fastest_cmd(self):
'''
Returns the fastest command according to distribution.
'''
means = []
for cmd in self.benchmark.commands:
mean, _ = self.distribution_for(cmd)
if mean is None:
continue
means.append((cmd, mean))
return min(means, key=lambda tup: tup[1])[0]
def samples_for(self, cmd):
'Returns an iterable of samples for cmd'
yield from (s for s in self.samples if s['cmd'].name == cmd.name)
def line_counts_for(self, cmd):
'''
Returns the line counts recorded for each command.
:returns:
A dictionary from command name to a set of line
counts recorded.
'''
return {s['line_count']
for s in self.samples_for(cmd)
if s['line_count'] is not None}
def distribution_for(self, cmd):
'''
Returns the distribution (mean +/- std) of the given command.
If there are no samples for this command (i.e., it was skipped),
then return ``(None, None)``.
:rtype: (float, float)
:returns:
A tuple containing the mean and standard deviation, in that
order.
'''
samples = list(s['duration'] for s in self.samples_for(cmd))
if len(samples) == 0:
return None, None
return statistics.mean(samples), statistics.stdev(samples)
class Command(object):
def __init__(self, name, cmd, *args, **kwargs):
'''
Create a new command that is run as part of a benchmark.
*args and **kwargs are passed directly to ``subprocess.run``.
An exception to this is stdin/stdout/stderr. Output
redirection is completely controlled by the benchmark harness.
Trying to set them here will trigger an assert.
:param str name:
The human readable name of this command. This is
particularly useful if the same search tool is used
multiple times in the same benchmark with different
arguments.
:param list(str) cmd:
The command to run as a list of arguments (including the
command name itself).
'''
assert 'stdin' not in kwargs
assert 'stdout' not in kwargs
assert 'stderr' not in kwargs
self.name = name
self.cmd = cmd
self.args = args
self.kwargs = kwargs
def exists(self):
'Returns true if and only if this command exists.'
return shutil.which(self.binary_name) is not None
@property
def binary_name(self):
'Return the binary name of this command.'
return self.cmd[0]
def run(self):
'''
Runs this command and returns its status.
:rtype: subprocess.CompletedProcess
'''
return subprocess.run(self.cmd, *self.args, **self.kwargs)
def eprint(*args, **kwargs):
'Like print, but to stderr.'
kwargs['file'] = sys.stderr
print(*args, **kwargs)
def run_cmd(cmd, *args, **kwargs):
'''
Print the command to stderr and run it.
If the command fails, throw a traceback.
'''
eprint('# %s' % ' '.join(cmd))
kwargs['check'] = True
return subprocess.run(cmd, *args, **kwargs)
def require(suite_dir, *names):
'''
Declare a dependency on the given names for a benchmark.
If any dependency doesn't exist, then fail with an error message.
'''
errs = []
for name in names:
fun_name = name.replace('-', '_')
if not globals()['has_%s' % fun_name](suite_dir):
errs.append(name)
if len(errs) > 0:
raise MissingDependencies(errs)
def download_linux(suite_dir):
'Download and build the Linux kernel.'
checkout_dir = path.join(suite_dir, LINUX_DIR)
if not os.path.isdir(checkout_dir):
# Clone from my fork so that we always get the same corpus *and* still
# do a shallow clone. Shallow clones are much much cheaper than full
# clones.
run_cmd(['git', 'clone', '--depth', '1', LINUX_CLONE, checkout_dir])
# We want to build the kernel because the process of building it produces
# a lot of junk in the repository that a search tool probably shouldn't
# touch.
if not os.path.exists(path.join(checkout_dir, 'vmlinux')):
eprint('# Building Linux kernel...')
run_cmd(['make', 'defconfig'], cwd=checkout_dir)
run_cmd(['make', '-j', str(cpu_count())], cwd=checkout_dir)
def has_linux(suite_dir):
'Returns true if we believe the Linux kernel is built.'
checkout_dir = path.join(suite_dir, LINUX_DIR)
return path.exists(path.join(checkout_dir, 'vmlinux'))
def download_subtitles_en(suite_dir):
'Download and decompress English subtitles.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
en_path_sample = path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE)
if not os.path.isdir(subtitle_dir):
os.makedirs(subtitle_dir)
if not os.path.exists(en_path):
if not os.path.exists(en_path_gz):
run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
run_cmd(['gunzip', en_path_gz])
if not os.path.exists(en_path_sample):
# Get a sample roughly the same size as the Russian corpus so that
# benchmarks finish in a reasonable time.
with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f:
run_cmd(
['head', '-n', '32722372', en_path],
cwd=subtitle_dir, stdout=f)
def has_subtitles_en(suite_dir):
'Returns true if English subtitles have been downloaded.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE))
def download_subtitles_ru(suite_dir):
'Download and decompress Russian subtitles.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
ru_path_gz = path.join(subtitle_dir, SUBTITLES_RU_NAME_GZ)
ru_path = path.join(subtitle_dir, SUBTITLES_RU_NAME)
if not os.path.isdir(subtitle_dir):
os.makedirs(subtitle_dir)
if not os.path.exists(ru_path):
if not os.path.exists(ru_path_gz):
run_cmd(['curl', '-LO', SUBTITLES_RU_URL], cwd=subtitle_dir)
run_cmd(['gunzip', ru_path_gz])
def has_subtitles_ru(suite_dir):
'Returns true if Russian subtitles have been downloaded.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
return path.exists(path.join(subtitle_dir, SUBTITLES_RU_NAME))
def download(suite_dir, choices):
'''
Download choices into suite_dir.
Specifically, choices specifies a list of corpora to fetch.
:param str suite_dir:
The directory in which to download corpora.
:param list(str) choices:
A list of corpora to download. Available choices are:
all, linux, subtitles-en, subtitles-ru.
'''
for choice in choices:
if choice == 'linux':
download_linux(suite_dir)
elif choice == 'subtitles-en':
download_subtitles_en(suite_dir)
elif choice == 'subtitles-ru':
download_subtitles_ru(suite_dir)
elif choice == 'all':
download_linux(suite_dir)
download_subtitles_en(suite_dir)
download_subtitles_ru(suite_dir)
else:
eprint('Unrecognized download choice: %s' % choice)
sys.exit(1)
def collect_benchmarks(suite_dir, filter_pat=None,
allow_missing_commands=False,
disabled_cmds=None,
warmup_iter=1, bench_iter=3):
'''
Return an iterable of all runnable benchmarks.
:param str suite_dir:
The directory containing corpora.
:param str filter_pat:
A single regular expression that is used to filter benchmarks
by their name. When not specified, all benchmarks are run.
:returns:
An iterable over all runnable benchmarks. If a benchmark
requires corpora that are missing, then a log message is
emitted to stderr and it is not yielded.
'''
for fun in sorted(globals()):
if not fun.startswith('bench_'):
continue
name = re.sub('^bench_', '', fun)
if filter_pat is not None and not re.search(filter_pat, name):
continue
try:
benchmark = globals()[fun](suite_dir)
benchmark.name = name
benchmark.warmup_count = warmup_iter
benchmark.count = bench_iter
benchmark.allow_missing_commands = allow_missing_commands
benchmark.disabled_cmds = disabled_cmds
benchmark.raise_if_missing()
except MissingDependencies as e:
eprint(
'missing: %s, skipping benchmark %s (try running with: %s)' % (
', '.join(e.missing_names),
name,
' '.join(['--download %s' % n for n in e.missing_names]),
))
continue
except MissingCommands as e:
fmt = 'missing commands: %s, skipping benchmark %s ' \
'(run with --allow-missing to run incomplete benchmarks)'
eprint(fmt % (', '.join(e.missing_names), name))
continue
yield benchmark
def main():
download_choices = ['all', 'linux', 'subtitles-en', 'subtitles-ru']
p = argparse.ArgumentParser('Command line search tool benchmark suite.')
p.add_argument(
'--dir', metavar='PATH', default=os.getcwd(),
help='The directory in which to download data and perform searches.')
p.add_argument(
'--download', metavar='CORPUS', action='append',
choices=download_choices,
help='Download and prepare corpus data, then exit without running '
'any benchmarks. Note that this command is intended to be '
'idempotent. WARNING: This downloads over a gigabyte of data, '
'and also includes building the Linux kernel. If "all" is used '
'then the total uncompressed size is around 13 GB. '
'Choices: %s' % ', '.join(download_choices))
p.add_argument(
'--allow-missing', action='store_true',
help='Permit benchmarks to run even if some commands are missing.')
p.add_argument(
'--disabled', help='A list of comma separated commands to skip.')
p.add_argument(
'-f', '--force', action='store_true',
help='Overwrite existing files if there is a conflict.')
p.add_argument(
'--list', action='store_true',
help='List available benchmarks by name.')
p.add_argument(
'--raw', metavar='PATH',
help='Dump raw data (all samples collected) in CSV format to the '
'file path provided.')
p.add_argument(
'--warmup-iter', metavar='INTEGER', type=int, default=1,
help='The number of iterations to run each command before '
'recording measurements.')
p.add_argument(
'--bench-iter', metavar='INTEGER', type=int, default=3,
help='The number of iterations to run each command while '
'recording measurements.')
p.add_argument(
'bench', metavar='PAT', nargs='?',
help='A regex pattern that will only run benchmarks that match.')
args = p.parse_args()
if args.list:
benchmarks = collect_benchmarks(
args.dir, filter_pat=args.bench,
allow_missing_commands=args.allow_missing,
disabled_cmds=(args.disabled or '').split(','),
warmup_iter=args.warmup_iter, bench_iter=args.bench_iter)
for b in benchmarks:
print(b.name)
sys.exit(0)
if args.download is not None and len(args.download) > 0:
download(args.dir, args.download)
sys.exit(0)
if not path.isdir(args.dir):
os.makedirs(args.dir)
if args.raw is not None and path.exists(args.raw) and not args.force:
eprint('File %s already exists (delete it or use --force)' % args.raw)
sys.exit(1)
raw_handle, raw_csv_wtr = None, None
if args.raw is not None:
fields = [
'benchmark', 'warmup_iter', 'iter',
'name', 'command', 'duration', 'lines', 'env',
]
raw_handle = open(args.raw, 'w+')
raw_csv_wtr = csv.DictWriter(raw_handle, fields)
raw_csv_wtr.writerow({x: x for x in fields})
benchmarks = collect_benchmarks(
args.dir, filter_pat=args.bench,
allow_missing_commands=args.allow_missing,
disabled_cmds=(args.disabled or '').split(','),
warmup_iter=args.warmup_iter, bench_iter=args.bench_iter)
for i, b in enumerate(benchmarks):
result = b.run()
fastest_cmd = result.fastest_cmd()
fastest_sample = result.fastest_sample()
max_name_len = max(len(cmd.name) for cmd in b.commands)
if i > 0:
print()
header = '%s (pattern: %s)' % (b.name, b.pattern)
print('%s\n%s' % (header, '-' * len(header)))
for cmd in b.commands:
name = cmd.name
mean, stdev = result.distribution_for(cmd)
if mean is None:
# If we couldn't get a distribution for this command then
# it was skipped.
continue
line_counts = result.line_counts_for(cmd)
show_fast_cmd, show_line_counts = '', ''
if fastest_cmd.name == cmd.name:
show_fast_cmd = '*'
if fastest_sample['cmd'].name == cmd.name:
name += '*'
if len(line_counts) > 0:
counts = map(str, line_counts)
show_line_counts = ' (lines: %s)' % ', '.join(counts)
fmt = '{name:{pad}} {mean:0.3f} +/- {stdev:0.3f}{lines}{fast_cmd}'
print(fmt.format(
name=name, pad=max_name_len + 2, fast_cmd=show_fast_cmd,
mean=mean, stdev=stdev, lines=show_line_counts))
sys.stdout.flush()
if raw_csv_wtr is not None:
for sample in result.samples:
cmd, duration = sample['cmd'], sample['duration']
env = ' '.join(['%s=%s' % (k, v)
for k, v in cmd.kwargs.get('env', {}).items()])
raw_csv_wtr.writerow({
'benchmark': b.name,
'warmup_iter': b.warmup_count,
'iter': b.count,
'name': sample['cmd'].name,
'command': ' '.join(cmd.cmd),
'duration': duration,
'lines': sample['line_count'] or '',
'env': env,
})
raw_handle.flush()
if __name__ == '__main__':
main()