From 9bf7696ec8cacc74baaa4003cdfba0dab65245fd Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 11 Sep 2016 01:05:36 -0400 Subject: [PATCH] Initial cut at a benchmark suite for CLI search tools. --- benchsuite | 918 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 918 insertions(+) create mode 100755 benchsuite diff --git a/benchsuite b/benchsuite new file mode 100755 index 00000000..381b57fb --- /dev/null +++ b/benchsuite @@ -0,0 +1,918 @@ +#!/usr/bin/env python + +''' +benchsuite is a benchmark runner for comparing command line search tools. +''' + +import argparse +import csv +import os +import os.path as path +from multiprocessing import cpu_count +import re +import statistics +import subprocess +import sys +import time + +# Some constants for identifying the corpora we use to run tests. +# We establish two very different kinds of corpora: a small number of large +# files and a large number of small files. These are vastly different use cases +# not only because of their performance characteristics, but also the +# strategies used to increase the relevance of results returned. + +SUBTITLES_DIR = 'subtitles' +SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en' +SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME +SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz' +SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru' +SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME +SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz' + +LINUX_DIR = 'linux' +LINUX_CLONE = 'git://github.com/BurntSushi/linux' + + +def bench_linux_literal_default(suite_dir): + ''' + Benchmark the speed of a literal using *default* settings. + + This is a purposefully unfair benchmark for use in performance + analysis, but it is pedagogically useful. + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = 'PM_RESUME' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + # N.B. This is a purposefully unfair benchmark for illustrative purposes + # of how the default modes for each search tool differ. + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', pat]), + mkcmd('ag', ['ag', pat]), + # ucg reports the exact same matches as ag and rg even though it + # doesn't read gitignore files. Instead, it has a file whitelist + # that happens to match up exactly with the gitignores for this search. + mkcmd('ucg', ['ucg', pat]), + mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}), + mkcmd('pt', ['pt', pat]), + # sift reports an extra line here for a binary file matched. + mkcmd('sift', ['sift', pat]), + ]) + + +def bench_linux_literal(suite_dir): + ''' + Benchmark the speed of a literal, attempting to be fair. + + This tries to use the minimum set of options available in all tools + to test how fast they are. For example, it makes sure there is no + case insensitive matching and that line numbers are computed. + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = 'PM_RESUME' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), + mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]), + mkcmd('ag', ['ag', '-s', pat]), + mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]), + mkcmd('ucg', ['ucg', '--nosmart-case', pat]), + mkcmd('git grep', [ + 'git', 'grep', '-I', '-n', pat, + ], env={'LC_ALL': 'C'}), + mkcmd('pt', ['pt', pat]), + mkcmd('sift', [ + 'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat, + ]), + ]) + + +def bench_linux_literal_casei(suite_dir): + ''' + Benchmark the speed of a case insensitive literal search. + + This is like the linux_literal benchmark, except we ask the + search tools to do case insensitive search. + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = 'PM_RESUME' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', '-i', pat]), + mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]), + mkcmd('rg-novcs-mmap', [ + 'rg', '--mmap', '--no-ignore', '-n', '-i', pat, + ]), + mkcmd('ag', ['ag', '-i', pat]), + mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]), + mkcmd('ucg', ['ucg', '-i', pat]), + mkcmd('git grep', [ + 'git', 'grep', '-I', '-n', '-i', pat, + ], env={'LC_ALL': 'C'}), + # sift yields more matches than it should here. Specifically, it gets + # matches in Module.symvers and System.map in the repo root. Both of + # those files show up in the repo root's .gitignore file. + mkcmd('sift', [ + 'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat, + ]), + ]) + + +def bench_linux_re_literal_suffix(suite_dir): + ''' + Benchmark the speed of a literal inside a regex. + + This, for example, inhibits a prefix byte optimization used + inside of Go's regex engine (relevant for sift and pt). + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = '[A-Z]+_RESUME' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), + mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]), + mkcmd('ag', ['ag', '-s', pat]), + mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]), + mkcmd('ucg', ['ucg', '--nosmart-case', pat]), + mkcmd( + 'git grep', + ['git', 'grep', '-E', '-I', '-n', pat], + env={'LC_ALL': 'C'}, + ), + mkcmd('sift', [ + 'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat, + ]), + ]) + + +def bench_linux_word(suite_dir): + ''' + Benchmark use of the -w ("match word") flag in each tool. + + sift has a lot of trouble with this because it forces it into Go's + regex engine by surrounding the pattern with \b assertions. + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = 'PM_RESUME' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', '-w', pat]), + mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]), + mkcmd('rg-novcs-mmap', [ + 'rg', '--mmap', '--no-ignore', '-n', '-w', pat, + ]), + mkcmd('ag', ['ag', '-s', '-w', pat]), + mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]), + mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]), + mkcmd( + 'git grep', + ['git', 'grep', '-E', '-I', '-n', '-w', pat], + env={'LC_ALL': 'C'}, + ), + mkcmd('sift', [ + 'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat, + ]), + ]) + + +def bench_linux_unicode_greek(suite_dir): + ''' + Benchmark matching of a Unicode category. + + Only three tools (ripgrep, sift and pt) support this. + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = r'\p{Greek}' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', pat]), + # sift tries to search a bunch of PDF files and clutters up the + # results, even though --binary-skip is provided. They are excluded + # here explicitly, but don't have a measurable impact on performance. + mkcmd('sift', [ + 'sift', '-n', '--binary-skip', + '--exclude-files', '.*', + '--exclude-files', '*.pdf', + pat, + ]), + ]) + + +def bench_linux_unicode_greek_casei(suite_dir): + ''' + Benchmark matching of a Unicode category, case insensitively. + + Only ripgrep gets this right (and it's still fast). + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = r'\p{Greek}' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', '-i', pat]), + # sift tries to search a bunch of PDF files and clutters up the + # results, even though --binary-skip is provided. They are excluded + # here explicitly, but don't have a measurable impact on performance. + mkcmd('sift', [ + 'sift', '-n', '--binary-skip', + '--exclude-files', '.*', + '--exclude-files', '*.pdf', + pat, + ]), + ]) + + +def bench_linux_unicode_word(suite_dir): + ''' + Benchmark Unicode aware \w character class. + + Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get + this right. Everything else uses the standard ASCII interpretation + of \w. + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = r'\wAh' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]), + mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), + mkcmd('rg-novcs-mmap', [ + 'rg', '--mmap', '--no-ignore', '-n', pat, + ]), + mkcmd('ag (no Unicode)', ['ag', '-s', pat]), + mkcmd('ag-novcs (no Unicode)', [ + 'ag', '--skip-vcs-ignores', '-s', pat, + ]), + mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]), + mkcmd( + 'git grep', + ['git', 'grep', '-E', '-I', '-n', pat], + env={'LC_ALL': 'en_US.UTF-8'}, + ), + mkcmd( + 'git grep (no Unicode)', + ['git', 'grep', '-E', '-I', '-n', pat], + env={'LC_ALL': 'C'}, + ), + mkcmd('sift (no Unicode)', [ + 'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat, + ]), + ]) + + +def bench_linux_no_literal(suite_dir): + ''' + Benchmark a regex that defeats all literal optimizations. + + Most search patterns have some kind of literal in them, which + typically permits searches to take some shortcuts. Therefore, the + applicability of this benchmark is somewhat suspicious, but the + suite wouldn't feel complete without it. + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), + mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]), + mkcmd('rg-novcs (no Unicode)', [ + 'rg', '--no-ignore', '-n', '(?-u)' + pat, + ]), + mkcmd('ag (no Unicode)', ['ag', '-s', pat]), + mkcmd('ag-novcs (no Unicode)', [ + 'ag', '--skip-vcs-ignores', '-s', pat, + ]), + mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]), + mkcmd( + 'git grep', + ['git', 'grep', '-E', '-I', '-n', pat], + env={'LC_ALL': 'en_US.UTF-8'}, + ), + mkcmd( + 'git grep (no Unicode)', + ['git', 'grep', '-E', '-I', '-n', pat], + env={'LC_ALL': 'C'}, + ), + mkcmd('sift (no Unicode)', [ + 'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat, + ]), + ]) + + +def bench_linux_alternates(suite_dir): + ''' + Benchmark a small alternation of literals. + + sift doesn't make the cut. It's more than 10x slower than the next + fastest result. The slowdown is likely because the Go regexp engine + doesn't do any literal optimizations for this case (there is no + common leading byte). + ''' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), + mkcmd('rg-novcs-mmap', [ + 'rg', '--mmap', '--no-ignore', '-n', pat, + ]), + mkcmd('ag', ['ag', '-s', pat]), + mkcmd('ag-novcs', [ + 'ag', '--skip-vcs-ignores', '-s', pat, + ]), + mkcmd('ucg', ['ucg', '--nosmart-case', pat]), + mkcmd( + 'git grep', + ['git', 'grep', '-E', '-I', '-n', pat], + env={'LC_ALL': 'C'}, + ), + ]) + + +def bench_linux_alternates_casei(suite_dir): + 'Benchmark a small alternation of literals case insensitively.' + require(suite_dir, 'linux') + cwd = path.join(suite_dir, LINUX_DIR) + pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT' + + def mkcmd(*args, **kwargs): + kwargs['cwd'] = cwd + return Command(*args, **kwargs) + + return Benchmark(pattern=pat, commands=[ + mkcmd('rg', ['rg', '-n', '-i', pat]), + mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]), + mkcmd('rg-novcs-mmap', [ + 'rg', '--mmap', '--no-ignore', '-n', '-i', pat, + ]), + mkcmd('ag', ['ag', '-i', pat]), + mkcmd('ag-novcs', [ + 'ag', '--skip-vcs-ignores', '-i', pat, + ]), + mkcmd('ucg', ['ucg', '-i', pat]), + mkcmd( + 'git grep', + ['git', 'grep', '-E', '-I', '-n', '-i', pat], + env={'LC_ALL': 'C'}, + ), + ]) + + +# BREADCRUMBS(burntsushi): We should benchmark an alternation for `linux` as +# well. + +def bench_sherlock(suite_dir): + 'TODO: Fix this and add more single file benchmarks.' + require(suite_dir, 'subtitles-en') + en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME) + pat = 'Sherlock' + + return Benchmark(pattern=pat, commands=[ + Command('rg', ['rg', pat, en]), + Command('grep', ['grep', '-a', pat, en]) + ]) + + +class MissingDependencies(Exception): + ''' + A missing dependency exception. + + This exception occurs when running a benchmark that requires a + particular corpus that isn't available. + + :ivar list(str) missing_names: + A list of missing dependency names. These names correspond to + names that can be used with the --download flag. + ''' + def __init__(self, missing_names): + self.missing_names = missing_names + + def __str__(self): + return 'MissingDependency(%s)' % repr(self.missing_names) + + +class Benchmark(object): + ''' + A single benchmark corresponding to a grouping of commands. + + The main purpose of a benchmark is to compare the performance + characteristics of a group of commands. + ''' + + def __init__(self, name=None, pattern=None, commands=None, + warmup_count=1, count=3, line_count=True): + ''' + Create a single benchmark. + + A single benchmark is composed of a set of commands that are + benchmarked and compared against one another. A benchmark may + have multiple commands that use the same search tool (but + probably should have something differentiating them). + + The grouping of commands is a purely human driven process. + + By default, the output of every command is sent to /dev/null. + Other types of behavior are available via the methods defined + on this benchmark. + + :param str name: + A human readable string denoting the name of this + benchmark. + :param str pattern: + The pattern that is used in search. + :param list(Command) commands: + A list of commands to initialize this benchmark with. More + commands may be added before running the benchmark. + :param int warmup_count: + The number of times to run each command before recording + samples. + :param int count: + The number of samples to collect from each command. + :param bool line_count: + When set, the lines of each search are counted and included + in the samples produced. + ''' + self.name = name + self.pattern = pattern + self.commands = commands or [] + self.warmup_count = warmup_count + self.count = count + self.line_count = line_count + + def run(self): + ''' + Runs this benchmark and returns the results. + + :rtype: Result + ''' + result = Result(self) + for cmd in self.commands: + # Do a warmup first. + for _ in range(self.warmup_count): + self.run_one(cmd) + for _ in range(self.count): + result.add(cmd, **self.run_one(cmd)) + return result + + def run_one(self, cmd): + ''' + Runs the given command exactly once. + + Returns an object that includes the time taken by the command. + If this benchmark was configured to count the number of lines + returned, then the line count is also returned. + + :param Command cmd: The command to run. + :returns: + A dict with two fields, duration and line_count. + The duration is in seconds, with fractional milliseconds, + and is guaranteed to be available. The line_count is set + to None unless line counting is enabled, in which case, + it is the number of lines in the search output. + :rtype: int + ''' + cmd.kwargs['stderr'] = subprocess.DEVNULL + if self.line_count: + cmd.kwargs['stdout'] = subprocess.PIPE + else: + cmd.kwargs['stdout'] = subprocess.DEVNULL + + start = time.time() + completed = cmd.run() + end = time.time() + + line_count = None + if self.line_count: + line_count = completed.stdout.count(b'\n') + return { + 'duration': end - start, + 'line_count': line_count, + } + + +class Result(object): + ''' + The result of running a benchmark. + + Benchmark results consist of a set of samples, where each sample + corresponds to a single run of a single command in the benchmark. + Various statistics can be computed from these samples such as mean + and standard deviation. + ''' + def __init__(self, benchmark): + ''' + Create a new set of results, initially empty. + + :param Benchmarl benchmark: + The benchmark that produced these results. + ''' + self.benchmark = benchmark + self.samples = [] + + def add(self, cmd, duration, line_count=None): + ''' + Add a new sample to this result set. + + :param Command cmd: + The command that produced this sample. + :param int duration: + The duration, in milliseconds, that the command took to + run. + :param int line_count: + The number of lines in the search output. This is optional. + ''' + self.samples.append({ + 'cmd': cmd, + 'duration': duration, + 'line_count': line_count, + }) + + def fastest_sample(self): + ''' + Returns the fastest recorded sample. + ''' + return min(self.samples, key=lambda s: s['duration']) + + def fastest_cmd(self): + ''' + Returns the fastest command according to distribution. + ''' + means = [] + for cmd in self.benchmark.commands: + mean, _ = self.distribution_for(cmd) + means.append((cmd, mean)) + return min(means, key=lambda tup: tup[1])[0] + + def samples_for(self, cmd): + 'Returns an iterable of samples for cmd' + yield from (s for s in self.samples if s['cmd'].name == cmd.name) + + def line_counts_for(self, cmd): + ''' + Returns the line counts recorded for each command. + + :returns: + A dictionary from command name to a set of line + counts recorded. + ''' + return {s['line_count'] for s in self.samples_for(cmd) + if s['line_count'] is not None} + + def distribution_for(self, cmd): + ''' + Returns the distribution (mean +/- std) of the given command. + + :rtype: (float, float) + :returns: + A tuple containing the mean and standard deviation, in that + order. + ''' + mean = statistics.mean( + s['duration'] for s in self.samples_for(cmd)) + stdev = statistics.stdev( + s['duration'] for s in self.samples_for(cmd)) + return mean, stdev + + +class Command(object): + def __init__(self, name, cmd, *args, **kwargs): + ''' + Create a new command that is run as part of a benchmark. + + *args and **kwargs are passed directly to ``subprocess.run``. + An exception to this is stdin/stdout/stderr. Output + redirection is completely controlled by the benchmark harness. + Trying to set them here will trigger an assert. + + :param str name: + The human readable name of this command. This is + particularly useful if the same search tool is used + multiple times in the same benchmark with different + arguments. + :param list(str) cmd: + The command to run as a list of arguments (including the + command name itself). + ''' + assert 'stdin' not in kwargs + assert 'stdout' not in kwargs + assert 'stderr' not in kwargs + self.name = name + self.cmd = cmd + self.args = args + self.kwargs = kwargs + + def run(self): + ''' + Runs this command and returns its status. + + :rtype: subprocess.CompletedProcess + ''' + return subprocess.run(self.cmd, *self.args, **self.kwargs) + + +def eprint(*args, **kwargs): + 'Like print, but to stderr.' + kwargs['file'] = sys.stderr + print(*args, **kwargs) + + +def run_cmd(cmd, *args, **kwargs): + ''' + Print the command to stderr and run it. + + If the command fails, throw a traceback. + ''' + eprint('# %s' % ' '.join(cmd)) + kwargs['check'] = True + return subprocess.run(cmd, *args, **kwargs) + + +def require(suite_dir, *names): + ''' + Declare a dependency on the given names for a benchmark. + + If any dependency doesn't exist, then fail with an error message. + ''' + errs = [] + for name in names: + fun_name = name.replace('-', '_') + if not globals()['has_%s' % fun_name](suite_dir): + errs.append(name) + if len(errs) > 0: + raise MissingDependencies(errs) + + +def download_linux(suite_dir): + 'Download and build the Linux kernel.' + checkout_dir = path.join(suite_dir, LINUX_DIR) + if not os.path.isdir(checkout_dir): + # Clone from my fork so that we always get the same corpus *and* still + # do a shallow clone. Shallow clones are much much cheaper than full + # clones. + run_cmd(['git', 'clone', '--depth', '1', LINUX_CLONE, checkout_dir]) + # We want to build the kernel because the process of building it produces + # a lot of junk in the repository that a search tool probably shouldn't + # touch. + if not os.path.exists(path.join(checkout_dir, 'vmlinux')): + eprint('# Building Linux kernel...') + run_cmd(['make', 'defconfig'], cwd=checkout_dir) + run_cmd(['make', '-j', str(cpu_count())], cwd=checkout_dir) + + +def has_linux(suite_dir): + 'Returns true if we believe the Linux kernel is built.' + checkout_dir = path.join(suite_dir, LINUX_DIR) + return path.exists(path.join(checkout_dir, 'vmlinux')) + + +def download_subtitles_en(suite_dir): + 'Download and decompress English subtitles.' + subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) + en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ) + en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME) + + if not os.path.isdir(subtitle_dir): + os.makedirs(subtitle_dir) + if not os.path.exists(en_path): + if not os.path.exists(en_path_gz): + run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir) + run_cmd(['gunzip', en_path_gz], cwd=subtitle_dir) + + +def has_subtitles_en(suite_dir): + 'Returns true if English subtitles have been downloaded.' + subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) + return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME)) + + +def download_subtitles_ru(suite_dir): + 'Download and decompress Russian subtitles.' + subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) + ru_path_gz = path.join(subtitle_dir, SUBTITLES_RU_NAME_GZ) + ru_path = path.join(subtitle_dir, SUBTITLES_RU_NAME) + + if not os.path.isdir(subtitle_dir): + os.makedirs(subtitle_dir) + if not os.path.exists(ru_path): + if not os.path.exists(ru_path_gz): + run_cmd(['curl', '-LO', SUBTITLES_RU_URL], cwd=subtitle_dir) + run_cmd(['gunzip', ru_path_gz], cwd=subtitle_dir) + + +def has_subtitles_ru(suite_dir): + 'Returns true if Russian subtitles have been downloaded.' + subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) + return path.exists(path.join(subtitle_dir, SUBTITLES_RU_NAME)) + + +def download(suite_dir, choices): + ''' + Download choices into suite_dir. + + Specifically, choices specifies a list of corpora to fetch. + + :param str suite_dir: + The directory in which to download corpora. + :param list(str) choices: + A list of corpora to download. Available choices are: + all, linux, subtitles-en, subtitles-ru. + ''' + for choice in args.download: + if choice == 'linux': + download_linux(suite_dir) + elif choice == 'subtitles-en': + download_subtitles_en(suite_dir) + elif choice == 'subtitles-ru': + download_subtitles_ru(suite_dir) + elif choice == 'all': + download_linux(suite_dir) + download_subtitles_en(suite_dir) + download_subtitles_ru(suite_dir) + else: + eprint('Unrecognized download choice: %s' % choice) + sys.exit(1) + + +def collect_benchmarks(suite_dir, filter_pat=None): + ''' + Return an iterable of all runnable benchmarks. + + :param str suite_dir: + The directory containing corpora. + :param str filter_pat: + A single regular expression that is used to filter benchmarks + by their name. When not specified, all benchmarks are run. + :returns: + An iterable over all runnable benchmarks. If a benchmark + requires corpora that are missing, then a log message is + emitted to stderr and it is not yielded. + ''' + for fun in sorted(globals()): + if not fun.startswith('bench_'): + continue + name = re.sub('^bench_', '', fun) + if filter_pat is not None and not re.search(filter_pat, name): + continue + try: + benchmark = globals()[fun](suite_dir) + except MissingDependencies as e: + eprint( + 'missing: %s, skipping benchmark %s (try running with: %s)' % ( + ', '.join(e.missing_names), + name, + ' '.join(['--download %s' % n for n in e.missing_names]), + )) + continue + benchmark.name = name + yield benchmark + + +def main(): + p = argparse.ArgumentParser('Command line search tool benchmark suite.') + p.add_argument( + '--dir', metavar='PATH', default=os.getcwd(), + help='The directory in which to download data and perform searches.') + p.add_argument( + '--download', metavar='CORPUS', action='append', + choices=['all', 'linux', 'subtitles-en', 'subtitles-ru'], + help='Download and prepare corpus data, then exit without running ' + 'any benchmarks. Note that this command is intended to be ' + 'idempotent. WARNING: This downloads over a gigabyte of data, ' + 'and also includes building the Linux kernel. If "all" is used ' + 'then the total uncompressed size is around 13 GB.') + p.add_argument( + '-f', '--force', action='store_true', + help='Overwrite existing files if there is a conflict.') + p.add_argument( + '--list', action='store_true', + help='List available benchmarks by name.') + p.add_argument( + '--raw', metavar='PATH', + help='Dump raw data (all samples collected) in CSV format to the ' + 'file path provided.') + p.add_argument( + 'bench', metavar='PAT', nargs='?', + help='A regex pattern that will only run benchmarks that match.') + args = p.parse_args() + + if args.download is not None and len(args.download) > 0: + download(args.dir, args.choices) + sys.exit(0) + + if not path.isdir(args.dir): + os.makedirs(args.dir) + if args.raw is not None and path.exists(args.raw) and not args.force: + eprint('File %s already exists (delete it or use --force)' % args.raw) + sys.exit(1) + raw_handle, raw_csv_wtr = None, None + if args.raw is not None: + fields = [ + 'benchmark', 'warmup_iter', 'iter', + 'name', 'command', 'duration', 'lines', 'env', + ] + raw_handle = open(args.raw, 'w+') + raw_csv_wtr = csv.DictWriter(raw_handle, fields) + raw_csv_wtr.writerow({x: x for x in fields}) + + benchmarks = collect_benchmarks(args.dir, filter_pat=args.bench) + for i, b in enumerate(benchmarks): + result = b.run() + fastest_cmd = result.fastest_cmd() + fastest_sample = result.fastest_sample() + max_name_len = max(len(cmd.name) for cmd in b.commands) + + if i > 0: + print() + header = '%s (pattern: %s)' % (b.name, b.pattern) + print('%s\n%s' % (header, '-' * len(header))) + for cmd in b.commands: + name = cmd.name + mean, stdev = result.distribution_for(cmd) + line_counts = result.line_counts_for(cmd) + show_fast_cmd, show_line_counts = '', '' + if fastest_cmd.name == cmd.name: + show_fast_cmd = '*' + if fastest_sample['cmd'].name == cmd.name: + name += '*' + if len(line_counts) > 0: + counts = map(str, line_counts) + show_line_counts = ' (lines: %s)' % ', '.join(counts) + fmt = '{name:{pad}} {mean:0.3f} +/- {stdev:0.3f}{lines}{fast_cmd}' + print(fmt.format( + name=name, pad=max_name_len + 2, fast_cmd=show_fast_cmd, + mean=mean, stdev=stdev, lines=show_line_counts)) + sys.stdout.flush() + + if raw_csv_wtr is not None: + for sample in result.samples: + cmd, duration = sample['cmd'], sample['duration'] + env = ' '.join(['%s=%s' % (k, v) + for k, v in cmd.kwargs.get('env', {}).items()]) + raw_csv_wtr.writerow({ + 'benchmark': b.name, + 'warmup_iter': b.warmup_count, + 'iter': b.count, + 'name': sample['cmd'].name, + 'command': ' '.join(cmd.cmd), + 'duration': duration, + 'lines': sample['line_count'] or '', + 'env': env, + }) + raw_handle.flush() + + +if __name__ == '__main__': + main()