ripgrep/benchsuite

#!/usr/bin/env python

'''
benchsuite is a benchmark runner for comparing command line search tools.
'''

import argparse
import csv
import os
import os.path as path
from multiprocessing import cpu_count
import re
import statistics
import subprocess
import sys
import time

# Some constants for identifying the corpora we use to run tests.
# We establish two very different kinds of corpora: a small number of large
# files and a large number of small files. These are vastly different use cases
# not only because of their performance characteristics, but also the
# strategies used to increase the relevance of results returned.

SUBTITLES_DIR = 'subtitles'
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en'
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'

LINUX_DIR = 'linux'
LINUX_CLONE = 'git://github.com/BurntSushi/linux'

# Grep takes locale settings from the environment. There is a *substantial*
# performance impact for enabling Unicode, so we need to handle this explicitly
# in our benchmarks.
GREP_ASCII = {'LC_ALL': 'C'}
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}


def bench_linux_literal_default(suite_dir):
    '''
    Benchmark the speed of a literal using *default* settings.

    This is a purposefully unfair benchmark for use in performance
    analysis, but it is pedagogically useful.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'PM_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    # N.B. This is a purposefully unfair benchmark for illustrative purposes
    # of how the default modes for each search tool differ.
    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', pat]),
        mkcmd('ag', ['ag', pat]),
        # ucg reports the exact same matches as ag and rg even though it
        # doesn't read gitignore files. Instead, it has a file whitelist
        # that happens to match up exactly with the gitignores for this search.
        mkcmd('ucg', ['ucg', pat]),
        # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
        # default, but I'd guess it to be on most desktop systems.
        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
        mkcmd('pt', ['pt', pat]),
        # sift reports an extra line here for a binary file matched.
        mkcmd('sift', ['sift', pat]),
    ])


def bench_linux_literal(suite_dir):
    '''
    Benchmark the speed of a literal, attempting to be fair.

    This tries to use the minimum set of options available in all tools
    to test how fast they are. For example, it makes sure there is no
    case insensitive matching and that line numbers are computed.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'PM_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
        mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('ag (mmap)', ['ag', '-s', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
        mkcmd('git grep', [
            'git', 'grep', '-I', '-n', pat,
        ], env={'LC_ALL': 'C'}),
        mkcmd('pt', ['pt', pat]),
        mkcmd('sift', [
            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
        ]),
    ])


def bench_linux_literal_casei(suite_dir):
    '''
    Benchmark the speed of a case insensitive literal search.

    This is like the linux_literal benchmark, except we ask the
    search tools to do case insensitive search.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'PM_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', '-i', pat]),
        mkcmd('rg (mmap)', ['rg', '-n', '-i', pat]),
        mkcmd('rg (whitelist)', [
            'rg', '-n', '-i', '--no-ignore', '-tall', pat,
        ]),
        mkcmd('ag (mmap)', ['ag', '-i', pat]),
        mkcmd('ucg', ['ucg', '-i', pat]),
        # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
        # since that is certainly what ripgrep is doing, but this is for an
        # ASCII literal, so we should give `git grep` all the opportunity to
        # do its best.
        mkcmd('git grep', [
            'git', 'grep', '-I', '-n', '-i', pat,
        ], env={'LC_ALL': 'C'}),
        # sift yields more matches than it should here. Specifically, it gets
        # matches in Module.symvers and System.map in the repo root. Both of
        # those files show up in the repo root's .gitignore file.
        mkcmd('sift', [
            'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat,
        ]),
    ])


def bench_linux_re_literal_suffix(suite_dir):
    '''
    Benchmark the speed of a literal inside a regex.

    This, for example, inhibits a prefix byte optimization used
    inside of Go's regex engine (relevant for sift and pt).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = '[A-Z]+_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
        mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
        mkcmd('ag', ['ag', '-s', pat]),
        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
        mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
        mkcmd(
            'git grep',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('sift', [
            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
        ]),
    ])


def bench_linux_word(suite_dir):
    '''
    Benchmark use of the -w ("match word") flag in each tool.

    sift has a lot of trouble with this because it forces it into Go's
    regex engine by surrounding the pattern with \b assertions.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'PM_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', '-w', pat]),
        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]),
        mkcmd('rg-novcs-mmap', [
            'rg', '--mmap', '--no-ignore', '-n', '-w', pat,
        ]),
        mkcmd('ag', ['ag', '-s', '-w', pat]),
        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]),
        mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]),
        mkcmd(
            'git grep',
            ['git', 'grep', '-E', '-I', '-n', '-w', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('sift', [
            'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat,
        ]),
    ])


def bench_linux_unicode_greek(suite_dir):
    '''
    Benchmark matching of a Unicode category.

    Only three tools (ripgrep, sift and pt) support this.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = r'\p{Greek}'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
        # sift tries to search a bunch of PDF files and clutters up the
        # results, even though --binary-skip is provided. They are excluded
        # here explicitly, but don't have a measurable impact on performance.
        mkcmd('sift', [
            'sift', '-n', '--binary-skip',
            '--exclude-files', '.*',
            '--exclude-files', '*.pdf',
            pat,
        ]),
    ])


def bench_linux_unicode_greek_casei(suite_dir):
    '''
    Benchmark matching of a Unicode category, case insensitively.

    Only ripgrep gets this right (and it's still fast).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = r'\p{Greek}'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', '-i', pat]),
        # sift tries to search a bunch of PDF files and clutters up the
        # results, even though --binary-skip is provided. They are excluded
        # here explicitly, but don't have a measurable impact on performance.
        mkcmd('sift', [
            'sift', '-n', '--binary-skip',
            '--exclude-files', '.*',
            '--exclude-files', '*.pdf',
            pat,
        ]),
    ])


def bench_linux_unicode_word(suite_dir):
    '''
    Benchmark Unicode aware \w character class.

    Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
    this right. Everything else uses the standard ASCII interpretation
    of \w.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = r'\wAh'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
        mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
        mkcmd('rg-novcs-mmap', [
            'rg', '--mmap', '--no-ignore', '-n', pat,
        ]),
        mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
        mkcmd('ag-novcs (no Unicode)', [
            'ag', '--skip-vcs-ignores', '-s', pat,
        ]),
        mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
        mkcmd(
            'git grep',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'en_US.UTF-8'},
        ),
        mkcmd(
            'git grep (no Unicode)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('sift (no Unicode)', [
            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
        ]),
    ])


def bench_linux_no_literal(suite_dir):
    '''
    Benchmark a regex that defeats all literal optimizations.

    Most search patterns have some kind of literal in them, which
    typically permits searches to take some shortcuts. Therefore, the
    applicability of this benchmark is somewhat suspicious, but the
    suite wouldn't feel complete without it.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
        mkcmd('rg-whitelist', ['rg', '-tall', '--no-ignore', '-n', pat]),
        mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
        mkcmd('rg-whitelist (no Unicode)', [
            'rg', '-tall', '--no-ignore', '-n', '(?-u)' + pat,
        ]),
        mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
        mkcmd('ag-novcs (no Unicode)', [
            'ag', '--skip-vcs-ignores', '-s', pat,
        ]),
        mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
        mkcmd(
            'git grep',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'en_US.UTF-8'},
        ),
        mkcmd(
            'git grep (no Unicode)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('sift (no Unicode)', [
            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
        ]),
    ])


def bench_linux_alternates(suite_dir):
    '''
    Benchmark a small alternation of literals.

    sift doesn't make the cut. It's more than 10x slower than the next
    fastest result. The slowdown is likely because the Go regexp engine
    doesn't do any literal optimizations for this case (there is no
    common leading byte).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
        mkcmd('rg-novcs-mmap', [
            'rg', '--mmap', '--no-ignore', '-n', pat,
        ]),
        mkcmd('ag', ['ag', '-s', pat]),
        mkcmd('ag-novcs', [
            'ag', '--skip-vcs-ignores', '-s', pat,
        ]),
        mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
        mkcmd(
            'git grep',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
    ])


def bench_linux_alternates_casei(suite_dir):
    'Benchmark a small alternation of literals case insensitively.'
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', '-i', pat]),
        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
        mkcmd('rg-novcs-mmap', [
            'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
        ]),
        mkcmd('ag', ['ag', '-i', pat]),
        mkcmd('ag-novcs', [
            'ag', '--skip-vcs-ignores', '-i', pat,
        ]),
        mkcmd('ucg', ['ucg', '-i', pat]),
        mkcmd(
            'git grep',
            ['git', 'grep', '-E', '-I', '-n', '-i', pat],
            env={'LC_ALL': 'C'},
        ),
    ])


def bench_subtitles_en_literal(suite_dir):
    '''
    Benchmark the speed of an ASCII string literal.
    '''
    require(suite_dir, 'subtitles-en')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = 'Sherlock Holmes'

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, ru]),
        Command('rg (no line numbers)', ['rg', pat, ru]),
        Command('ag', ['ag', '-s', pat, ru]),
        Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
        Command('grep (no line numbers)', [
            'grep', '-a', pat, ru,
        ], env=GREP_ASCII),
        Command('pt', ['pt', pat, ru]),
        Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
        Command('sift', ['sift', '-n', pat, ru]),
        Command('sift (no line numbers)', ['sift', pat, ru]),
    ])


def bench_subtitles_ru_literal(suite_dir):
    '''
    Benchmark the speed of a Unicode-y string literal.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, ru]),
        Command('rg (no line numbers)', ['rg', pat, ru]),
        Command('ag', ['ag', '-s', pat, ru]),
        Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
        Command('grep (no line numbers)', [
            'grep', '-a', pat, ru,
        ], env=GREP_ASCII),
        Command('pt', ['pt', pat, ru]),
        Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
        Command('sift', ['sift', '-n', pat, ru]),
        Command('sift (no line numbers)', ['sift', pat, ru]),
    ])


def bench_subtitles_ru_literal_casei(suite_dir):
    '''
    Benchmark the speed of a Unicode-y string case insensitively.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', '-i', pat, ru]),
        Command('ag (not Unicode)', ['ag', '-i', pat, ru]),
        Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
        Command('grep', ['grep', '-ani', pat, ru], env=GREP_UNICODE),
        Command('grep (not Unicode)', [
            'grep', '-E', '-ani', pat, ru,
        ], env=GREP_ASCII),
    ])


def bench_subtitles_ru_literal_word(suite_dir):
    '''
    Benchmark the speed of finding a literal inside word boundaries.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-nw', pat, ru]),
        Command('rg (not Unicode)', [
            'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
        ]),
        Command('ag (not Unicode)', ['ag', '-sw', pat, ru]),
        Command('ucg (not Unicode)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (not Unicode)', [
            'grep', '-anw', pat, ru,
        ], env=GREP_ASCII),
        Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE),
    ])


def bench_subtitles_ru_alternate(suite_dir):
    '''
    Benchmark the speed of a set of alternate literals.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = '|'.join([
        'Шерлок Холмс',  # Sherlock Holmes
        'Джон Уотсон',  # John Watson
        'Ирен Адлер',  # Irene Adler
        'инспектор Лестрейд',  # Inspector Lestrade
        'профессор Мориарти',  # Professor Moriarty
    ])

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, ru]),
        Command('rg (no line numbers)', ['rg', pat, ru]),
        Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_ASCII),
        Command('grep (no line numbers)', [
            'grep', '-E', '-a', pat, ru,
        ], env=GREP_ASCII),
    ])


def bench_subtitles_ru_alternate_casei(suite_dir):
    '''
    Benchmark the speed of a set of alternate literals.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = '|'.join([
        'Шерлок Холмс',  # Sherlock Holmes
        'Джон Уотсон',  # John Watson
        'Ирен Адлер',  # Irene Adler
        'инспектор Лестрейд',  # Inspector Lestrade
        'профессор Мориарти',  # Professor Moriarty
    ])

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', '-i', pat, ru]),
        Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
        Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
        Command('grep (not Unicode)', [
            'grep', '-E', '-ani', pat, ru,
        ], env=GREP_ASCII),
    ])


def bench_subtitles_ru_no_literal(suite_dir):
    '''
    Benchmark the speed of a regex with no literals.

    Note that we don't even try to run grep with Unicode support
    on this one. While it should eventually get the right answer,
    I killed it after it had already been running for two minutes
    and showed no signs of finishing soon.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, ru]),
        Command('rg (no line numbers)', ['rg', pat, ru]),
        Command('ucg (no Unicode)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (no Unicode)', [
            'grep', '-E', '-an', pat, ru,
        ], env=GREP_ASCII),
    ])


class MissingDependencies(Exception):
    '''
    A missing dependency exception.

    This exception occurs when running a benchmark that requires a
    particular corpus that isn't available.

    :ivar list(str) missing_names:
        A list of missing dependency names. These names correspond to
        names that can be used with the --download flag.
    '''
    def __init__(self, missing_names):
        self.missing_names = missing_names

    def __str__(self):
        return 'MissingDependency(%s)' % repr(self.missing_names)


class Benchmark(object):
    '''
    A single benchmark corresponding to a grouping of commands.

    The main purpose of a benchmark is to compare the performance
    characteristics of a group of commands.
    '''

    def __init__(self, name=None, pattern=None, commands=None,
                 warmup_count=1, count=3, line_count=True):
        '''
        Create a single benchmark.

        A single benchmark is composed of a set of commands that are
        benchmarked and compared against one another. A benchmark may
        have multiple commands that use the same search tool (but
        probably should have something differentiating them).

        The grouping of commands is a purely human driven process.

        By default, the output of every command is sent to /dev/null.
        Other types of behavior are available via the methods defined
        on this benchmark.

        :param str name:
            A human readable string denoting the name of this
            benchmark.
        :param str pattern:
            The pattern that is used in search.
        :param list(Command) commands:
            A list of commands to initialize this benchmark with. More
            commands may be added before running the benchmark.
        :param int warmup_count:
            The number of times to run each command before recording
            samples.
        :param int count:
            The number of samples to collect from each command.
        :param bool line_count:
            When set, the lines of each search are counted and included
            in the samples produced.
        '''
        self.name = name
        self.pattern = pattern
        self.commands = commands or []
        self.warmup_count = warmup_count
        self.count = count
        self.line_count = line_count

    def run(self):
        '''
        Runs this benchmark and returns the results.

        :rtype: Result
        '''
        result = Result(self)
        for cmd in self.commands:
            # Do a warmup first.
            for _ in range(self.warmup_count):
                self.run_one(cmd)
            for _ in range(self.count):
                result.add(cmd, **self.run_one(cmd))
        return result

    def run_one(self, cmd):
        '''
        Runs the given command exactly once.

        Returns an object that includes the time taken by the command.
        If this benchmark was configured to count the number of lines
        returned, then the line count is also returned.

        :param Command cmd: The command to run.
        :returns:
            A dict with two fields, duration and line_count.
            The duration is in seconds, with fractional milliseconds,
            and is guaranteed to be available. The line_count is set
            to None unless line counting is enabled, in which case,
            it is the number of lines in the search output.
        :rtype: int
        '''
        cmd.kwargs['stderr'] = subprocess.DEVNULL
        if self.line_count:
            cmd.kwargs['stdout'] = subprocess.PIPE
        else:
            cmd.kwargs['stdout'] = subprocess.DEVNULL

        start = time.time()
        completed = cmd.run()
        end = time.time()

        line_count = None
        if self.line_count:
            line_count = completed.stdout.count(b'\n')
        return {
            'duration': end - start,
            'line_count': line_count,
        }


class Result(object):
    '''
    The result of running a benchmark.

    Benchmark results consist of a set of samples, where each sample
    corresponds to a single run of a single command in the benchmark.
    Various statistics can be computed from these samples such as mean
    and standard deviation.
    '''
    def __init__(self, benchmark):
        '''
        Create a new set of results, initially empty.

        :param Benchmarl benchmark:
            The benchmark that produced these results.
        '''
        self.benchmark = benchmark
        self.samples = []

    def add(self, cmd, duration, line_count=None):
        '''
        Add a new sample to this result set.

        :param Command cmd:
            The command that produced this sample.
        :param int duration:
            The duration, in milliseconds, that the command took to
            run.
        :param int line_count:
            The number of lines in the search output. This is optional.
        '''
        self.samples.append({
            'cmd': cmd,
            'duration': duration,
            'line_count': line_count,
        })

    def fastest_sample(self):
        '''
        Returns the fastest recorded sample.
        '''
        return min(self.samples, key=lambda s: s['duration'])

    def fastest_cmd(self):
        '''
        Returns the fastest command according to distribution.
        '''
        means = []
        for cmd in self.benchmark.commands:
            mean, _ = self.distribution_for(cmd)
            means.append((cmd, mean))
        return min(means, key=lambda tup: tup[1])[0]

    def samples_for(self, cmd):
        'Returns an iterable of samples for cmd'
        yield from (s for s in self.samples if s['cmd'].name == cmd.name)

    def line_counts_for(self, cmd):
        '''
        Returns the line counts recorded for each command.

        :returns:
            A dictionary from command name to a set of line
            counts recorded.
        '''
        return {s['line_count'] for s in self.samples_for(cmd)
                                if s['line_count'] is not None}

    def distribution_for(self, cmd):
        '''
        Returns the distribution (mean +/- std) of the given command.

        :rtype: (float, float)
        :returns:
            A tuple containing the mean and standard deviation, in that
            order.
        '''
        mean = statistics.mean(
            s['duration'] for s in self.samples_for(cmd))
        stdev = statistics.stdev(
            s['duration'] for s in self.samples_for(cmd))
        return mean, stdev


class Command(object):
    def __init__(self, name, cmd, *args, **kwargs):
        '''
        Create a new command that is run as part of a benchmark.

        *args and **kwargs are passed directly to ``subprocess.run``.
        An exception to this is stdin/stdout/stderr. Output
        redirection is completely controlled by the benchmark harness.
        Trying to set them here will trigger an assert.

        :param str name:
            The human readable name of this command. This is
            particularly useful if the same search tool is used
            multiple times in the same benchmark with different
            arguments.
        :param list(str) cmd:
            The command to run as a list of arguments (including the
            command name itself).
        '''
        assert 'stdin' not in kwargs
        assert 'stdout' not in kwargs
        assert 'stderr' not in kwargs
        self.name = name
        self.cmd = cmd
        self.args = args
        self.kwargs = kwargs

    def run(self):
        '''
        Runs this command and returns its status.

        :rtype: subprocess.CompletedProcess
        '''
        return subprocess.run(self.cmd, *self.args, **self.kwargs)


def eprint(*args, **kwargs):
    'Like print, but to stderr.'
    kwargs['file'] = sys.stderr
    print(*args, **kwargs)


def run_cmd(cmd, *args, **kwargs):
    '''
    Print the command to stderr and run it.

    If the command fails, throw a traceback.
    '''
    eprint('# %s' % ' '.join(cmd))
    kwargs['check'] = True
    return subprocess.run(cmd, *args, **kwargs)


def require(suite_dir, *names):
    '''
    Declare a dependency on the given names for a benchmark.

    If any dependency doesn't exist, then fail with an error message.
    '''
    errs = []
    for name in names:
        fun_name = name.replace('-', '_')
        if not globals()['has_%s' % fun_name](suite_dir):
            errs.append(name)
    if len(errs) > 0:
        raise MissingDependencies(errs)


def download_linux(suite_dir):
    'Download and build the Linux kernel.'
    checkout_dir = path.join(suite_dir, LINUX_DIR)
    if not os.path.isdir(checkout_dir):
        # Clone from my fork so that we always get the same corpus *and* still
        # do a shallow clone. Shallow clones are much much cheaper than full
        # clones.
        run_cmd(['git', 'clone', '--depth', '1', LINUX_CLONE, checkout_dir])
    # We want to build the kernel because the process of building it produces
    # a lot of junk in the repository that a search tool probably shouldn't
    # touch.
    if not os.path.exists(path.join(checkout_dir, 'vmlinux')):
        eprint('# Building Linux kernel...')
        run_cmd(['make', 'defconfig'], cwd=checkout_dir)
        run_cmd(['make', '-j', str(cpu_count())], cwd=checkout_dir)


def has_linux(suite_dir):
    'Returns true if we believe the Linux kernel is built.'
    checkout_dir = path.join(suite_dir, LINUX_DIR)
    return path.exists(path.join(checkout_dir, 'vmlinux'))


def download_subtitles_en(suite_dir):
    'Download and decompress English subtitles.'
    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
    en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
    en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
    en_path_sample = path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE)

    if not os.path.isdir(subtitle_dir):
        os.makedirs(subtitle_dir)
    if not os.path.exists(en_path):
        if not os.path.exists(en_path_gz):
            run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
        run_cmd(['gunzip', en_path_gz], cwd=subtitle_dir)
    if not os.path.exists(en_path_sample):
        # Get a sample roughly the same size as the Russian corpus so that
        # benchmarks finish in a reasonable time.
        with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f:
            run_cmd(
                ['head', '-n', '32722372', en_path],
                cwd=subtitle_dir, stdout=f)


def has_subtitles_en(suite_dir):
    'Returns true if English subtitles have been downloaded.'
    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
    return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE))


def download_subtitles_ru(suite_dir):
    'Download and decompress Russian subtitles.'
    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
    ru_path_gz = path.join(subtitle_dir, SUBTITLES_RU_NAME_GZ)
    ru_path = path.join(subtitle_dir, SUBTITLES_RU_NAME)

    if not os.path.isdir(subtitle_dir):
        os.makedirs(subtitle_dir)
    if not os.path.exists(ru_path):
        if not os.path.exists(ru_path_gz):
            run_cmd(['curl', '-LO', SUBTITLES_RU_URL], cwd=subtitle_dir)
        run_cmd(['gunzip', ru_path_gz], cwd=subtitle_dir)


def has_subtitles_ru(suite_dir):
    'Returns true if Russian subtitles have been downloaded.'
    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
    return path.exists(path.join(subtitle_dir, SUBTITLES_RU_NAME))


def download(suite_dir, choices):
    '''
    Download choices into suite_dir.

    Specifically, choices specifies a list of corpora to fetch.

    :param str suite_dir:
        The directory in which to download corpora.
    :param list(str) choices:
        A list of corpora to download. Available choices are:
        all, linux, subtitles-en, subtitles-ru.
    '''
    for choice in choices:
        if choice == 'linux':
            download_linux(suite_dir)
        elif choice == 'subtitles-en':
            download_subtitles_en(suite_dir)
        elif choice == 'subtitles-ru':
            download_subtitles_ru(suite_dir)
        elif choice == 'all':
            download_linux(suite_dir)
            download_subtitles_en(suite_dir)
            download_subtitles_ru(suite_dir)
        else:
            eprint('Unrecognized download choice: %s' % choice)
            sys.exit(1)


def collect_benchmarks(suite_dir, filter_pat=None):
    '''
    Return an iterable of all runnable benchmarks.

    :param str suite_dir:
        The directory containing corpora.
    :param str filter_pat:
        A single regular expression that is used to filter benchmarks
        by their name. When not specified, all benchmarks are run.
    :returns:
        An iterable over all runnable benchmarks. If a benchmark
        requires corpora that are missing, then a log message is
        emitted to stderr and it is not yielded.
    '''
    for fun in sorted(globals()):
        if not fun.startswith('bench_'):
            continue
        name = re.sub('^bench_', '', fun)
        if filter_pat is not None and not re.search(filter_pat, name):
            continue
        try:
            benchmark = globals()[fun](suite_dir)
        except MissingDependencies as e:
            eprint(
                'missing: %s, skipping benchmark %s (try running with: %s)' % (
                    ', '.join(e.missing_names),
                    name,
                    ' '.join(['--download %s' % n for n in e.missing_names]),
                ))
            continue
        benchmark.name = name
        yield benchmark


def main():
    p = argparse.ArgumentParser('Command line search tool benchmark suite.')
    p.add_argument(
        '--dir', metavar='PATH', default=os.getcwd(),
        help='The directory in which to download data and perform searches.')
    p.add_argument(
        '--download', metavar='CORPUS', action='append',
        choices=['all', 'linux', 'subtitles-en', 'subtitles-ru'],
        help='Download and prepare corpus data, then exit without running '
             'any benchmarks. Note that this command is intended to be '
             'idempotent. WARNING: This downloads over a gigabyte of data, '
             'and also includes building the Linux kernel. If "all" is used '
             'then the total uncompressed size is around 13 GB.')
    p.add_argument(
        '-f', '--force', action='store_true',
        help='Overwrite existing files if there is a conflict.')
    p.add_argument(
        '--list', action='store_true',
        help='List available benchmarks by name.')
    p.add_argument(
        '--raw', metavar='PATH',
        help='Dump raw data (all samples collected) in CSV format to the '
             'file path provided.')
    p.add_argument(
        'bench', metavar='PAT', nargs='?',
        help='A regex pattern that will only run benchmarks that match.')
    args = p.parse_args()

    if args.download is not None and len(args.download) > 0:
        download(args.dir, args.download)
        sys.exit(0)

    if not path.isdir(args.dir):
        os.makedirs(args.dir)
    if args.raw is not None and path.exists(args.raw) and not args.force:
        eprint('File %s already exists (delete it or use --force)' % args.raw)
        sys.exit(1)
    raw_handle, raw_csv_wtr = None, None
    if args.raw is not None:
        fields = [
            'benchmark', 'warmup_iter', 'iter',
            'name', 'command', 'duration', 'lines', 'env',
        ]
        raw_handle = open(args.raw, 'w+')
        raw_csv_wtr = csv.DictWriter(raw_handle, fields)
        raw_csv_wtr.writerow({x: x for x in fields})

    benchmarks = collect_benchmarks(args.dir, filter_pat=args.bench)
    for i, b in enumerate(benchmarks):
        result = b.run()
        fastest_cmd = result.fastest_cmd()
        fastest_sample = result.fastest_sample()
        max_name_len = max(len(cmd.name) for cmd in b.commands)

        if i > 0:
            print()
        header = '%s (pattern: %s)' % (b.name, b.pattern)
        print('%s\n%s' % (header, '-' * len(header)))
        for cmd in b.commands:
            name = cmd.name
            mean, stdev = result.distribution_for(cmd)
            line_counts = result.line_counts_for(cmd)
            show_fast_cmd, show_line_counts = '', ''
            if fastest_cmd.name == cmd.name:
                show_fast_cmd = '*'
            if fastest_sample['cmd'].name == cmd.name:
                name += '*'
            if len(line_counts) > 0:
                counts = map(str, line_counts)
                show_line_counts = ' (lines: %s)' % ', '.join(counts)
            fmt = '{name:{pad}} {mean:0.3f} +/- {stdev:0.3f}{lines}{fast_cmd}'
            print(fmt.format(
                name=name, pad=max_name_len + 2, fast_cmd=show_fast_cmd,
                mean=mean, stdev=stdev, lines=show_line_counts))
        sys.stdout.flush()

        if raw_csv_wtr is not None:
            for sample in result.samples:
                cmd, duration = sample['cmd'], sample['duration']
                env = ' '.join(['%s=%s' % (k, v)
                                for k, v in cmd.kwargs.get('env', {}).items()])
                raw_csv_wtr.writerow({
                    'benchmark': b.name,
                    'warmup_iter': b.warmup_count,
                    'iter': b.count,
                    'name': sample['cmd'].name,
                    'command': ' '.join(cmd.cmd),
                    'duration': duration,
                    'lines': sample['line_count'] or '',
                    'env': env,
                })
            raw_handle.flush()


if __name__ == '__main__':
    main()