Fixing, polishing and adding benchmarks.

2025-08-04 21:52:54 +02:00 · 2016-09-16 21:02:46 -04:00
parent 65fec147d6
commit 5a0c873f61
1 changed files with 285 additions and 168 deletions
--- a/benchsuite/benchsuite
+++ b/benchsuite/benchsuite
@ -39,13 +39,23 @@ LINUX_CLONE = 'git://github.com/BurntSushi/linux'
 GREP_ASCII = {'LC_ALL': 'C'}
 GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}

+# Sift tries really hard to search everything by default. In our code search
+# benchmarks, we don't want that.
+SIFT = [
+    'sift',
+    '--binary-skip',
+    '--exclude-files', '.*',
+    '--exclude-files', '*.pdf',
+]
+

 def bench_linux_literal_default(suite_dir):
    '''
    Benchmark the speed of a literal using *default* settings.

    This is a purposefully unfair benchmark for use in performance
-    analysis, but it is pedagogically useful.
+    analysis, but it is pedagogically useful to demonstrate how
+    default behaviors differ.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
@ -55,8 +65,6 @@ def bench_linux_literal_default(suite_dir):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

-    # N.B. This is a purposefully unfair benchmark for illustrative purposes
-    # of how the default modes for each search tool differ.
    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', pat]),
        mkcmd('ag', ['ag', pat]),
@ -66,10 +74,10 @@ def bench_linux_literal_default(suite_dir):
        mkcmd('ucg', ['ucg', pat]),
        # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
        # default, but I'd guess it to be on most desktop systems.
-        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
        mkcmd('pt', ['pt', pat]),
        # sift reports an extra line here for a binary file matched.
        mkcmd('sift', ['sift', pat]),
+        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
    ])


@ -78,8 +86,9 @@ def bench_linux_literal(suite_dir):
    Benchmark the speed of a literal, attempting to be fair.

    This tries to use the minimum set of options available in all tools
-    to test how fast they are. For example, it makes sure there is no
-    case insensitive matching and that line numbers are computed.
+    to test how fast they are. For example, it makes sure there is
+    no case insensitive matching and that line numbers are computed
+    (because some tools don't permit disabling line numbers).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
@ -90,18 +99,16 @@ def bench_linux_literal(suite_dir):
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
-        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
-        mkcmd('ag (mmap)', ['ag', '-s', pat]),
-        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
-        mkcmd('git grep', [
+        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]),
+        mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]),
+        mkcmd('pt (ignore)', ['pt', pat]),
+        mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
+        mkcmd('git grep (ignore)', [
            'git', 'grep', '-I', '-n', pat,
        ], env={'LC_ALL': 'C'}),
-        mkcmd('pt', ['pt', pat]),
-        mkcmd('sift', [
-            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
-        ]),
+        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
+        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])


@ -121,26 +128,21 @@ def bench_linux_literal_casei(suite_dir):
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg', ['rg', '-n', '-i', pat]),
-        mkcmd('rg (mmap)', ['rg', '-n', '-i', pat]),
-        mkcmd('rg (whitelist)', [
-            'rg', '-n', '-i', '--no-ignore', '-tall', pat,
-        ]),
-        mkcmd('ag (mmap)', ['ag', '-i', pat]),
-        mkcmd('ucg', ['ucg', '-i', pat]),
+        mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
+        mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
+        mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]),
+        mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]),
        # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
        # since that is certainly what ripgrep is doing, but this is for an
        # ASCII literal, so we should give `git grep` all the opportunity to
        # do its best.
-        mkcmd('git grep', [
+        mkcmd('git grep (ignore)', [
            'git', 'grep', '-I', '-n', '-i', pat,
        ], env={'LC_ALL': 'C'}),
-        # sift yields more matches than it should here. Specifically, it gets
-        # matches in Module.symvers and System.map in the repo root. Both of
-        # those files show up in the repo root's .gitignore file.
-        mkcmd('sift', [
-            'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat,
+        mkcmd('rg (whitelist)', [
+            'rg', '-n', '-i', '--no-ignore', '-tall', pat,
        ]),
+        mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
    ])


@ -160,20 +162,16 @@ def bench_linux_re_literal_suffix(suite_dir):
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
-        mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
-        mkcmd('ag', ['ag', '-s', pat]),
-        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
-        mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('ag (ignore)', ['ag', '-s', pat]),
+        mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
        mkcmd(
-            'git grep',
+            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
-        mkcmd('sift', [
-            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
-        ]),
+        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
+        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])


@ -193,22 +191,18 @@ def bench_linux_word(suite_dir):
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg', ['rg', '-n', '-w', pat]),
-        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]),
-        mkcmd('rg-novcs-mmap', [
-            'rg', '--mmap', '--no-ignore', '-n', '-w', pat,
-        ]),
-        mkcmd('ag', ['ag', '-s', '-w', pat]),
-        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]),
-        mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]),
+        mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]),
+        mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]),
+        mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]),
        mkcmd(
-            'git grep',
+            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', '-w', pat],
            env={'LC_ALL': 'C'},
        ),
-        mkcmd('sift', [
-            'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat,
+        mkcmd('rg (whitelist)', [
+            'rg', '-n', '-w', '--no-ignore', '-tall', pat,
        ]),
+        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]),
    ])


@ -216,7 +210,8 @@ def bench_linux_unicode_greek(suite_dir):
    '''
    Benchmark matching of a Unicode category.

-    Only three tools (ripgrep, sift and pt) support this.
+    Only three tools (ripgrep, sift and pt) support this. We omit
+    pt because it is too slow.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
@ -228,15 +223,7 @@ def bench_linux_unicode_greek(suite_dir):

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
-        # sift tries to search a bunch of PDF files and clutters up the
-        # results, even though --binary-skip is provided. They are excluded
-        # here explicitly, but don't have a measurable impact on performance.
-        mkcmd('sift', [
-            'sift', '-n', '--binary-skip',
-            '--exclude-files', '.*',
-            '--exclude-files', '*.pdf',
-            pat,
-        ]),
+        mkcmd('sift', SIFT + ['-n', '--git', pat]),
    ])


@ -256,15 +243,7 @@ def bench_linux_unicode_greek_casei(suite_dir):

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', '-i', pat]),
-        # sift tries to search a bunch of PDF files and clutters up the
-        # results, even though --binary-skip is provided. They are excluded
-        # here explicitly, but don't have a measurable impact on performance.
-        mkcmd('sift', [
-            'sift', '-n', '--binary-skip',
-            '--exclude-files', '.*',
-            '--exclude-files', '*.pdf',
-            pat,
-        ]),
+        mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]),
    ])


@ -285,30 +264,25 @@ def bench_linux_unicode_word(suite_dir):
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
-        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
-        mkcmd('rg-novcs-mmap', [
-            'rg', '--mmap', '--no-ignore', '-n', pat,
-        ]),
-        mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
-        mkcmd('ag-novcs (no Unicode)', [
-            'ag', '--skip-vcs-ignores', '-s', pat,
-        ]),
-        mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
+        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
+        mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
+        mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', pat]),
        mkcmd(
-            'git grep',
+            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'en_US.UTF-8'},
        ),
        mkcmd(
-            'git grep (no Unicode)',
+            'git grep (ignore) (ASCII)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
-        mkcmd('sift (no Unicode)', [
-            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
+        mkcmd('rg (whitelist) (ASCII)', [
+            'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
        ]),
+        mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]),
    ])


@ -330,30 +304,25 @@ def bench_linux_no_literal(suite_dir):
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg-whitelist', ['rg', '-tall', '--no-ignore', '-n', pat]),
-        mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
-        mkcmd('rg-whitelist (no Unicode)', [
-            'rg', '-tall', '--no-ignore', '-n', '(?-u)' + pat,
-        ]),
-        mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
-        mkcmd('ag-novcs (no Unicode)', [
-            'ag', '--skip-vcs-ignores', '-s', pat,
-        ]),
-        mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
+        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
+        mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
+        mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', pat]),
        mkcmd(
-            'git grep',
+            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'en_US.UTF-8'},
        ),
        mkcmd(
-            'git grep (no Unicode)',
+            'git grep (ignore) (ASCII)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
-        mkcmd('sift (no Unicode)', [
-            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
+        mkcmd('rg (whitelist) (ASCII)', [
+            'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
        ]),
+        mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]),
    ])


@ -375,21 +344,15 @@ def bench_linux_alternates(suite_dir):
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
-        mkcmd('rg-novcs-mmap', [
-            'rg', '--mmap', '--no-ignore', '-n', pat,
-        ]),
-        mkcmd('ag', ['ag', '-s', pat]),
-        mkcmd('ag-novcs', [
-            'ag', '--skip-vcs-ignores', '-s', pat,
-        ]),
-        mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('ag (ignore)', ['ag', '-s', pat]),
        mkcmd(
-            'git grep',
+            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
+        mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]),
+        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])


@ -404,21 +367,15 @@ def bench_linux_alternates_casei(suite_dir):
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg', ['rg', '-n', '-i', pat]),
-        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
-        mkcmd('rg-novcs-mmap', [
-            'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
-        ]),
-        mkcmd('ag', ['ag', '-i', pat]),
-        mkcmd('ag-novcs', [
-            'ag', '--skip-vcs-ignores', '-i', pat,
-        ]),
-        mkcmd('ucg', ['ucg', '-i', pat]),
+        mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
+        mkcmd('ag (ignore)', ['ag', '-i', pat]),
        mkcmd(
-            'git grep',
+            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', '-i', pat],
            env={'LC_ALL': 'C'},
        ),
+        mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]),
+        mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
    ])


@ -427,22 +384,159 @@ def bench_subtitles_en_literal(suite_dir):
    Benchmark the speed of an ASCII string literal.
    '''
    require(suite_dir, 'subtitles-en')
-    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
+    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = 'Sherlock Holmes'

    return Benchmark(pattern=pat, commands=[
-        Command('rg', ['rg', '-n', pat, ru]),
-        Command('rg (no line numbers)', ['rg', pat, ru]),
-        Command('ag', ['ag', '-s', pat, ru]),
-        Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
-        Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
-        Command('grep (no line numbers)', [
-            'grep', '-a', pat, ru,
+        Command('rg', ['rg', pat, en]),
+        Command('pt', ['pt', '-N', pat, en]),
+        Command('sift', ['sift', pat, en]),
+        Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII),
+        Command('rg (lines)', ['rg', '-n', pat, en]),
+        Command('ag (lines)', ['ag', '-s', pat, en]),
+        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
+        Command('pt (lines)', ['pt', pat, en]),
+        Command('sift (lines)', ['sift', '-n', pat, en]),
+        Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII),
+    ])
+
+
+def bench_subtitles_en_literal_casei(suite_dir):
+    '''
+    Benchmark the speed of a Unicode-y string case insensitively.
+    '''
+    require(suite_dir, 'subtitles-en')
+    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
+    pat = 'Sherlock Holmes'
+
+    return Benchmark(pattern=pat, commands=[
+        Command('rg', ['rg', '-i', pat, en]),
+        Command('grep', ['grep', '-ai', pat, en], env=GREP_UNICODE),
+        Command('grep (ASCII)', [
+            'grep', '-E', '-ai', pat, en,
+        ], env=GREP_ASCII),
+        Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
+        Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
+        Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]),
+    ])
+
+
+def bench_subtitles_en_literal_word(suite_dir):
+    '''
+    Benchmark the speed of finding a literal inside word boundaries.
+    '''
+    require(suite_dir, 'subtitles-en')
+    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
+    pat = 'Sherlock Holmes'
+
+    return Benchmark(pattern=pat, commands=[
+        Command('rg (ASCII)', [
+            'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en,
+        ]),
+        Command('ag (ASCII)', ['ag', '-sw', pat, en]),
+        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
+        Command('grep (ASCII)', [
+            'grep', '-anw', pat, en,
+        ], env=GREP_ASCII),
+        Command('rg', ['rg', '-nw', pat, en]),
+        Command('grep', ['grep', '-anw', pat, en], env=GREP_UNICODE),
+    ])
+
+
+def bench_subtitles_en_alternate(suite_dir):
+    '''
+    Benchmark the speed of a set of alternate literals.
+    '''
+    require(suite_dir, 'subtitles-en')
+    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
+    pat = '|'.join([
+        'Sherlock Holmes',
+        'John Watson',
+        'Irene Adler',
+        'Inspector Lestrade',
+        'Professor Moriarty',
+    ])
+
+    return Benchmark(pattern=pat, commands=[
+        Command('rg (lines)', ['rg', '-n', pat, en]),
+        Command('ag (lines)', ['ag', '-s', pat, en]),
+        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
+        Command('grep (lines)', [
+            'grep', '-E', '-an', pat, en,
+        ], env=GREP_ASCII),
+        Command('rg', ['rg', pat, en]),
+        Command('grep', [
+            'grep', '-E', '-a', pat, en,
+        ], env=GREP_ASCII),
+    ])
+
+
+def bench_subtitles_en_alternate_casei(suite_dir):
+    '''
+    Benchmark the speed of a set of alternate literals.
+    '''
+    require(suite_dir, 'subtitles-en')
+    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
+    pat = '|'.join([
+        'Sherlock Holmes',
+        'John Watson',
+        'Irene Adler',
+        'Inspector Lestrade',
+        'Professor Moriarty',
+    ])
+
+    return Benchmark(pattern=pat, commands=[
+        Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]),
+        Command('ucg (ASCII)', ['ucg', '-i', pat, en]),
+        Command('grep (ASCII)', [
+            'grep', '-E', '-ani', pat, en,
+        ], env=GREP_ASCII),
+        Command('rg', ['rg', '-n', '-i', pat, en]),
+        Command('grep', ['grep', '-E', '-ani', pat, en], env=GREP_UNICODE),
+    ])
+
+
+def bench_subtitles_en_surrounding_words(suite_dir):
+    '''
+    Benchmark a more complex regex with an inner literal.
+    '''
+    require(suite_dir, 'subtitles-en')
+    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
+    pat = r'\w+\s+Holmes\s+\w+'
+
+    return Benchmark(pattern=pat, commands=[
+        Command('rg', ['rg', '-n', pat, en]),
+        Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE),
+        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
+        Command('ag (ASCII)', ['ag', '-s', pat, en]),
+        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
+        Command('grep (ASCII)', [
+            'grep', '-E', '-an', pat, en,
+        ], env=GREP_ASCII),
+    ])
+
+
+def bench_subtitles_en_no_literal(suite_dir):
+    '''
+    Benchmark the speed of a regex with no literals.
+
+    Note that we don't even try to run grep with Unicode support
+    on this one. While it should eventually get the right answer,
+    I killed it after it had already been running for two minutes
+    and showed no signs of finishing soon.
+    '''
+    require(suite_dir, 'subtitles-en')
+    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
+    pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
+
+    return Benchmark(pattern=pat, commands=[
+        Command('rg', ['rg', '-n', pat, en]),
+        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
+        Command('ag (ASCII)', ['ag', '-s', pat, en]),
+        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
+        Command('grep (ASCII)', [
+            'grep', '-E', '-an', pat, en,
        ], env=GREP_ASCII),
-        Command('pt', ['pt', pat, ru]),
-        Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
-        Command('sift', ['sift', '-n', pat, ru]),
-        Command('sift (no line numbers)', ['sift', pat, ru]),
    ])


@ -455,18 +549,16 @@ def bench_subtitles_ru_literal(suite_dir):
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
-        Command('rg', ['rg', '-n', pat, ru]),
-        Command('rg (no line numbers)', ['rg', pat, ru]),
-        Command('ag', ['ag', '-s', pat, ru]),
-        Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
-        Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
-        Command('grep (no line numbers)', [
-            'grep', '-a', pat, ru,
-        ], env=GREP_ASCII),
-        Command('pt', ['pt', pat, ru]),
-        Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
-        Command('sift', ['sift', '-n', pat, ru]),
-        Command('sift (no line numbers)', ['sift', pat, ru]),
+        Command('rg', ['rg', pat, ru]),
+        Command('pt', ['pt', '-N', pat, ru]),
+        Command('sift', ['sift', pat, ru]),
+        Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII),
+        Command('rg (lines)', ['rg', '-n', pat, ru]),
+        Command('ag (lines)', ['ag', '-s', pat, ru]),
+        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
+        Command('pt (lines)', ['pt', pat, ru]),
+        Command('sift (lines)', ['sift', '-n', pat, ru]),
+        Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII),
    ])


@ -479,13 +571,14 @@ def bench_subtitles_ru_literal_casei(suite_dir):
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
-        Command('rg', ['rg', '-n', '-i', pat, ru]),
-        Command('ag (not Unicode)', ['ag', '-i', pat, ru]),
-        Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
-        Command('grep', ['grep', '-ani', pat, ru], env=GREP_UNICODE),
-        Command('grep (not Unicode)', [
-            'grep', '-E', '-ani', pat, ru,
+        Command('rg', ['rg', '-i', pat, ru]),
+        Command('grep', ['grep', '-ai', pat, ru], env=GREP_UNICODE),
+        Command('grep (ASCII)', [
+            'grep', '-E', '-ai', pat, ru,
        ], env=GREP_ASCII),
+        Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
+        Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
+        Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]),
    ])


@ -498,15 +591,15 @@ def bench_subtitles_ru_literal_word(suite_dir):
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
-        Command('rg', ['rg', '-nw', pat, ru]),
-        Command('rg (not Unicode)', [
+        Command('rg (ASCII)', [
            'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
        ]),
-        Command('ag (not Unicode)', ['ag', '-sw', pat, ru]),
-        Command('ucg (not Unicode)', ['ucg', '--nosmart-case', pat, ru]),
-        Command('grep (not Unicode)', [
+        Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
+        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
+        Command('grep (ASCII)', [
            'grep', '-anw', pat, ru,
        ], env=GREP_ASCII),
+        Command('rg', ['rg', '-nw', pat, ru]),
        Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE),
    ])

@ -526,11 +619,14 @@ def bench_subtitles_ru_alternate(suite_dir):
    ])

    return Benchmark(pattern=pat, commands=[
-        Command('rg', ['rg', '-n', pat, ru]),
-        Command('rg (no line numbers)', ['rg', pat, ru]),
-        Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
-        Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_ASCII),
-        Command('grep (no line numbers)', [
+        Command('rg (lines)', ['rg', '-n', pat, ru]),
+        Command('ag (lines)', ['ag', '-s', pat, ru]),
+        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
+        Command('grep (lines)', [
+            'grep', '-E', '-an', pat, ru,
+        ], env=GREP_ASCII),
+        Command('rg', ['rg', pat, ru]),
+        Command('grep', [
            'grep', '-E', '-a', pat, ru,
        ], env=GREP_ASCII),
    ])
@ -551,12 +647,32 @@ def bench_subtitles_ru_alternate_casei(suite_dir):
    ])

    return Benchmark(pattern=pat, commands=[
-        Command('rg', ['rg', '-n', '-i', pat, ru]),
-        Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
-        Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
-        Command('grep (not Unicode)', [
+        Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]),
+        Command('ucg (ASCII)', ['ucg', '-i', pat, ru]),
+        Command('grep (ASCII)', [
            'grep', '-E', '-ani', pat, ru,
        ], env=GREP_ASCII),
+        Command('rg', ['rg', '-n', '-i', pat, ru]),
+        Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
+    ])
+
+
+def bench_subtitles_ru_surrounding_words(suite_dir):
+    '''
+    Benchmark a more complex regex with an inner literal.
+    '''
+    require(suite_dir, 'subtitles-en')
+    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
+    pat = r'\w+\s+Холмс\s+\w+'
+
+    return Benchmark(pattern=pat, commands=[
+        Command('rg', ['rg', '-n', pat, ru]),
+        Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE),
+        Command('ag (ASCII)', ['ag', '-s', pat, ru]),
+        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
+        Command('grep (ASCII)', [
+            'grep', '-E', '-an', pat, ru,
+        ], env=GREP_ASCII),
    ])


@ -575,9 +691,10 @@ def bench_subtitles_ru_no_literal(suite_dir):

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, ru]),
-        Command('rg (no line numbers)', ['rg', pat, ru]),
-        Command('ucg (no Unicode)', ['ucg', '--nosmart-case', pat, ru]),
-        Command('grep (no Unicode)', [
+        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
+        Command('ag (ASCII)', ['ag', '-s', pat, ru]),
+        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
+        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, ru,
        ], env=GREP_ASCII),
    ])