From b0066274cbb36e2cf4a76aded5f8a98d1f79e61a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 14 Oct 2020 14:17:23 -0400
Subject: [PATCH] benchsuite: update subtitle URLs

Since the English subtitle file actually changed its content, we tweak
the benchmark to use a slightly bigger sample that more closely matches
the file size of the Russian subtitle file.

Also, the BurntSushi/linux repo has been updated and I've confirmed that
it builds on my Linux machine.

Fixes #1257
---
 benchsuite/benchsuite | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/benchsuite/benchsuite b/benchsuite/benchsuite
index b849b454..9353cf49 100755
--- a/benchsuite/benchsuite
+++ b/benchsuite/benchsuite
@@ -23,13 +23,15 @@ import time
 # strategies used to increase the relevance of results returned.
 
 SUBTITLES_DIR = 'subtitles'
-SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
-SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en'
+SUBTITLES_EN_NAME = 'en.txt'
+SUBTITLES_EN_NAME_SAMPLE = 'en.sample.txt'
 SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
-SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'  # noqa
-SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
+# SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'  # noqa
+SUBTITLES_EN_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/en.txt.gz'  # noqa
+SUBTITLES_RU_NAME = 'ru.txt'
 SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
-SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'  # noqa
+# SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'  # noqa
+SUBTITLES_RU_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/ru.txt.gz'  # noqa
 
 LINUX_DIR = 'linux'
 LINUX_CLONE = 'git://github.com/BurntSushi/linux'
@@ -255,11 +257,11 @@ def bench_linux_unicode_greek_casei(suite_dir):
 
 def bench_linux_unicode_word(suite_dir):
     '''
-    Benchmark Unicode aware \w character class.
+    Benchmark Unicode aware \\w character class.
 
     Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
     this right. Everything else uses the standard ASCII interpretation
-    of \w.
+    of \\w.
     '''
     require(suite_dir, 'linux')
     cwd = path.join(suite_dir, LINUX_DIR)
@@ -1088,7 +1090,7 @@ def download_subtitles_en(suite_dir):
         # benchmarks finish in a reasonable time.
         with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f:
             run_cmd(
-                ['head', '-n', '32722372', en_path],
+                ['head', '-n', '55000000', en_path],
                 cwd=subtitle_dir, stdout=f)