Add support for translations

This implements a translation pipeline using the industry-standard Gettext[1] system. I picked Gettext for the reasons described in [2] and [3]: * It’s widely used in open source software. This means that there are graphical editors which will help you in editing the `.po` files. An example is Poedit[4], which is available for all major platforms. There are also many online systems for doing translations. An example is Pontoon[5], which is used for the Rust website itself. We can consider setting up such an instance ourselves. * It is a light-weight yet structured format. This means that nothing changes with regards to how you update the original English text. We can still accept fixes and PRs like normal. The structure means that translators can see exactly which part of the course they need to update after a change. This is completely lost if you simply copy over the original text and translate it in-place in the Markdown files. The code here only adds support for translations. They are not yet tested, published or used for anything. Next steps will be: * Add support for switching languages via a bit of JavaScript on each page. * Update the speaker notes feature to support translations (right now “Speaker Notes” is hard-coded into the generated HTML). I think we should turn it into a mdbook preprocessor instead. * Add testing: We should test that the `.po` files are well-formed. We should also run `mdbook test` on each language since the translations can alter the embedded code. Fixes #115. [1]: https://www.gnu.org/software/gettext/manual/html_node/index.html [2]: https://github.com/rust-lang/mdBook/pull/1864 [3]: https://github.com/rust-lang/mdBook/issues/5#issuecomment-1144887806 [4]: https://poedit.net/ [5]: https://pontoon.rust-lang.org/
2025-11-30 09:08:45 +02:00 · 2023-01-08 13:45:19 +01:00
parent 8e3941d2e6
commit 48ec773052
9 changed files with 2593 additions and 33 deletions
--- a/i18n-helpers/src/bin/mdbook-gettext.rs
+++ b/i18n-helpers/src/bin/mdbook-gettext.rs
@@ -0,0 +1,230 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! `gettext` for `mdbook`
+//!
+//! This program works like `gettext`, meaning it will translate
+//! strings in your book.
+//!
+//! The translations come from GNU Gettext `xx.po` files. You must set
+//! preprocessor.gettext.po-file to the PO file to use. If unset, a
+//! warning is issued while building the book.
+//!
+//! See `TRANSLATIONS.md` in the repository root for more information.
+
+use anyhow::{anyhow, Context};
+use i18n_helpers::extract_paragraphs;
+use mdbook::book::Book;
+use mdbook::preprocess::{CmdPreprocessor, PreprocessorContext};
+use mdbook::BookItem;
+use polib::catalog::Catalog;
+use polib::po_file;
+use semver::{Version, VersionReq};
+use std::io;
+use std::path::Path;
+use std::process;
+
+fn translate(text: &str, catalog: &Catalog) -> String {
+    let mut output = String::with_capacity(text.len());
+    let mut target_lineno = 1;
+
+    for (lineno, paragraph) in extract_paragraphs(text) {
+        // Fill in blank lines between paragraphs. This is important
+        // for code blocks where blank lines are significant.
+        while target_lineno < lineno {
+            output.push('\n');
+            target_lineno += 1;
+        }
+        // Subtract 1 because the paragraph is missing a final '\n'
+        // due to the splitting in `extract_paragraphs`.
+        target_lineno += paragraph.lines().count() - 1;
+
+        let translated = catalog
+            .find_message(paragraph)
+            .and_then(|msg| msg.get_msgstr().ok())
+            .filter(|msgstr| !msgstr.is_empty())
+            .map(|msgstr| msgstr.as_str())
+            .unwrap_or(paragraph);
+        output.push_str(translated);
+    }
+
+    let suffix = &text[text.trim_end_matches('\n').len()..];
+    output.push_str(suffix);
+    output
+}
+
+fn translate_book(ctx: &PreprocessorContext, mut book: Book) -> anyhow::Result<Book> {
+    let cfg = ctx
+        .config
+        .get_preprocessor("gettext")
+        .ok_or_else(|| anyhow!("Could not read preprocessor.gettext configuration"))?;
+    let path = cfg
+        .get("po-file")
+        .ok_or_else(|| anyhow!("Missing preprocessor.gettext.po-file config value"))?
+        .as_str()
+        .ok_or_else(|| anyhow!("Expected a string for preprocessor.gettext.po-file"))?;
+    let catalog = po_file::parse(Path::new(path))
+        .map_err(|err| anyhow!("{err}"))
+        .with_context(|| format!("Could not parse {path} as PO file"))?;
+
+    book.for_each_mut(|item| match item {
+        BookItem::Chapter(ch) => {
+            ch.content = translate(&ch.content, &catalog);
+            ch.name = translate(&ch.name, &catalog);
+        }
+        BookItem::Separator => {}
+        BookItem::PartTitle(title) => {
+            *title = translate(title, &catalog);
+        }
+    });
+
+    Ok(book)
+}
+
+fn preprocess() -> anyhow::Result<()> {
+    let (ctx, book) = CmdPreprocessor::parse_input(io::stdin())?;
+    let book_version = Version::parse(&ctx.mdbook_version)?;
+    let version_req = VersionReq::parse(mdbook::MDBOOK_VERSION)?;
+    if !version_req.matches(&book_version) {
+        eprintln!(
+            "Warning: The gettext preprocessor was built against \
+             mdbook version {}, but we're being called from version {}",
+            mdbook::MDBOOK_VERSION,
+            ctx.mdbook_version
+        );
+    }
+
+    let translated_book = translate_book(&ctx, book)?;
+    serde_json::to_writer(io::stdout(), &translated_book)?;
+
+    Ok(())
+}
+
+fn main() -> anyhow::Result<()> {
+    if std::env::args().len() == 3 {
+        assert_eq!(std::env::args().nth(1).as_deref(), Some("supports"));
+        // Signal that we support all renderers.
+        process::exit(0);
+    }
+
+    preprocess()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use polib::message::Message;
+
+    fn create_catalog(translations: &[(&str, &str)]) -> Catalog {
+        let mut catalog = Catalog::new();
+        for (msgid, msgstr) in translations {
+            let message = Message::new_singular("", "", "", "", msgid, msgstr);
+            catalog.add_message(message);
+        }
+        catalog
+    }
+
+    #[test]
+    fn test_translate_single_line() {
+        let catalog = create_catalog(&[("foo bar", "FOO BAR")]);
+        assert_eq!(translate("foo bar", &catalog), "FOO BAR");
+    }
+
+    #[test]
+    fn test_translate_single_paragraph() {
+        let catalog = create_catalog(&[("foo bar", "FOO BAR")]);
+        assert_eq!(translate("foo bar\n", &catalog), "FOO BAR\n");
+    }
+
+    #[test]
+    fn test_translate_paragraph_with_leading_newlines() {
+        let catalog = create_catalog(&[("foo bar", "FOO BAR")]);
+        assert_eq!(translate("\n\n\nfoo bar\n", &catalog), "\n\n\nFOO BAR\n");
+    }
+
+    #[test]
+    fn test_translate_paragraph_with_trailing_newlines() {
+        let catalog = create_catalog(&[("foo bar", "FOO BAR")]);
+        assert_eq!(translate("foo bar\n\n\n", &catalog), "FOO BAR\n\n\n");
+    }
+
+    #[test]
+    fn test_translate_multiple_paragraphs() {
+        let catalog = create_catalog(&[("foo bar", "FOO BAR")]);
+        assert_eq!(
+            translate(
+                "first paragraph\n\
+                 \n\
+                 foo bar\n\
+                 \n\
+                 last paragraph\n",
+                &catalog
+            ),
+            "first paragraph\n\
+             \n\
+             FOO BAR\n\
+             \n\
+             last paragraph\n"
+        );
+    }
+
+    #[test]
+    fn test_translate_multiple_paragraphs_extra_newlines() {
+        // Notice how the translated paragraphs have more lines.
+        let catalog = create_catalog(&[
+            (
+                "first\n\
+                 paragraph",
+                "FIRST\n\
+                 TRANSLATED\n\
+                 PARAGRAPH",
+            ),
+            (
+                "last\n\
+                 paragraph",
+                "LAST\n\
+                 TRANSLATED\n\
+                 PARAGRAPH",
+            ),
+        ]);
+        // Paragraph separation is kept intact while translating.
+        assert_eq!(
+            translate(
+                "\n\
+                 first\n\
+                 paragraph\n\
+                 \n\
+                 \n\
+                 \n\
+                 last\n\
+                 paragraph\n\
+                 \n\
+                 \n",
+                &catalog
+            ),
+            "\n\
+             FIRST\n\
+             TRANSLATED\n\
+             PARAGRAPH\n\
+             \n\
+             \n\
+             \n\
+             LAST\n\
+             TRANSLATED\n\
+             PARAGRAPH\n\
+             \n\
+             \n"
+        );
+    }
+}
--- a/i18n-helpers/src/bin/mdbook-xgettext.rs
+++ b/i18n-helpers/src/bin/mdbook-xgettext.rs
@@ -0,0 +1,128 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! `xgettext` for `mdbook`
+//!
+//! This program works like `xgettext`, meaning it will extract
+//! translatable strings from your book. The strings are saved in a
+//! GNU Gettext `messages.pot` file in your build directory (typically
+//! `po/messages.pot`).
+//!
+//! See `TRANSLATIONS.md` in the repository root for more information.
+
+use anyhow::{anyhow, Context};
+use mdbook::renderer::RenderContext;
+use mdbook::BookItem;
+use polib::catalog::Catalog;
+use polib::message::Message;
+use std::fs;
+use std::io;
+
+fn add_message(catalog: &mut Catalog, msgid: &str, source: &str) {
+    let sources = match catalog.find_message(msgid) {
+        Some(msg) => format!("{}\n{}", msg.source, source),
+        None => String::from(source),
+    };
+    let message = Message::new_singular("", &sources, "", "", msgid, "");
+
+    // Carefully update the existing message or add a new one. It's an
+    // error to create a catalog with duplicate msgids.
+    match catalog.find_message_index(msgid) {
+        Some(&idx) => catalog.update_message_by_index(idx, message).unwrap(),
+        None => catalog.add_message(message),
+    }
+}
+
+fn create_catalog(ctx: &RenderContext) -> anyhow::Result<Catalog> {
+    let mut catalog = Catalog::new();
+    if let Some(title) = &ctx.config.book.title {
+        catalog.metadata.project_id_version = String::from(title);
+    }
+    if let Some(lang) = &ctx.config.book.language {
+        catalog.metadata.language = String::from(lang);
+    }
+    catalog.metadata.mime_version = String::from("1.0");
+    catalog.metadata.content_type = String::from("text/plain; charset=UTF-8");
+    catalog.metadata.content_transfer_encoding = String::from("8bit");
+
+    let summary_path = ctx.config.book.src.join("SUMMARY.md");
+    let summary = std::fs::read_to_string(ctx.root.join(&summary_path))?;
+
+    // First, add all chapter names and part titles from SUMMARY.md.
+    // The book items are in order of the summary, so we can assign
+    // correct line numbers for duplicate lines by tracking the index
+    // of our last search.
+    let mut last_idx = 0;
+    for item in ctx.book.iter() {
+        let line = match item {
+            BookItem::Chapter(chapter) => &chapter.name,
+            BookItem::PartTitle(title) => title,
+            BookItem::Separator => continue,
+        };
+
+        let idx = summary[last_idx..].find(line).ok_or_else(|| {
+            anyhow!(
+                "Could not find {line:?} in SUMMARY.md after line {} -- \
+                 please remove any formatting from SUMMARY.md",
+                summary[..last_idx].lines().count()
+            )
+        })?;
+        last_idx += idx;
+        let lineno = summary[..last_idx].lines().count();
+        let source = format!("{}:{}", summary_path.display(), lineno);
+        add_message(&mut catalog, line, &source);
+    }
+
+    // Next, we add the chapter contents.
+    for item in ctx.book.iter() {
+        if let BookItem::Chapter(chapter) = item {
+            let path = match &chapter.path {
+                Some(path) => ctx.config.book.src.join(path),
+                None => continue,
+            };
+            for (lineno, paragraph) in i18n_helpers::extract_paragraphs(&chapter.content)
+            {
+                let source = format!("{}:{}", path.display(), lineno);
+                add_message(&mut catalog, paragraph, &source);
+            }
+        }
+    }
+
+    Ok(catalog)
+}
+
+fn main() -> anyhow::Result<()> {
+    let ctx = RenderContext::from_json(&mut io::stdin()).context("Parsing stdin")?;
+    let cfg = ctx
+        .config
+        .get_renderer("xgettext")
+        .ok_or_else(|| anyhow!("Could not read output.xgettext configuration"))?;
+    let path = cfg
+        .get("pot-file")
+        .ok_or_else(|| anyhow!("Missing output.xgettext.pot-file config value"))?
+        .as_str()
+        .ok_or_else(|| anyhow!("Expected a string for output.xgettext.pot-file"))?;
+    fs::create_dir_all(&ctx.destination)
+        .with_context(|| format!("Could not create {}", ctx.destination.display()))?;
+    let output_path = ctx.destination.join(path);
+    if output_path.exists() {
+        fs::remove_file(&output_path)
+            .with_context(|| format!("Removing {}", output_path.display()))?
+    }
+    let catalog = create_catalog(&ctx).context("Extracting messages")?;
+    polib::po_file::write(&catalog, &output_path)
+        .with_context(|| format!("Writing messages to {}", output_path.display()))?;
+
+    Ok(())
+}
--- a/i18n-helpers/src/lib.rs
+++ b/i18n-helpers/src/lib.rs
@@ -0,0 +1,123 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use once_cell::sync::Lazy;
+use regex::Regex;
+
+static PARAGRAPH_SEPARATOR: Lazy<Regex> = Lazy::new(|| Regex::new(r"\n\n+").unwrap());
+
+/// Extract paragraphs from text.
+///
+/// Paragraphs are separated by at least two newlines. Returns an
+/// iterator over line numbers (starting from 1) and paragraphs.
+pub fn extract_paragraphs(text: &str) -> impl Iterator<Item = (usize, &str)> {
+    // TODO: This could be made more sophisticated by parsing the
+    // Markdown and stripping off the markup characters.
+    //
+    // As an example, a header like "## My heading" could become just
+    // "My heading" in the `.pot` file. Similarly, paragraphs could be
+    // unfolded and list items could be translated one-by-one.
+
+    // Skip over leading empty lines.
+    let trimmed = text.trim_start_matches('\n');
+    let mut matches = PARAGRAPH_SEPARATOR.find_iter(trimmed);
+    let mut lineno = 1 + text.len() - trimmed.len();
+    let mut last = 0;
+
+    std::iter::from_fn(move || match matches.next() {
+        Some(m) => {
+            let result = (lineno, &trimmed[last..m.start()]);
+            lineno += trimmed[last..m.end()].lines().count();
+            last = m.end();
+            Some(result)
+        }
+        None => {
+            if last < trimmed.len() {
+                let result = (lineno, trimmed[last..].trim_end_matches('\n'));
+                last = trimmed.len();
+                Some(result)
+            } else {
+                None
+            }
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    macro_rules! assert_iter_eq {
+        ($left_iter:expr, $right:expr) => {
+            assert_eq!($left_iter.collect::<Vec<_>>(), $right)
+        };
+    }
+
+    #[test]
+    fn test_extract_paragraphs_empty() {
+        assert_iter_eq!(extract_paragraphs(""), vec![]);
+    }
+
+    #[test]
+    fn test_extract_paragraphs_single_line() {
+        assert_iter_eq!(
+            extract_paragraphs("This is a paragraph."),
+            vec![(1, "This is a paragraph.")]
+        );
+    }
+
+    #[test]
+    fn test_extract_paragraphs_simple() {
+        assert_iter_eq!(
+            extract_paragraphs(
+                "This is\n\
+                 the first\n\
+                 paragraph.\n\
+                 \n\
+                 Second paragraph."
+            ),
+            vec![
+                (1, "This is\nthe first\nparagraph."),
+                (5, "Second paragraph.")
+            ]
+        );
+    }
+
+    #[test]
+    fn test_extract_paragraphs_leading_newlines() {
+        assert_iter_eq!(
+            extract_paragraphs(
+                "\n\
+                 \n\
+                 \n\
+                 This is the\n\
+                 first paragraph."
+            ),
+            vec![(4, "This is the\nfirst paragraph.")]
+        );
+    }
+
+    #[test]
+    fn test_extract_paragraphs_trailing_newlines() {
+        assert_iter_eq!(
+            extract_paragraphs(
+                "This is\n\
+                 a paragraph.\n\
+                 \n\
+                 \n"
+            ),
+            vec![(1, "This is\na paragraph.")]
+        );
+    }
+}