infra: remove extra leading > in simple preprocessor output

Preprocessors using `pulldown-cmark-to-cmark` do not yet perform round trips 100% correctly, and insert leading spaces and an extra initial `>` for block quotes, so strip those. Once that is fixed upstream, this will become a no-op, and can be removed then.
rust-lang · Dec 5, 2024 · 1882cfb · 1882cfb
1 parent 2e8392d
commit 1882cfb
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 3 deletions.
diff --git a/packages/tools/Cargo.toml b/packages/tools/Cargo.toml
@@ -36,6 +36,11 @@ path = "src/bin/remove_links.rs"
 name = "remove_markup"
 path = "src/bin/remove_markup.rs"
 
+[[bin]]
+name = "cleanup_blockquotes"
+path = "src/bin/cleanup_blockquotes.rs"
+
+
 [dependencies]
 walkdir = { workspace = true }
 docopt = { workspace = true }

diff --git a/packages/tools/src/bin/cleanup_blockquotes.rs b/packages/tools/src/bin/cleanup_blockquotes.rs
@@ -0,0 +1,147 @@
+//! Fix incorrect round-tripping of block quotes in `pulldown-cmark-to-cmark`:
+//!
+//! - Eliminate extraneous leading `>`
+//! - Eliminate extraneous indent.
+//!
+//! Note: later versions of `pulldown-cmark-to-cmark` will likely fix this, so
+//! check when upgrading it if it is still necessary!
+
+use std::io::{self, Read};
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+fn main() {
+    let input = {
+        let mut buffer = String::new();
+        io::stdin()
+            .read_to_string(&mut buffer)
+            .unwrap_or_else(|e| panic!("{e}"));
+        buffer
+    };
+
+    let fixed = cleanup_blockquotes(input);
+    print!("{fixed}");
+}
+
+fn cleanup_blockquotes(input: String) -> String {
+    let normal_start = EXTRA_SPACE.replace_all(&input, ">");
+    let sans_empty_leading = EMPTY_LEADING.replace_all(&normal_start, "\n\n");
+    sans_empty_leading.to_string()
+}
+
+lazy_static! {
+    static ref EXTRA_SPACE: Regex = Regex::new("(?m)^ >").unwrap();
+    static ref EMPTY_LEADING: Regex = Regex::new("\n\n> ?\n").unwrap();
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn extra_space() {
+        let input = " > Hello".to_string();
+        let actual = cleanup_blockquotes(input);
+        assert_eq!(actual, "> Hello");
+    }
+
+    #[test]
+    fn empty_leading() {
+        let input = "\n\n>\n> Hello".into();
+        let actual = cleanup_blockquotes(input);
+        assert_eq!(actual, "\n\n> Hello");
+    }
+
+    #[test]
+    fn leading_after_extra_space_cleaned_up() {
+        let input = r#"Start
+
+>
+> Note: Hey.
+
+Wrap."#
+            .into();
+
+        let actual = cleanup_blockquotes(input);
+        assert_eq!(
+            actual,
+            r#"Start
+
+> Note: Hey.
+
+Wrap."#
+        );
+    }
+
+    /// This particular input was the result of running any of the mdbook
+    /// preprocessors which use `pulldown-cmark-to-cmark@<=18.0.0`.
+    #[test]
+    fn regression_ch17_example() {
+        //  This is an example of the original motivating input which we are fixing.
+        let input = r#"
+We have to explicitly await both of these futures, because futures in Rust are
+*lazy*: they don’t do anything until you ask them to with `await`. (In fact,
+Rust will show a compiler warning if you don’t use a future.) This should
+remind you of our discussion of iterators [back in Chapter 13][iterators-lazy].
+Iterators do nothing unless you call their `next` method—whether directly, or
+using `for` loops or methods such as `map` which use `next` under the hood. With
+futures, the same basic idea applies: they do nothing unless you explicitly ask
+them to. This laziness allows Rust to avoid running async code until it’s
+actually needed.
+
+ >
+ > Note: This is different from the behavior we saw when using `thread::spawn` in
+ > the previous chapter, where the closure we passed to another thread started
+ > running immediately. It’s also different from how many other languages
+ > approach async! But it’s important for Rust. We’ll see why that is later.
+
+Once we have `response_text`, we can then parse it into an instance of the
+`Html` type using `Html::parse`. Instead of a raw string, we now have a data
+type we can use to work with the HTML as a richer data structure. In particular,
+we can use the `select_first` method to find the first instance of a given CSS
+selector. By passing the string `"title"`, we’ll get the first `<title>`
+element in the document, if there is one. Because there may not be any matching
+element, `select_first` returns an `Option<ElementRef>`. Finally, we use the
+`Option::map` method, which lets us work with the item in the `Option` if it’s
+present, and do nothing if it isn’t. (We could also use a `match` expression
+here, but `map` is more idiomatic.) In the body of the function we supply to
+`map`, we call `inner_html` on the `title_element` to get its content, which is
+a `String`. When all is said and done, we have an `Option<String>`.
+"#.to_string();
+
+        let actual = cleanup_blockquotes(input);
+        assert_eq!(
+            actual,
+            r#"
+We have to explicitly await both of these futures, because futures in Rust are
+*lazy*: they don’t do anything until you ask them to with `await`. (In fact,
+Rust will show a compiler warning if you don’t use a future.) This should
+remind you of our discussion of iterators [back in Chapter 13][iterators-lazy].
+Iterators do nothing unless you call their `next` method—whether directly, or
+using `for` loops or methods such as `map` which use `next` under the hood. With
+futures, the same basic idea applies: they do nothing unless you explicitly ask
+them to. This laziness allows Rust to avoid running async code until it’s
+actually needed.
+
+> Note: This is different from the behavior we saw when using `thread::spawn` in
+> the previous chapter, where the closure we passed to another thread started
+> running immediately. It’s also different from how many other languages
+> approach async! But it’s important for Rust. We’ll see why that is later.
+
+Once we have `response_text`, we can then parse it into an instance of the
+`Html` type using `Html::parse`. Instead of a raw string, we now have a data
+type we can use to work with the HTML as a richer data structure. In particular,
+we can use the `select_first` method to find the first instance of a given CSS
+selector. By passing the string `"title"`, we’ll get the first `<title>`
+element in the document, if there is one. Because there may not be any matching
+element, `select_first` returns an `Option<ElementRef>`. Finally, we use the
+`Option::map` method, which lets us work with the item in the `Option` if it’s
+present, and do nothing if it isn’t. (We could also use a `match` expression
+here, but `map` is more idiomatic.) In the body of the function we supply to
+`map`, we call `inner_html` on the `title_element` to get its content, which is
+a `String`. When all is said and done, we have an `Option<String>`.
+"#
+        );
+    }
+}
diff --git a/tools/nostarch.sh b/tools/nostarch.sh
@@ -4,7 +4,7 @@ set -eu
 
 cargo build --release
 
-cargo install --locked --path ./packages/mdbook_trpl
+cargo install --locked --path ./packages/mdbook_trpl --offline
 
 mkdir -p tmp
 rm -rf tmp/*.md
@@ -14,7 +14,9 @@ rm -rf tmp/markdown
 MDBOOK_OUTPUT__MARKDOWN=1 mdbook build nostarch
 
 # Get all the Markdown files
-find tmp/markdown -name "${1:-\"\"}*.md" -print0 | \
+# TODO: what was this doing and why?!?
+# find tmp/markdown -name "${1:-\"\"}*.md" -print0 | \
+find tmp/markdown -name "*.md" -print0 | \
 # Extract just the filename so we can reuse it easily.
 xargs -0 basename | \
 # Remove all links followed by `<!-- ignore -->``, then
@@ -23,7 +25,8 @@ while IFS= read -r filename; do
   < "tmp/markdown/$filename" ./target/release/remove_links \
     | ./target/release/link2print \
     | ./target/release/remove_markup \
-    | ./target/release/remove_hidden_lines > "tmp/$filename"
+    | ./target/release/remove_hidden_lines \
+    | ./target/release/cleanup_blockquotes > "tmp/$filename"
 done
 # Concatenate the files into the `nostarch` dir.
 ./target/release/concat_chapters tmp nostarch