Implement excluding code blocks (lycheeverse#523)

This is done in the extractor to avoid unnecessary allocations.
anonrig · Mar 26, 2022 · d616177 · d616177
1 parent 5a77209
commit d616177
Show file tree

Hide file tree

Showing 12 changed files with 432 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -217,6 +217,7 @@ FLAGS:
         --exclude-private        Exclude private IP address ranges from checking
         --glob-ignore-case       Ignore case when expanding filesystem path glob inputs
         --help                   Prints help information
+        --include-verbatim       Find links in verbatim sections like `pre`- and `code` blocks
     -i, --insecure               Proceed for server connections considered insecure (invalid TLS)
     -n, --no-progress            Do not show progress bar.
                                  This is recommended for non-interactive shells (e.g. for continuous integration)

diff --git a/benches/src/extract.rs b/benches/src/extract.rs
@@ -6,7 +6,8 @@ use std::path::PathBuf;
 fn extract(paths: &[PathBuf]) {
     for path in paths {
         let content: InputContent = path.try_into().unwrap();
-        let extracted = Extractor::extract(&content);
+        let extractor = Extractor::default();
+        let extracted = extractor.extract(&content);
         println!("{}", extracted.len());
     }
 }

diff --git a/examples/extract/extract.rs b/examples/extract/extract.rs
@@ -6,7 +6,8 @@ use std::fs;
 #[tokio::main]
 async fn main() -> Result<()> {
     let input = fs::read_to_string("fixtures/elvis.html").unwrap();
-    let links = Extractor::extract(&InputContent::from_string(&input, FileType::Html));
+    let extractor = Extractor::default();
+    let links = extractor.extract(&InputContent::from_string(&input, FileType::Html));
     println!("{links:#?}");
 
     Ok(())

diff --git a/fixtures/TEST_CODE_BLOCKS.md b/fixtures/TEST_CODE_BLOCKS.md
@@ -0,0 +1,11 @@
+# Test Links In Code
+
+```
+http://127.0.0.1/block
+```
+
+```bash
+http://127.0.0.1/bash
+```
+
+`http://127.0.0.1/inline` will also be excluded by default
diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs
@@ -223,6 +223,7 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
     let inputs = opts.inputs();
     let requests = Collector::new(opts.config.base.clone())
         .skip_missing_inputs(opts.config.skip_missing)
+        .include_verbatim(opts.config.include_verbatim)
         // File a bug if you rely on this envvar! It's going to go away eventually.
         .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
         .collect_links(inputs)

diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs
@@ -300,6 +300,11 @@ pub(crate) struct Config {
     #[serde(default)]
     pub(crate) skip_missing: bool,
 
+    /// Find links in verbatim sections like `pre`- and `code` blocks
+    #[structopt(long)]
+    #[serde(default)]
+    pub(crate) include_verbatim: bool,
+
     /// Ignore case when expanding filesystem path glob inputs
     #[structopt(long)]
     #[serde(default)]
@@ -375,6 +380,7 @@ impl Config {
             base: None;
             basic_auth: None;
             skip_missing: false;
+            include_verbatim: false;
             glob_ignore_case: false;
             output: None;
             require_https: false;

diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
@@ -9,7 +9,7 @@ mod cli {
 
     use assert_cmd::Command;
     use http::StatusCode;
-    use predicates::str::contains;
+    use predicates::str::{contains, is_empty};
     use pretty_assertions::assert_eq;
     use uuid::Uuid;
 
@@ -603,6 +603,37 @@ mod cli {
         Ok(())
     }
 
+    #[test]
+    fn test_include_verbatim() -> Result<()> {
+        let mut cmd = main_command();
+        let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
+
+        cmd.arg("--include-verbatim")
+            .arg(input)
+            .arg("--dump")
+            .assert()
+            .success()
+            .stdout(contains("http://127.0.0.1/block"))
+            .stdout(contains("http://127.0.0.1/inline"))
+            .stdout(contains("http://127.0.0.1/bash"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_exclude_verbatim() -> Result<()> {
+        let mut cmd = main_command();
+        let input = fixtures_path().join("TEST_CODE_BLOCKS.md");
+
+        cmd.arg(input)
+            .arg("--dump")
+            .assert()
+            .success()
+            .stdout(is_empty());
+
+        Ok(())
+    }
+
     #[test]
     fn test_require_https() -> Result<()> {
         let mut cmd = main_command();

diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
@@ -13,6 +13,7 @@ use par_stream::ParStreamExt;
 pub struct Collector {
     base: Option<Base>,
     skip_missing_inputs: bool,
+    include_verbatim: bool,
     use_html5ever: bool,
 }
 
@@ -24,6 +25,7 @@ impl Collector {
             base,
             skip_missing_inputs: false,
             use_html5ever: false,
+            include_verbatim: false,
         }
     }
 
@@ -41,6 +43,13 @@ impl Collector {
         self
     }
 
+    /// Skip over links in verbatim sections (like Markdown code blocks)
+    #[must_use]
+    pub const fn include_verbatim(mut self, yes: bool) -> Self {
+        self.include_verbatim = yes;
+        self
+    }
+
     /// Fetch all unique links from inputs
     /// All relative URLs get prefixed with `base` (if given).
     /// (This can be a directory or a base URL)
@@ -63,11 +72,8 @@ impl Collector {
                 let base = base.clone();
                 async move {
                     let content = content?;
-                    let uris: Vec<RawUri> = if self.use_html5ever {
-                        Extractor::extract_html5ever(&content)
-                    } else {
-                        Extractor::extract(&content)
-                    };
+                    let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
+                    let uris: Vec<RawUri> = extractor.extract(&content);
                     let requests = request::create(uris, &content, &base)?;
                     Result::Ok(stream::iter(requests.into_iter().map(Ok)))
                 }

diff --git a/lychee-lib/src/extract/html.rs → lychee-lib/src/extract/html5ever.rs b/lychee-lib/src/extract/html.rs → lychee-lib/src/extract/html5ever.rs
@@ -1,15 +1,17 @@
 use html5ever::{
     buffer_queue::BufferQueue,
     tendril::StrTendril,
-    tokenizer::{Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
+    tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
 };
 
-use super::plaintext::extract_plaintext;
+use super::{is_verbatim_elem, plaintext::extract_plaintext};
 use crate::types::raw_uri::RawUri;
 
 #[derive(Clone, Default)]
 struct LinkExtractor {
     links: Vec<RawUri>,
+    include_verbatim: bool,
+    inside_excluded_element: bool,
 }
 
 impl TokenSink for LinkExtractor {
@@ -18,20 +20,30 @@ impl TokenSink for LinkExtractor {
     #[allow(clippy::match_same_arms)]
     fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
         match token {
-            Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
+            Token::CharacterTokens(raw) => {
+                if self.inside_excluded_element {
+                    return TokenSinkResult::Continue;
+                }
+                self.links.extend(extract_plaintext(&raw));
+            }
             Token::TagToken(tag) => {
                 let Tag {
-                    kind: _kind,
+                    kind,
                     name,
                     self_closing: _self_closing,
                     attrs,
                 } = tag;
+                if !self.include_verbatim && is_verbatim_elem(&name) {
+                    // Skip content inside excluded elements until we see the end tag.
+                    self.inside_excluded_element = matches!(kind, TagKind::StartTag);
+                    return TokenSinkResult::Continue;
+                }
 
                 for attr in attrs {
                     let urls = LinkExtractor::extract_urls_from_elem_attr(
-                        attr.name.local.as_ref(),
-                        name.as_ref(),
-                        attr.value.as_ref(),
+                        &attr.name.local,
+                        &name,
+                        &attr.value,
                     );
 
                     let new_urls = match urls {
@@ -61,8 +73,12 @@ impl TokenSink for LinkExtractor {
 }
 
 impl LinkExtractor {
-    pub(crate) fn new() -> Self {
-        LinkExtractor::default()
+    pub(crate) const fn new(include_verbatim: bool) -> Self {
+        Self {
+            links: vec![],
+            include_verbatim,
+            inside_excluded_element: false,
+        }
     }
 
     /// Extract all semantically known links from a given html attribute.
@@ -75,6 +91,7 @@ impl LinkExtractor {
         // For a comprehensive list of elements that might contain URLs/URIs
         // see https://www.w3.org/TR/REC-html40/index/attributes.html
         // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
+
         match (elem_name, attr_name) {
             // Common element/attribute combinations for links
             (_, "href" | "src" | "cite" | "usemap")
@@ -115,13 +132,75 @@ impl LinkExtractor {
 }
 
 /// Extract unparsed URL strings from an HTML string.
-pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
+pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
     let mut input = BufferQueue::new();
     input.push_back(StrTendril::from(buf));
 
-    let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
+    let mut tokenizer = Tokenizer::new(
+        LinkExtractor::new(include_verbatim),
+        TokenizerOpts::default(),
+    );
     let _handle = tokenizer.feed(&mut input);
     tokenizer.end();
 
     tokenizer.sink.links
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const HTML_INPUT: &str = r#"
+<html>
+    <body>
+        <p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
+        <pre>
+        Some random text
+        https://foo.com and http://bar.com/some/path
+        Something else
+        </pre>
+        <p><b>bold</b></p>
+    </body>
+</html>"#;
+
+    #[test]
+    fn test_skip_verbatim() {
+        let expected = vec![RawUri {
+            text: "https://example.org".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+
+        let uris = extract_html(HTML_INPUT, false);
+        assert_eq!(uris, expected);
+    }
+
+    #[test]
+    fn test_include_verbatim() {
+        let expected = vec![
+            RawUri {
+                text: "https://example.com".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "https://example.org".to_string(),
+                element: Some("a".to_string()),
+                attribute: Some("href".to_string()),
+            },
+            RawUri {
+                text: "https://foo.com".to_string(),
+                element: None,
+                attribute: None,
+            },
+            RawUri {
+                text: "http://bar.com/some/path".to_string(),
+                element: None,
+                attribute: None,
+            },
+        ];
+
+        let uris = extract_html(HTML_INPUT, true);
+        assert_eq!(uris, expected);
+    }
+}