Skip to content

Commit

Permalink
Implement excluding code blocks (lycheeverse#523)
Browse files Browse the repository at this point in the history
This is done in the extractor to avoid unnecessary
allocations.
  • Loading branch information
mre authored Mar 26, 2022
1 parent 5a77209 commit d616177
Show file tree
Hide file tree
Showing 12 changed files with 432 additions and 60 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ FLAGS:
--exclude-private Exclude private IP address ranges from checking
--glob-ignore-case Ignore case when expanding filesystem path glob inputs
--help Prints help information
--include-verbatim Find links in verbatim sections like `pre`- and `code` blocks
-i, --insecure Proceed for server connections considered insecure (invalid TLS)
-n, --no-progress Do not show progress bar.
This is recommended for non-interactive shells (e.g. for continuous integration)
Expand Down
3 changes: 2 additions & 1 deletion benches/src/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ use std::path::PathBuf;
fn extract(paths: &[PathBuf]) {
for path in paths {
let content: InputContent = path.try_into().unwrap();
let extracted = Extractor::extract(&content);
let extractor = Extractor::default();
let extracted = extractor.extract(&content);
println!("{}", extracted.len());
}
}
Expand Down
3 changes: 2 additions & 1 deletion examples/extract/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ use std::fs;
#[tokio::main]
async fn main() -> Result<()> {
let input = fs::read_to_string("fixtures/elvis.html").unwrap();
let links = Extractor::extract(&InputContent::from_string(&input, FileType::Html));
let extractor = Extractor::default();
let links = extractor.extract(&InputContent::from_string(&input, FileType::Html));
println!("{links:#?}");

Ok(())
Expand Down
11 changes: 11 additions & 0 deletions fixtures/TEST_CODE_BLOCKS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Test Links In Code

```
http://127.0.0.1/block
```

```bash
http://127.0.0.1/bash
```

`http://127.0.0.1/inline` will also be excluded by default
1 change: 1 addition & 0 deletions lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
let inputs = opts.inputs();
let requests = Collector::new(opts.config.base.clone())
.skip_missing_inputs(opts.config.skip_missing)
.include_verbatim(opts.config.include_verbatim)
// File a bug if you rely on this envvar! It's going to go away eventually.
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").map_or(false, |x| x == "1"))
.collect_links(inputs)
Expand Down
6 changes: 6 additions & 0 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,11 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) skip_missing: bool,

/// Find links in verbatim sections like `pre`- and `code` blocks
#[structopt(long)]
#[serde(default)]
pub(crate) include_verbatim: bool,

/// Ignore case when expanding filesystem path glob inputs
#[structopt(long)]
#[serde(default)]
Expand Down Expand Up @@ -375,6 +380,7 @@ impl Config {
base: None;
basic_auth: None;
skip_missing: false;
include_verbatim: false;
glob_ignore_case: false;
output: None;
require_https: false;
Expand Down
33 changes: 32 additions & 1 deletion lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ mod cli {

use assert_cmd::Command;
use http::StatusCode;
use predicates::str::contains;
use predicates::str::{contains, is_empty};
use pretty_assertions::assert_eq;
use uuid::Uuid;

Expand Down Expand Up @@ -603,6 +603,37 @@ mod cli {
Ok(())
}

#[test]
fn test_include_verbatim() -> Result<()> {
let mut cmd = main_command();
let input = fixtures_path().join("TEST_CODE_BLOCKS.md");

cmd.arg("--include-verbatim")
.arg(input)
.arg("--dump")
.assert()
.success()
.stdout(contains("http://127.0.0.1/block"))
.stdout(contains("http://127.0.0.1/inline"))
.stdout(contains("http://127.0.0.1/bash"));

Ok(())
}

#[test]
fn test_exclude_verbatim() -> Result<()> {
let mut cmd = main_command();
let input = fixtures_path().join("TEST_CODE_BLOCKS.md");

cmd.arg(input)
.arg("--dump")
.assert()
.success()
.stdout(is_empty());

Ok(())
}

#[test]
fn test_require_https() -> Result<()> {
let mut cmd = main_command();
Expand Down
16 changes: 11 additions & 5 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use par_stream::ParStreamExt;
pub struct Collector {
base: Option<Base>,
skip_missing_inputs: bool,
include_verbatim: bool,
use_html5ever: bool,
}

Expand All @@ -24,6 +25,7 @@ impl Collector {
base,
skip_missing_inputs: false,
use_html5ever: false,
include_verbatim: false,
}
}

Expand All @@ -41,6 +43,13 @@ impl Collector {
self
}

/// Skip over links in verbatim sections (like Markdown code blocks)
#[must_use]
pub const fn include_verbatim(mut self, yes: bool) -> Self {
self.include_verbatim = yes;
self
}

/// Fetch all unique links from inputs
/// All relative URLs get prefixed with `base` (if given).
/// (This can be a directory or a base URL)
Expand All @@ -63,11 +72,8 @@ impl Collector {
let base = base.clone();
async move {
let content = content?;
let uris: Vec<RawUri> = if self.use_html5ever {
Extractor::extract_html5ever(&content)
} else {
Extractor::extract(&content)
};
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
let uris: Vec<RawUri> = extractor.extract(&content);
let requests = request::create(uris, &content, &base)?;
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
Expand Down
101 changes: 90 additions & 11 deletions lychee-lib/src/extract/html.rs → lychee-lib/src/extract/html5ever.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
use html5ever::{
buffer_queue::BufferQueue,
tendril::StrTendril,
tokenizer::{Tag, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
};

use super::plaintext::extract_plaintext;
use super::{is_verbatim_elem, plaintext::extract_plaintext};
use crate::types::raw_uri::RawUri;

#[derive(Clone, Default)]
struct LinkExtractor {
links: Vec<RawUri>,
include_verbatim: bool,
inside_excluded_element: bool,
}

impl TokenSink for LinkExtractor {
Expand All @@ -18,20 +20,30 @@ impl TokenSink for LinkExtractor {
#[allow(clippy::match_same_arms)]
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
match token {
Token::CharacterTokens(raw) => self.links.extend(extract_plaintext(&raw)),
Token::CharacterTokens(raw) => {
if self.inside_excluded_element {
return TokenSinkResult::Continue;
}
self.links.extend(extract_plaintext(&raw));
}
Token::TagToken(tag) => {
let Tag {
kind: _kind,
kind,
name,
self_closing: _self_closing,
attrs,
} = tag;
if !self.include_verbatim && is_verbatim_elem(&name) {
// Skip content inside excluded elements until we see the end tag.
self.inside_excluded_element = matches!(kind, TagKind::StartTag);
return TokenSinkResult::Continue;
}

for attr in attrs {
let urls = LinkExtractor::extract_urls_from_elem_attr(
attr.name.local.as_ref(),
name.as_ref(),
attr.value.as_ref(),
&attr.name.local,
&name,
&attr.value,
);

let new_urls = match urls {
Expand Down Expand Up @@ -61,8 +73,12 @@ impl TokenSink for LinkExtractor {
}

impl LinkExtractor {
pub(crate) fn new() -> Self {
LinkExtractor::default()
pub(crate) const fn new(include_verbatim: bool) -> Self {
Self {
links: vec![],
include_verbatim,
inside_excluded_element: false,
}
}

/// Extract all semantically known links from a given html attribute.
Expand All @@ -75,6 +91,7 @@ impl LinkExtractor {
// For a comprehensive list of elements that might contain URLs/URIs
// see https://www.w3.org/TR/REC-html40/index/attributes.html
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1

match (elem_name, attr_name) {
// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
Expand Down Expand Up @@ -115,13 +132,75 @@ impl LinkExtractor {
}

/// Extract unparsed URL strings from an HTML string.
pub(crate) fn extract_html(buf: &str) -> Vec<RawUri> {
pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
let mut input = BufferQueue::new();
input.push_back(StrTendril::from(buf));

let mut tokenizer = Tokenizer::new(LinkExtractor::new(), TokenizerOpts::default());
let mut tokenizer = Tokenizer::new(
LinkExtractor::new(include_verbatim),
TokenizerOpts::default(),
);
let _handle = tokenizer.feed(&mut input);
tokenizer.end();

tokenizer.sink.links
}

#[cfg(test)]
mod tests {
use super::*;

const HTML_INPUT: &str = r#"
<html>
<body>
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
<pre>
Some random text
https://foo.com and http://bar.com/some/path
Something else
</pre>
<p><b>bold</b></p>
</body>
</html>"#;

#[test]
fn test_skip_verbatim() {
let expected = vec![RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];

let uris = extract_html(HTML_INPUT, false);
assert_eq!(uris, expected);
}

#[test]
fn test_include_verbatim() {
let expected = vec![
RawUri {
text: "https://example.com".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "https://foo.com".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "http://bar.com/some/path".to_string(),
element: None,
attribute: None,
},
];

let uris = extract_html(HTML_INPUT, true);
assert_eq!(uris, expected);
}
}
Loading

0 comments on commit d616177

Please sign in to comment.