diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c60724d..7c26d3d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: target key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - name: Run - run: cargo run -- ./README.md -d + run: cargo run -- ./README.md -d --disable-raw-link-check formatting: runs-on: ubuntu-latest diff --git a/README.md b/README.md index 1ea1cfa..5e5ed79 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ The following arguments are available: | `--markup-types` | `-t` | Comma separated list list of markup types which shall be checked. Possible values: `md`, `html` | | `--root-dir` | `-r` | All links to the file system starting with a slash on linux or backslash on windows will use another virtual root dir. For example the link in a file `[link](/dir/other/file.md)` checked with the cli arg `--root-dir /env/another/dir` will let *mlc* check the existence of `/env/another/dir/dir/other/file.md`. | | `--throttle` | `-T` | Number of milliseconds to wait in between web requests to the same host. Default is zero which means no throttling. Set this if you need to slow down the web request frequency to avoid `429 - Too Many Requests` responses. For example with `--throttle 15`, between each http check to the same host, 15 ms will be waited. Note that this setting can slow down the link checker. | +| `--disable-raw-link-check` | `-c` | Disable checking of raw links in code blocks and other text. By default, raw HTTP(S) URLs are extracted and checked from code blocks and inline code. | | `--csv` | | Path to csv file which contains all failed requests and warnings in the format `source,line,column,target,severity`. The severity column contains `ERR` for errors and `WARN` for warnings. | | `--files` | `-f` | Comma separated list of files which shall be checked. For example `--files "./README.md,./docs/README.md"` will check only the specified files. This is useful for checking specific files in a monorepo without having to exclude many directories. | | `--http-headers` | `-H` | Comma separated list of custom HTTP headers in the format `'Name: Value'`. This is useful for setting custom user agents or other headers required by specific websites. For example `--http-headers "User-Agent: Mozilla/5.0,X-Custom-Header: value"` will set both a custom user agent and an additional header. | @@ -216,6 +217,8 @@ markup-types=["Markdown","Html"] throttle= 100 # Path to the root folder used to resolve all relative paths root-dir="./" +# Disable checking of raw links in code blocks (enabled by default) +disable-raw-link-check = false # Path to csv file which contains all failed requests and warnings csv="output.csv" # List of specific files to check diff --git a/src/cli.rs b/src/cli.rs index f15cf3a..f5cb3f3 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -132,6 +132,14 @@ pub fn parse_args() -> Config { .action(ArgAction::SetTrue) .required(false), ) + .arg( + Arg::new("disable-raw-link-check") + .long("disable-raw-link-check") + .short('c') + .action(ArgAction::SetTrue) + .help("Disable checking of raw links in code blocks and other text. By default, raw HTTP(S) URLs are extracted and checked.") + .required(false), + ) .arg( Arg::new("files") .long("files") @@ -228,6 +236,10 @@ pub fn parse_args() -> Config { opt.gituntracked = Some(true); } + if matches.get_flag("disable-raw-link-check") { + opt.disable_raw_link_check = Some(true); + } + if let Some(files) = matches.get_many::("files") { let mut file_paths: Vec<_> = files .map(|x| Path::new(&normalize_path_separators(x)).to_path_buf()) diff --git a/src/lib.rs b/src/lib.rs index 503e2f8..3e84c48 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -62,6 +62,8 @@ pub struct OptionalConfig { #[serde(rename(deserialize = "gituntracked"))] pub gituntracked: Option, pub throttle: Option, + #[serde(rename(deserialize = "disable-raw-link-check"))] + pub disable_raw_link_check: Option, #[serde(rename(deserialize = "files"))] pub files: Option>, #[serde(rename(deserialize = "http-headers"))] @@ -120,6 +122,7 @@ IgnoreLinks: {} IgnorePath: {:?} Throttle: {} ms CSVFile: {:?} +DisableRawLinkCheck: {} Files: {:?} HttpHeaders: {:?}", self.optional.debug.unwrap_or(false), @@ -135,6 +138,7 @@ HttpHeaders: {:?}", ignore_path_str, self.optional.throttle.unwrap_or(0), csv_file_str, + self.optional.disable_raw_link_check.unwrap_or_default(), files_str, http_headers_str ) @@ -158,7 +162,9 @@ fn find_all_links(config: &Config) -> Vec Vec> { + fn find_links( + &self, + text: &str, + _config: &Config, + ) -> Vec> { let mut result: Vec> = Vec::new(); let mut state: ParserState = ParserState::Text; let mut link_column = 0; @@ -121,13 +126,22 @@ impl LinkExtractor for HtmlLinkExtractor { #[cfg(test)] mod tests { use super::*; + use crate::OptionalConfig; use ntest::test_case; + use std::path::PathBuf; + + fn default_config() -> Config { + Config { + directory: PathBuf::from("."), + optional: OptionalConfig::default(), + } + } #[test] fn no_link() { let le = HtmlLinkExtractor(); let input = "]This is not a no link

Bla

attribute."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -135,14 +149,17 @@ mod tests { fn commented() { let le = HtmlLinkExtractor(); let input = "df "; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } #[test] fn space() { let le = HtmlLinkExtractor(); - let result = le.find_links("blah foo."); + let result = le.find_links( + "blah foo.", + &default_config(), + ); let expected = Ok(MarkupLink { target: "some file.html".to_string(), line: 1, @@ -155,7 +172,10 @@ mod tests { #[test] fn url_encoded_path() { let le = HtmlLinkExtractor(); - let result = le.find_links("blah foo."); + let result = le.find_links( + "blah foo.", + &default_config(), + ); let expected = Ok(MarkupLink { target: "some file.html".to_string(), line: 1, @@ -183,7 +203,7 @@ mod tests { )] fn links(input: &str, line: usize, column: usize) { let le = HtmlLinkExtractor(); - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); let expected = Ok(MarkupLink { target: "https://www.w3schools.com".to_string(), line, @@ -197,7 +217,7 @@ mod tests { fn ignore_disable_line() { let le = HtmlLinkExtractor(); let input = " link"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -205,7 +225,7 @@ mod tests { fn ignore_disable_next_line() { let le = HtmlLinkExtractor(); let input = "\nlink"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -213,7 +233,7 @@ mod tests { fn ignore_disable_block() { let le = HtmlLinkExtractor(); let input = "\nlink1\n\nlink2"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert_eq!(1, result.len()); assert_eq!(result[0].as_ref().unwrap().target, "http://example.com/"); assert_eq!(result[0].as_ref().unwrap().line, 4); @@ -223,7 +243,7 @@ mod tests { fn ignore_multiple_blocks() { let le = HtmlLinkExtractor(); let input = "1\n\n2\n\n3"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert_eq!(2, result.len()); assert_eq!(result[0].as_ref().unwrap().target, "http://a.com/"); assert_eq!(result[1].as_ref().unwrap().target, "http://c.com/"); diff --git a/src/link_extractors/link_extractor.rs b/src/link_extractors/link_extractor.rs index 8cbc8a9..8300ea4 100644 --- a/src/link_extractors/link_extractor.rs +++ b/src/link_extractors/link_extractor.rs @@ -1,6 +1,7 @@ use super::html_link_extractor::HtmlLinkExtractor; use super::markdown_link_extractor::MarkdownLinkExtractor; use crate::markup::{MarkupFile, MarkupType}; +use crate::Config; use std::env; use std::fmt; use std::fs; @@ -57,14 +58,17 @@ impl MarkupLink { } #[must_use] -pub fn find_links(file: &MarkupFile) -> Vec> { +pub fn find_links( + file: &MarkupFile, + config: &Config, +) -> Vec> { let path = &file.path; let link_extractor = link_extractor_factory(file.markup_type); info!("Scan file at path '{path}' for links."); match fs::read_to_string(path) { Ok(text) => { - let mut links = link_extractor.find_links(&text); + let mut links = link_extractor.find_links(&text, config); for l in &mut links { match l { Ok(link) => { @@ -92,5 +96,9 @@ fn link_extractor_factory(markup_type: MarkupType) -> Box { } pub trait LinkExtractor { - fn find_links(&self, text: &str) -> Vec>; + fn find_links( + &self, + text: &str, + config: &Config, + ) -> Vec>; } diff --git a/src/link_extractors/markdown_link_extractor.rs b/src/link_extractors/markdown_link_extractor.rs index 5d017a8..29fca5a 100644 --- a/src/link_extractors/markdown_link_extractor.rs +++ b/src/link_extractors/markdown_link_extractor.rs @@ -3,12 +3,26 @@ use super::ignore_comments::IgnoreRegions; use super::link_extractor::BrokenExtractedLink; use crate::link_extractors::link_extractor::LinkExtractor; use crate::link_extractors::link_extractor::MarkupLink; -use pulldown_cmark::{BrokenLink, Event, Options, Parser, Tag}; +use crate::Config; +use pulldown_cmark::{BrokenLink, Event, Options, Parser, Tag, TagEnd}; +use regex::Regex; + +lazy_static! { + // Regex to match HTTP(S) URLs in code blocks + // Matches URLs including those with parentheses, brackets, and query parameters + // The pattern matches greedily but we trim trailing punctuation in code + static ref CODE_BLOCK_URL_REGEX: Regex = + Regex::new(r"https?://[^\s<>]+").unwrap(); +} pub struct MarkdownLinkExtractor(); impl LinkExtractor for MarkdownLinkExtractor { - fn find_links(&self, text: &str) -> Vec> { + fn find_links( + &self, + text: &str, + config: &Config, + ) -> Vec> { use std::cell::RefCell; let result: RefCell>> = RefCell::new(Vec::new()); @@ -42,9 +56,13 @@ impl LinkExtractor for MarkdownLinkExtractor { let parser = Parser::new_with_broken_link_callback(text, Options::ENABLE_TASKLISTS, Some(callback)); + let check_code_blocks = !config.optional.disable_raw_link_check.unwrap_or(false); + let mut inside_link = false; + for (evt, range) in parser.into_offset_iter() { match evt { Event::Start(Tag::Link { dest_url, .. } | Tag::Image { dest_url, .. }) => { + inside_link = true; let line_col = converter.line_column_from_idx(range.start); // Skip if line is ignored @@ -59,9 +77,12 @@ impl LinkExtractor for MarkdownLinkExtractor { target: dest_url.to_string(), })); } + Event::End(TagEnd::Link | TagEnd::Image) => { + inside_link = false; + } Event::Html(html) | Event::InlineHtml(html) => { let line_col = converter.line_column_from_idx(range.start); - let html_result = html_extractor.find_links(html.as_ref()); + let html_result = html_extractor.find_links(html.as_ref(), config); let mut parsed_html = html_result .iter() .filter_map(|res| res.as_ref().ok()) @@ -90,6 +111,37 @@ impl LinkExtractor for MarkdownLinkExtractor { .collect(); result.borrow_mut().append(&mut parsed_html); } + Event::Text(code_text) | Event::Code(code_text) + if check_code_blocks && !inside_link => + { + // Extract HTTP(S) URLs from code blocks using the pre-compiled regex + for url_match in CODE_BLOCK_URL_REGEX.find_iter(code_text.as_ref()) { + let mut url = url_match.as_str(); + + // Trim common trailing punctuation that's likely not part of the URL + // But keep parentheses and brackets if they appear to be balanced + while let Some(last_char) = url.chars().last() { + if matches!(last_char, '.' | ',' | ';' | '!' | '?') { + url = &url[..url.len() - last_char.len_utf8()]; + } else { + break; + } + } + + if url.is_empty() { + continue; + } + + let url_start_idx = range.start + url_match.start(); + let url_line_col = converter.line_column_from_idx(url_start_idx); + result.borrow_mut().push(Ok(MarkupLink { + line: url_line_col.0, + column: url_line_col.1, + source: String::new(), + target: url.to_string(), + })); + } + } _ => (), }; } @@ -132,13 +184,32 @@ impl LineColumnConverter { #[cfg(test)] mod tests { use super::*; + use crate::OptionalConfig; use ntest::test_case; + use std::path::PathBuf; + + fn default_config() -> Config { + Config { + directory: PathBuf::from("."), + optional: OptionalConfig::default(), + } + } + + fn config_with_raw_link_check_disabled() -> Config { + Config { + directory: PathBuf::from("."), + optional: OptionalConfig { + disable_raw_link_check: Some(true), + ..OptionalConfig::default() + }, + } + } #[test] fn inline_no_link() { let le = MarkdownLinkExtractor(); let input = "]This is not a () link](! has no title attribute."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -146,7 +217,7 @@ mod tests { fn commented_link() { let le = MarkdownLinkExtractor(); let input = "]This is not a () ."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -155,7 +226,7 @@ mod tests { let le = MarkdownLinkExtractor(); let input = "\n\r\t\n[![](http://meritbadge.herokuapp.com/mlc)](https://crates.io/crates/mlc)"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); let img = Ok(MarkupLink { target: "http://meritbadge.herokuapp.com/mlc".to_string(), line: 3, @@ -175,7 +246,7 @@ mod tests { fn link_escaped() { let le = MarkdownLinkExtractor(); let input = "This is not a \\[link\\](random_link)."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -183,7 +254,7 @@ mod tests { fn link_in_headline() { let le = MarkdownLinkExtractor(); let input = " # This is a [link](http://example.net/)."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert_eq!(result[0].as_ref().unwrap().column, 15); } @@ -191,7 +262,7 @@ mod tests { fn no_link_colon() { let le = MarkdownLinkExtractor(); let input = "This is not a [link:bla."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -199,7 +270,7 @@ mod tests { fn broken_reference_link() { let le = MarkdownLinkExtractor(); let input = "This is not a [link]:bla."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); let expected = Err(BrokenExtractedLink { source: "".to_string(), @@ -215,7 +286,18 @@ mod tests { fn inline_code() { let le = MarkdownLinkExtractor(); let input = " `[code](http://example.net/)`, no link!."; - let result = le.find_links(input); + // With raw link checking enabled (default), raw URL in inline code is extracted + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + assert_eq!(result[0].as_ref().unwrap().target, "http://example.net/)"); + } + + #[test] + fn inline_code_with_raw_link_check_disabled() { + let le = MarkdownLinkExtractor(); + let input = " `[code](http://example.net/)`, no link!."; + // With raw link checking disabled, inline code is ignored + let result = le.find_links(input, &config_with_raw_link_check_disabled()); assert!(result.is_empty()); } @@ -223,7 +305,7 @@ mod tests { fn link_near_inline_code() { let le = MarkdownLinkExtractor(); let input = " `bug` [code](http://example.net/), link!."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); let expected = Ok(MarkupLink { target: "http://example.net/".to_string(), line: 1, @@ -237,7 +319,7 @@ mod tests { fn link_very_near_inline_code() { let le = MarkdownLinkExtractor(); let input = "`bug`[code](http://example.net/)"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); let expected = Ok(MarkupLink { target: "http://example.net/".to_string(), line: 1, @@ -250,8 +332,19 @@ mod tests { #[test] fn code_block() { let le = MarkdownLinkExtractor(); - let input = " ``` js\n[code](http://example.net/)```, no link!."; - let result = le.find_links(input); + let input = "```js\n[code](http://example.net/)\n```\nno link!."; + // With raw link checking enabled (default), the raw URL in the code block is extracted + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + assert_eq!(result[0].as_ref().unwrap().target, "http://example.net/)"); + } + + #[test] + fn code_block_with_raw_link_check_disabled() { + let le = MarkdownLinkExtractor(); + let input = "```js\n[code](http://example.net/)\n```\nno link!."; + // With raw link checking disabled, code blocks are ignored + let result = le.find_links(input, &config_with_raw_link_check_disabled()); assert!(result.is_empty()); } @@ -259,7 +352,7 @@ mod tests { fn html_code_block() { let le = MarkdownLinkExtractor(); let input = ", no link!."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -267,7 +360,7 @@ mod tests { fn escaped_code_block() { let le = MarkdownLinkExtractor(); let input = " klsdjf \\`[escape](http://example.net/)\\`, no link!."; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); let expected = Ok(MarkupLink { target: "http://example.net/".to_string(), line: 1, @@ -281,7 +374,18 @@ mod tests { fn link_in_code_block() { let le = MarkdownLinkExtractor(); let input = "```\n[only code](http://example.net/)\n```."; - let result = le.find_links(input); + // With raw link checking enabled (default), raw URL in code block is extracted + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + assert_eq!(result[0].as_ref().unwrap().target, "http://example.net/)"); + } + + #[test] + fn link_in_code_block_with_raw_link_check_disabled() { + let le = MarkdownLinkExtractor(); + let input = "```\n[only code](http://example.net/)\n```."; + // With raw link checking disabled, code blocks are ignored + let result = le.find_links(input, &config_with_raw_link_check_disabled()); assert!(result.is_empty()); } @@ -290,7 +394,7 @@ mod tests { let le = MarkdownLinkExtractor(); let link_str = "http://example.net/"; let input = format!("\n\nBla ![This is an image link]({link_str})"); - let result = le.find_links(&input); + let result = le.find_links(&input, &default_config()); let expected = Ok(MarkupLink { target: link_str.to_string(), line: 3, @@ -305,7 +409,7 @@ mod tests { let le = MarkdownLinkExtractor(); let link_str = "http://example.net/"; let input = format!("[This link]({link_str}) has no title attribute."); - let result = le.find_links(&input); + let result = le.find_links(&input, &default_config()); let expected = Ok(MarkupLink { target: link_str.to_string(), line: 1, @@ -320,7 +424,7 @@ mod tests { let le = MarkdownLinkExtractor(); let link_str = "http://example.net/"; let input = format!("\n123[This is a link]({link_str} \"with title\") oh yea."); - let result = le.find_links(&input); + let result = le.find_links(&input, &default_config()); let expected = Ok(MarkupLink { target: link_str.to_string(), line: 2, @@ -337,7 +441,7 @@ mod tests { #[test_case("This is a short link ", 22)] fn inline_link(input: &str, column: usize) { let le = MarkdownLinkExtractor(); - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); let expected = Ok(MarkupLink { target: "http://example.net/".to_string(), line: 1, @@ -357,7 +461,7 @@ mod tests { )] fn html_link(input: &str) { let le = MarkdownLinkExtractor(); - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); let expected = Ok(MarkupLink { target: "http://example.net/".to_string(), line: 1, @@ -370,7 +474,10 @@ mod tests { #[test] fn html_link_ident() { let le = MarkdownLinkExtractor(); - let result = le.find_links("123 link text"); + let result = le.find_links( + "123 link text", + &default_config(), + ); let expected = Ok(MarkupLink { target: "http://example.net/".to_string(), line: 1, @@ -383,7 +490,10 @@ mod tests { #[test] fn html_link_new_line() { let le = MarkdownLinkExtractor(); - let result = le.find_links("\n123 link text"); + let result = le.find_links( + "\n123 link text", + &default_config(), + ); let expected = Ok(MarkupLink { target: "http://example.net/".to_string(), line: 2, @@ -396,7 +506,10 @@ mod tests { #[test] fn raw_html_issue_31() { let le = MarkdownLinkExtractor(); - let result = le.find_links("Some text link text more text."); + let result = le.find_links( + "Some text link text more text.", + &default_config(), + ); let expected = Ok(MarkupLink { target: "some_url".to_string(), line: 1, @@ -413,7 +526,7 @@ mod tests { let input = format!( "This is [an example][arbitrary case-insensitive reference text] reference-style link.\n\n[Arbitrary CASE-insensitive reference text]: {link_str}" ); - let result = le.find_links(&input); + let result = le.find_links(&input, &default_config()); let expected = Ok(MarkupLink { target: link_str.to_string(), line: 1, @@ -428,7 +541,7 @@ mod tests { let le = MarkdownLinkExtractor(); let link_str = "http://example.net/"; let input = format!("Foo Bar\n\n[Arbitrary CASE-insensitive reference text]: {link_str}"); - let result = le.find_links(&input); + let result = le.find_links(&input, &default_config()); assert_eq!(0, result.len()); } @@ -436,15 +549,116 @@ mod tests { fn referenced_link_no_tag_only() { let le = MarkdownLinkExtractor(); let input = "[link][reference]"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + } + + // Tests for code block link checking feature + #[test] + fn code_block_with_url_checked_by_default() { + let le = MarkdownLinkExtractor(); + let input = "```bash\nwget https://raw.githubusercontent.com/example/file.txt\n```"; + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + let link = result[0].as_ref().unwrap(); + assert_eq!( + link.target, + "https://raw.githubusercontent.com/example/file.txt" + ); + } + + #[test] + fn code_block_with_url_not_checked_when_disabled() { + let le = MarkdownLinkExtractor(); + let input = "```bash\nwget https://raw.githubusercontent.com/example/file.txt\n```"; + let result = le.find_links(input, &config_with_raw_link_check_disabled()); + assert!(result.is_empty()); + } + + #[test] + fn code_block_with_url_checked_when_enabled() { + let le = MarkdownLinkExtractor(); + let input = "```bash\nwget https://raw.githubusercontent.com/example/file.txt\n```"; + let result = le.find_links(input, &default_config()); assert_eq!(1, result.len()); + let link = result[0].as_ref().unwrap(); + assert_eq!( + link.target, + "https://raw.githubusercontent.com/example/file.txt" + ); + } + + #[test] + fn inline_code_with_url_checked_when_enabled() { + let le = MarkdownLinkExtractor(); + let input = "Use `wget https://example.com/file.txt` to download."; + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + let link = result[0].as_ref().unwrap(); + assert_eq!(link.target, "https://example.com/file.txt"); + } + + #[test] + fn code_block_with_multiple_urls() { + let le = MarkdownLinkExtractor(); + let input = + "```\nwget https://example.com/file1.txt\ncurl http://example.org/file2.txt\n```"; + let result = le.find_links(input, &default_config()); + assert_eq!(2, result.len()); + assert_eq!( + result[0].as_ref().unwrap().target, + "https://example.com/file1.txt" + ); + assert_eq!( + result[1].as_ref().unwrap().target, + "http://example.org/file2.txt" + ); + } + + #[test] + fn code_block_url_with_special_chars() { + let le = MarkdownLinkExtractor(); + let input = "```\nhttps://example.com/path?param=value&other=123\n```"; + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + assert_eq!( + result[0].as_ref().unwrap().target, + "https://example.com/path?param=value&other=123" + ); + } + + #[test] + fn code_block_url_with_parentheses() { + let le = MarkdownLinkExtractor(); + let input = + "```\nSee https://en.wikipedia.org/wiki/Markdown_(markup_language) for details\n```"; + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + // URL should include the parentheses in the path + assert_eq!( + result[0].as_ref().unwrap().target, + "https://en.wikipedia.org/wiki/Markdown_(markup_language)" + ); + } + + #[test] + fn code_block_url_ending_with_punctuation() { + let le = MarkdownLinkExtractor(); + let input = "```\nVisit https://example.com/page. Then do something.\n```"; + let result = le.find_links(input, &default_config()); + assert_eq!(1, result.len()); + // URL should NOT include the trailing period + assert_eq!( + result[0].as_ref().unwrap().target, + "https://example.com/page" + ); } #[test] fn ignore_disable_line() { let le = MarkdownLinkExtractor(); let input = " [link](http://example.net/)"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -452,7 +666,7 @@ mod tests { fn ignore_disable_next_line() { let le = MarkdownLinkExtractor(); let input = "\n[link](http://example.net/)"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -460,7 +674,7 @@ mod tests { fn ignore_disable_block() { let le = MarkdownLinkExtractor(); let input = "\n[link1](http://example.net/)\n\n[link2](http://example.com/)"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert_eq!(1, result.len()); assert_eq!(result[0].as_ref().unwrap().target, "http://example.com/"); assert_eq!(result[0].as_ref().unwrap().line, 4); @@ -470,7 +684,7 @@ mod tests { fn ignore_multiple_blocks() { let le = MarkdownLinkExtractor(); let input = "[link1](http://a.com/)\n\n[link2](http://b.com/)\n\n[link3](http://c.com/)\n\n[link4](http://d.com/)\n\n[link5](http://e.com/)"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert_eq!(3, result.len()); assert_eq!(result[0].as_ref().unwrap().target, "http://a.com/"); assert_eq!(result[1].as_ref().unwrap().target, "http://c.com/"); @@ -481,7 +695,7 @@ mod tests { fn ignore_html_link_in_markdown() { let le = MarkdownLinkExtractor(); let input = "\nlink"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert!(result.is_empty()); } @@ -489,7 +703,7 @@ mod tests { fn ignore_mixed_types() { let le = MarkdownLinkExtractor(); let input = "[link1](http://a.com/)\n [link2](http://b.com/)\n[link3](http://c.com/)"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); assert_eq!(2, result.len()); assert_eq!(result[0].as_ref().unwrap().target, "http://a.com/"); assert_eq!(result[1].as_ref().unwrap().target, "http://c.com/"); @@ -499,7 +713,7 @@ mod tests { fn gfm_checkbox_not_link() { let le = MarkdownLinkExtractor(); let input = "- [x] checked task\n- [ ] unchecked task"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); // GitHub-flavored markdown task list checkboxes should NOT be treated as links assert!( result.is_empty(), @@ -512,7 +726,7 @@ mod tests { fn gfm_checkbox_with_link() { let le = MarkdownLinkExtractor(); let input = "- [x] [actual link](http://example.com/)\n- [ ] unchecked task"; - let result = le.find_links(input); + let result = le.find_links(input, &default_config()); // Only the actual link should be detected, not the checkboxes assert_eq!(1, result.len()); assert_eq!(result[0].as_ref().unwrap().target, "http://example.com/"); diff --git a/tests/end_to_end.rs b/tests/end_to_end.rs index 890478f..88347bd 100644 --- a/tests/end_to_end.rs +++ b/tests/end_to_end.rs @@ -16,7 +16,7 @@ async fn end_to_end() { debug: None, do_not_warn_for_redirect_to: None, markup_types: Some(vec![MarkupType::Markdown]), - offline: Some(true), // Use offline mode to avoid checking external URLs + offline: Some(true), // Set to offline mode to avoid external network calls match_file_extension: None, throttle: None, ignore_links: Some(vec!["./doc/broken-local-link.doc".to_string()]), @@ -28,6 +28,7 @@ async fn end_to_end() { gitignore: None, gituntracked: None, csv_file: None, + disable_raw_link_check: None, files: None, http_headers: None, }, @@ -56,6 +57,7 @@ async fn end_to_end_different_root() { gitignore: None, gituntracked: None, csv_file: Some(csv_output.clone()), + disable_raw_link_check: None, files: None, http_headers: None, }, @@ -89,6 +91,7 @@ async fn end_to_end_write_csv_file() { gitignore: None, gituntracked: None, csv_file: Some(csv_output.clone()), + disable_raw_link_check: None, files: None, http_headers: None, }, @@ -131,6 +134,7 @@ async fn end_to_end_csv_include_warnings() { csv_file: Some(csv_output.clone()), files: None, http_headers: None, + disable_raw_link_check: None, }, }; // Run the check - should succeed because we're offline @@ -177,3 +181,71 @@ async fn end_to_end_csv_include_warnings() { // Also verify the test would pass assert!(result.is_ok(), "Should succeed with warnings only"); } + +#[tokio::test] +async fn end_to_end_code_block_links_enabled() { + // Test that raw links in code blocks are checked by default + let test_file = "tests/test_files/code_block_links.md"; + let config = Config { + directory: test_file.into(), + optional: OptionalConfig { + debug: None, + do_not_warn_for_redirect_to: None, + markup_types: Some(vec![MarkupType::Markdown]), + offline: Some(true), // Offline mode to avoid actual HTTP calls + match_file_extension: None, + throttle: None, + ignore_links: None, + ignore_path: None, + root_dir: None, + gitignore: None, + gituntracked: None, + csv_file: None, + disable_raw_link_check: None, // Default: enabled (checks raw links in code blocks) + files: None, + http_headers: None, + }, + }; + + // Run the check - should succeed in offline mode (links are skipped but counted) + let result = mlc::run(&config).await; + assert!( + result.is_ok(), + "Should succeed in offline mode: {:?}", + result + ); +} + +#[tokio::test] +async fn end_to_end_code_block_links_disabled() { + // Test that raw links in code blocks can be disabled + let test_file = "tests/test_files/code_block_links.md"; + let config = Config { + directory: test_file.into(), + optional: OptionalConfig { + debug: None, + do_not_warn_for_redirect_to: None, + markup_types: Some(vec![MarkupType::Markdown]), + offline: Some(true), // Offline mode to avoid actual HTTP calls + match_file_extension: None, + throttle: None, + ignore_links: None, + ignore_path: None, + root_dir: None, + gitignore: None, + gituntracked: None, + csv_file: None, + disable_raw_link_check: Some(true), // Disable raw link checking + files: None, + http_headers: None, + }, + }; + + // Run the check - should succeed + let result = mlc::run(&config).await; + assert!( + result.is_ok(), + "Should succeed with raw link checking disabled: {:?}", + result + ); +} diff --git a/tests/end_to_end_mock.rs b/tests/end_to_end_mock.rs index 694d1f4..98571f9 100644 --- a/tests/end_to_end_mock.rs +++ b/tests/end_to_end_mock.rs @@ -105,6 +105,7 @@ async fn end_to_end_with_mock_servers() { csv_file: None, files: None, http_headers: None, + disable_raw_link_check: None, }, }; @@ -160,6 +161,7 @@ async fn end_to_end_with_mock_server_failure() { csv_file: None, files: None, http_headers: None, + disable_raw_link_check: None, }, }; @@ -229,6 +231,7 @@ async fn end_to_end_with_mock_server_redirect() { csv_file: None, files: None, http_headers: None, + disable_raw_link_check: None, }, }; diff --git a/tests/gitignore_recursive.rs b/tests/gitignore_recursive.rs index 56aa68a..872f0a2 100644 --- a/tests/gitignore_recursive.rs +++ b/tests/gitignore_recursive.rs @@ -106,6 +106,7 @@ async fn gitignore_is_recursive_nested_gitignore_is_respected() { throttle: None, files: None, http_headers: None, + disable_raw_link_check: None, }, }; diff --git a/tests/markdown_files.rs b/tests/markdown_files.rs index 7107de0..07ef062 100644 --- a/tests/markdown_files.rs +++ b/tests/markdown_files.rs @@ -1,6 +1,15 @@ #[cfg(test)] use mlc::link_extractors::link_extractor::find_links; use mlc::markup::{MarkupFile, MarkupType}; +use mlc::{Config, OptionalConfig}; +use std::path::PathBuf; + +fn default_config() -> Config { + Config { + directory: PathBuf::from("."), + optional: OptionalConfig::default(), + } +} #[test] fn no_links() { @@ -9,7 +18,7 @@ fn no_links() { path, markup_type: MarkupType::Markdown, }; - let result = find_links(&file); + let result = find_links(&file, &default_config()); assert!(result.is_empty()); } @@ -20,6 +29,6 @@ fn some_links() { path, markup_type: MarkupType::Markdown, }; - let result = find_links(&file); + let result = find_links(&file, &default_config()); assert_eq!(result.len(), 12); } diff --git a/tests/test_files/code_block_links.md b/tests/test_files/code_block_links.md new file mode 100644 index 0000000..768f1c9 --- /dev/null +++ b/tests/test_files/code_block_links.md @@ -0,0 +1,22 @@ +# Test Code Block Links + +This file tests the raw link checking feature in code blocks. + +## Regular link (should always be checked) +[Regular link](http://example.com/) + +## Code block with raw URLs (checked by default, can be disabled) + +```bash +# Download config files +wget http://example.com/config.yml +curl https://example.org/data.json +``` + +## Inline code with URL +Use `curl http://example.com/api` to fetch data. + +## Another code block +``` +http://example.net/path +```