Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
be5d5ed
Initial plan
Copilot Dec 8, 2025
f495c71
Add --check-links-in-code-blocks option to enable link checking in co…
Copilot Dec 8, 2025
5dd788f
Update documentation and fix integration tests
Copilot Dec 8, 2025
220a47b
Address code review feedback: improve regex performance and URL patte…
Copilot Dec 8, 2025
ba6c5f8
Changes before error encountered
Copilot Dec 9, 2025
ff90290
Resolve merge conflicts with master branch
Copilot Dec 9, 2025
dc01dcf
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 9, 2025
97227e5
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 9, 2025
d72f93a
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 10, 2025
0231e32
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 10, 2025
1ff7743
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 10, 2025
4f9e254
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 11, 2025
65ab6a2
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 11, 2025
14fda1d
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 13, 2025
6ecd421
Fix failing tests after merge - add missing config parameter and field
Copilot Dec 13, 2025
edb49e2
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 13, 2025
5f567ab
Fix code review feedback: update CSV description and remove extra comma
Copilot Dec 15, 2025
015368d
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 15, 2025
473192e
Fix failing test: add missing disable_raw_link_check field to gitigno…
Copilot Dec 15, 2025
d35363d
Merge branch 'master' into copilot/add-link-check-option
becheran Dec 18, 2025
3e243b6
Add end-to-end tests for raw link checking in code blocks
Copilot Dec 18, 2025
e582698
Disable raw link checking in test_own_readme CI job
Copilot Dec 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
target
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
- name: Run
run: cargo run -- ./README.md -d
run: cargo run -- ./README.md -d --disable-raw-link-check

formatting:
runs-on: ubuntu-latest
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ The following arguments are available:
| `--markup-types` | `-t` | Comma separated list list of markup types which shall be checked. Possible values: `md`, `html` |
| `--root-dir` | `-r` | All links to the file system starting with a slash on linux or backslash on windows will use another virtual root dir. For example the link in a file `[link](/dir/other/file.md)` checked with the cli arg `--root-dir /env/another/dir` will let *mlc* check the existence of `/env/another/dir/dir/other/file.md`. |
| `--throttle` | `-T` | Number of milliseconds to wait in between web requests to the same host. Default is zero which means no throttling. Set this if you need to slow down the web request frequency to avoid `429 - Too Many Requests` responses. For example with `--throttle 15`, between each http check to the same host, 15 ms will be waited. Note that this setting can slow down the link checker. |
| `--disable-raw-link-check` | `-c` | Disable checking of raw links in code blocks and other text. By default, raw HTTP(S) URLs are extracted and checked from code blocks and inline code. |
| `--csv` | | Path to csv file which contains all failed requests and warnings in the format `source,line,column,target,severity`. The severity column contains `ERR` for errors and `WARN` for warnings. |
| `--files` | `-f` | Comma separated list of files which shall be checked. For example `--files "./README.md,./docs/README.md"` will check only the specified files. This is useful for checking specific files in a monorepo without having to exclude many directories. |
| `--http-headers` | `-H` | Comma separated list of custom HTTP headers in the format `'Name: Value'`. This is useful for setting custom user agents or other headers required by specific websites. For example `--http-headers "User-Agent: Mozilla/5.0,X-Custom-Header: value"` will set both a custom user agent and an additional header. |
Expand Down Expand Up @@ -216,6 +217,8 @@ markup-types=["Markdown","Html"]
throttle= 100
# Path to the root folder used to resolve all relative paths
root-dir="./"
# Disable checking of raw links in code blocks (enabled by default)
disable-raw-link-check = false
# Path to csv file which contains all failed requests and warnings
csv="output.csv"
# List of specific files to check
Expand Down
12 changes: 12 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,14 @@ pub fn parse_args() -> Config {
.action(ArgAction::SetTrue)
.required(false),
)
.arg(
Arg::new("disable-raw-link-check")
.long("disable-raw-link-check")
.short('c')
.action(ArgAction::SetTrue)
.help("Disable checking of raw links in code blocks and other text. By default, raw HTTP(S) URLs are extracted and checked.")
.required(false),
)
.arg(
Arg::new("files")
.long("files")
Expand Down Expand Up @@ -228,6 +236,10 @@ pub fn parse_args() -> Config {
opt.gituntracked = Some(true);
}

if matches.get_flag("disable-raw-link-check") {
opt.disable_raw_link_check = Some(true);
}

if let Some(files) = matches.get_many::<String>("files") {
let mut file_paths: Vec<_> = files
.map(|x| Path::new(&normalize_path_separators(x)).to_path_buf())
Expand Down
8 changes: 7 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ pub struct OptionalConfig {
#[serde(rename(deserialize = "gituntracked"))]
pub gituntracked: Option<bool>,
pub throttle: Option<u32>,
#[serde(rename(deserialize = "disable-raw-link-check"))]
pub disable_raw_link_check: Option<bool>,
#[serde(rename(deserialize = "files"))]
pub files: Option<Vec<PathBuf>>,
#[serde(rename(deserialize = "http-headers"))]
Expand Down Expand Up @@ -120,6 +122,7 @@ IgnoreLinks: {}
IgnorePath: {:?}
Throttle: {} ms
CSVFile: {:?}
DisableRawLinkCheck: {}
Files: {:?}
HttpHeaders: {:?}",
self.optional.debug.unwrap_or(false),
Expand All @@ -135,6 +138,7 @@ HttpHeaders: {:?}",
ignore_path_str,
self.optional.throttle.unwrap_or(0),
csv_file_str,
self.optional.disable_raw_link_check.unwrap_or_default(),
files_str,
http_headers_str
)
Expand All @@ -158,7 +162,9 @@ fn find_all_links(config: &Config) -> Vec<Result<MarkupLink, BrokenExtractedLink
file_traversal::find(config, &mut files);
let mut links = vec![];
for file in files {
links.append(&mut link_extractors::link_extractor::find_links(&file));
links.append(&mut link_extractors::link_extractor::find_links(
&file, config,
));
}
links
}
Expand Down
40 changes: 30 additions & 10 deletions src/link_extractors/html_link_extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::link_extractors::link_extractor::LinkExtractor;
use crate::link_extractors::link_extractor::MarkupLink;
use crate::link_validator::link_type::get_link_type;
use crate::link_validator::link_type::LinkType;
use crate::Config;

use super::ignore_comments::IgnoreRegions;
use super::link_extractor::BrokenExtractedLink;
Expand All @@ -17,7 +18,11 @@ enum ParserState {
}

impl LinkExtractor for HtmlLinkExtractor {
fn find_links(&self, text: &str) -> Vec<Result<MarkupLink, BrokenExtractedLink>> {
fn find_links(
&self,
text: &str,
_config: &Config,
) -> Vec<Result<MarkupLink, BrokenExtractedLink>> {
let mut result: Vec<Result<MarkupLink, BrokenExtractedLink>> = Vec::new();
let mut state: ParserState = ParserState::Text;
let mut link_column = 0;
Expand Down Expand Up @@ -121,28 +126,40 @@ impl LinkExtractor for HtmlLinkExtractor {
#[cfg(test)]
mod tests {
use super::*;
use crate::OptionalConfig;
use ntest::test_case;
use std::path::PathBuf;

fn default_config() -> Config {
Config {
directory: PathBuf::from("."),
optional: OptionalConfig::default(),
}
}

#[test]
fn no_link() {
let le = HtmlLinkExtractor();
let input = "]This is not a <has> no link <h1>Bla</h1> attribute.";
let result = le.find_links(input);
let result = le.find_links(input, &default_config());
assert!(result.is_empty());
}

#[test]
fn commented() {
let le = HtmlLinkExtractor();
let input = "df <!-- <a href=\"http://wiki.selfhtml.org\"> haha</a> -->";
let result = le.find_links(input);
let result = le.find_links(input, &default_config());
assert!(result.is_empty());
}

#[test]
fn space() {
let le = HtmlLinkExtractor();
let result = le.find_links("blah <a href=\"some file.html\">foo</a>.");
let result = le.find_links(
"blah <a href=\"some file.html\">foo</a>.",
&default_config(),
);
let expected = Ok(MarkupLink {
target: "some file.html".to_string(),
line: 1,
Expand All @@ -155,7 +172,10 @@ mod tests {
#[test]
fn url_encoded_path() {
let le = HtmlLinkExtractor();
let result = le.find_links("blah <a href=\"some%20file.html\">foo</a>.");
let result = le.find_links(
"blah <a href=\"some%20file.html\">foo</a>.",
&default_config(),
);
let expected = Ok(MarkupLink {
target: "some file.html".to_string(),
line: 1,
Expand Down Expand Up @@ -183,7 +203,7 @@ mod tests {
)]
fn links(input: &str, line: usize, column: usize) {
let le = HtmlLinkExtractor();
let result = le.find_links(input);
let result = le.find_links(input, &default_config());
let expected = Ok(MarkupLink {
target: "https://www.w3schools.com".to_string(),
line,
Expand All @@ -197,23 +217,23 @@ mod tests {
fn ignore_disable_line() {
let le = HtmlLinkExtractor();
let input = "<!-- mlc-disable-line --> <a href=\"http://example.net/\">link</a>";
let result = le.find_links(input);
let result = le.find_links(input, &default_config());
assert!(result.is_empty());
}

#[test]
fn ignore_disable_next_line() {
let le = HtmlLinkExtractor();
let input = "<!-- mlc-disable-next-line -->\n<a href=\"http://example.net/\">link</a>";
let result = le.find_links(input);
let result = le.find_links(input, &default_config());
assert!(result.is_empty());
}

#[test]
fn ignore_disable_block() {
let le = HtmlLinkExtractor();
let input = "<!-- mlc-disable -->\n<a href=\"http://example.net/\">link1</a>\n<!-- mlc-enable -->\n<a href=\"http://example.com/\">link2</a>";
let result = le.find_links(input);
let result = le.find_links(input, &default_config());
assert_eq!(1, result.len());
assert_eq!(result[0].as_ref().unwrap().target, "http://example.com/");
assert_eq!(result[0].as_ref().unwrap().line, 4);
Expand All @@ -223,7 +243,7 @@ mod tests {
fn ignore_multiple_blocks() {
let le = HtmlLinkExtractor();
let input = "<a href=\"http://a.com/\">1</a>\n<!-- mlc-disable -->\n<a href=\"http://b.com/\">2</a>\n<!-- mlc-enable -->\n<a href=\"http://c.com/\">3</a>";
let result = le.find_links(input);
let result = le.find_links(input, &default_config());
assert_eq!(2, result.len());
assert_eq!(result[0].as_ref().unwrap().target, "http://a.com/");
assert_eq!(result[1].as_ref().unwrap().target, "http://c.com/");
Expand Down
14 changes: 11 additions & 3 deletions src/link_extractors/link_extractor.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use super::html_link_extractor::HtmlLinkExtractor;
use super::markdown_link_extractor::MarkdownLinkExtractor;
use crate::markup::{MarkupFile, MarkupType};
use crate::Config;
use std::env;
use std::fmt;
use std::fs;
Expand Down Expand Up @@ -57,14 +58,17 @@ impl MarkupLink {
}

#[must_use]
pub fn find_links(file: &MarkupFile) -> Vec<Result<MarkupLink, BrokenExtractedLink>> {
pub fn find_links(
file: &MarkupFile,
config: &Config,
) -> Vec<Result<MarkupLink, BrokenExtractedLink>> {
let path = &file.path;
let link_extractor = link_extractor_factory(file.markup_type);

info!("Scan file at path '{path}' for links.");
match fs::read_to_string(path) {
Ok(text) => {
let mut links = link_extractor.find_links(&text);
let mut links = link_extractor.find_links(&text, config);
for l in &mut links {
match l {
Ok(link) => {
Expand Down Expand Up @@ -92,5 +96,9 @@ fn link_extractor_factory(markup_type: MarkupType) -> Box<dyn LinkExtractor> {
}

pub trait LinkExtractor {
fn find_links(&self, text: &str) -> Vec<Result<MarkupLink, BrokenExtractedLink>>;
fn find_links(
&self,
text: &str,
config: &Config,
) -> Vec<Result<MarkupLink, BrokenExtractedLink>>;
}
Loading