From 6b992db24d39dabbc05ee2f3f58efdd21dd1a520 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 18 Jun 2026 16:08:16 +0200 Subject: [PATCH] Redact git credentials at remaining URL log/serialization sites Follow-up to #2154/#3689. Closes the remaining credential leak vectors in the git/workspace stack so URL-credential redaction is consistent. - Add `redact_url_credentials_in_text()` to sdk/utils/redact.py: a substring-capable redactor that strips `https?://user:token@` credentials embedded anywhere in a larger string. The existing `redact_url_credentials()` is anchored and only redacts when the entire string is a URL, so it misses credentials inside messages like git stderr. - RepoSource.validate_url: redact the URL before logging the http->https normalization warning (was logging user-supplied creds verbatim). - run_git_command: redact stderr before logging it and before storing it on GitCommandError. - Consolidate repo.py `_mask_url`/`_mask_token` onto the central helpers (`redact_url_credentials` / `redact_url_credentials_in_text`) so there is a single source of truth; the old `_mask_token` only string-replaced the exact token and missed URL-encoded/partial credentials. Tests cover the new helper (embedded-URL strings, multiple URLs, URL-encoded creds, path-`@` non-matches) and each leak site. Co-Authored-By: Claude Opus 4.8 (1M context) --- openhands-sdk/openhands/sdk/git/utils.py | 13 +++- openhands-sdk/openhands/sdk/utils/redact.py | 33 +++++++++ openhands-sdk/openhands/sdk/workspace/repo.py | 37 +++++----- tests/sdk/git/test_url_redaction.py | 26 +++++++ tests/sdk/utils/test_redact.py | 73 +++++++++++++++++++ tests/workspace/test_cloud_workspace_repos.py | 37 ++++++++++ 6 files changed, 196 insertions(+), 23 deletions(-) diff --git a/openhands-sdk/openhands/sdk/git/utils.py b/openhands-sdk/openhands/sdk/git/utils.py index c9f064a600..1bdf7ebdda 100644 --- a/openhands-sdk/openhands/sdk/git/utils.py +++ b/openhands-sdk/openhands/sdk/git/utils.py @@ -5,7 +5,10 @@ from pathlib import Path from openhands.sdk.git.exceptions import GitCommandError, GitRepositoryError -from openhands.sdk.utils.redact import redact_url_credentials +from openhands.sdk.utils.redact import ( + redact_url_credentials, + redact_url_credentials_in_text, +) logger = logging.getLogger(__name__) @@ -48,14 +51,18 @@ def run_git_command( if result.returncode != 0: error_msg = f"Git command failed: {cmd_str}" + # stderr can echo the remote URL (with embedded credentials on some + # git versions / error paths), so redact before logging and storing. + redacted_stderr = redact_url_credentials_in_text(result.stderr) logger.error( - f"{error_msg}. Exit code: {result.returncode}. Stderr: {result.stderr}" + f"{error_msg}. Exit code: {result.returncode}. " + f"Stderr: {redacted_stderr}" ) raise GitCommandError( message=error_msg, command=redacted_args, exit_code=result.returncode, - stderr=result.stderr.strip(), + stderr=redacted_stderr.strip(), ) logger.debug(f"Git command succeeded: {cmd_str}") diff --git a/openhands-sdk/openhands/sdk/utils/redact.py b/openhands-sdk/openhands/sdk/utils/redact.py index 88c820ef78..0d1ee5153a 100644 --- a/openhands-sdk/openhands/sdk/utils/redact.py +++ b/openhands-sdk/openhands/sdk/utils/redact.py @@ -166,6 +166,39 @@ def redact_url_credentials(url: str) -> str: return url +# Matches embedded ``http(s)://user:token@`` credentials anywhere within a +# larger string. The userinfo portion excludes ``/``, ``@``, and whitespace so +# the match stops at the first credential boundary and never spans a path +# segment or another token. +_EMBEDDED_URL_CREDENTIALS_RE = re.compile(r"(https?://)[^/@\s]+@") + + +def redact_url_credentials_in_text(text: str) -> str: + """Redact ``https?://user:token@host`` credentials embedded in arbitrary text. + + Unlike :func:`redact_url_credentials`, which is anchored and only redacts when + the *entire* string is a credential-bearing URL, this scans for embedded URLs + anywhere in a larger string (e.g. git ``stderr`` such as + ``fatal: unable to access 'https://oauth2:SECRET@github.com/o/r.git/': 403``) + and replaces each credential portion with ``****``. + + Args: + text: Arbitrary text that may contain one or more credential-bearing URLs. + + Returns: + The text with every embedded URL credential replaced by ``****``. + + Examples: + >>> redact_url_credentials_in_text( + ... "fatal: unable to access 'https://oauth2:SECRET@github.com/o/r.git/'" + ... ) + "fatal: unable to access 'https://****@github.com/o/r.git/'" + >>> redact_url_credentials_in_text("no credentials here") + 'no credentials here' + """ + return _EMBEDDED_URL_CREDENTIALS_RE.sub(r"\g<1>****@", text) + + def redact_url_params(url: str) -> str: """Redact sensitive query parameter values from a URL string. diff --git a/openhands-sdk/openhands/sdk/workspace/repo.py b/openhands-sdk/openhands/sdk/workspace/repo.py index 2923ddf15d..6dee56fea0 100644 --- a/openhands-sdk/openhands/sdk/workspace/repo.py +++ b/openhands-sdk/openhands/sdk/workspace/repo.py @@ -21,6 +21,10 @@ from openhands.sdk.logger import get_logger from openhands.sdk.utils.path import to_posix_path +from openhands.sdk.utils.redact import ( + redact_url_credentials, + redact_url_credentials_in_text, +) logger = get_logger(__name__) @@ -146,7 +150,10 @@ def validate_url(cls, v: str) -> str: return v # Normalize HTTP to HTTPS for security (token injection requires HTTPS) if v.startswith("http://"): - logger.warning(f"Converting HTTP URL to HTTPS for security: {v}") + logger.warning( + "Converting HTTP URL to HTTPS for security: " + f"{redact_url_credentials_in_text(v)}" + ) v = "https://" + v[7:] # Allow HTTPS, git@, and file:// URLs (file:// for testing) if v.startswith(("https://", "git@", "file://")): @@ -316,20 +323,6 @@ def _build_clone_url(url: str, provider: GitProvider, token: str | None) -> str: return url -def _mask_url(url: str) -> str: - """Remove credentials from URL for display.""" - if "://" not in url: - return url - return url.split("://")[0] + "://" + url.split("://")[-1].split("@")[-1] - - -def _mask_token(text: str, token: str | None) -> str: - """Mask token in text for safe logging.""" - if token: - text = text.replace(token, "***") - return text - - # Type for functions that fetch tokens by name (e.g., "github_token" -> token value) TokenFetcher = Callable[[str], str | None] @@ -381,7 +374,7 @@ def _clone_single_repo(repo: RepoSource, dest: Path, token: str | None) -> bool: clone_url = repo.url provider_str = "local" - display_url = _mask_url(repo.url) + display_url = redact_url_credentials(repo.url) logger.info(f"[clone] Cloning {display_url} ({provider_str}) -> {dest.name}/") cmd = _build_clone_command(clone_url, dest, repo.ref) @@ -395,7 +388,9 @@ def _clone_single_repo(repo: RepoSource, dest: Path, token: str | None) -> bool: return False if result.returncode != 0: - logger.warning(f"[clone] Failed: {_mask_token(result.stderr, token)}") + logger.warning( + f"[clone] Failed: {redact_url_credentials_in_text(result.stderr)}" + ) return False # For SHA refs, we did a full clone and need to checkout the specific commit @@ -454,7 +449,9 @@ def clone_repos( seen_urls.add(repo.url) unique_repos.append(repo) elif repo.url: - logger.warning(f"[clone] Skipping duplicate URL: {_mask_url(repo.url)}") + logger.warning( + f"[clone] Skipping duplicate URL: {redact_url_credentials(repo.url)}" + ) if not unique_repos: logger.info("[clone] No repositories to clone after deduplication") @@ -496,10 +493,10 @@ def clone_repos( ref=repo.ref, ) else: - failed.append(_mask_url(repo.url)) + failed.append(redact_url_credentials(repo.url)) except Exception as e: # Don't let one bad repo stop the entire batch - display_url = _mask_url(repo.url) if repo.url else "" + display_url = redact_url_credentials(repo.url) if repo.url else "" logger.warning(f"[clone] Error processing {display_url}: {e}") failed.append(display_url) diff --git a/tests/sdk/git/test_url_redaction.py b/tests/sdk/git/test_url_redaction.py index bd0b8357f6..084948844b 100644 --- a/tests/sdk/git/test_url_redaction.py +++ b/tests/sdk/git/test_url_redaction.py @@ -1,5 +1,6 @@ """Tests for URL credential redaction utilities.""" +import logging import subprocess from unittest.mock import patch @@ -242,3 +243,28 @@ def test_file_not_found_redacts_command(self): run_git_command(["git-not-on-path", "clone", CREDENTIAL_URL, "/tmp/x"]) assert CREDENTIAL_URL not in exc_info.value.command assert REDACTED_URL in exc_info.value.command + + def test_stderr_credentials_redacted_on_exception(self): + """Credentials echoed in stderr must not leak onto GitCommandError.stderr.""" + leaky_stderr = f"fatal: Authentication failed for '{CREDENTIAL_URL}/'" + completed = subprocess.CompletedProcess( + args=self._args(), returncode=128, stdout="", stderr=leaky_stderr + ) + with patch("subprocess.run", return_value=completed): + with pytest.raises(GitCommandError) as exc_info: + run_git_command(self._args()) + assert "SUPERSECRET" not in exc_info.value.stderr + assert REDACTED_URL in exc_info.value.stderr + + def test_stderr_credentials_redacted_in_log(self, caplog): + """Credentials echoed in stderr must not leak into the error log line.""" + leaky_stderr = f"fatal: Authentication failed for '{CREDENTIAL_URL}/'" + completed = subprocess.CompletedProcess( + args=self._args(), returncode=128, stdout="", stderr=leaky_stderr + ) + with patch("subprocess.run", return_value=completed): + with caplog.at_level(logging.ERROR): + with pytest.raises(GitCommandError): + run_git_command(self._args()) + assert "SUPERSECRET" not in caplog.text + assert REDACTED_URL in caplog.text diff --git a/tests/sdk/utils/test_redact.py b/tests/sdk/utils/test_redact.py index 86b561ff9a..a8ba9c753f 100644 --- a/tests/sdk/utils/test_redact.py +++ b/tests/sdk/utils/test_redact.py @@ -2,6 +2,8 @@ from openhands.sdk.utils.redact import ( SENSITIVE_URL_PARAMS, + redact_url_credentials, + redact_url_credentials_in_text, redact_url_params, ) @@ -174,3 +176,74 @@ def test_url_with_encoded_characters(self): assert "secret" not in result # The non-sensitive param value should be preserved (possibly re-encoded) assert "hello" in result + + +# --------------------------------------------------------------------------- +# redact_url_credentials_in_text +# --------------------------------------------------------------------------- + + +class TestRedactUrlCredentialsInText: + """Tests for redact_url_credentials_in_text() (substring-capable).""" + + def test_redacts_credentials_embedded_in_larger_string(self): + """The key limitation of the anchored helper: creds inside a message.""" + s = "fatal: unable to access 'https://oauth2:SECRET@github.com/o/r.git/': 403" + result = redact_url_credentials_in_text(s) + assert "SECRET" not in result + assert "oauth2" not in result + assert result == ( + "fatal: unable to access 'https://****@github.com/o/r.git/': 403" + ) + + def test_redacts_token_only_credential(self): + s = "Cloning https://ghp_supersecrettoken@github.com/o/r.git failed" + result = redact_url_credentials_in_text(s) + assert "ghp_supersecrettoken" not in result + assert "https://****@github.com/o/r.git" in result + + def test_redacts_http_scheme(self): + s = "warn http://user:pw@internal.example.com/x done" + result = redact_url_credentials_in_text(s) + assert "user:pw" not in result + assert "http://****@internal.example.com/x" in result + + def test_redacts_multiple_embedded_urls(self): + s = "a https://t1@github.com/o/r.git b https://user:t2@gitlab.com/o/r.git c" + result = redact_url_credentials_in_text(s) + assert "t1" not in result + assert "t2" not in result + assert result.count("****@") == 2 + + def test_redacts_url_encoded_credentials(self): + s = "url 'https://user%40domain:p%40ss@github.com/repo.git'" + result = redact_url_credentials_in_text(s) + assert "user%40domain" not in result + assert "p%40ss" not in result + assert "https://****@github.com/repo.git" in result + + def test_leaves_credential_free_url_untouched(self): + s = "Cloning https://github.com/owner/repo.git into ./repo" + assert redact_url_credentials_in_text(s) == s + + def test_does_not_match_at_sign_in_path(self): + """An '@' after a path segment is not userinfo and must be left alone.""" + s = "see https://github.com/owner/repo/blob/main/x@v1.txt" + assert redact_url_credentials_in_text(s) == s + + def test_leaves_ssh_url_untouched(self): + s = "remote git@github.com:owner/repo.git fetched" + assert redact_url_credentials_in_text(s) == s + + def test_empty_string(self): + assert redact_url_credentials_in_text("") == "" + + def test_no_url_string(self): + assert redact_url_credentials_in_text("nothing to redact here") == ( + "nothing to redact here" + ) + + def test_matches_whole_url_like_anchored_helper(self): + """For a bare whole-URL string both helpers agree.""" + url = "https://oauth2:SECRET@gitlab.com/org/repo.git" + assert redact_url_credentials_in_text(url) == redact_url_credentials(url) diff --git a/tests/workspace/test_cloud_workspace_repos.py b/tests/workspace/test_cloud_workspace_repos.py index b33a89f0f6..a555419edc 100644 --- a/tests/workspace/test_cloud_workspace_repos.py +++ b/tests/workspace/test_cloud_workspace_repos.py @@ -1,5 +1,6 @@ """Tests for repository cloning and skill loading in OpenHandsCloudWorkspace.""" +import logging import tempfile from pathlib import Path from unittest.mock import MagicMock, patch @@ -125,6 +126,18 @@ def test_url_with_dashes_allowed(self): repo = RepoSource(url="my-org/my-repo", provider="github") assert repo.url == "my-org/my-repo" + def test_http_url_credentials_redacted_in_warning(self, caplog): + """The http->https normalization warning must not leak embedded creds.""" + with caplog.at_level(logging.WARNING): + repo = RepoSource(url="http://oauth2:SUPERSECRET@github.com/owner/repo.git") + # The credential never reaches the logs... + assert "SUPERSECRET" not in caplog.text + assert "oauth2" not in caplog.text + assert "Converting HTTP URL to HTTPS" in caplog.text + assert "http://****@github.com/owner/repo.git" in caplog.text + # ...but normalization still happens on the stored value. + assert repo.url == "https://oauth2:SUPERSECRET@github.com/owner/repo.git" + class TestProviderDetection: """Tests for provider detection from URLs.""" @@ -367,6 +380,30 @@ def test_clone_failure(self, mock_run): assert len(result.failed_repos) == 1 assert result.repo_mappings == {} + @patch("subprocess.run") + def test_clone_failure_redacts_credentials_in_stderr(self, mock_run, caplog): + """A failing clone must not leak the auth token echoed back in stderr.""" + token = "ghp_supersecrettoken" + # git often echoes the authenticated remote URL back in stderr on failure. + leaky_stderr = ( + f"fatal: Authentication failed for " + f"'https://{token}@github.com/owner/repo.git/'" + ) + mock_run.return_value = MagicMock(returncode=1, stderr=leaky_stderr) + + def token_fetcher(name: str) -> str | None: + return token if name == "github_token" else None + + with tempfile.TemporaryDirectory() as tmpdir: + repos = [RepoSource(url="owner/repo", provider="github")] + with caplog.at_level(logging.WARNING): + result = clone_repos(repos, Path(tmpdir), token_fetcher=token_fetcher) + + assert result.success_count == 0 + assert token not in caplog.text + assert "[clone] Failed:" in caplog.text + assert "https://****@github.com/owner/repo.git" in caplog.text + @patch("subprocess.run") def test_clone_with_token_fetcher(self, mock_run): """Test clone with token fetcher callback."""