Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions openhands-sdk/openhands/sdk/git/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
from pathlib import Path

from openhands.sdk.git.exceptions import GitCommandError, GitRepositoryError
from openhands.sdk.utils.redact import redact_url_credentials
from openhands.sdk.utils.redact import (
redact_url_credentials,
redact_url_credentials_in_text,
)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -48,14 +51,18 @@ def run_git_command(

if result.returncode != 0:
error_msg = f"Git command failed: {cmd_str}"
# stderr can echo the remote URL (with embedded credentials on some
# git versions / error paths), so redact before logging and storing.
redacted_stderr = redact_url_credentials_in_text(result.stderr)
logger.error(
f"{error_msg}. Exit code: {result.returncode}. Stderr: {result.stderr}"
f"{error_msg}. Exit code: {result.returncode}. "
f"Stderr: {redacted_stderr}"
)
raise GitCommandError(
message=error_msg,
command=redacted_args,
exit_code=result.returncode,
stderr=result.stderr.strip(),
stderr=redacted_stderr.strip(),
)

logger.debug(f"Git command succeeded: {cmd_str}")
Expand Down
33 changes: 33 additions & 0 deletions openhands-sdk/openhands/sdk/utils/redact.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,39 @@ def redact_url_credentials(url: str) -> str:
return url


# Matches embedded ``http(s)://user:token@`` credentials anywhere within a
# larger string. The userinfo portion excludes ``/``, ``@``, and whitespace so
# the match stops at the first credential boundary and never spans a path
# segment or another token.
_EMBEDDED_URL_CREDENTIALS_RE = re.compile(r"(https?://)[^/@\s]+@")


def redact_url_credentials_in_text(text: str) -> str:
"""Redact ``https?://user:token@host`` credentials embedded in arbitrary text.

Unlike :func:`redact_url_credentials`, which is anchored and only redacts when
the *entire* string is a credential-bearing URL, this scans for embedded URLs
anywhere in a larger string (e.g. git ``stderr`` such as
``fatal: unable to access 'https://oauth2:SECRET@github.com/o/r.git/': 403``)
and replaces each credential portion with ``****``.

Args:
text: Arbitrary text that may contain one or more credential-bearing URLs.

Returns:
The text with every embedded URL credential replaced by ``****``.

Examples:
>>> redact_url_credentials_in_text(
... "fatal: unable to access 'https://oauth2:SECRET@github.com/o/r.git/'"
... )
"fatal: unable to access 'https://****@github.com/o/r.git/'"
>>> redact_url_credentials_in_text("no credentials here")
'no credentials here'
"""
return _EMBEDDED_URL_CREDENTIALS_RE.sub(r"\g<1>****@", text)


def redact_url_params(url: str) -> str:
"""Redact sensitive query parameter values from a URL string.

Expand Down
37 changes: 17 additions & 20 deletions openhands-sdk/openhands/sdk/workspace/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@

from openhands.sdk.logger import get_logger
from openhands.sdk.utils.path import to_posix_path
from openhands.sdk.utils.redact import (
redact_url_credentials,
redact_url_credentials_in_text,
)


logger = get_logger(__name__)
Expand Down Expand Up @@ -146,7 +150,10 @@ def validate_url(cls, v: str) -> str:
return v
# Normalize HTTP to HTTPS for security (token injection requires HTTPS)
if v.startswith("http://"):
logger.warning(f"Converting HTTP URL to HTTPS for security: {v}")
logger.warning(
"Converting HTTP URL to HTTPS for security: "
f"{redact_url_credentials_in_text(v)}"
)
v = "https://" + v[7:]
# Allow HTTPS, git@, and file:// URLs (file:// for testing)
if v.startswith(("https://", "git@", "file://")):
Expand Down Expand Up @@ -316,20 +323,6 @@ def _build_clone_url(url: str, provider: GitProvider, token: str | None) -> str:
return url


def _mask_url(url: str) -> str:
"""Remove credentials from URL for display."""
if "://" not in url:
return url
return url.split("://")[0] + "://" + url.split("://")[-1].split("@")[-1]


def _mask_token(text: str, token: str | None) -> str:
"""Mask token in text for safe logging."""
if token:
text = text.replace(token, "***")
return text


# Type for functions that fetch tokens by name (e.g., "github_token" -> token value)
TokenFetcher = Callable[[str], str | None]

Expand Down Expand Up @@ -381,7 +374,7 @@ def _clone_single_repo(repo: RepoSource, dest: Path, token: str | None) -> bool:
clone_url = repo.url
provider_str = "local"

display_url = _mask_url(repo.url)
display_url = redact_url_credentials(repo.url)
logger.info(f"[clone] Cloning {display_url} ({provider_str}) -> {dest.name}/")

cmd = _build_clone_command(clone_url, dest, repo.ref)
Expand All @@ -395,7 +388,9 @@ def _clone_single_repo(repo: RepoSource, dest: Path, token: str | None) -> bool:
return False

if result.returncode != 0:
logger.warning(f"[clone] Failed: {_mask_token(result.stderr, token)}")
logger.warning(
f"[clone] Failed: {redact_url_credentials_in_text(result.stderr)}"
)
return False

# For SHA refs, we did a full clone and need to checkout the specific commit
Expand Down Expand Up @@ -454,7 +449,9 @@ def clone_repos(
seen_urls.add(repo.url)
unique_repos.append(repo)
elif repo.url:
logger.warning(f"[clone] Skipping duplicate URL: {_mask_url(repo.url)}")
logger.warning(
f"[clone] Skipping duplicate URL: {redact_url_credentials(repo.url)}"
)

if not unique_repos:
logger.info("[clone] No repositories to clone after deduplication")
Expand Down Expand Up @@ -496,10 +493,10 @@ def clone_repos(
ref=repo.ref,
)
else:
failed.append(_mask_url(repo.url))
failed.append(redact_url_credentials(repo.url))
except Exception as e:
# Don't let one bad repo stop the entire batch
display_url = _mask_url(repo.url) if repo.url else "<unknown>"
display_url = redact_url_credentials(repo.url) if repo.url else "<unknown>"
logger.warning(f"[clone] Error processing {display_url}: {e}")
failed.append(display_url)

Expand Down
26 changes: 26 additions & 0 deletions tests/sdk/git/test_url_redaction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Tests for URL credential redaction utilities."""

import logging
import subprocess
from unittest.mock import patch

Expand Down Expand Up @@ -242,3 +243,28 @@ def test_file_not_found_redacts_command(self):
run_git_command(["git-not-on-path", "clone", CREDENTIAL_URL, "/tmp/x"])
assert CREDENTIAL_URL not in exc_info.value.command
assert REDACTED_URL in exc_info.value.command

def test_stderr_credentials_redacted_on_exception(self):
"""Credentials echoed in stderr must not leak onto GitCommandError.stderr."""
leaky_stderr = f"fatal: Authentication failed for '{CREDENTIAL_URL}/'"
completed = subprocess.CompletedProcess(
args=self._args(), returncode=128, stdout="", stderr=leaky_stderr
)
with patch("subprocess.run", return_value=completed):
with pytest.raises(GitCommandError) as exc_info:
run_git_command(self._args())
assert "SUPERSECRET" not in exc_info.value.stderr
assert REDACTED_URL in exc_info.value.stderr

def test_stderr_credentials_redacted_in_log(self, caplog):
"""Credentials echoed in stderr must not leak into the error log line."""
leaky_stderr = f"fatal: Authentication failed for '{CREDENTIAL_URL}/'"
completed = subprocess.CompletedProcess(
args=self._args(), returncode=128, stdout="", stderr=leaky_stderr
)
with patch("subprocess.run", return_value=completed):
with caplog.at_level(logging.ERROR):
with pytest.raises(GitCommandError):
run_git_command(self._args())
assert "SUPERSECRET" not in caplog.text
assert REDACTED_URL in caplog.text
73 changes: 73 additions & 0 deletions tests/sdk/utils/test_redact.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from openhands.sdk.utils.redact import (
SENSITIVE_URL_PARAMS,
redact_url_credentials,
redact_url_credentials_in_text,
redact_url_params,
)

Expand Down Expand Up @@ -174,3 +176,74 @@ def test_url_with_encoded_characters(self):
assert "secret" not in result
# The non-sensitive param value should be preserved (possibly re-encoded)
assert "hello" in result


# ---------------------------------------------------------------------------
# redact_url_credentials_in_text
# ---------------------------------------------------------------------------


class TestRedactUrlCredentialsInText:
"""Tests for redact_url_credentials_in_text() (substring-capable)."""

def test_redacts_credentials_embedded_in_larger_string(self):
"""The key limitation of the anchored helper: creds inside a message."""
s = "fatal: unable to access 'https://oauth2:SECRET@github.com/o/r.git/': 403"
result = redact_url_credentials_in_text(s)
assert "SECRET" not in result
assert "oauth2" not in result
assert result == (
"fatal: unable to access 'https://****@github.com/o/r.git/': 403"
)

def test_redacts_token_only_credential(self):
s = "Cloning https://ghp_supersecrettoken@github.com/o/r.git failed"
result = redact_url_credentials_in_text(s)
assert "ghp_supersecrettoken" not in result
assert "https://****@github.com/o/r.git" in result

def test_redacts_http_scheme(self):
s = "warn http://user:pw@internal.example.com/x done"
result = redact_url_credentials_in_text(s)
assert "user:pw" not in result
assert "http://****@internal.example.com/x" in result

def test_redacts_multiple_embedded_urls(self):
s = "a https://t1@github.com/o/r.git b https://user:t2@gitlab.com/o/r.git c"
result = redact_url_credentials_in_text(s)
assert "t1" not in result
assert "t2" not in result
assert result.count("****@") == 2

def test_redacts_url_encoded_credentials(self):
s = "url 'https://user%40domain:p%40ss@github.com/repo.git'"
result = redact_url_credentials_in_text(s)
assert "user%40domain" not in result
assert "p%40ss" not in result
assert "https://****@github.com/repo.git" in result

def test_leaves_credential_free_url_untouched(self):
s = "Cloning https://github.com/owner/repo.git into ./repo"
assert redact_url_credentials_in_text(s) == s

def test_does_not_match_at_sign_in_path(self):
"""An '@' after a path segment is not userinfo and must be left alone."""
s = "see https://github.com/owner/repo/blob/main/x@v1.txt"
assert redact_url_credentials_in_text(s) == s

def test_leaves_ssh_url_untouched(self):
s = "remote git@github.com:owner/repo.git fetched"
assert redact_url_credentials_in_text(s) == s

def test_empty_string(self):
assert redact_url_credentials_in_text("") == ""

def test_no_url_string(self):
assert redact_url_credentials_in_text("nothing to redact here") == (
"nothing to redact here"
)

def test_matches_whole_url_like_anchored_helper(self):
"""For a bare whole-URL string both helpers agree."""
url = "https://oauth2:SECRET@gitlab.com/org/repo.git"
assert redact_url_credentials_in_text(url) == redact_url_credentials(url)
37 changes: 37 additions & 0 deletions tests/workspace/test_cloud_workspace_repos.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Tests for repository cloning and skill loading in OpenHandsCloudWorkspace."""

import logging
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch
Expand Down Expand Up @@ -125,6 +126,18 @@ def test_url_with_dashes_allowed(self):
repo = RepoSource(url="my-org/my-repo", provider="github")
assert repo.url == "my-org/my-repo"

def test_http_url_credentials_redacted_in_warning(self, caplog):
"""The http->https normalization warning must not leak embedded creds."""
with caplog.at_level(logging.WARNING):
repo = RepoSource(url="http://oauth2:SUPERSECRET@github.com/owner/repo.git")
# The credential never reaches the logs...
assert "SUPERSECRET" not in caplog.text
assert "oauth2" not in caplog.text
assert "Converting HTTP URL to HTTPS" in caplog.text
assert "http://****@github.com/owner/repo.git" in caplog.text
# ...but normalization still happens on the stored value.
assert repo.url == "https://oauth2:SUPERSECRET@github.com/owner/repo.git"


class TestProviderDetection:
"""Tests for provider detection from URLs."""
Expand Down Expand Up @@ -367,6 +380,30 @@ def test_clone_failure(self, mock_run):
assert len(result.failed_repos) == 1
assert result.repo_mappings == {}

@patch("subprocess.run")
def test_clone_failure_redacts_credentials_in_stderr(self, mock_run, caplog):
"""A failing clone must not leak the auth token echoed back in stderr."""
token = "ghp_supersecrettoken"
# git often echoes the authenticated remote URL back in stderr on failure.
leaky_stderr = (
f"fatal: Authentication failed for "
f"'https://{token}@github.com/owner/repo.git/'"
)
mock_run.return_value = MagicMock(returncode=1, stderr=leaky_stderr)

def token_fetcher(name: str) -> str | None:
return token if name == "github_token" else None

with tempfile.TemporaryDirectory() as tmpdir:
repos = [RepoSource(url="owner/repo", provider="github")]
with caplog.at_level(logging.WARNING):
result = clone_repos(repos, Path(tmpdir), token_fetcher=token_fetcher)

assert result.success_count == 0
assert token not in caplog.text
assert "[clone] Failed:" in caplog.text
assert "https://****@github.com/owner/repo.git" in caplog.text

@patch("subprocess.run")
def test_clone_with_token_fetcher(self, mock_run):
"""Test clone with token fetcher callback."""
Expand Down
Loading