Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions src/keboola_agent_cli/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2295,6 +2295,7 @@ def _prepare_sliced_download(
provider = file_detail.get("provider", "")
downloader = _CloudDownloader.create(file_detail)

_assert_safe_download_url(file_detail["url"])
with httpx.Client(timeout=FILE_DOWNLOAD_TIMEOUT) as http:
resp = http.get(file_detail["url"])
resp.raise_for_status()
Expand Down Expand Up @@ -2393,6 +2394,7 @@ def download_file(self, url: str, output_path: str) -> int:
import gzip
import shutil

_assert_safe_download_url(url)
out_path = Path(output_path)
out_path.parent.mkdir(parents=True, exist_ok=True)
is_gzipped = url.rstrip("?").split("?")[0].endswith(".gz")
Expand Down Expand Up @@ -3101,6 +3103,70 @@ def _build_abs_upload_url(abs_params: dict[str, Any]) -> str:
# ---------------------------------------------------------------------------


def _assert_safe_download_url(url: str) -> None:
"""Reject a download URL whose host resolves to a non-public address.

A malicious or compromised Storage API response can return a download URL
pointing at the cloud instance-metadata endpoint (169.254.169.254) or
localhost; because these fetches carry no Storage token and don't follow
redirects, the residual SSRF still writes internal/credential data into the
user's download file (GHSA-hjhx-mx7m-8xx2). We resolve the host and allow
ONLY globally-routable (public) addresses plus the explicit BYOC private
ranges below; everything else -- loopback, link-local (incl. the
169.254.169.254 metadata endpoint), CGNAT 100.64.0.0/10, reserved,
multicast, unspecified -- is refused.

RFC1918 + IPv6-ULA *private* ranges are deliberately ALLOWED: BYOC /
private-tenant Keboola deployments legitimately serve storage from private
endpoints, and the high-value SSRF target (instance metadata) is link-local,
not private. An allow-list (public OR explicit private) rather than a
block-list of `ipaddress` predicates avoids gaps like CGNAT, which none of
`is_loopback/is_link_local/is_reserved/is_multicast/is_unspecified` catch.
"""
import ipaddress
import socket
from urllib.parse import urlparse

byoc_private = (
ipaddress.ip_network("10.0.0.0/8"),
ipaddress.ip_network("172.16.0.0/12"),
ipaddress.ip_network("192.168.0.0/16"),
ipaddress.ip_network("fc00::/7"), # IPv6 unique-local
)

host = urlparse(url).hostname
if not host:
raise KeboolaApiError(
message=f"Refusing to download: URL has no host ({url!r}).",
status_code=0,
error_code=ErrorCode.INVALID_ARGUMENT,
retryable=False,
)
try:
resolved = {info[4][0] for info in socket.getaddrinfo(host, None)}
except socket.gaierror:
# DNS failure is surfaced by the real fetch; don't mask it here.
return
for addr in resolved:
try:
ip = ipaddress.ip_address(addr)
except ValueError:
continue
if ip.is_global or any(ip in net for net in byoc_private):
continue # public host, or a BYOC private endpoint -- allowed
raise KeboolaApiError(
message=(
f"Refusing to download from {host} -> {addr}: non-public address "
f"(not a public host nor a BYOC private range). This indicates a "
f"malicious or compromised Storage API response (possible SSRF, "
f"e.g. the cloud instance-metadata endpoint)."
),
status_code=0,
error_code=ErrorCode.INVALID_ARGUMENT,
retryable=False,
)


class _IterBytesReader:
"""Adapt an httpx iter_bytes() iterator to a .read(n) file-like interface.

Expand Down Expand Up @@ -3299,6 +3365,7 @@ def stream_to_file(self, url: str, dest: "Path | str", decompress_gzip: bool) ->
import gzip
import shutil

_assert_safe_download_url(url)
headers = self._request_headers(url)
dest_path = Path(dest)
with (
Expand Down
44 changes: 44 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3658,3 +3658,47 @@ def test_set_table_metadata_column_convention(self, httpx_mock) -> None:
)
body = httpx_mock.get_request().content.decode().replace("%5B", "[").replace("%5D", "]")
assert "KBC.column.city.description" in body


class TestAssertSafeDownloadUrl:
"""GHSA-hjhx-mx7m-8xx2: a download URL from a (possibly malicious) Storage
API response must not be fetched when its host resolves to a non-public
address -- SSRF guard against the cloud metadata endpoint and localhost.
All inputs are IP literals / localhost so the resolution stays offline."""

@pytest.mark.parametrize(
"url",
[
"https://169.254.169.254/latest/meta-data/iam/security-credentials/", # metadata
"http://127.0.0.1/secret", # loopback v4
"http://localhost/secret", # loopback by name
"http://[::1]/secret", # loopback v6
"http://0.0.0.0/x", # unspecified
"http://100.64.0.1/file", # CGNAT (RFC 6598) -- not global, not RFC1918
],
)
def test_rejects_non_public_hosts(self, url: str) -> None:
from keboola_agent_cli.client import _assert_safe_download_url

with pytest.raises(KeboolaApiError):
_assert_safe_download_url(url)

@pytest.mark.parametrize(
"url",
[
"https://93.184.216.34/file.csv", # public IP literal
"https://10.1.2.3/file.csv", # RFC1918 -- ALLOWED for BYOC
"http://192.168.1.5/file.csv", # RFC1918 -- ALLOWED for BYOC
"http://172.16.0.9/file.csv", # RFC1918 -- ALLOWED for BYOC
],
)
def test_allows_public_and_private_byoc_hosts(self, url: str) -> None:
from keboola_agent_cli.client import _assert_safe_download_url

_assert_safe_download_url(url) # must not raise

def test_rejects_url_without_host(self) -> None:
from keboola_agent_cli.client import _assert_safe_download_url

with pytest.raises(KeboolaApiError):
_assert_safe_download_url("file:///etc/passwd")