From 99f623c85e83f57a14b86889eab8e2b3b9f1344c Mon Sep 17 00:00:00 2001 From: Petr Date: Tue, 23 Jun 2026 00:18:25 +0200 Subject: [PATCH] fix(storage): guard downloads against SSRF to non-public hosts (GHSA-hjhx) A malicious or compromised Storage API response can return a download URL pointing at the cloud instance-metadata endpoint (169.254.169.254) or localhost. The download clients carry no Storage token and don't follow redirects, but the residual SSRF still writes internal/credential data into the user's download file. Add _assert_safe_download_url(): resolve the host and refuse loopback / link-local / reserved / multicast / unspecified targets before each download fetch (download_file, the sliced-manifest fetch in _prepare_sliced_download, and the per-slice stream_to_file). RFC1918 private ranges are ALLOWED so BYOC / private-tenant deployments keep working; the high-value target (instance metadata) is link-local, not private. Code-only (no version bump / changelog entry) to stay conflict-free against the rapid main release cadence; version + changelog added at next release. Private advisory GHSA-hjhx-mx7m-8xx2. --- src/keboola_agent_cli/client.py | 67 +++++++++++++++++++++++++++++++++ tests/test_client.py | 44 ++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/src/keboola_agent_cli/client.py b/src/keboola_agent_cli/client.py index cb2ec191..23151abc 100644 --- a/src/keboola_agent_cli/client.py +++ b/src/keboola_agent_cli/client.py @@ -2295,6 +2295,7 @@ def _prepare_sliced_download( provider = file_detail.get("provider", "") downloader = _CloudDownloader.create(file_detail) + _assert_safe_download_url(file_detail["url"]) with httpx.Client(timeout=FILE_DOWNLOAD_TIMEOUT) as http: resp = http.get(file_detail["url"]) resp.raise_for_status() @@ -2393,6 +2394,7 @@ def download_file(self, url: str, output_path: str) -> int: import gzip import shutil + _assert_safe_download_url(url) out_path = Path(output_path) out_path.parent.mkdir(parents=True, exist_ok=True) is_gzipped = url.rstrip("?").split("?")[0].endswith(".gz") @@ -3101,6 +3103,70 @@ def _build_abs_upload_url(abs_params: dict[str, Any]) -> str: # --------------------------------------------------------------------------- +def _assert_safe_download_url(url: str) -> None: + """Reject a download URL whose host resolves to a non-public address. + + A malicious or compromised Storage API response can return a download URL + pointing at the cloud instance-metadata endpoint (169.254.169.254) or + localhost; because these fetches carry no Storage token and don't follow + redirects, the residual SSRF still writes internal/credential data into the + user's download file (GHSA-hjhx-mx7m-8xx2). We resolve the host and allow + ONLY globally-routable (public) addresses plus the explicit BYOC private + ranges below; everything else -- loopback, link-local (incl. the + 169.254.169.254 metadata endpoint), CGNAT 100.64.0.0/10, reserved, + multicast, unspecified -- is refused. + + RFC1918 + IPv6-ULA *private* ranges are deliberately ALLOWED: BYOC / + private-tenant Keboola deployments legitimately serve storage from private + endpoints, and the high-value SSRF target (instance metadata) is link-local, + not private. An allow-list (public OR explicit private) rather than a + block-list of `ipaddress` predicates avoids gaps like CGNAT, which none of + `is_loopback/is_link_local/is_reserved/is_multicast/is_unspecified` catch. + """ + import ipaddress + import socket + from urllib.parse import urlparse + + byoc_private = ( + ipaddress.ip_network("10.0.0.0/8"), + ipaddress.ip_network("172.16.0.0/12"), + ipaddress.ip_network("192.168.0.0/16"), + ipaddress.ip_network("fc00::/7"), # IPv6 unique-local + ) + + host = urlparse(url).hostname + if not host: + raise KeboolaApiError( + message=f"Refusing to download: URL has no host ({url!r}).", + status_code=0, + error_code=ErrorCode.INVALID_ARGUMENT, + retryable=False, + ) + try: + resolved = {info[4][0] for info in socket.getaddrinfo(host, None)} + except socket.gaierror: + # DNS failure is surfaced by the real fetch; don't mask it here. + return + for addr in resolved: + try: + ip = ipaddress.ip_address(addr) + except ValueError: + continue + if ip.is_global or any(ip in net for net in byoc_private): + continue # public host, or a BYOC private endpoint -- allowed + raise KeboolaApiError( + message=( + f"Refusing to download from {host} -> {addr}: non-public address " + f"(not a public host nor a BYOC private range). This indicates a " + f"malicious or compromised Storage API response (possible SSRF, " + f"e.g. the cloud instance-metadata endpoint)." + ), + status_code=0, + error_code=ErrorCode.INVALID_ARGUMENT, + retryable=False, + ) + + class _IterBytesReader: """Adapt an httpx iter_bytes() iterator to a .read(n) file-like interface. @@ -3299,6 +3365,7 @@ def stream_to_file(self, url: str, dest: "Path | str", decompress_gzip: bool) -> import gzip import shutil + _assert_safe_download_url(url) headers = self._request_headers(url) dest_path = Path(dest) with ( diff --git a/tests/test_client.py b/tests/test_client.py index 2256950a..7547ea39 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -3658,3 +3658,47 @@ def test_set_table_metadata_column_convention(self, httpx_mock) -> None: ) body = httpx_mock.get_request().content.decode().replace("%5B", "[").replace("%5D", "]") assert "KBC.column.city.description" in body + + +class TestAssertSafeDownloadUrl: + """GHSA-hjhx-mx7m-8xx2: a download URL from a (possibly malicious) Storage + API response must not be fetched when its host resolves to a non-public + address -- SSRF guard against the cloud metadata endpoint and localhost. + All inputs are IP literals / localhost so the resolution stays offline.""" + + @pytest.mark.parametrize( + "url", + [ + "https://169.254.169.254/latest/meta-data/iam/security-credentials/", # metadata + "http://127.0.0.1/secret", # loopback v4 + "http://localhost/secret", # loopback by name + "http://[::1]/secret", # loopback v6 + "http://0.0.0.0/x", # unspecified + "http://100.64.0.1/file", # CGNAT (RFC 6598) -- not global, not RFC1918 + ], + ) + def test_rejects_non_public_hosts(self, url: str) -> None: + from keboola_agent_cli.client import _assert_safe_download_url + + with pytest.raises(KeboolaApiError): + _assert_safe_download_url(url) + + @pytest.mark.parametrize( + "url", + [ + "https://93.184.216.34/file.csv", # public IP literal + "https://10.1.2.3/file.csv", # RFC1918 -- ALLOWED for BYOC + "http://192.168.1.5/file.csv", # RFC1918 -- ALLOWED for BYOC + "http://172.16.0.9/file.csv", # RFC1918 -- ALLOWED for BYOC + ], + ) + def test_allows_public_and_private_byoc_hosts(self, url: str) -> None: + from keboola_agent_cli.client import _assert_safe_download_url + + _assert_safe_download_url(url) # must not raise + + def test_rejects_url_without_host(self) -> None: + from keboola_agent_cli.client import _assert_safe_download_url + + with pytest.raises(KeboolaApiError): + _assert_safe_download_url("file:///etc/passwd")