From e94ed1c6ddd424e0340c0a797a483e8e8cb210f3 Mon Sep 17 00:00:00 2001 From: Ruchit Agrawal Date: Sat, 28 Mar 2026 02:39:24 +0530 Subject: [PATCH] fix: issue 795-download_url from PURL when not provided Signed-off-by: Ruchit Agrawal --- packagedb/models.py | 14 ++++ packagedb/purl_url_utils.py | 74 ++++++++++++++++++++ packagedb/tests/test_purl_download_url.py | 83 +++++++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 packagedb/purl_url_utils.py create mode 100644 packagedb/tests/test_purl_download_url.py diff --git a/packagedb/models.py b/packagedb/models.py index a774592e..11150ffa 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -38,6 +38,7 @@ from rest_framework.authtoken.models import Token from packagedb import schedules +from packagedb.purl_url_utils import derive_download_url TRACE = False @@ -574,6 +575,19 @@ class Meta: def __str__(self): return self.package_url + def save(self, *args, **kwargs): + """ + Override save to auto-derive download_url from PURL if not provided. + + Packages coming from federatedcode repos may not have a download_url. + We use purl2url to infer a real download URL when possible, and fall + back to a synthetic unique URL derived from the PURL components. + """ + if not self.download_url and self.purl: + self.download_url = derive_download_url(self.purl) + + super().save(*args, **kwargs) + @property def purl(self): return self.package_url diff --git a/packagedb/purl_url_utils.py b/packagedb/purl_url_utils.py new file mode 100644 index 00000000..f3451419 --- /dev/null +++ b/packagedb/purl_url_utils.py @@ -0,0 +1,74 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging + +from packageurl import PackageURL +from packageurl.contrib import purl2url + +logger = logging.getLogger(__name__) + + +def derive_download_url(purl_string, provided_download_url=None): + """ + Return a download URL for the package identified by ``purl_string``. + + If ``provided_download_url`` is given it is returned as-is. Otherwise + purl2url is used to infer a real download URL. When that also fails a + synthetic URL is built from the PURL components so that the unique + constraint on ``Package.download_url`` can still be satisfied. + """ + if provided_download_url: + return provided_download_url + + try: + download_url = purl2url.get_download_url(purl_string) + if download_url: + return download_url + except Exception: + pass + + # Fall back to a synthetic URL so the uniqueness constraint is satisfied + # even when no real download URL is available (e.g. packages from + # federatedcode that only carry a PURL). + try: + purl = PackageURL.from_string(purl_string) + return generate_synthetic_download_url(purl) + except Exception as e: + logger.warning(f"Could not generate download URL for {purl_string!r}: {e}") + return f"purl:{purl_string}" + + +def generate_synthetic_download_url(purl): + """ + Return a synthetic download URL for ``purl`` in the form: + purl:////@?# + + All PURL components that affect identity are included so that two + packages which differ only by qualifier (e.g. Maven JARs with different + classifiers) still receive distinct synthetic URLs. + """ + parts = ["purl://", purl.type] + + if purl.namespace: + parts += ["/", purl.namespace] + + parts += ["/", purl.name] + + if purl.version: + parts += ["@", purl.version] + + if purl.qualifiers: + qual_str = "&".join(f"{k}={v}" for k, v in sorted(purl.qualifiers.items())) + parts += ["?", qual_str] + + if purl.subpath: + parts += ["#", purl.subpath] + + return "".join(parts) diff --git a/packagedb/tests/test_purl_download_url.py b/packagedb/tests/test_purl_download_url.py new file mode 100644 index 00000000..6049ae27 --- /dev/null +++ b/packagedb/tests/test_purl_download_url.py @@ -0,0 +1,83 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import unittest +from unittest.mock import patch + +from packageurl import PackageURL + +from packagedb.purl_url_utils import derive_download_url +from packagedb.purl_url_utils import generate_synthetic_download_url + + +class TestDeriveDownloadURL(unittest.TestCase): + def test_provided_url_takes_precedence(self): + provided = "https://example.com/lodash-4.17.21.tgz" + result = derive_download_url("pkg:npm/lodash@4.17.21", provided) + self.assertEqual(result, provided) + + @patch("packagedb.purl_url_utils.purl2url.get_download_url") + def test_infers_url_from_purl(self, mock_get_download): + expected = "https://rubygems.org/downloads/bundler-2.3.23.gem" + mock_get_download.return_value = expected + + result = derive_download_url("pkg:gem/bundler@2.3.23") + + mock_get_download.assert_called_once_with("pkg:gem/bundler@2.3.23") + self.assertEqual(result, expected) + + @patch("packagedb.purl_url_utils.purl2url.get_download_url") + def test_falls_back_to_synthetic_url(self, mock_get_download): + mock_get_download.side_effect = Exception("cannot infer") + + result = derive_download_url("pkg:generic/some-package@1.0.0") + + self.assertTrue(result.startswith("purl://")) + self.assertIn("generic/some-package@1.0.0", result) + + def test_invalid_purl_does_not_raise(self): + # Last-resort fallback: returns a purl:-prefixed string + result = derive_download_url("not-a-valid-purl") + self.assertIsNotNone(result) + self.assertIn("purl:", result) + + +class TestGenerateSyntheticDownloadURL(unittest.TestCase): + def test_basic(self): + purl = PackageURL.from_string("pkg:npm/express@4.17.1") + self.assertEqual(generate_synthetic_download_url(purl), "purl://npm/express@4.17.1") + + def test_includes_namespace(self): + purl = PackageURL.from_string("pkg:maven/org.apache.commons/commons-lang3@3.12.0") + url = generate_synthetic_download_url(purl) + self.assertTrue(url.startswith("purl://maven/org.apache.commons/")) + self.assertIn("commons-lang3@3.12.0", url) + + def test_qualifiers_differentiate_packages(self): + # Maven JARs with different classifiers must produce different URLs + purl1 = PackageURL.from_string("pkg:maven/com.example/lib@1.0.0") + purl2 = PackageURL.from_string("pkg:maven/com.example/lib@1.0.0?classifier=sources") + self.assertNotEqual( + generate_synthetic_download_url(purl1), + generate_synthetic_download_url(purl2), + ) + + def test_no_version(self): + purl = PackageURL.from_string("pkg:npm/express") + url = generate_synthetic_download_url(purl) + self.assertNotIn("@", url) + + def test_includes_subpath(self): + purl = PackageURL.from_string("pkg:github/user/repo@v1.0#path/to/file") + url = generate_synthetic_download_url(purl) + self.assertIn("#path/to/file", url) + + +if __name__ == "__main__": + unittest.main(verbosity=2)