Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions packagedb/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from rest_framework.authtoken.models import Token

from packagedb import schedules
from packagedb.purl_url_utils import derive_download_url

TRACE = False

Expand Down Expand Up @@ -574,6 +575,19 @@ class Meta:
def __str__(self):
return self.package_url

def save(self, *args, **kwargs):
"""
Override save to auto-derive download_url from PURL if not provided.

Packages coming from federatedcode repos may not have a download_url.
We use purl2url to infer a real download URL when possible, and fall
back to a synthetic unique URL derived from the PURL components.
"""
if not self.download_url and self.purl:
self.download_url = derive_download_url(self.purl)

super().save(*args, **kwargs)

@property
def purl(self):
return self.package_url
Expand Down
74 changes: 74 additions & 0 deletions packagedb/purl_url_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging

from packageurl import PackageURL
from packageurl.contrib import purl2url

logger = logging.getLogger(__name__)


def derive_download_url(purl_string, provided_download_url=None):
"""
Return a download URL for the package identified by ``purl_string``.

If ``provided_download_url`` is given it is returned as-is. Otherwise
purl2url is used to infer a real download URL. When that also fails a
synthetic URL is built from the PURL components so that the unique
constraint on ``Package.download_url`` can still be satisfied.
"""
if provided_download_url:
return provided_download_url

try:
download_url = purl2url.get_download_url(purl_string)
if download_url:
return download_url
except Exception:
pass

# Fall back to a synthetic URL so the uniqueness constraint is satisfied
# even when no real download URL is available (e.g. packages from
# federatedcode that only carry a PURL).
try:
purl = PackageURL.from_string(purl_string)
return generate_synthetic_download_url(purl)
except Exception as e:
logger.warning(f"Could not generate download URL for {purl_string!r}: {e}")
return f"purl:{purl_string}"


def generate_synthetic_download_url(purl):
"""
Return a synthetic download URL for ``purl`` in the form:
purl://<type>/<namespace>/<name>@<version>?<qualifiers>#<subpath>

All PURL components that affect identity are included so that two
packages which differ only by qualifier (e.g. Maven JARs with different
classifiers) still receive distinct synthetic URLs.
"""
parts = ["purl://", purl.type]

if purl.namespace:
parts += ["/", purl.namespace]

parts += ["/", purl.name]

if purl.version:
parts += ["@", purl.version]

if purl.qualifiers:
qual_str = "&".join(f"{k}={v}" for k, v in sorted(purl.qualifiers.items()))
parts += ["?", qual_str]

if purl.subpath:
parts += ["#", purl.subpath]

return "".join(parts)
83 changes: 83 additions & 0 deletions packagedb/tests/test_purl_download_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import unittest
from unittest.mock import patch

from packageurl import PackageURL

from packagedb.purl_url_utils import derive_download_url
from packagedb.purl_url_utils import generate_synthetic_download_url


class TestDeriveDownloadURL(unittest.TestCase):
def test_provided_url_takes_precedence(self):
provided = "https://example.com/lodash-4.17.21.tgz"
result = derive_download_url("pkg:npm/lodash@4.17.21", provided)
self.assertEqual(result, provided)

@patch("packagedb.purl_url_utils.purl2url.get_download_url")
def test_infers_url_from_purl(self, mock_get_download):
expected = "https://rubygems.org/downloads/bundler-2.3.23.gem"
mock_get_download.return_value = expected

result = derive_download_url("pkg:gem/bundler@2.3.23")

mock_get_download.assert_called_once_with("pkg:gem/bundler@2.3.23")
self.assertEqual(result, expected)

@patch("packagedb.purl_url_utils.purl2url.get_download_url")
def test_falls_back_to_synthetic_url(self, mock_get_download):
mock_get_download.side_effect = Exception("cannot infer")

result = derive_download_url("pkg:generic/some-package@1.0.0")

self.assertTrue(result.startswith("purl://"))
self.assertIn("generic/some-package@1.0.0", result)

def test_invalid_purl_does_not_raise(self):
# Last-resort fallback: returns a purl:-prefixed string
result = derive_download_url("not-a-valid-purl")
self.assertIsNotNone(result)
self.assertIn("purl:", result)


class TestGenerateSyntheticDownloadURL(unittest.TestCase):
def test_basic(self):
purl = PackageURL.from_string("pkg:npm/express@4.17.1")
self.assertEqual(generate_synthetic_download_url(purl), "purl://npm/express@4.17.1")

def test_includes_namespace(self):
purl = PackageURL.from_string("pkg:maven/org.apache.commons/commons-lang3@3.12.0")
url = generate_synthetic_download_url(purl)
self.assertTrue(url.startswith("purl://maven/org.apache.commons/"))
self.assertIn("commons-lang3@3.12.0", url)

def test_qualifiers_differentiate_packages(self):
# Maven JARs with different classifiers must produce different URLs
purl1 = PackageURL.from_string("pkg:maven/com.example/lib@1.0.0")
purl2 = PackageURL.from_string("pkg:maven/com.example/lib@1.0.0?classifier=sources")
self.assertNotEqual(
generate_synthetic_download_url(purl1),
generate_synthetic_download_url(purl2),
)

def test_no_version(self):
purl = PackageURL.from_string("pkg:npm/express")
url = generate_synthetic_download_url(purl)
self.assertNotIn("@", url)

def test_includes_subpath(self):
purl = PackageURL.from_string("pkg:github/user/repo@v1.0#path/to/file")
url = generate_synthetic_download_url(purl)
self.assertIn("#path/to/file", url)


if __name__ == "__main__":
unittest.main(verbosity=2)