diff --git a/minecode/model_utils.py b/minecode/model_utils.py index bea192cd..fc8d5662 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -16,6 +16,7 @@ from packagedb.models import PackageSet from packagedb.models import Party from packagedb.models import Resource + from packagedb.serializers import DependentPackageSerializer from packagedb.serializers import PartySerializer @@ -67,6 +68,23 @@ def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, priority=0, logger.debug(f" + Inserted ScannableURI\t: {uri}") +def _create_vcs_aliases(old_url, new_url): + try: + from purl2vcs.find_source_repo import convert_repo_urls_to_purls + from packagedb.models import VcsAlias + + old_purls = list(convert_repo_urls_to_purls([old_url])) + new_purls = list(convert_repo_urls_to_purls([new_url])) + + for old_purl in old_purls: + for new_purl in new_purls: + VcsAlias.objects.get_or_create( + old_vcs_purl=str(old_purl), new_vcs_purl=str(new_purl) + ) + except Exception as e: + logger.error(f"Failed to create VcsAlias: {e}") + + def merge_packages(existing_package, new_package_data, replace=False): """ Merge the data from the `new_package_data` mapping into the @@ -82,7 +100,6 @@ def merge_packages(existing_package, new_package_data, replace=False): field value is left unchanged in this case. """ existing_mapping = existing_package.to_dict() - # We remove `purl` from `existing_mapping` because we use the other purl # fields (type, namespace, name, version, etc.) to generate the purl. existing_mapping.pop("purl") @@ -209,6 +226,10 @@ def merge_packages(existing_package, new_package_data, replace=False): new_value = new_mapping.extra_data.get("package_content") if not new_value: continue + elif existing_field == "vcs_url" or existing_field == "homepage_url": + if existing_value and new_value and existing_value != new_value: + _create_vcs_aliases(existing_value, new_value) + # Continue normally to update the field elif existing_field in fields_to_skip: # Continue to next field continue @@ -243,7 +264,6 @@ def merge_or_create_package(scanned_package, visit_level, override=False, filena merged = False package = None map_error = "" - mining_level = visit_level if override: # this will force the data override @@ -396,6 +416,28 @@ def merge_or_create_package(scanned_package, visit_level, override=False, filena if created: created_package.append_to_history(f"New Package created from URI: {package_uri}") + older_packages = Package.objects.filter( + type=scanned_package.type or "", + namespace=scanned_package.namespace or "", + name=scanned_package.name or "", + ).exclude(version=scanned_package.version) + + if older_packages.exists(): + older_package = older_packages.order_by("-pk").first() + if ( + older_package.vcs_url + and created_package.vcs_url + and older_package.vcs_url != created_package.vcs_url + ): + _create_vcs_aliases(older_package.vcs_url, created_package.vcs_url) + if ( + older_package.homepage_url + and created_package.homepage_url + and older_package.homepage_url != created_package.homepage_url + ): + # Some packages have their homepage url set to their vcs url, so we should create an alias for that too + _create_vcs_aliases(older_package.homepage_url, created_package.homepage_url) + # This is used in the case of Maven packages created from the priority queue for h in history: created_package.append_to_history(h) diff --git a/packagedb/migrations/0095_vcsalias.py b/packagedb/migrations/0095_vcsalias.py new file mode 100644 index 00000000..b3d831e3 --- /dev/null +++ b/packagedb/migrations/0095_vcsalias.py @@ -0,0 +1,26 @@ +# Generated by Django 5.1.13 on 2026-02-23 17:03 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('packagedb', '0094_package_packagedb_p_package_d39839_idx'), + ] + + operations = [ + migrations.CreateModel( + name='VcsAlias', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('old_vcs_purl', models.CharField(db_index=True, max_length=2048)), + ('new_vcs_purl', models.CharField(db_index=True, max_length=2048)), + ('created_date', models.DateTimeField(auto_now_add=True)), + ], + options={ + 'indexes': [models.Index(fields=['old_vcs_purl'], name='packagedb_v_old_vcs_88807e_idx'), models.Index(fields=['new_vcs_purl'], name='packagedb_v_new_vcs_0f8a3b_idx')], + 'unique_together': {('old_vcs_purl', 'new_vcs_purl')}, + }, + ), + ] diff --git a/packagedb/models.py b/packagedb/models.py index a774592e..e576581a 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -1429,6 +1429,39 @@ def create_auth_token(sender, instance=None, created=False, **kwargs): Token.objects.get_or_create(user_id=instance.pk) +class VcsAlias(models.Model): + old_vcs_purl = models.CharField(max_length=2048, db_index=True) + new_vcs_purl = models.CharField(max_length=2048, db_index=True) + created_date = models.DateTimeField(auto_now_add=True) + + class Meta: + unique_together = ["old_vcs_purl", "new_vcs_purl"] + indexes = [ + models.Index(fields=["old_vcs_purl"]), + models.Index(fields=["new_vcs_purl"]), + ] + + @classmethod + def resolve_purl(cls, vcs_purl_str): + """ + Given a VCS PURL string, follows the VcsAlias chain to find and return + the latest active PURL. Returns the original string if no alias exists. + """ + current_purl = vcs_purl_str + visited = set() + + while current_purl not in visited: + visited.add(current_purl) + alias = cls.objects.filter(old_vcs_purl=current_purl).first() + + if not alias: + break + + current_purl = alias.new_vcs_purl + + return current_purl + + class PackageActivity(FederatedCodePackageActivityMixin): """Record of package activity from a FederatedCode."""