diff --git a/minecode_pipelines/pipelines/mine_alpine.py b/minecode_pipelines/pipelines/mine_alpine.py index 63b447c2..a2fb1fdf 100644 --- a/minecode_pipelines/pipelines/mine_alpine.py +++ b/minecode_pipelines/pipelines/mine_alpine.py @@ -20,6 +20,9 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +from scanpipe.pipes import federatedcode + +from minecode_pipelines import pipes from minecode_pipelines.pipelines import MineCodeBasePipeline from minecode_pipelines.pipes import alpine @@ -28,16 +31,38 @@ class MineAlpine(MineCodeBasePipeline): """Mine PackageURLs from alpine index and publish them to FederatedCode.""" + pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/" + checkpoint_path = "alpine/checkpoints.json" + @classmethod def steps(cls): return ( cls.check_federatedcode_eligibility, cls.create_federatedcode_working_dir, cls.fetch_federation_config, + cls.fetch_checkpoint, cls.mine_and_publish_alpine_packageurls, + cls.save_checkpoint, cls.delete_working_dir, ) + def fetch_checkpoint(self): + """Fetch the list of already-processed Alpine index URLs.""" + self.checkpoint_config_repo = federatedcode.clone_repository( + repo_url=self.pipeline_config_repo, + clone_path=self.working_path / "minecode-pipelines-config", + logger=self.log, + ) + checkpoint = pipes.get_checkpoint_from_file( + cloned_repo=self.checkpoint_config_repo, + path=self.checkpoint_path, + ) + self.processed_indexes = set(checkpoint.get("processed_indexes", [])) + self.log( + f"Loaded checkpoint with {len(self.processed_indexes)} " + f"already-processed indexes." + ) + def mine_and_publish_alpine_packageurls(self): alpine.mine_and_publish_alpine_packageurls( data_cluster=self.data_cluster, @@ -45,4 +70,22 @@ def mine_and_publish_alpine_packageurls(self): working_path=self.working_path, commit_msg_func=self.commit_message, logger=self.log, + processed_indexes=self.processed_indexes, + checkpoint_func=self._save_checkpoint, + ) + + def _save_checkpoint(self): + """Save current set of processed indexes as a checkpoint.""" + checkpoint = {"processed_indexes": sorted(self.processed_indexes)} + self.log( + f"Saving checkpoint with {len(self.processed_indexes)} processed indexes." + ) + pipes.update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=self.checkpoint_config_repo, + path=self.checkpoint_path, + logger=self.log, ) + + def save_checkpoint(self): + self._save_checkpoint() diff --git a/minecode_pipelines/pipelines/mine_maven.py b/minecode_pipelines/pipelines/mine_maven.py index e11c7b6c..b3d019c2 100644 --- a/minecode_pipelines/pipelines/mine_maven.py +++ b/minecode_pipelines/pipelines/mine_maven.py @@ -65,9 +65,27 @@ def fetch_checkpoint_and_maven_index(self): logger=self.log, ) + # Determine if we can resume from the last processed purl + saved_checksum = checkpoint.get("index_checksum") + current_checksum = self.maven_nexus_collector.index_checksum + self.last_processed_purl = None + + if saved_checksum and saved_checksum == current_checksum: + self.last_processed_purl = checkpoint.get("last_processed_purl") + if self.last_processed_purl: + self.log( + f"Index checksum matches. Resuming from: {self.last_processed_purl}" + ) + elif saved_checksum and saved_checksum != current_checksum: + self.log( + "Index checksum changed. Starting from beginning." + ) + def mine_and_publish_maven_packageurls(self): _mine_and_publish_packageurls( - packageurls=self.maven_nexus_collector.get_packages(), + packageurls=self.maven_nexus_collector.get_packages( + last_processed_purl=self.last_processed_purl, + ), total_package_count=None, data_cluster=self.data_cluster, checked_out_repos=self.checked_out_repos, @@ -75,13 +93,19 @@ def mine_and_publish_maven_packageurls(self): append_purls=self.append_purls, commit_msg_func=self.commit_message, logger=self.log, + checkpoint_func=self._save_checkpoint, ) - def save_check_point(self): + def _save_checkpoint(self): + """Save current progress as a checkpoint.""" last_incremental = self.maven_nexus_collector.index_properties.get( "nexus.index.last-incremental" ) - checkpoint = {"last_incremental": last_incremental} + checkpoint = { + "last_incremental": last_incremental, + "index_checksum": self.maven_nexus_collector.index_checksum, + "last_processed_purl": self.maven_nexus_collector.last_processed_purl, + } self.log(f"Saving checkpoint: {checkpoint}") pipes.update_checkpoints_in_github( checkpoint=checkpoint, @@ -89,3 +113,6 @@ def save_check_point(self): path=self.checkpoint_path, logger=self.log, ) + + def save_check_point(self): + self._save_checkpoint() diff --git a/minecode_pipelines/pipes/alpine.py b/minecode_pipelines/pipes/alpine.py index 009cf637..cfddaf43 100644 --- a/minecode_pipelines/pipes/alpine.py +++ b/minecode_pipelines/pipes/alpine.py @@ -551,10 +551,29 @@ def mine_and_publish_alpine_packageurls( working_path, commit_msg_func, logger, + processed_indexes=None, + checkpoint_func=None, ): """Yield PackageURLs from Alpine index.""" - index_count = len(ALPINE_LINUX_APKINDEX_URLS) + if processed_indexes is None: + processed_indexes = set() + + indexes_to_process = [ + url for url in ALPINE_LINUX_APKINDEX_URLS + if url not in processed_indexes + ] + + index_count = len(indexes_to_process) + total_count = len(ALPINE_LINUX_APKINDEX_URLS) + skipped_count = total_count - index_count + + if skipped_count: + logger( + f"Skipping {skipped_count:,d} already-processed indexes. " + f"Processing {index_count:,d} remaining indexes." + ) + progress = LoopProgress( total_iterations=index_count, logger=logger, @@ -563,7 +582,7 @@ def mine_and_publish_alpine_packageurls( logger(f"Mine PackageURL from {index_count:,d} alpine index.") alpine_collector = AlpineCollector() - for index in progress.iter(ALPINE_LINUX_APKINDEX_URLS): + for index in progress.iter(indexes_to_process): logger(f"Mine PackageURL from {index} index.") _mine_and_publish_packageurls( packageurls=alpine_collector.get_package_from_index(index), @@ -575,3 +594,6 @@ def mine_and_publish_alpine_packageurls( commit_msg_func=commit_msg_func, logger=logger, ) + processed_indexes.add(index) + if checkpoint_func: + checkpoint_func() diff --git a/minecode_pipelines/pipes/maven.py b/minecode_pipelines/pipes/maven.py index dd536194..c0c02c7c 100644 --- a/minecode_pipelines/pipes/maven.py +++ b/minecode_pipelines/pipes/maven.py @@ -8,6 +8,7 @@ # import gzip +import hashlib import io import os from collections import namedtuple @@ -574,6 +575,7 @@ def __init__( index_location=None, index_properties_location=None, last_incremental=None, + last_processed_purl=None, logger=None, ): if index_location and last_incremental: @@ -584,6 +586,7 @@ def __init__( ) self.downloads = [] + self.last_processed_purl = last_processed_purl if index_properties_location: self.index_properties_location = index_properties_location @@ -615,6 +618,25 @@ def __init__( self.index_location = index_download.path self.index_increment_locations = [] + self.index_checksum = self._compute_index_checksum() + + def _compute_index_checksum(self): + """Compute SHA-256 checksum of the index file(s) for checkpoint validation.""" + locations = [] + if self.index_location: + locations.append(self.index_location) + locations.extend(self.index_increment_locations) + + if not locations: + return None + + sha256 = hashlib.sha256() + for location in sorted(locations): + with open(location, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + sha256.update(chunk) + return sha256.hexdigest() + def __del__(self): if self.downloads: for download in self.downloads: @@ -658,8 +680,9 @@ def _fetch_index_increments(self, last_incremental): index_increment_downloads.append(index_increment) return index_increment_downloads - def _get_packages(self, content=None): + def _get_packages(self, content=None, last_processed_purl=None): artifacts = get_artifacts(content, worthyness=is_worthy_artifact) + skipping = bool(last_processed_purl) for artifact in artifacts: # we cannot do much without these @@ -723,17 +746,27 @@ def _get_packages(self, content=None): name=artifact_id, version=version, ) + + if skipping: + if str(current_purl) == last_processed_purl: + skipping = False + continue + + self.last_processed_purl = str(current_purl) yield current_purl, [package.purl] def _get_packages_from_index_increments(self): for index_increment in self.index_increment_locations: yield self._get_packages(content=index_increment) - def get_packages(self): + def get_packages(self, last_processed_purl=None): """Yield Package objects from maven index or index increments""" packages = [] if self.index_increment_locations: packages = chain(self._get_packages_from_index_increments()) elif self.index_location: - packages = self._get_packages(content=self.index_location) + packages = self._get_packages( + content=self.index_location, + last_processed_purl=last_processed_purl, + ) return packages diff --git a/minecode_pipelines/tests/data/alpine/expected_packages.json b/minecode_pipelines/tests/data/alpine/expected_packages.json index 925834bf..0ab7693c 100644 --- a/minecode_pipelines/tests/data/alpine/expected_packages.json +++ b/minecode_pipelines/tests/data/alpine/expected_packages.json @@ -2,7 +2,7 @@ { "type": "apk", "namespace": "alpine", - "name": "prspkt", + "name": "2bwm", "version": "0.3-r2", "qualifiers": { "arch": "x86_64", @@ -75,12 +75,12 @@ "repository_download_url": null, "api_data_url": null, "datasource_id": "alpine_metadata", - "purl": "pkg:apk/alpine/prspkt@0.3-r2?arch=x86_64&distro=v3.22" + "purl": "pkg:apk/alpine/2bwm@0.3-r2?arch=x86_64&distro=v3.22" }, { "type": "apk", "namespace": "alpine", - "name": "prspkt", + "name": "2bwm-doc", "version": "0.3-r2", "qualifiers": { "arch": "x86_64", @@ -153,6 +153,6 @@ "repository_download_url": null, "api_data_url": null, "datasource_id": "alpine_metadata", - "purl": "pkg:apk/alpine/prspkt@0.3-r2?arch=x86_64&distro=v3.22" + "purl": "pkg:apk/alpine/2bwm-doc@0.3-r2?arch=x86_64&distro=v3.22" } ] \ No newline at end of file diff --git a/minecode_pipelines/tests/data/maven/index/nexus-maven-repository-index.properties b/minecode_pipelines/tests/data/maven/index/nexus-maven-repository-index.properties new file mode 100644 index 00000000..75a5d3d7 --- /dev/null +++ b/minecode_pipelines/tests/data/maven/index/nexus-maven-repository-index.properties @@ -0,0 +1,6 @@ +#Mon Oct 29 01:28:33 UTC 2018 +nexus.index.id=central +nexus.index.chain-id=1318453614498 +nexus.index.timestamp=20181029012159.470 +0000 +nexus.index.last-incremental=445 +nexus.index.time=20120615133728.952 +0000 diff --git a/minecode_pipelines/tests/pipes/test_alpine.py b/minecode_pipelines/tests/pipes/test_alpine.py index ff0b91b3..9b9b11ea 100644 --- a/minecode_pipelines/tests/pipes/test_alpine.py +++ b/minecode_pipelines/tests/pipes/test_alpine.py @@ -8,6 +8,7 @@ # import os +from unittest.mock import patch from commoncode.testcase import check_against_expected_json_file from commoncode.testcase import FileBasedTesting @@ -27,3 +28,51 @@ def test_parse_apkindex_and_build_package(self): packages.append(pd.to_dict()) expected_loc = self.get_test_loc("alpine/expected_packages.json") check_against_expected_json_file(packages, expected_loc, regen=False) + + def test_mine_and_publish_skips_processed_indexes(self): + """Test that already-processed index URLs are skipped.""" + all_urls = alpine.ALPINE_LINUX_APKINDEX_URLS + processed_indexes = set(all_urls[1:]) + calls = [] + + def mock_mine_and_publish(**kwargs): + calls.append(kwargs.get("packageurls")) + + with patch( + "minecode_pipelines.pipes.alpine._mine_and_publish_packageurls", + side_effect=mock_mine_and_publish, + ): + alpine.mine_and_publish_alpine_packageurls( + data_cluster=None, + checked_out_repos={}, + working_path=None, + commit_msg_func=lambda *a, **kw: "test", + logger=lambda msg: None, + processed_indexes=processed_indexes, + ) + self.assertEqual(len(calls), 1) + + def test_mine_and_publish_updates_processed_indexes(self): + """Test that processed_indexes is updated after each index.""" + processed_indexes = set() + + with patch( + "minecode_pipelines.pipes.alpine._mine_and_publish_packageurls", + ): + original_urls = alpine.ALPINE_LINUX_APKINDEX_URLS + alpine.ALPINE_LINUX_APKINDEX_URLS = original_urls[:2] + try: + alpine.mine_and_publish_alpine_packageurls( + data_cluster=None, + checked_out_repos={}, + working_path=None, + commit_msg_func=lambda *a, **kw: "test", + logger=lambda msg: None, + processed_indexes=processed_indexes, + ) + finally: + alpine.ALPINE_LINUX_APKINDEX_URLS = original_urls + + self.assertEqual(len(processed_indexes), 2) + self.assertIn(original_urls[0], processed_indexes) + self.assertIn(original_urls[1], processed_indexes) diff --git a/minecode_pipelines/tests/pipes/test_conan.py b/minecode_pipelines/tests/pipes/test_conan.py index 044fd1ee..8715d5dd 100644 --- a/minecode_pipelines/tests/pipes/test_conan.py +++ b/minecode_pipelines/tests/pipes/test_conan.py @@ -19,7 +19,7 @@ class ConanPipelineTests(TestCase): - def test_collect_packages_from_conan_calls_write(self, mock_write): + def test_collect_packages_from_conan_calls_write(self): packages_file = DATA_DIR / "cairo" / "cairo-config.yml" expected_file = DATA_DIR / "expected-cairo-purls.yml" diff --git a/minecode_pipelines/tests/pipes/test_maven.py b/minecode_pipelines/tests/pipes/test_maven.py index 199c8322..3d927121 100644 --- a/minecode_pipelines/tests/pipes/test_maven.py +++ b/minecode_pipelines/tests/pipes/test_maven.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import hashlib import os from commoncode.testcase import check_against_expected_json_file @@ -18,6 +19,82 @@ class MavenMiscTest(FileBasedTesting): test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") + def test_maven_collector_index_checksum(self): + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + props = self.get_test_loc("maven/index/nexus-maven-repository-index.properties") + collector = maven.MavenNexusCollector( + index_location=index, + index_properties_location=props, + ) + self.assertIsNotNone(collector.index_checksum) + self.assertEqual(len(collector.index_checksum), 64) + int(collector.index_checksum, 16) + + collector2 = maven.MavenNexusCollector( + index_location=index, + index_properties_location=props, + ) + self.assertEqual(collector.index_checksum, collector2.index_checksum) + + def test_maven_get_packages_resume(self): + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + props = self.get_test_loc("maven/index/nexus-maven-repository-index.properties") + collector = maven.MavenNexusCollector( + index_location=index, + index_properties_location=props, + ) + + all_packages = list(collector.get_packages()) + + if len(all_packages) < 2: + return + + first_purl = str(all_packages[0][0]) + collector2 = maven.MavenNexusCollector( + index_location=index, + index_properties_location=props, + ) + resumed_packages = list(collector2.get_packages(last_processed_purl=first_purl)) + + self.assertEqual(len(resumed_packages), len(all_packages) - 1) + for resumed, original in zip(resumed_packages, all_packages[1:]): + self.assertEqual(str(resumed[0]), str(original[0])) + + def test_maven_get_packages_resume_nonexistent_purl(self): + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + props = self.get_test_loc("maven/index/nexus-maven-repository-index.properties") + collector = maven.MavenNexusCollector( + index_location=index, + index_properties_location=props, + ) + + all_packages = list(collector.get_packages()) + collector2 = maven.MavenNexusCollector( + index_location=index, + index_properties_location=props, + ) + resumed_packages = list( + collector2.get_packages(last_processed_purl="pkg:maven/nonexistent/package@0.0.0") + ) + + self.assertEqual(len(resumed_packages), 0) + + def test_maven_collector_tracks_last_processed_purl(self): + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + props = self.get_test_loc("maven/index/nexus-maven-repository-index.properties") + collector = maven.MavenNexusCollector( + index_location=index, + index_properties_location=props, + ) + self.assertIsNone(collector.last_processed_purl) + + packages = list(collector.get_packages()) + if packages: + self.assertIsNotNone(collector.last_processed_purl) + self.assertEqual( + collector.last_processed_purl, str(packages[-1][0]) + ) + def test_get_entries(self): index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") fields = ( diff --git a/minecode_pipelines/tests/pipes/test_swift.py b/minecode_pipelines/tests/pipes/test_swift.py index ea1eafee..bc374223 100644 --- a/minecode_pipelines/tests/pipes/test_swift.py +++ b/minecode_pipelines/tests/pipes/test_swift.py @@ -28,7 +28,7 @@ def _run_package_test(self, package_repo_url, commits_tags_file, expected_file): git_ls_remote = f.read() with open(expected_file, encoding="utf-8") as f: - expected = saneyaml.load(f) + expected_purls = saneyaml.load(f) tags_and_commits = get_tags_and_commits_from_git_output(git_ls_remote) @@ -38,23 +38,14 @@ def _run_package_test(self, package_repo_url, commits_tags_file, expected_file): logger=logger, ) - assert base_purl.to_string() == expected["base_purl"] - assert sorted(generated_purls) == sorted(expected["purls"]) + result_purls = sorted(str(p) for p in generated_purls) + assert result_purls == sorted(expected_purls) - def test_swift_safe_collection_access(self, mock_write): + def test_swift_safe_collection_access(self): self._run_package_test( - mock_write, package_repo_url="https://github.com/RougeWare/Swift-Safe-Collection-Access", commits_tags_file=DATA_DIR / "commits_tags1.txt", expected_file=DATA_DIR / "expected1.yaml", - expected_path_parts=[ - "aboutcode-packages-swift-0", - "swift", - "github.com", - "RougeWare", - "Swift-Safe-Collection-Access", - "purls.yml", - ], ) def test_human_string(self): @@ -62,14 +53,6 @@ def test_human_string(self): package_repo_url="https://github.com/zonble/HumanString.git", commits_tags_file=DATA_DIR / "commits_tags2.txt", expected_file=DATA_DIR / "expected2.yaml", - expected_path_parts=[ - "aboutcode-packages-swift-0", - "swift", - "github.com", - "zonble", - "HumanString", - "purls.yml", - ], ) def test_swift_financial(self): @@ -77,14 +60,6 @@ def test_swift_financial(self): package_repo_url="https://github.com/zrluety/SwiftFinancial.git", commits_tags_file=DATA_DIR / "commits_tags3.txt", expected_file=DATA_DIR / "expected3.yaml", - expected_path_parts=[ - "aboutcode-packages-swift-0", - "swift", - "github.com", - "zrluety", - "SwiftFinancial", - "purls.yml", - ], ) def test_swift_xcf_sodium(self): @@ -92,12 +67,4 @@ def test_swift_xcf_sodium(self): package_repo_url="https://github.com/0xacdc/XCFSodium", commits_tags_file=DATA_DIR / "commits_tags4.txt", expected_file=DATA_DIR / "expected4.yaml", - expected_path_parts=[ - "aboutcode-packages-swift-0", - "swift", - "github.com", - "0xacdc", - "XCFSodium", - "purls.yml", - ], ) diff --git a/purldb_project/settings.py b/purldb_project/settings.py index c6c2c731..75377a9a 100644 --- a/purldb_project/settings.py +++ b/purldb_project/settings.py @@ -65,6 +65,7 @@ "minecode", "matchcode", "packagedb", +"scanpipe", # Django built-in "django.contrib.auth", "django.contrib.contenttypes", @@ -74,6 +75,7 @@ "django.contrib.admin", "django.contrib.humanize", # Third-party apps + "taggit", "django_filters", "rest_framework", "drf_spectacular", @@ -332,3 +334,33 @@ FEDERATEDCODE_GIT_SERVICE_TOKEN = env.str("FEDERATEDCODE_GIT_SERVICE_TOKEN", default="") FEDERATEDCODE_GIT_SERVICE_NAME = env.str("FEDERATEDCODE_GIT_SERVICE_NAME", default="") FEDERATEDCODE_GIT_SERVICE_EMAIL = env.str("FEDERATEDCODE_GIT_SERVICE_EMAIL", default="") + +VULNERABLECODE_URL = env.str("VULNERABLECODE_URL", default="") +VULNERABLECODE_USER = env.str("VULNERABLECODE_USER", default="") +VULNERABLECODE_PASSWORD = env.str("VULNERABLECODE_PASSWORD", default="") +VULNERABLECODE_API_KEY = env.str("VULNERABLECODE_API_KEY", default="") + +MATCHCODEIO_URL = env.str("MATCHCODEIO_URL", default="") +MATCHCODEIO_USER = env.str("MATCHCODEIO_USER", default="") +MATCHCODEIO_PASSWORD = env.str("MATCHCODEIO_PASSWORD", default="") +MATCHCODEIO_API_KEY = env.str("MATCHCODEIO_API_KEY", default="") + +PURLDB_URL = env.str("PURLDB_URL", default="") +PURLDB_USER = env.str("PURLDB_USER", default="") +PURLDB_PASSWORD = env.str("PURLDB_PASSWORD", default="") +PURLDB_API_KEY = env.str("PURLDB_API_KEY", default="") + +SCANCODEIO_WORKSPACE_LOCATION = "/tmp/scancodeio-workspace" +SCANCODEIO_WORKSPACE_LOCATION = "/tmp/scancodeio-workspace" +SCANCODEIO_STORAGE_LOCATION = "/tmp/scancodeio-storage" +SCANCODEIO_MOST_COMMON_LIMIT = 50 +SCANCODEIO_SCAN_FILE_TIMEOUT = 3600 +PURLDB_URL = "http://localhost:8000" +PURLDB_USER = "test-user" +PURLDB_PASSWORD = "test-password" +PURLDB_API_KEY = "test-api-key" +MATCHCODEIO_URL = "http://localhost:8001" +MATCHCODEIO_USER = "test-user" +MATCHCODEIO_PASSWORD = "test-password" +MATCHCODEIO_API_KEY = "test-api-key" +SCANCODEIO_ASYNC = False