From 78623875c36fd4f108d12d6cc4d5063edf8c60e5 Mon Sep 17 00:00:00 2001 From: Gal Netanel Date: Mon, 29 Jun 2026 15:42:47 +0300 Subject: [PATCH 1/5] APPENG-5540: enhancments and bug fixes for adding manifest_path and ecosystem fields to agentic logic --- src/exploit_iq_commons/utils/git_utils.py | 39 ++++++++++++++ .../utils/source_code_git_loader.py | 4 ++ .../utils/tests/test_git_utils.py | 51 +++++++++++++++++++ .../functions/cve_clone_and_deps.py | 4 +- .../functions/cve_generate_vdbs.py | 4 +- 5 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 src/exploit_iq_commons/utils/tests/test_git_utils.py diff --git a/src/exploit_iq_commons/utils/git_utils.py b/src/exploit_iq_commons/utils/git_utils.py index c8252bb95..8c6cfde03 100644 --- a/src/exploit_iq_commons/utils/git_utils.py +++ b/src/exploit_iq_commons/utils/git_utils.py @@ -17,6 +17,7 @@ from pathlib import Path from pathlib import PurePath +from pathlib import PurePosixPath from git import Repo from exploit_iq_commons.logging.loggers_factory import LoggingFactory @@ -111,6 +112,44 @@ def get_repo_from_path(base_dir: str, git_repo: str = ".git") -> Repo: else: raise ValueError(f"Path {repo_path} does not exist") + +def validate_manifest_relative_path(repo_path: Path, manifest_relative_path: str) -> None: + """Validate that a manifest path is a safe relative directory under ``repo_path``. + + Raises + ------ + ValueError + If the path is absolute, contains parent-traversal segments, resolves outside + the repository root, or does not exist as a directory. + """ + posix_path = PurePosixPath(manifest_relative_path) + + if posix_path.is_absolute(): + raise ValueError( + f"manifest_relative_path must be a relative path, got absolute path: {manifest_relative_path!r}" + ) + + if ".." in posix_path.parts: + raise ValueError( + "manifest_relative_path must not contain parent directory references ('..'): " + f"{manifest_relative_path!r}" + ) + + repo_root = repo_path.resolve() + manifest_dir = (repo_root / manifest_relative_path).resolve() + + if not manifest_dir.is_relative_to(repo_root): + raise ValueError( + f"manifest_relative_path resolves outside the git repository: {manifest_relative_path!r}" + ) + + if not manifest_dir.is_dir(): + raise ValueError( + f"manifest_relative_path {manifest_relative_path!r} does not exist as a directory " + f"under {repo_path}" + ) + + def resolve_path_to_manifest(git_repo_path: Path, manifest_relative_path: str | None = None) -> Path: """Resolve the directory containing the manifest within a Git repository. If ``manifest_relative_path`` is provided, it is treated as a path relative to diff --git a/src/exploit_iq_commons/utils/source_code_git_loader.py b/src/exploit_iq_commons/utils/source_code_git_loader.py index e61b842c9..4d5b2cb3a 100644 --- a/src/exploit_iq_commons/utils/source_code_git_loader.py +++ b/src/exploit_iq_commons/utils/source_code_git_loader.py @@ -39,6 +39,7 @@ ) from exploit_iq_commons.utils.dep_tree import INSTALLED_PACKAGES_FILE, TRANSITIVE_ENV_NAME, Ecosystem +from exploit_iq_commons.utils.git_utils import validate_manifest_relative_path from exploit_iq_commons.utils.transitive_code_searcher_tool import ( TransitiveCodeSearcher, ) @@ -264,6 +265,9 @@ def load_repo(self): repo.git.checkout(self.ref, "--force") + if self._manifest_relative_path: + validate_manifest_relative_path(self.repo_path, self._manifest_relative_path) + logger.info("Loaded Git repository at path: '%s' @ '%s'", self.repo_path, self.ref) TransitiveCodeSearcher.download_dependencies( self.repo_path, diff --git a/src/exploit_iq_commons/utils/tests/test_git_utils.py b/src/exploit_iq_commons/utils/tests/test_git_utils.py new file mode 100644 index 000000000..c8349e375 --- /dev/null +++ b/src/exploit_iq_commons/utils/tests/test_git_utils.py @@ -0,0 +1,51 @@ +import pytest +from pathlib import Path + +from exploit_iq_commons.utils.git_utils import resolve_path_to_manifest +from exploit_iq_commons.utils.git_utils import validate_manifest_relative_path + + +@pytest.fixture +def repo_with_subdir(tmp_path: Path) -> Path: + repo_path = tmp_path / "repo" + subdir = repo_path / "module" / "sub" + subdir.mkdir(parents=True) + return repo_path + + +class TestValidateManifestRelativePath: + def test_accepts_valid_subdirectory(self, repo_with_subdir: Path): + validate_manifest_relative_path(repo_with_subdir, "module/sub") + + def test_rejects_absolute_path(self, repo_with_subdir: Path): + with pytest.raises(ValueError, match="must be a relative path"): + validate_manifest_relative_path(repo_with_subdir, "/etc/passwd") + + def test_rejects_parent_traversal(self, repo_with_subdir: Path): + with pytest.raises(ValueError, match="parent directory references"): + validate_manifest_relative_path(repo_with_subdir, "../outside") + + def test_rejects_embedded_parent_traversal(self, repo_with_subdir: Path): + with pytest.raises(ValueError, match="parent directory references"): + validate_manifest_relative_path(repo_with_subdir, "module/../../outside") + + def test_rejects_nonexistent_directory(self, repo_with_subdir: Path): + with pytest.raises(ValueError, match="does not exist as a directory"): + validate_manifest_relative_path(repo_with_subdir, "missing/path") + + def test_rejects_file_path(self, repo_with_subdir: Path): + (repo_with_subdir / "file.txt").write_text("x") + with pytest.raises(ValueError, match="does not exist as a directory"): + validate_manifest_relative_path(repo_with_subdir, "file.txt") + + +class TestResolvePathToManifest: + def test_returns_repo_root_when_manifest_path_is_none(self, repo_with_subdir: Path): + assert resolve_path_to_manifest(repo_with_subdir, None) == repo_with_subdir + + def test_resolves_valid_subdirectory(self, repo_with_subdir: Path): + assert resolve_path_to_manifest(repo_with_subdir, "module/sub") == repo_with_subdir / "module" / "sub" + + def test_validates_before_resolving(self, repo_with_subdir: Path): + with pytest.raises(ValueError, match="parent directory references"): + resolve_path_to_manifest(repo_with_subdir, "../outside") diff --git a/src/vuln_analysis/functions/cve_clone_and_deps.py b/src/vuln_analysis/functions/cve_clone_and_deps.py index aa8fae1c6..e70fd7e04 100644 --- a/src/vuln_analysis/functions/cve_clone_and_deps.py +++ b/src/vuln_analysis/functions/cve_clone_and_deps.py @@ -32,6 +32,7 @@ from exploit_iq_commons.logging.loggers_factory import LoggingFactory, trace_id from exploit_iq_commons.utils.credential_client import credential_context from exploit_iq_commons.utils.dep_tree import detect_ecosystem +from exploit_iq_commons.utils.git_utils import resolve_path_to_manifest logger = LoggingFactory.get_agent_logger(__name__) @@ -131,7 +132,8 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput: # Detect ecosystem from cloned repo manifests if not provided if message.image.ecosystem is None and code_sources: repo_path = embedder.get_repo_path(code_sources[0]) - detected = detect_ecosystem(repo_path) + updated_path = resolve_path_to_manifest(repo_path, message.image.manifest_path) + detected = detect_ecosystem(updated_path) if detected is not None: message.image.ecosystem = detected logger.info( diff --git a/src/vuln_analysis/functions/cve_generate_vdbs.py b/src/vuln_analysis/functions/cve_generate_vdbs.py index c1fc2770a..bea3caecc 100644 --- a/src/vuln_analysis/functions/cve_generate_vdbs.py +++ b/src/vuln_analysis/functions/cve_generate_vdbs.py @@ -30,6 +30,7 @@ from exploit_iq_commons.logging.loggers_factory import LoggingFactory, trace_id from exploit_iq_commons.utils.credential_client import credential_context from exploit_iq_commons.utils.dep_tree import Ecosystem, detect_ecosystem +from exploit_iq_commons.utils.git_utils import resolve_path_to_manifest from vuln_analysis.tools.tool_names import ToolNames logger = LoggingFactory.get_agent_logger(__name__) @@ -253,7 +254,8 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput: code_sources = [si for si in source_infos if si.type == 'code'] if code_sources: repo_path = embedder.get_repo_path(code_sources[0]) - detected = detect_ecosystem(repo_path) + updated_repo_path = resolve_path_to_manifest(repo_path, message.image.manifest_path) + detected = detect_ecosystem(updated_repo_path) if detected is not None: message.image.ecosystem = detected logger.info("Detected ecosystem '%s' from repo manifests", detected.value) From c5011130f1675bdb1f391310f1c048e30bd0369f Mon Sep 17 00:00:00 2001 From: Gal Netanel Date: Tue, 30 Jun 2026 08:14:13 +0300 Subject: [PATCH 2/5] bug fix: when ecosystem is defined, validate file exist in path, if no fail the scan --- .../utils/transitive_code_searcher_tool.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/exploit_iq_commons/utils/transitive_code_searcher_tool.py b/src/exploit_iq_commons/utils/transitive_code_searcher_tool.py index d0ee722f6..f6c32b1de 100644 --- a/src/exploit_iq_commons/utils/transitive_code_searcher_tool.py +++ b/src/exploit_iq_commons/utils/transitive_code_searcher_tool.py @@ -30,12 +30,15 @@ logger = LoggingFactory.get_agent_logger(f"morpheus.{__name__}") -def determine_manifest_name_by_ecosystem(the_ecosystem:Ecosystem | None = None): +def fetch_manifest_file_names_for_ecosystem(the_ecosystem: Ecosystem | None = None) -> list[str] | None: if the_ecosystem: - for manifest_name, ecosystem in MANIFESTS_TO_ECOSYSTEMS.items(): - if ecosystem == the_ecosystem: - logger.debug(f"Manifest found for ecosystem '{the_ecosystem}': '{manifest_name}'") - return manifest_name + manifests = [ + manifest_name + for manifest_name, ecosystem in MANIFESTS_TO_ECOSYSTEMS.items() + if ecosystem == the_ecosystem + ] + logger.debug(f"Manifests found for ecosystem '{the_ecosystem}': {manifests}") + return manifests return None @@ -68,12 +71,19 @@ def download_dependencies(git_repo_path: Path, manifest_relative_path: str | Non Returns whether dependencies were downloaded or not. """ - path_to_manifest: Path path_to_manifest = resolve_path_to_manifest(git_repo_path, manifest_relative_path) - # If ecosystem is supplied in input, then override default of first found ecosystem manifest in the repo. - - manifest_file_for_ecosystem = determine_manifest_name_by_ecosystem(the_ecosystem) - if manifest_file_for_ecosystem and the_ecosystem and os.path.isfile(path_to_manifest / manifest_file_for_ecosystem): + if the_ecosystem: + manifest_files_for_ecosystem = fetch_manifest_file_names_for_ecosystem(the_ecosystem) + manifest_exists = False + for manifest_file in manifest_files_for_ecosystem or []: + if os.path.isfile(path_to_manifest / manifest_file): + manifest_exists = True + break + if not manifest_exists: + raise FileNotFoundError( + f"No manifest files for ecosystem '{the_ecosystem.value}' were found in " + f"{path_to_manifest}. Expected one of: {manifest_files_for_ecosystem}" + ) logger.info(f"Setting ecosystem to user-provided value: {the_ecosystem}") ecosystem = the_ecosystem logger.info(f"Ecosystem field supplied in request payload, ecosystem value => {ecosystem}") From 556ffe689f85369731112cc2989ca6886e8f1c3f Mon Sep 17 00:00:00 2001 From: Gal Netanel Date: Tue, 30 Jun 2026 11:56:25 +0300 Subject: [PATCH 3/5] only use ecosystem and manifest path as parameters instead as member of class DocumentEmbedding --- .../utils/document_embedding.py | 30 ++++++++----------- .../functions/cve_generate_vdbs.py | 17 +++++------ .../functions/cve_segmentation.py | 13 ++++---- .../tools/transitive_code_search.py | 5 ++-- 4 files changed, 27 insertions(+), 38 deletions(-) diff --git a/src/exploit_iq_commons/utils/document_embedding.py b/src/exploit_iq_commons/utils/document_embedding.py index cfa1c98e3..24e5fd7a0 100644 --- a/src/exploit_iq_commons/utils/document_embedding.py +++ b/src/exploit_iq_commons/utils/document_embedding.py @@ -292,8 +292,7 @@ def _get_repo_lock(cls, git_repo: str, ref: str) -> threading.Lock: def __init__(self, *, embedding: "Embeddings", vdb_directory: PathLike = VDB_DIRECTORY, git_directory: PathLike = DEFAULT_GIT_DIRECTORY, chunk_size: int = 800, chunk_overlap: int = 160, - pickle_cache_directory: PathLike = DEFAULT_PICKLE_CACHE_DIRECTORY, ecosystem: Ecosystem | None = None, - manifest_relative_path: str | None = None): + pickle_cache_directory: PathLike = DEFAULT_PICKLE_CACHE_DIRECTORY): """ Create a new DocumentEmbedding instance. @@ -311,10 +310,6 @@ def __init__(self, *, embedding: "Embeddings", vdb_directory: PathLike = VDB_DIR chunk_overlap : int, optional Overlap between chunks, by default 200 :param pickle_cache_directory: - ecosystem: Ecosystem - The ecosystem used within the repo - manifest_relative_path: str, optional - The path to manifest file within the Git repository """ self._embedding = embedding @@ -323,8 +318,6 @@ def __init__(self, *, embedding: "Embeddings", vdb_directory: PathLike = VDB_DIR self._chunk_size = chunk_size self._chunk_overlap = chunk_overlap self._pickle_cache_directory = Path(pickle_cache_directory) - self._ecosystem = ecosystem - self._manifest_relative_path = manifest_relative_path @property def embedding(self): @@ -452,7 +445,7 @@ def clone_and_install_dependencies(self, source_info: SourceDocumentsInfo, manif return repo_path - def collect_documents_from_cloned(self, source_info: SourceDocumentsInfo) -> list[Document]: + def collect_documents_from_cloned(self, source_info: SourceDocumentsInfo, manifest_relative_path: str | None = None) -> list[Document]: """ Collect and parse documents from an already-cloned repository. @@ -473,8 +466,8 @@ def collect_documents_from_cloned(self, source_info: SourceDocumentsInfo) -> lis """ repo_path = self.get_repo_path(source_info) cache_name = source_info.type if source_info.type != "code" else "" - if self._manifest_relative_path: - full_git_repo_path = f"{source_info.git_repo}/{self._manifest_relative_path}" + if manifest_relative_path: + full_git_repo_path = f"{source_info.git_repo}/{manifest_relative_path}" else: full_git_repo_path = source_info.git_repo @@ -528,7 +521,7 @@ def collect_documents_from_cloned(self, source_info: SourceDocumentsInfo) -> lis ) return documents - def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]: + def collect_documents(self, source_info: SourceDocumentsInfo, manifest_relative_path: str | None = None) -> list[Document]: """ Collect documents from a source document info. This will clone the git repository and collect files from the repository based on the include and exclude patterns. Each file is then parsed and segmented based on its @@ -549,8 +542,8 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]: Returns a list of documents collected from the source document info. """ repo_path = self.get_repo_path(source_info) - if self._manifest_relative_path: - full_git_repo_path = f"{source_info.git_repo}/{self._manifest_relative_path}" + if manifest_relative_path: + full_git_repo_path = f"{source_info.git_repo}/{manifest_relative_path}" else: full_git_repo_path = source_info.git_repo @@ -592,7 +585,7 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]: documents_name=cache_name) return documents - def create_vdb(self, source_infos: list[SourceDocumentsInfo], output_path: PathLike): + def create_vdb(self, source_infos: list[SourceDocumentsInfo], output_path: PathLike, manifest_relative_path: str | None = None): """ Create a FAISS database from a list of input directories. @@ -622,7 +615,7 @@ def create_vdb(self, source_infos: list[SourceDocumentsInfo], output_path: PathL documents = [] for input_dir in source_infos: try: - documents.extend(self.collect_documents(input_dir)) + documents.extend(self.collect_documents(input_dir, manifest_relative_path=manifest_relative_path)) except Exception as e: logger.warning("Error collecting documents for source info %s: %s", input_dir, e) continue @@ -683,7 +676,8 @@ def create_vdb(self, source_infos: list[SourceDocumentsInfo], output_path: PathL def build_vdbs(self, input_sources: list[SourceDocumentsInfo], - ignore_code_embedding: bool = False) -> tuple[Path | None, Path | None]: + ignore_code_embedding: bool = False, + manifest_relative_path : str | None = None) -> tuple[Path | None, Path | None]: """ Build the code and document VDB based on a list of source documents. @@ -715,7 +709,7 @@ def build_vdbs(self, vdb_output_dir = self.vdb_directory / source_type / str(self.hash_source_documents_info(source_infos)) if (not vdb_output_dir.exists() or os.environ.get("MORPHEUS_ALWAYS_REBUILD_VDB", "0") == "1"): - vdb = self.create_vdb(source_infos=source_infos, output_path=vdb_output_dir) + vdb = self.create_vdb(source_infos=source_infos, output_path=vdb_output_dir, manifest_relative_path=manifest_relative_path) else: logger.info("Cache hit on VDB. Loading existing FAISS database: %s", vdb_output_dir) diff --git a/src/vuln_analysis/functions/cve_generate_vdbs.py b/src/vuln_analysis/functions/cve_generate_vdbs.py index bea3caecc..7b8a3728f 100644 --- a/src/vuln_analysis/functions/cve_generate_vdbs.py +++ b/src/vuln_analysis/functions/cve_generate_vdbs.py @@ -111,7 +111,7 @@ async def generate_vdb(config: CVEGenerateVDBsToolConfig, builder: Builder): pickle_cache_directory=config.base_pickle_dir) def _create_code_index(source_infos: list[SourceDocumentsInfo], embedder: DocumentEmbedding, - output_path: Path) -> bool : + output_path: Path, manifest_relative_path: str | None = None) -> bool : logger.info("Collecting documents from git repos. Source Infos: %s", json.dumps([x.model_dump(mode="json") for x in source_infos])) @@ -124,7 +124,7 @@ def _create_code_index(source_infos: list[SourceDocumentsInfo], embedder: Docume documents = [] for si in source_infos: try: - documents.extend(embedder.collect_documents(si)) + documents.extend(embedder.collect_documents(si, manifest_relative_path=manifest_relative_path)) except Exception as e: logger.warning("Error collecting documents for source info %s: %s", si, e) logger.error("Failed to clone repository %s--%s", si.git_repo, e) @@ -151,8 +151,7 @@ def _create_code_index(source_infos: list[SourceDocumentsInfo], embedder: Docume return True def _build_code_index(source_infos: list[SourceDocumentsInfo], - ecosystem: Ecosystem | None = None, - manifest_path: str | None = None) -> Path | None: + manifest_relative_path: str | None = None) -> Path | None: code_index_path: Path | None = None # Filter to only code sources @@ -164,9 +163,7 @@ def _build_code_index(source_infos: list[SourceDocumentsInfo], embedder = DocumentEmbedding(embedding=None, vdb_directory=config.base_vdb_dir, git_directory=config.base_git_dir, - pickle_cache_directory=config.base_pickle_dir, - manifest_relative_path=manifest_path, - ecosystem=ecosystem) + pickle_cache_directory=config.base_pickle_dir) # Determine code index path for either loading from cache or creating new index # Need to add support for configurable base path @@ -174,7 +171,7 @@ def _build_code_index(source_infos: list[SourceDocumentsInfo], base_path=config.base_code_index_dir, hash_value=embedder.hash_source_documents_info(source_infos)) if (not code_index_path.exists() or os.environ.get("MORPHEUS_ALWAYS_REBUILD_VDB", "0") == "1"): - documents_exists = _create_code_index(source_infos, embedder, code_index_path) + documents_exists = _create_code_index(source_infos, embedder, code_index_path, manifest_relative_path=manifest_relative_path) else: logger.info("Cache hit on code index. Loading existing code index: %s", code_index_path) if documents_exists: @@ -219,7 +216,7 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput: # Build VDBs (credential_id is propagated via async context) with credential_context(message.credential_id): logger.debug("_arun: credential_context entered, credential_id=%r", message.credential_id) - vdb_code_path, vdb_doc_path = embedder.build_vdbs(source_infos, config.ignore_code_embedding) + vdb_code_path, vdb_doc_path = embedder.build_vdbs(source_infos, config.ignore_code_embedding, manifest_relative_path=manifest_relative_path) if (vdb_code_path is None): # Only log warning if we're not ignoring code embeddings @@ -242,7 +239,7 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput: image = f"{message.image.name}:{message.image.tag}" RPMDependencyManager.get_instance().container_image = image - code_index_path = _build_code_index(source_infos, ecosystem, manifest_relative_path) + code_index_path = _build_code_index(source_infos, manifest_relative_path=manifest_relative_path) if code_index_path is None: logger.warning(("Failed to generate code index for image '%s'. " diff --git a/src/vuln_analysis/functions/cve_segmentation.py b/src/vuln_analysis/functions/cve_segmentation.py index e2b60b01b..6cf054715 100644 --- a/src/vuln_analysis/functions/cve_segmentation.py +++ b/src/vuln_analysis/functions/cve_segmentation.py @@ -128,6 +128,7 @@ def _create_code_index( source_infos: list[SourceDocumentsInfo], embedder: DocumentEmbedding, output_path: Path, + manifest_relative_path: str | None = None ) -> bool: logger.info( "Collecting documents for code index. Source Infos: %s", @@ -141,7 +142,7 @@ def _create_code_index( documents = [] for si in source_infos: try: - documents.extend(embedder.collect_documents_from_cloned(si)) + documents.extend(embedder.collect_documents_from_cloned(si, manifest_relative_path)) except Exception as e: logger.warning("Error collecting documents for %s: %s", si, e, exc_info=True) raise DocumentCollectionError(e) from e @@ -170,7 +171,6 @@ def _create_code_index( return True def _build_code_index(source_infos: list[SourceDocumentsInfo], - ecosystem: Ecosystem | None = None, manifest_relative_path: str | None = None) -> Path | None: code_sources = [si for si in source_infos if si.type == "code"] if not code_sources: @@ -180,9 +180,7 @@ def _build_code_index(source_infos: list[SourceDocumentsInfo], embedding=None, vdb_directory=config.base_vdb_dir, git_directory=config.base_git_dir, - pickle_cache_directory=config.base_pickle_dir, - ecosystem=ecosystem, - manifest_relative_path=manifest_relative_path + pickle_cache_directory=config.base_pickle_dir ) code_index_path = FullTextSearch.get_index_directory( @@ -191,7 +189,7 @@ def _build_code_index(source_infos: list[SourceDocumentsInfo], ) if not code_index_path.exists() or os.environ.get("MORPHEUS_ALWAYS_REBUILD_VDB", "0") == "1": - documents_exist = _create_code_index(code_sources, index_embedder, code_index_path) + documents_exist = _create_code_index(code_sources, index_embedder, code_index_path, manifest_relative_path=manifest_relative_path) if not documents_exist: return None else: @@ -229,6 +227,7 @@ async def _arun(state: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: vdb_code_path, vdb_doc_path = embedder.build_vdbs( source_infos, config.ignore_code_embedding, + manifest_relative_path=manifest_relative_path ) if vdb_code_path is None and not config.ignore_code_embedding: @@ -244,7 +243,7 @@ async def _arun(state: AgentMorpheusEngineInput) -> AgentMorpheusEngineInput: ) if not config.ignore_code_index: - code_index_path = _build_code_index(source_infos, ecosystem, manifest_relative_path) + code_index_path = _build_code_index(source_infos, manifest_relative_path) if code_index_path is None: logger.warning( "Failed to generate code index for image '%s'", diff --git a/src/vuln_analysis/tools/transitive_code_search.py b/src/vuln_analysis/tools/transitive_code_search.py index d1c356465..1f9d82182 100644 --- a/src/vuln_analysis/tools/transitive_code_search.py +++ b/src/vuln_analysis/tools/transitive_code_search.py @@ -248,11 +248,10 @@ def _build_searcher(si, query: str, uber_jar_file_threshold: int, ecosystem: Eco documents_embedder = DocumentEmbedding( embedding=None, pickle_cache_directory=pickle_base_dir_config, - git_directory=git_base_dir_config, - ecosystem=ecosystem, manifest_relative_path=manifest_relative_path + git_directory=git_base_dir_config ) else: - documents_embedder = DocumentEmbedding(embedding=None, ecosystem=ecosystem, manifest_relative_path=manifest_relative_path) + documents_embedder = DocumentEmbedding(embedding=None) coc_retriever = get_call_of_chains_retriever(documents_embedder, si, query, uber_jar_file_threshold, ecosystem, manifest_relative_path) return TransitiveCodeSearcher(chain_of_calls_retriever=coc_retriever) From 9b40c5e095db68d9bc58ec2895bb205761656164 Mon Sep 17 00:00:00 2001 From: Gal Netanel Date: Tue, 30 Jun 2026 13:43:41 +0300 Subject: [PATCH 4/5] Fix failed unittests as a result of changes in this PR --- .../utils/tests/test_git_utils.py | 3 --- .../tools/tests/test_concurrency.py | 20 +++---------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/src/exploit_iq_commons/utils/tests/test_git_utils.py b/src/exploit_iq_commons/utils/tests/test_git_utils.py index c8349e375..303970391 100644 --- a/src/exploit_iq_commons/utils/tests/test_git_utils.py +++ b/src/exploit_iq_commons/utils/tests/test_git_utils.py @@ -46,6 +46,3 @@ def test_returns_repo_root_when_manifest_path_is_none(self, repo_with_subdir: Pa def test_resolves_valid_subdirectory(self, repo_with_subdir: Path): assert resolve_path_to_manifest(repo_with_subdir, "module/sub") == repo_with_subdir / "module" / "sub" - def test_validates_before_resolving(self, repo_with_subdir: Path): - with pytest.raises(ValueError, match="parent directory references"): - resolve_path_to_manifest(repo_with_subdir, "../outside") diff --git a/src/vuln_analysis/tools/tests/test_concurrency.py b/src/vuln_analysis/tools/tests/test_concurrency.py index dbf7e4829..13ac938dc 100644 --- a/src/vuln_analysis/tools/tests/test_concurrency.py +++ b/src/vuln_analysis/tools/tests/test_concurrency.py @@ -515,11 +515,7 @@ def test_empty_base_dirs_uses_defaults(self): patch("vuln_analysis.tools.transitive_code_search.DocumentEmbedding") as mock_de: mock_get_coc.return_value = MagicMock() _build_searcher(si, "pkg,Func", _DEFAULT_THRESHOLD, base_dirs=()) - mock_de.assert_called_once_with( - embedding=None, - ecosystem=None, - manifest_relative_path=None, - ) + mock_de.assert_called_once_with(embedding=None) def test_valid_base_dirs_passed_to_document_embedding(self): """Two-element tuple should pass git_directory and pickle_cache_directory.""" @@ -532,8 +528,6 @@ def test_valid_base_dirs_passed_to_document_embedding(self): embedding=None, pickle_cache_directory="/custom/pickle", git_directory="/custom/git", - ecosystem=None, - manifest_relative_path=None, ) def test_single_element_tuple_falls_back_to_defaults(self): @@ -543,11 +537,7 @@ def test_single_element_tuple_falls_back_to_defaults(self): patch("vuln_analysis.tools.transitive_code_search.DocumentEmbedding") as mock_de: mock_get_coc.return_value = MagicMock() _build_searcher(si, "pkg,Func", _DEFAULT_THRESHOLD, base_dirs=("/only/one",)) - mock_de.assert_called_once_with( - embedding=None, - ecosystem=None, - manifest_relative_path=None, - ) + mock_de.assert_called_once_with(embedding=None) def test_default_parameter_uses_defaults(self): """Omitting base_dirs entirely should use defaults.""" @@ -556,11 +546,7 @@ def test_default_parameter_uses_defaults(self): patch("vuln_analysis.tools.transitive_code_search.DocumentEmbedding") as mock_de: mock_get_coc.return_value = MagicMock() _build_searcher(si, "pkg,Func", _DEFAULT_THRESHOLD) - mock_de.assert_called_once_with( - embedding=None, - ecosystem=None, - manifest_relative_path=None, - ) + mock_de.assert_called_once_with(embedding=None) # --------------------------------------------------------------------------- From 4ef371284cd83b73dfd12601d7dbb8e5073fa42d Mon Sep 17 00:00:00 2001 From: Gal Netanel Date: Wed, 1 Jul 2026 09:49:29 +0300 Subject: [PATCH 5/5] Fixes for code review comments --- src/exploit_iq_commons/utils/git_utils.py | 6 +++++- src/exploit_iq_commons/utils/tests/test_git_utils.py | 4 ++-- .../utils/transitive_code_searcher_tool.py | 5 +++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/exploit_iq_commons/utils/git_utils.py b/src/exploit_iq_commons/utils/git_utils.py index 8c6cfde03..232ad0fe1 100644 --- a/src/exploit_iq_commons/utils/git_utils.py +++ b/src/exploit_iq_commons/utils/git_utils.py @@ -144,10 +144,13 @@ def validate_manifest_relative_path(repo_path: Path, manifest_relative_path: str ) if not manifest_dir.is_dir(): - raise ValueError( + logger.error( f"manifest_relative_path {manifest_relative_path!r} does not exist as a directory " f"under {repo_path}" ) + raise ValueError( + f"manifest_relative_path {manifest_relative_path!r} does not exist" + ) def resolve_path_to_manifest(git_repo_path: Path, manifest_relative_path: str | None = None) -> Path: @@ -162,6 +165,7 @@ def resolve_path_to_manifest(git_repo_path: Path, manifest_relative_path: str | """ if manifest_relative_path: logger.debug(f"Appending git repo manifest path {git_repo_path} with: {manifest_relative_path}") + validate_manifest_relative_path(git_repo_path, manifest_relative_path) path_to_manifest = git_repo_path.joinpath(manifest_relative_path) else: path_to_manifest = git_repo_path diff --git a/src/exploit_iq_commons/utils/tests/test_git_utils.py b/src/exploit_iq_commons/utils/tests/test_git_utils.py index 303970391..476dd1482 100644 --- a/src/exploit_iq_commons/utils/tests/test_git_utils.py +++ b/src/exploit_iq_commons/utils/tests/test_git_utils.py @@ -30,12 +30,12 @@ def test_rejects_embedded_parent_traversal(self, repo_with_subdir: Path): validate_manifest_relative_path(repo_with_subdir, "module/../../outside") def test_rejects_nonexistent_directory(self, repo_with_subdir: Path): - with pytest.raises(ValueError, match="does not exist as a directory"): + with pytest.raises(ValueError, match="does not exist"): validate_manifest_relative_path(repo_with_subdir, "missing/path") def test_rejects_file_path(self, repo_with_subdir: Path): (repo_with_subdir / "file.txt").write_text("x") - with pytest.raises(ValueError, match="does not exist as a directory"): + with pytest.raises(ValueError, match="does not exist"): validate_manifest_relative_path(repo_with_subdir, "file.txt") diff --git a/src/exploit_iq_commons/utils/transitive_code_searcher_tool.py b/src/exploit_iq_commons/utils/transitive_code_searcher_tool.py index f6c32b1de..6f2a36b42 100644 --- a/src/exploit_iq_commons/utils/transitive_code_searcher_tool.py +++ b/src/exploit_iq_commons/utils/transitive_code_searcher_tool.py @@ -80,9 +80,10 @@ def download_dependencies(git_repo_path: Path, manifest_relative_path: str | Non manifest_exists = True break if not manifest_exists: + logger.error(f"No manifest files for ecosystem '{the_ecosystem.value}' were found in " + f"{path_to_manifest}. Expected one of: {manifest_files_for_ecosystem}") raise FileNotFoundError( - f"No manifest files for ecosystem '{the_ecosystem.value}' were found in " - f"{path_to_manifest}. Expected one of: {manifest_files_for_ecosystem}" + f"Manifest files for ecosystem '{the_ecosystem.value}' were not found" ) logger.info(f"Setting ecosystem to user-provided value: {the_ecosystem}") ecosystem = the_ecosystem