Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ce4e69c
Rewrite JS parser and Segmenter:
tmihalac Jun 17, 2026
56bde63
Remove parser_threshold parameter from ExtendedLanguageParser creation
tmihalac Jun 21, 2026
d9054f2
Add CCA import-based pre-filter, cycle detection, and log lazy
tmihalac Jun 23, 2026
f96338a
Fix CCA tree dedup, Rule 8 Go subpackage matching, FCF regex
tmihalac Jun 24, 2026
a733aec
Source bugs:
tmihalac Jun 25, 2026
4f44a85
Source bugs:
tmihalac Jun 25, 2026
7af1213
Removed a test
tmihalac Jun 26, 2026
15ee1eb
Added debug logging
tmihalac Jun 26, 2026
3702027
Added debug logging
tmihalac Jun 26, 2026
a220084
Added debug logging
tmihalac Jun 26, 2026
b7ec881
Performance fixes for JS
tmihalac Jun 26, 2026
c5bafb8
- Reorder direct_parents in __find_caller_function_dfs so root-level
tmihalac Jun 26, 2026
8418adf
Fixed tests
tmihalac Jun 28, 2026
bde7996
C CCA argument-count pre-filter, Maven parallelism, checklist
tmihalac Jun 28, 2026
e0cc7c5
Increase cores to 3
tmihalac Jun 28, 2026
e1c04bd
Removed AVOID UNANSWERABLE QUESTIONS from the prompt
tmihalac Jun 29, 2026
d230632
RPM checker Case B: emit TARGET_IN_VULNERABLE_RANGE and add VERSION
tmihalac Jun 29, 2026
8ada59d
Config scanner: allowlist extension filter for config-dir files, add
tmihalac Jun 29, 2026
b808e16
Consolidate 9 duplicate test files into tests/, config scanner
tmihalac Jun 29, 2026
09c0f22
Increased uber-jar threshold to 1000
tmihalac Jun 30, 2026
9e3bdce
Go CCA sub-package granularity fixes, GOCACHE env var, revert
tmihalac Jun 30, 2026
6bebe70
Removed debug logging
tmihalac Jun 30, 2026
d267ff2
Removed logging and set maven local repo in lint-test
tmihalac Jun 30, 2026
2b5d772
Fix Go FL short_name dict collision preserving all packages
tmihalac Jun 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .tekton/on-cm-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,10 @@ spec:

resources:
requests:
cpu: "1000m" # CPU request (1 core)
cpu: "3000m" # CPU request (3 cores)
memory: "12Gi" # Memory request (8 gigabytes)
limits:
cpu: "2000m" # CPU limit (2 cores)
cpu: "3000m" # CPU limit (3 cores)
memory: "32Gi" # Memory limit (16 gigabytes)

volumeMounts:
Expand Down Expand Up @@ -188,6 +188,10 @@ spec:
value: "$(params.TRIGGER_COMMENT)"
- name: GOMODCACHE
value: "/exploit-iq-data/go/pkg/mod"
- name: GOCACHE
value: "/exploit-iq-data/go/cache"
- name: MAVEN_OPTS
value: "-Dmaven.repo.local=/exploit-iq-data/maven"
- name: UV_CACHE_DIR
value: "/tmp/uv-cache"
- name: SERPAPI_BASE_URL
Expand Down
7 changes: 6 additions & 1 deletion .tekton/on-pull-request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,8 @@ spec:
export MAVEN_HOME="$HOME/maven-sdk/apache-maven-${MAVEN_VERSION}"
export M2_HOME="$MAVEN_HOME"
export PATH="$MAVEN_HOME/bin:$PATH"

export MAVEN_OPTS="-Dmaven.repo.local=/exploit-iq-data/maven"

echo "Maven version:"
mvn -v

Expand Down Expand Up @@ -368,6 +369,10 @@ spec:
# Pass the raw comment text into the container
- name: GOMODCACHE
value: "/exploit-iq-data/go/pkg/mod"
- name: GOCACHE
value: "/exploit-iq-data/go/cache"
- name: MAVEN_OPTS
value: "-Dmaven.repo.local=/exploit-iq-data/maven"
- name: UV_CACHE_DIR
value: "/tmp/uv-cache"
- name: UV_PYTHON_INSTALL_DIR
Expand Down
4 changes: 4 additions & 0 deletions kustomize/base/exploit_iq_service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ spec:
fieldPath: metadata.namespace
- name: GOMODCACHE
value: /exploit-iq-package-cache/go/pkg/mod
- name: GOCACHE
value: /exploit-iq-package-cache/go/cache
- name: MAVEN_OPTS
value: "-Dmaven.repo.local=/exploit-iq-package-cache/maven"
- name: ENABLE_MLOPS
value: "true"
- name: CREDENTIAL_ENCRYPTION_KEY
Expand Down
6 changes: 6 additions & 0 deletions src/exploit_iq_commons/data_models/checker_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,12 @@ def format_for_prompt(self) -> str:
lines.append(f"AFFECTED_VERSION_RANGE: {self.affected_version_range}")
if self.fixed_version:
lines.append(f"FIXED_VERSION: {self.fixed_version}")
# Emit version-in-range so the L1 agent can apply the VERSION GUARD
# and VERSION-BASED FALLBACK rules defined in the Case B thought instructions.
# Without this, the agent is told to check TARGET_IN_VULNERABLE_RANGE but never sees it.
if self.target_version_in_vulnerable_range is not None:
label = "YES" if self.target_version_in_vulnerable_range else "NO"
lines.append(f"TARGET_IN_VULNERABLE_RANGE: {label}")
return "\n".join(lines)


Expand Down
25 changes: 20 additions & 5 deletions src/exploit_iq_commons/utils/c_segmenter_custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,24 @@
from langchain_community.document_loaders.parsers.language.c import CSegmenter
from typing import List


def _comment_replacer(match):
"""Preserve string literals while removing C/C++ comments."""
if match.group(1) is not None: # string literal — keep it
return match.group(0)
return ' ' # comment — replace with space to preserve token boundaries


_COMMENT_OR_STRING = re.compile(
r'("(?:[^"\\]|\\.)*"|\'(?:[^\'\\]|\\.)*\')' # group 1: string literals
r'|'
r'(/\*[\s\S]*?\*/)' # block comment
r'|'
r'(//[^\n]*)', # line comment
re.DOTALL
)


#class extened CSegmenter
class CSegmenterExtended(CSegmenter):

Expand All @@ -32,11 +50,8 @@ def __init__(self, code: str):

@staticmethod
def remove_comments(code: str) -> str:
# Remove all multi-line comments (/* ... */)
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
# Remove all single-line comments (//...)
code = re.sub(r'//.*', '', code)
return code
# Remove comments while preserving comment-like patterns inside string literals
return _COMMENT_OR_STRING.sub(_comment_replacer, code)

@staticmethod
def remove_macro_blocks(text: str) -> str:
Expand Down
109 changes: 82 additions & 27 deletions src/exploit_iq_commons/utils/chain_of_calls_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def __init__(self, documents: List[Document], ecosystem: Ecosystem, manifest_pat
used here to build the dependency tree for a more efficient lookup and search.
"""

logger.debug("Creating Chain of Calls Retriever")
logger.debug("Starting building Chain of Calls Retriever")
self.ecosystem = ecosystem
logger.debug("Chain of Calls Retriever - creating dependency tree")
Expand All @@ -113,8 +112,8 @@ def __init__(self, documents: List[Document], ecosystem: Ecosystem, manifest_pat
# Build a dependency tree using the dependency tree builder logic.
tree = self.dependency_tree.builder.build_tree(manifest_path=manifest_path)
for package, parents in tree.items():
parents.extend([package])
self.tree_dict[package] = parents
parents.append(package)
self.tree_dict[package] = list(dict.fromkeys(parents))
self.supported_packages = list(self.tree_dict.keys())
logger.debug("Chain of Calls Retriever - populating functions documents")

Expand Down Expand Up @@ -159,8 +158,18 @@ def __init__(self, documents: List[Document], ecosystem: Ecosystem, manifest_pat
self.functions_local_variables_index = self.language_parser.create_map_of_local_vars(self.documents_of_functions)
logger.debug("Chain of Calls Retriever - after functions_local_variables_index")

if not self.language_parser.is_search_algo_dfs():
self.sort_docs = self.__group_docs_by_pkg()
# Pre-index docs by package name for O(package_size) lookups instead of O(all_docs).
# sort_docs is used by BFS and by get_possible_docs for vendor-package filtering.
self.sort_docs = self.__group_docs_by_pkg()
# Pre-filter root-level docs to avoid scanning all documents in the root-package
# search path (sources_location_packages=False) of get_possible_docs.
self._root_docs = [doc for doc in self.documents if self.language_parser.is_root_package(doc)]
# Pre-index non-root docs by source path segments for fast vendor-package lookups.
# Maps each unique path component to the set of docs whose source contains it.
self._source_path_index: dict[str, list[Document]] = defaultdict(list)
for doc in self.documents:
if not self.language_parser.is_root_package(doc):
self._source_path_index[doc.metadata.get('source', '')].append(doc)

def _resolve_tree_key(self, package: str, ctx: _SearchCtx) -> str | None:
"""Find the canonical tree_dict key for a package name.
Expand Down Expand Up @@ -226,6 +235,17 @@ def __find_caller_function_dfs(self, document_function: Document, function_packa
parents = self._get_parents(package_name, ctx)
if parents:
direct_parents.extend(parents)
# Search root-level parents first so the DFS finds root callers
# before exploring library-internal call chains.
root_first = []
non_root = []
for p in direct_parents:
pp = self._get_parents(p, ctx)
if pp and pp[0] == ROOT_LEVEL_SENTINEL:
root_first.append(p)
else:
non_root.append(p)
direct_parents = root_first + non_root
function_name_to_search = self.language_parser.get_function_name(document_function)
if not function_name_to_search:
return None
Expand Down Expand Up @@ -281,9 +301,6 @@ def __find_caller_function_dfs(self, document_function: Document, function_packa
# match, and add it to exclusions so it will not consider it when backtracking in order to prevent cycles.
if function_is_being_called:
package_exclusions.append(doc)
# update index of last scanned package for backtracking
# hashed_value = calculate_hashable_string_for_function(function_file_name, function_name_to_search)
# self.last_visited_parent_package_indexes[hashed_value] = last_visited_package_index + package_index
return doc

# If didn't find a matching caller function document, returns None.
Expand All @@ -292,38 +309,55 @@ def __find_caller_function_dfs(self, document_function: Document, function_packa
def _is_doc_excluded(self, doc: Document, exclusions: list[Document]) -> bool:
"""
Checks if a document is in the exclusions list based on its
function name, function body and source metadata.
function body and source metadata.
Compares source first (cheap string compare) before falling back
to the more expensive content comparison.
"""
if not exclusions:
return False
doc_function_content = doc.page_content.strip()
doc_source = doc.metadata.get('source').strip()

for exclusion_doc in exclusions:
exclusion_function_content = exclusion_doc.page_content.strip()
# Compare source path first — cheaper and usually different
exclusion_source = exclusion_doc.metadata.get('source').strip()

if doc_function_content == exclusion_function_content and doc_source == exclusion_source:
if exclusion_source != doc_source:
continue
exclusion_function_content = exclusion_doc.page_content.strip()
if doc_function_content == exclusion_function_content:
return True
return False


# This helper method filter out irrelevant function ( that cannot be caller functions), it filter out all
# excluded functions, and all function that their body doesn't contain the target function name to search for.
def get_possible_docs(self, function_name_to_search: str, package: str, exclusions: list[Document],
sources_location_packages: bool,
target_class_names: frozenset[str],
method_exclusions: dict) -> (list[Document], bool):
if sources_location_packages:
filter_1 = [doc for doc in self.documents if package in doc.metadata.get('source')
and self.language_parser.is_function(doc) and
not self._is_doc_excluded(doc, exclusions)]
else:
filter_1 = [doc for doc in self.documents if self.language_parser.is_root_package(doc) and
(self.language_parser.is_function(doc) or self.language_parser.is_script_language()) and
not self._is_doc_excluded(doc, exclusions)]
"""Filter documents to those that could be callers of function_name_to_search.

Applies the cheapest check first (search_token substring match) to
short-circuit before more expensive checks (is_function, _is_doc_excluded).
For root-package searches, uses pre-filtered _root_docs instead of scanning
all documents.
"""
if not function_name_to_search:
return []
return [doc for doc in filter_1 if doc.page_content.__contains__(f"{function_name_to_search}(")]
search_token = f"{function_name_to_search}("
if sources_location_packages:
# Use source path index to only scan docs whose path contains the package name,
# instead of iterating all documents.
candidates = [doc for path, docs in self._source_path_index.items()
if package in path for doc in docs]
return [doc for doc in candidates
if search_token in doc.page_content
and self.language_parser.is_function(doc)
and not self._is_doc_excluded(doc, exclusions)]
else:
# Use pre-filtered _root_docs to avoid scanning all documents
return [doc for doc in self._root_docs
if search_token in doc.page_content
and (self.language_parser.is_function(doc) or self.language_parser.is_script_language())
and not self._is_doc_excluded(doc, exclusions)]

def __find_caller_functions_bfs(self, document_function: Document, function_package: str,
ctx: _SearchCtx) -> List[Document]:
Expand Down Expand Up @@ -407,6 +441,7 @@ def __find_caller_functions_bfs(self, document_function: Document, function_pack
documents_of_functions=
self.documents_of_functions)


if found and self.language_parser.is_call_allowed( pkg_docs, doc, document_function):
log_entries.append((file_name, func_name, function_name_to_search))
relevant_docs_to_search_in.append(doc)
Expand Down Expand Up @@ -552,19 +587,32 @@ def get_relevant_documents(self, query: str) -> tuple[List[Document], bool]:
matching_documents = []
standard_libs_cache = StandardLibraryCache.get_instance()
# If it's a standard library package, then skip checking the package in dependency tree.
subpackage_filter = None
if not standard_libs_cache.is_standard_library(package_name, self.ecosystem):
# Check if input package is in dependency tree
for package in self.tree_dict:
if self.language_parser.is_tree_key_match(package_name, package):
package_name = package
found_package = True
break
# Sub-package fallback: query may be a sub-path of a module in tree_dict
if not found_package:
for package in self.tree_dict:
suffix = self.language_parser.resolve_subpackage_to_module(package_name, package)
if suffix is not None:
subpackage_filter = suffix
logger.debug("Sub-package resolved: '%s' → module '%s' (filter='%s')",
package_name, package, subpackage_filter)
package_name = package
found_package = True
break
# If it's , then create a document for it.
if found_package:
target_function_doc = self.__find_initial_function(function, package_name=package_name,
documents=self.documents,
ctx=ctx,
class_name=class_name)
class_name=class_name,
subpackage_filter=subpackage_filter)
if not target_function_doc and self.language_parser.get_constructor_method_name():
target_function_doc = self.__find_initial_function(function_name=self.language_parser.get_constructor_method_name(),
package_name=package_name,
Expand Down Expand Up @@ -620,8 +668,6 @@ def get_relevant_documents(self, query: str) -> tuple[List[Document], bool]:
matching_documents, ctx.found_path = self._breadth_first_search(
matching_documents, target_function_doc, current_package_name, ctx)

# When the loop is finished, return list of documents ( path) and boolean indicating whether a path was
# found or not.
return matching_documents, ctx.found_path

def __determine_doc_package_name(self, target_function_doc, ctx: _SearchCtx):
Expand All @@ -645,7 +691,8 @@ def __determine_doc_package_name(self, target_function_doc, ctx: _SearchCtx):
return fallback

def __find_initial_function(self, function_name: str, package_name: str, documents: list[Document],
ctx: _SearchCtx, class_name: str = None) -> Document:
ctx: _SearchCtx, class_name: str = None,
subpackage_filter: str | None = None) -> Document:

if self.language_parser.is_search_algo_dfs():
pkg_docs = documents
Expand All @@ -657,6 +704,14 @@ def __find_initial_function(self, function_name: str, package_name: str, documen
relevant_docs = [doc for doc in relevant_docs if doc.page_content.endswith(
f'{self.language_parser.get_comment_line_notation()}(class: {class_name})')]

if subpackage_filter:
pre_count = len(relevant_docs)
relevant_docs = [
doc for doc in relevant_docs
if subpackage_filter in doc.metadata.get("source", "")
]
logger.debug("Sub-package filter '%s': %d → %d docs", subpackage_filter, pre_count, len(relevant_docs))

package_exclusions = ctx.exclusions[package_name]
#for index, document in enumerate(get_functions_for_package(package_name, relevant_docs, language_parser)):
from itertools import chain
Expand Down
Loading