github · paldepind · Dec 12, 2025 · Dec 12, 2025 · Dec 15, 2025
@@ -44,6 +44,15 @@ def missing_module(module_name: str) -> None:
 build_dir = pathlib.Path(gitroot, "mad-generation-build")
 
 
+def database_dir_for_project(name: str) -> pathlib.Path:
+    return build_dir / f"{name}-db"
+
+
+def database_for_project_exists(name: str) -> pathlib.Path:
+    path = database_dir_for_project(name)
+    return path.exists()
+
+
 # A project to generate models for
 Project = TypedDict(
     "Project",
@@ -175,7 +184,7 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
 
 def build_database(
     language: str, extractor_options, project: Project, project_dir: str
-) -> str | None:
+) -> bool:
     """
     Build a CodeQL database for a project.
 
@@ -186,12 +195,12 @@ def build_database(
         project_dir: Path to the CodeQL database.
 
     Returns:
-        The path to the created database directory.
+        True if the build was successful, False otherwise.
     """
     name = project["name"]
 
     # Create database directory path
-    database_dir = build_dir / f"{name}-db"
+    database_dir = database_dir_for_project(name)
 
     # Only build the database if it doesn't already exist
     if not database_dir.exists():
@@ -214,13 +223,13 @@ def build_database(
             print(f"Successfully created database at {database_dir}")
         except subprocess.CalledProcessError as e:
             print(f"Failed to create database for {name}: {e}")
-            return None
+            return False
     else:
         print(
             f"Skipping database creation for {name} as it already exists at {database_dir}"
         )
 
-    return database_dir
+    return True
 
 
 def generate_models(config, args, project: Project, database_dir: str) -> None:
@@ -251,7 +260,7 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
 
 def build_databases_from_projects(
     language: str, extractor_options, projects: List[Project]
-) -> List[tuple[Project, str | None]]:
+) -> List[tuple[Project, bool]]:
     """
     Build databases for all projects in parallel.
 
@@ -261,7 +270,7 @@ def build_databases_from_projects(
         projects: List of projects to build databases for.
 
     Returns:
-        List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
+        List of (project_name, success) pairs, where success is False if the build failed.
     """
     # Clone projects in parallel
     print("=== Cloning projects ===")
@@ -332,20 +341,22 @@ def download_dca_databases(
     language: str,
     experiment_names: list[str],
     pat: str,
+    reuse_databases: bool,
     projects: List[Project],
-) -> List[tuple[Project, str | None]]:
+) -> List[tuple[Project, bool]]:
     """
     Download databases from a DCA experiment.
     Args:
         experiment_names: The names of the DCA experiments to download databases from.
         pat: Personal Access Token for GitHub API authentication.
         projects: List of projects to download databases for.
     Returns:
-        List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
+        List of (project_name, success) pairs, where success is False if the download failed.
     """
     print("\n=== Finding projects ===")
     project_map = {project["name"]: project for project in projects}
-    analyzed_databases = {n: None for n in project_map}
+
+    analyzed_databases = {}
     for experiment_name in experiment_names:
         response = get_json_from_github(
             f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -358,11 +369,11 @@ def download_dca_databases(
             artifact_name = analyzed_database["artifact_name"]
             pretty_name = pretty_name_from_artifact_name(artifact_name)
 
-            if not pretty_name in analyzed_databases:
+            if not pretty_name in project_map:
                 print(f"Skipping {pretty_name} as it is not in the list of projects")
                 continue
 
-            if analyzed_databases[pretty_name] is not None:
+            if pretty_name in analyzed_databases:
                 print(
                     f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
                 )
@@ -376,8 +387,9 @@ def download_dca_databases(
         )
         sys.exit(1)
 
-    def download_and_decompress(analyzed_database: dict) -> str:
+    def download_and_decompress(analyzed_database: dict) -> bool:
         artifact_name = analyzed_database["artifact_name"]
+        pretty_name = pretty_name_from_artifact_name(artifact_name)
         repository = analyzed_database["repository"]
         run_id = analyzed_database["run_id"]
         print(f"=== Finding artifact: {artifact_name} ===")
@@ -407,15 +419,18 @@ def download_and_decompress(analyzed_database: dict) -> str:
             with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
                 # And we just untar it to the same directory as the zip file
                 tar_ref.extractall(artifact_unzipped_location)
-        ret = artifact_unzipped_location / language
-        print(f"Decompression complete: {ret}")
-        return ret
+        database_location = database_dir_for_project(pretty_name)
+        # Move the database to the canonical location
+        shutil.move(artifact_unzipped_location / language, database_location)
+
+        print(f"Decompression complete: {database_location}")
+        return True
 
     results = run_in_parallel(
         download_and_decompress,
         list(analyzed_databases.values()),
         on_error=lambda db, exc: print(
-            f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
+            f"ERROR: Failed to download and decompress {db['artifact_name']}: {exc}"
         ),
         error_summary=lambda failures: print(
             f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
@@ -460,6 +475,13 @@ def main(config, args) -> None:
     # Create build directory if it doesn't exist
     build_dir.mkdir(parents=True, exist_ok=True)
 
+    # Check if reusing databases is given and all databases exist
+    reuse_databases = args.reuse_databases
+    all_databases_exist = reuse_databases and
+        all_exist = all(
+            database_for_project_exists(project["name"]) for project in projects
+        )
+
     database_results = []
     match get_strategy(config):
         case "repo":
@@ -487,14 +509,15 @@ def main(config, args) -> None:
                     language,
                     experiment_names,
                     pat,
+                    args.reuse_databases,
                     projects,
                 )
 
     # Generate models for all projects
     print("\n=== Generating models ===")
 
     failed_builds = [
-        project["name"] for project, db_dir in database_results if db_dir is None
+        project["name"] for project, success in database_results if not success
     ]
     if failed_builds:
         print(
@@ -506,8 +529,9 @@ def main(config, args) -> None:
     for project, _ in database_results:
         clean_up_mad_destination_for_project(config, project["name"])
 
-    for project, database_dir in database_results:
-        if database_dir is not None:
+    for project, success in database_results:
+        database_dir = database_dir_for_project(project["name"])
+        if success:
             generate_models(config, args, project, database_dir)
 
 
@@ -543,6 +567,12 @@ def main(config, args) -> None:
         help="What `--threads` value to pass to `codeql` (default %(default)s)",
         default=0,
     )
+    parser.add_argument(
+        "--reuse-databases",
+        type=bool,
+        help="Whether to reuse existing databases instead of rebuilding them",
+        default=False,
+    )
     args = parser.parse_args()
 
     # Load config file

@@ -10,16 +10,16 @@ single-file: true  # dump models into a single file per crate (we do not have pr
 # if a target has a dependency in this same list, it should be listed after that dependency
 targets:
 - name: rust
-- name: libc
-- name: log
-- name: memchr
-- name: once_cell
-- name: rand
-- name: smallvec
-- name: serde
-- name: tokio
-- name: reqwest
-- name: rocket
-- name: actix-web
-- name: hyper
-- name: clap
+# - name: libc
+# - name: log
+# - name: memchr
+# - name: once_cell
+# - name: rand
+# - name: smallvec
+# - name: serde
+# - name: tokio
+# - name: reqwest
+# - name: rocket
+# - name: actix-web
+# - name: hyper
+# - name: clap
@@ -286,7 +286,11 @@ predicate indexAssignment(
   not index.getResolvedTarget().fromSource()
 }
 
-module RustDataFlow implements InputSig<Location> {
+signature module RustDataFlowInputSig {
+  predicate includeDynamicTargets();
+}
+
+module RustDataFlowGen<RustDataFlowInputSig Input> implements InputSig<Location> {
   private import Aliases
   private import codeql.rust.dataflow.DataFlow
   private import Node as Node
@@ -441,8 +445,12 @@ module RustDataFlow implements InputSig<Location> {
   /** Gets a viable implementation of the target of the given `Call`. */
   DataFlowCallable viableCallable(DataFlowCall call) {
     exists(Call c | c = call.asCall() |
+      Input::includeDynamicTargets() and
       result.asCfgScope() = c.getARuntimeTarget()
       or
+      not Input::includeDynamicTargets() and
+      result.asCfgScope() = c.getStaticTarget()
+      or
       exists(SummarizedCallable sc, Function staticTarget |
         staticTarget = getStaticTargetExt(c) and
         sc = result.asSummarizedCallable() and
@@ -908,6 +916,12 @@ module RustDataFlow implements InputSig<Location> {
   class DataFlowSecondLevelScope = Void;
 }
 
+module RustDataFlowInput implements RustDataFlowInputSig {
+  predicate includeDynamicTargets() { any() }
+}
+
+module RustDataFlow = RustDataFlowGen<RustDataFlowInput>;
+
 /** Provides logic related to captured variables. */
 module VariableCapture {
   private import codeql.rust.internal.CachedStages
@@ -1079,7 +1093,7 @@ private module Cached {
   }
 
   cached
-  newtype TParameterPosition =
+  newtype TParameterPositioni =
     TPositionalParameterPosition(int i) {
       i in [0 .. max([any(ParamList l).getNumberOfParams(), any(ArgList l).getNumberOfArgs()]) - 1]
       or
@@ -1090,6 +1104,8 @@ private module Cached {
     TClosureSelfParameterPosition() or
     TSelfParameterPosition()
 
+  final class TParameterPosition = TParameterPositioni;
+
   cached
   newtype TReturnKind = TNormalReturnKind()
 

@@ -1,8 +1,9 @@
 private import rust
+private import codeql.dataflow.DataFlow as DF
 private import codeql.dataflow.TaintTracking
-private import codeql.rust.dataflow.DataFlow
+private import codeql.rust.dataflow.DataFlow as RustDataFlow
 private import codeql.rust.dataflow.FlowSummary
-private import DataFlowImpl
+private import DataFlowImpl as DataFlowImpl
 private import Node as Node
 private import Content
 private import FlowSummaryImpl as FlowSummaryImpl
@@ -29,15 +30,19 @@ private predicate excludedTaintStepContent(Content c) {
   )
 }
 
-module RustTaintTracking implements InputSig<Location, RustDataFlow> {
-  predicate defaultTaintSanitizer(DataFlow::Node node) { none() }
+module RustTaintTrackingGen<DataFlowImpl::RustDataFlowInputSig I> implements
+  InputSig<Location, DataFlowImpl::RustDataFlowGen<I>>
+{
+  private module TheDataFlow = DataFlowImpl::RustDataFlowGen<I>;
+
+  predicate defaultTaintSanitizer(TheDataFlow::Node node) { none() }
 
   /**
    * Holds if the additional step from `pred` to `succ` should be included in all
    * global taint flow configurations.
    */
   cached
-  predicate defaultAdditionalTaintStep(DataFlow::Node pred, DataFlow::Node succ, string model) {
+  predicate defaultAdditionalTaintStep(TheDataFlow::Node pred, TheDataFlow::Node succ, string model) {
     Stages::DataFlowStage::ref() and
     model = "" and
     (
@@ -53,14 +58,14 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
       // is tainted and an operation reads from `foo` (e.g., `foo.bar`) then
       // taint is propagated.
       exists(Content c |
-        RustDataFlow::readContentStep(pred, c, succ) and
+        TheDataFlow::readContentStep(pred, c, succ) and
         not excludedTaintStepContent(c)
       )
       or
       // In addition to the above, for element and reference content we let
       // _all_ read steps (including those from flow summaries and those that
       // result in small primitive types) give rise to taint steps.
-      exists(SingletonContentSet cs | RustDataFlow::readStep(pred, cs, succ) |
+      exists(SingletonContentSet cs | TheDataFlow::readStep(pred, cs, succ) |
         cs.getContent() instanceof ElementContent
         or
         cs.getContent() instanceof ReferenceContent
@@ -79,9 +84,11 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
       )
       or
       succ.(Node::PostUpdateNode).getPreUpdateNode().asExpr() =
-        getPostUpdateReverseStep(pred.(Node::PostUpdateNode).getPreUpdateNode().asExpr(), false)
+        DataFlowImpl::getPostUpdateReverseStep(pred.(Node::PostUpdateNode)
+              .getPreUpdateNode()
+              .asExpr(), false)
       or
-      indexAssignment(any(CompoundAssignmentExpr cae),
+      DataFlowImpl::indexAssignment(any(CompoundAssignmentExpr cae),
         pred.(Node::PostUpdateNode).getPreUpdateNode().asExpr(), _, succ, _)
     )
     or
@@ -94,19 +101,22 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
    * and inputs to additional taint steps.
    */
   bindingset[node]
-  predicate defaultImplicitTaintRead(DataFlow::Node node, ContentSet cs) {
+  predicate defaultImplicitTaintRead(TheDataFlow::Node node, ContentSet cs) {
     exists(node) and
     exists(Content c | c = cs.(SingletonContentSet).getContent() |
       c instanceof ElementContent or
       c instanceof ReferenceContent
-    ) and
+    ) // and
     // Optional steps are added through isAdditionalFlowStep but we don't want the implicit reads
-    not optionalStep(node, _, _)
+    // FIXME:
+    // not optionalStep(node, _, _)
   }
 
   /**
    * Holds if the additional step from `src` to `sink` should be considered in
    * speculative taint flow exploration.
    */
-  predicate speculativeTaintStep(DataFlow::Node src, DataFlow::Node sink) { none() }
+  predicate speculativeTaintStep(TheDataFlow::Node src, TheDataFlow::Node sink) { none() }
 }
+
+module RustTaintTracking = RustTaintTrackingGen<DataFlowImpl::RustDataFlowInput>;