Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 50 additions & 20 deletions misc/scripts/models-as-data/bulk_generate_mad.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,15 @@ def missing_module(module_name: str) -> None:
build_dir = pathlib.Path(gitroot, "mad-generation-build")


def database_dir_for_project(name: str) -> pathlib.Path:
return build_dir / f"{name}-db"


def database_for_project_exists(name: str) -> pathlib.Path:
path = database_dir_for_project(name)
return path.exists()


# A project to generate models for
Project = TypedDict(
"Project",
Expand Down Expand Up @@ -175,7 +184,7 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:

def build_database(
language: str, extractor_options, project: Project, project_dir: str
) -> str | None:
) -> bool:
"""
Build a CodeQL database for a project.

Expand All @@ -186,12 +195,12 @@ def build_database(
project_dir: Path to the CodeQL database.

Returns:
The path to the created database directory.
True if the build was successful, False otherwise.
"""
name = project["name"]

# Create database directory path
database_dir = build_dir / f"{name}-db"
database_dir = database_dir_for_project(name)

# Only build the database if it doesn't already exist
if not database_dir.exists():
Expand All @@ -214,13 +223,13 @@ def build_database(
print(f"Successfully created database at {database_dir}")
except subprocess.CalledProcessError as e:
print(f"Failed to create database for {name}: {e}")
return None
return False
else:
print(
f"Skipping database creation for {name} as it already exists at {database_dir}"
)

return database_dir
return True


def generate_models(config, args, project: Project, database_dir: str) -> None:
Expand Down Expand Up @@ -251,7 +260,7 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:

def build_databases_from_projects(
language: str, extractor_options, projects: List[Project]
) -> List[tuple[Project, str | None]]:
) -> List[tuple[Project, bool]]:
"""
Build databases for all projects in parallel.

Expand All @@ -261,7 +270,7 @@ def build_databases_from_projects(
projects: List of projects to build databases for.

Returns:
List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
List of (project_name, success) pairs, where success is False if the build failed.
"""
# Clone projects in parallel
print("=== Cloning projects ===")
Expand Down Expand Up @@ -332,20 +341,22 @@ def download_dca_databases(
language: str,
experiment_names: list[str],
pat: str,
reuse_databases: bool,
projects: List[Project],
) -> List[tuple[Project, str | None]]:
) -> List[tuple[Project, bool]]:
"""
Download databases from a DCA experiment.
Args:
experiment_names: The names of the DCA experiments to download databases from.
pat: Personal Access Token for GitHub API authentication.
projects: List of projects to download databases for.
Returns:
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
List of (project_name, success) pairs, where success is False if the download failed.
"""
print("\n=== Finding projects ===")
project_map = {project["name"]: project for project in projects}
analyzed_databases = {n: None for n in project_map}

analyzed_databases = {}
for experiment_name in experiment_names:
response = get_json_from_github(
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
Expand All @@ -358,11 +369,11 @@ def download_dca_databases(
artifact_name = analyzed_database["artifact_name"]
pretty_name = pretty_name_from_artifact_name(artifact_name)

if not pretty_name in analyzed_databases:
if not pretty_name in project_map:
print(f"Skipping {pretty_name} as it is not in the list of projects")
continue

if analyzed_databases[pretty_name] is not None:
if pretty_name in analyzed_databases:
print(
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
)
Expand All @@ -376,8 +387,9 @@ def download_dca_databases(
)
sys.exit(1)

def download_and_decompress(analyzed_database: dict) -> str:
def download_and_decompress(analyzed_database: dict) -> bool:
artifact_name = analyzed_database["artifact_name"]
pretty_name = pretty_name_from_artifact_name(artifact_name)
repository = analyzed_database["repository"]
run_id = analyzed_database["run_id"]
print(f"=== Finding artifact: {artifact_name} ===")
Expand Down Expand Up @@ -407,15 +419,18 @@ def download_and_decompress(analyzed_database: dict) -> str:
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
# And we just untar it to the same directory as the zip file
tar_ref.extractall(artifact_unzipped_location)
ret = artifact_unzipped_location / language
print(f"Decompression complete: {ret}")
return ret
database_location = database_dir_for_project(pretty_name)
# Move the database to the canonical location
shutil.move(artifact_unzipped_location / language, database_location)

print(f"Decompression complete: {database_location}")
return True

results = run_in_parallel(
download_and_decompress,
list(analyzed_databases.values()),
on_error=lambda db, exc: print(
f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
f"ERROR: Failed to download and decompress {db['artifact_name']}: {exc}"
),
error_summary=lambda failures: print(
f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
Expand Down Expand Up @@ -460,6 +475,13 @@ def main(config, args) -> None:
# Create build directory if it doesn't exist
build_dir.mkdir(parents=True, exist_ok=True)

# Check if reusing databases is given and all databases exist
reuse_databases = args.reuse_databases
all_databases_exist = reuse_databases and
all_exist = all(
database_for_project_exists(project["name"]) for project in projects
)

database_results = []
match get_strategy(config):
case "repo":
Expand Down Expand Up @@ -487,14 +509,15 @@ def main(config, args) -> None:
language,
experiment_names,
pat,
args.reuse_databases,
projects,
)

# Generate models for all projects
print("\n=== Generating models ===")

failed_builds = [
project["name"] for project, db_dir in database_results if db_dir is None
project["name"] for project, success in database_results if not success
]
if failed_builds:
print(
Expand All @@ -506,8 +529,9 @@ def main(config, args) -> None:
for project, _ in database_results:
clean_up_mad_destination_for_project(config, project["name"])

for project, database_dir in database_results:
if database_dir is not None:
for project, success in database_results:
database_dir = database_dir_for_project(project["name"])
if success:
generate_models(config, args, project, database_dir)


Expand Down Expand Up @@ -543,6 +567,12 @@ def main(config, args) -> None:
help="What `--threads` value to pass to `codeql` (default %(default)s)",
default=0,
)
parser.add_argument(
"--reuse-databases",
type=bool,
help="Whether to reuse existing databases instead of rebuilding them",
default=False,
)
args = parser.parse_args()

# Load config file
Expand Down
26 changes: 13 additions & 13 deletions rust/bulk_generation_targets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ single-file: true # dump models into a single file per crate (we do not have pr
# if a target has a dependency in this same list, it should be listed after that dependency
targets:
- name: rust
- name: libc
- name: log
- name: memchr
- name: once_cell
- name: rand
- name: smallvec
- name: serde
- name: tokio
- name: reqwest
- name: rocket
- name: actix-web
- name: hyper
- name: clap
# - name: libc
# - name: log
# - name: memchr
# - name: once_cell
# - name: rand
# - name: smallvec
# - name: serde
# - name: tokio
# - name: reqwest
# - name: rocket
# - name: actix-web
# - name: hyper
# - name: clap
20 changes: 18 additions & 2 deletions rust/ql/lib/codeql/rust/dataflow/internal/DataFlowImpl.qll
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,11 @@ predicate indexAssignment(
not index.getResolvedTarget().fromSource()
}

module RustDataFlow implements InputSig<Location> {
signature module RustDataFlowInputSig {
predicate includeDynamicTargets();
}

module RustDataFlowGen<RustDataFlowInputSig Input> implements InputSig<Location> {
private import Aliases
private import codeql.rust.dataflow.DataFlow
private import Node as Node
Expand Down Expand Up @@ -441,8 +445,12 @@ module RustDataFlow implements InputSig<Location> {
/** Gets a viable implementation of the target of the given `Call`. */
DataFlowCallable viableCallable(DataFlowCall call) {
exists(Call c | c = call.asCall() |
Input::includeDynamicTargets() and
result.asCfgScope() = c.getARuntimeTarget()
or
not Input::includeDynamicTargets() and
result.asCfgScope() = c.getStaticTarget()
or
exists(SummarizedCallable sc, Function staticTarget |
staticTarget = getStaticTargetExt(c) and
sc = result.asSummarizedCallable() and
Expand Down Expand Up @@ -908,6 +916,12 @@ module RustDataFlow implements InputSig<Location> {
class DataFlowSecondLevelScope = Void;
}

module RustDataFlowInput implements RustDataFlowInputSig {
predicate includeDynamicTargets() { any() }
}

module RustDataFlow = RustDataFlowGen<RustDataFlowInput>;

/** Provides logic related to captured variables. */
module VariableCapture {
private import codeql.rust.internal.CachedStages
Expand Down Expand Up @@ -1079,7 +1093,7 @@ private module Cached {
}

cached
newtype TParameterPosition =
newtype TParameterPositioni =
TPositionalParameterPosition(int i) {
i in [0 .. max([any(ParamList l).getNumberOfParams(), any(ArgList l).getNumberOfArgs()]) - 1]
or
Expand All @@ -1090,6 +1104,8 @@ private module Cached {
TClosureSelfParameterPosition() or
TSelfParameterPosition()

final class TParameterPosition = TParameterPositioni;

cached
newtype TReturnKind = TNormalReturnKind()

Expand Down
36 changes: 23 additions & 13 deletions rust/ql/lib/codeql/rust/dataflow/internal/TaintTrackingImpl.qll
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
private import rust
private import codeql.dataflow.DataFlow as DF
private import codeql.dataflow.TaintTracking
private import codeql.rust.dataflow.DataFlow
private import codeql.rust.dataflow.DataFlow as RustDataFlow
private import codeql.rust.dataflow.FlowSummary
private import DataFlowImpl
private import DataFlowImpl as DataFlowImpl
private import Node as Node
private import Content
private import FlowSummaryImpl as FlowSummaryImpl
Expand All @@ -29,15 +30,19 @@ private predicate excludedTaintStepContent(Content c) {
)
}

module RustTaintTracking implements InputSig<Location, RustDataFlow> {
predicate defaultTaintSanitizer(DataFlow::Node node) { none() }
module RustTaintTrackingGen<DataFlowImpl::RustDataFlowInputSig I> implements
InputSig<Location, DataFlowImpl::RustDataFlowGen<I>>
{
private module TheDataFlow = DataFlowImpl::RustDataFlowGen<I>;

predicate defaultTaintSanitizer(TheDataFlow::Node node) { none() }

/**
* Holds if the additional step from `pred` to `succ` should be included in all
* global taint flow configurations.
*/
cached
predicate defaultAdditionalTaintStep(DataFlow::Node pred, DataFlow::Node succ, string model) {
predicate defaultAdditionalTaintStep(TheDataFlow::Node pred, TheDataFlow::Node succ, string model) {
Stages::DataFlowStage::ref() and
model = "" and
(
Expand All @@ -53,14 +58,14 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
// is tainted and an operation reads from `foo` (e.g., `foo.bar`) then
// taint is propagated.
exists(Content c |
RustDataFlow::readContentStep(pred, c, succ) and
TheDataFlow::readContentStep(pred, c, succ) and
not excludedTaintStepContent(c)
)
or
// In addition to the above, for element and reference content we let
// _all_ read steps (including those from flow summaries and those that
// result in small primitive types) give rise to taint steps.
exists(SingletonContentSet cs | RustDataFlow::readStep(pred, cs, succ) |
exists(SingletonContentSet cs | TheDataFlow::readStep(pred, cs, succ) |
cs.getContent() instanceof ElementContent
or
cs.getContent() instanceof ReferenceContent
Expand All @@ -79,9 +84,11 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
)
or
succ.(Node::PostUpdateNode).getPreUpdateNode().asExpr() =
getPostUpdateReverseStep(pred.(Node::PostUpdateNode).getPreUpdateNode().asExpr(), false)
DataFlowImpl::getPostUpdateReverseStep(pred.(Node::PostUpdateNode)
.getPreUpdateNode()
.asExpr(), false)
or
indexAssignment(any(CompoundAssignmentExpr cae),
DataFlowImpl::indexAssignment(any(CompoundAssignmentExpr cae),
pred.(Node::PostUpdateNode).getPreUpdateNode().asExpr(), _, succ, _)
)
or
Expand All @@ -94,19 +101,22 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
* and inputs to additional taint steps.
*/
bindingset[node]
predicate defaultImplicitTaintRead(DataFlow::Node node, ContentSet cs) {
predicate defaultImplicitTaintRead(TheDataFlow::Node node, ContentSet cs) {
exists(node) and
exists(Content c | c = cs.(SingletonContentSet).getContent() |
c instanceof ElementContent or
c instanceof ReferenceContent
) and
) // and
// Optional steps are added through isAdditionalFlowStep but we don't want the implicit reads
not optionalStep(node, _, _)
// FIXME:
// not optionalStep(node, _, _)
}

/**
* Holds if the additional step from `src` to `sink` should be considered in
* speculative taint flow exploration.
*/
predicate speculativeTaintStep(DataFlow::Node src, DataFlow::Node sink) { none() }
predicate speculativeTaintStep(TheDataFlow::Node src, TheDataFlow::Node sink) { none() }
}

module RustTaintTracking = RustTaintTrackingGen<DataFlowImpl::RustDataFlowInput>;
Loading