mlcommons · remg1997 · Jul 11, 2025 · Jul 15, 2025 · Jul 15, 2025
@@ -1,3 +1,7 @@
+# Copyright (c) MLCommons and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 # Copyright (c) Facebook, Inc. and its affiliates.
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
@@ -42,4 +46,5 @@
     "ssl_org_pem_file_path": os.environ["SSL_ORG_PEM_FILE"],
     "trial_jwtexp": 900,
     "frontend_ip": os.environ["FRONTEND_IP"],
+    "runpod_api_key": os.environ.get("RUNPOD_API_KEY", ""),
 }
@@ -348,9 +348,12 @@ def do_upload_via_train_files(credentials, tid, model_name):
             current_upload = json.loads(upload.file.read().decode("utf-8"))
             upload.file.seek(0)
             payload = {
-                "id_json": current_upload,
-                "bucket_name": task.s3_bucket,
-                "key": name,
+                "input": {
+                    "id_json": current_upload,
+                    "bucket_name": task.s3_bucket,
+                    "key": name,
+                    "model_id": model[1],  # Add model_id for backend processing
+                }
             }
             s3_client.upload_fileobj(
                 upload.file,
@@ -359,19 +362,24 @@ def do_upload_via_train_files(credentials, tid, model_name):
             )
 
             light_model_endpoint = task.lambda_model
-            r = requests.post(light_model_endpoint, json=payload)
 
-            try:
-                score = r.json()["score"]
-            except Exception as ex:
-                logger.exception(ex)
-                subject = f"Model {model_name} failed training as {r.json()['detail']}"
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {config['runpod_api_key']}",
+            }
+
+            r = requests.post(light_model_endpoint, json=payload, headers=headers)
+
+            if r.status_code != 200:
+                logger.error(
+                    f"RunPod request failed with status {r.status_code}: {r.text}"
+                )
                 Email().send(
                     contact=user.email,
                     cc_contact="dynabench-site@mlcommons.org",
                     template_name="model_train_failed.txt",
-                    msg_dict={"name": model_name},
-                    subject=subject,
+                    msg_dict={"name": model_name, "model_id": model[1]},
+                    subject=f"Model {model_name} submission failed",
                 )
                 for idx2, (rem_name, rem_upload) in enumerate(train_items[idx + 1 :]):
                     s3_client.upload_fileobj(
@@ -381,47 +389,74 @@ def do_upload_via_train_files(credentials, tid, model_name):
                     )
                 bottle.abort(400)
 
+            score = None  # Will be set by the async evaluation
+
         did = dm.getByName(name).id
         r_realid = rm.getByTid(tid)[0].rid
         if isinstance(task_config.get("perf_metric"), list):
             metric = task_config.get("perf_metric")[0].get("type")
         elif isinstance(task_config.get("perf_metric"), dict):
             metric = task_config.get("perf_metric").get("type")
-        new_score = {
-            metric: score,
-            "perf": score,
-            "perf_std": 0.0,
-            "perf_by_tag": [
-                {
-                    "tag": str(name),
-                    "pretty_perf": f"{score} %",
-                    "perf": score,
-                    "perf_std": 0.0,
-                    "perf_dict": {metric: score},
-                }
-            ],
-        }
 
-        new_score_string = json.dumps(new_score)
+        if score is not None:
+            new_score = {
+                metric: score,
+                "perf": score,
+                "perf_std": 0.0,
+                "perf_by_tag": [
+                    {
+                        "tag": str(name),
+                        "pretty_perf": f"{score} %",
+                        "perf": score,
+                        "perf_std": 0.0,
+                        "perf_dict": {metric: score},
+                    }
+                ],
+            }
+
+            new_score_string = json.dumps(new_score)
+
+            sm.create(
+                model_id=model[1],
+                r_realid=r_realid,
+                did=did,
+                pretty_perf=f"{score} %",
+                perf=score,
+                metadata_json=new_score_string,
+            )
 
-        sm.create(
-            model_id=model[1],
-            r_realid=r_realid,
-            did=did,
-            pretty_perf=f"{score} %",
-            perf=score,
-            metadata_json=new_score_string,
+    if any(upload.content_type != "text/plain" for upload in train_files.values()):
+        Email().send(
+            contact=user.email,
+            cc_contact="dynabench-site@mlcommons.org",
+            template_name="model_train_successful.txt",
+            msg_dict={"name": model_name, "model_id": model[1]},
+            subject=f"Model {model_name} submitted for evaluation.",
         )
 
-    Email().send(
-        contact=user.email,
-        cc_contact="dynabench-site@mlcommons.org",
-        template_name="model_train_successful.txt",
-        msg_dict={"name": model_name, "model_id": model[1]},
-        subject=f"Model {model_name} training succeeded.",
-    )
+        return util.json_encode(
+            {
+                "success": "ok",
+                "model_id": model[1],
+                "message": "Model submitted for evaluation. You will receive an email when evaluation is complete.",
+            }
+        )
+    else:
+        Email().send(
+            contact=user.email,
+            cc_contact="dynabench-site@mlcommons.org",
+            template_name="model_train_successful.txt",
+            msg_dict={"name": model_name, "model_id": model[1]},
+            subject=f"Model {model_name} evaluation completed.",
+        )
 
-    return util.json_encode({"success": "ok", "model_id": model[1]})
+        return util.json_encode(
+            {
+                "success": "ok",
+                "model_id": model[1],
+                "message": "Model evaluation completed successfully.",
+            }
+        )
 
 
 @bottle.post("/models/upload_predictions/<tid:int>/<model_name>")

@@ -401,7 +401,7 @@ def add_scores_and_update_model(
                 self.email_helper.send(
                     contact=user.email,
                     cc_contact=self.email_sender,
-                    template_name="model_inference_failed.txt",
+                    template_name="model_evaluation_failed.txt",
                     msg_dict={"name": model["name"], "message": message},
                     subject=f"Model {model['name']} evaluation failed.",
                 )
@@ -418,11 +418,57 @@ def add_scores_and_update_model(
                 round_info = self.round_repository.get_round_info_by_round_and_task(
                     model["tid"], round_id
                 )
+
+                # Get task configuration to determine score handling
+                task_config = self.task_repository.get_config_file_by_task_id(
+                    model["tid"]
+                )[0]
+                task_config = yaml.safe_load(task_config)
+
                 metadata_json = dict(scores)
 
+                # Determine the main performance metric based on task configuration
+                perf_metric = task_config.get("perf_metric", {})
+                if isinstance(perf_metric, list):
+                    main_metric = perf_metric[0].get("type", "score")
+                elif isinstance(perf_metric, dict):
+                    main_metric = perf_metric.get("type", "score")
+                else:
+                    main_metric = "score"  # Default fallback
+
+                # Extract the score value - handle different formats
+                score_value = None
+
+                # First, try to extract from nested results (for RunPod format)
+                if "results" in metadata_json and "score" in metadata_json["results"]:
+                    score_value = metadata_json["results"]["score"]
+                elif "score" in metadata_json:
+                    score_value = metadata_json["score"]
+                elif main_metric in metadata_json:
+                    score_value = metadata_json[main_metric]
+                elif "Standard_CER_15_WORSE" in metadata_json:
+                    # Backward compatibility for speech tasks
+                    score_value = metadata_json["Standard_CER_15_WORSE"]
+                    main_metric = "Standard_CER_15_WORSE"
+                else:
+                    raise ValueError(f"No score found in metadata: {metadata_json}")
+
+                # Format the score appropriately based on metric type
+                if main_metric == "Standard_CER_15_WORSE":
+                    # Speech recognition - percentage format
+                    pretty_perf = f"{100 * score_value:.2f}%"
+                else:
+                    # Other tasks - decimal format
+                    pretty_perf = f"{score_value:.4f}"
+
+                # Store the main metric type in metadata for reference
+                metadata_json["main_metric"] = main_metric
+                metadata_json["task_perf_metric"] = perf_metric
+
+                # Build the score structure with proper metric information
                 new_score = {
-                    "perf": metadata_json["Standard_CER_15_WORSE"],
-                    "pretty_perf": f"{100*metadata_json['Standard_CER_15_WORSE']:.2f}%",
+                    "perf": score_value,
+                    "pretty_perf": pretty_perf,
                     "mid": model_id,
                     "r_realid": round_info.id,
                     "did": datasets[0]["id"],
@@ -437,7 +483,7 @@ def add_scores_and_update_model(
                     cc_contact=self.email_sender,
                     template_name="model_evaluation_sucessful.txt",
                     msg_dict={"name": model["name"], "model_id": model["id"]},
-                    subject=f"Model {model['name']} evaluation succeeded.",
+                    subject=f"Model {model['name']} evaluation completed successfully.",
                 )
                 print(
                     f"sent email evaluation sucessful to {user.email} model {model['name']} "