microsoft · Aishwarya-Tonpe · Aug 5, 2025 · Aug 11, 2025 · Aug 11, 2025 · Aug 11, 2025
@@ -34,6 +34,19 @@ For inference, supported percentiles include
 
 **New: Support fp8_hybrid and fp8_e4m3 precision for BERT models.**
 
+**New: Deterministic Training Support**
+SuperBench now supports deterministic training to ensure reproducibility across runs. This includes fixed seeds and deterministic algorithms. To enable deterministic training, the following flags and environment variables must be set:
+
+- **Flags:**
+  - `--enable-determinism`: Enables deterministic computation for reproducible results.
+  - `--deterministic_seed <seed>`: Sets the seed for reproducibility.
+  - `--generate_log` : Boolean flag that stores comparison metrics in the results file
+  - `--compare_log <results_file_path>`: Specifies the path to the reference file for comparison.
+
+- **Environment Variables:**
+  - (Implicitly set when `enable-determinism` flag is set)
+  - `CUBLAS_WORKSPACE_CONFIG=:4096:8`: Ensures deterministic behavior in cuBLAS.
+
 #### Metrics
 
 | Name                                                                                    | Unit                   | Description                                                                  |

@@ -0,0 +1,137 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Unified PyTorch deterministic training example for all supported models.
+
+Deterministic metrics (loss, activation mean) are automatically stored in results.json
+when --enable-determinism flag is enabled. Use --compare-log to compare against a reference run.
+
+Commands to run:
+Run A (generate reference):
+
+python3 examples/benchmarks/pytorch_deterministic_example.py \
+    --model <model_from_MODEL_CHOICES> --enable-determinism --deterministic-seed 42
+
+This creates results-0.json with deterministic metrics.
+
+Run B (compare against reference):
+
+python3 examples/benchmarks/pytorch_deterministic_example.py \
+    --model <model_from_MODEL_CHOICES> --enable-determinism --deterministic-seed 42 --compare-log results-0.json
+
+Note: CUBLAS_WORKSPACE_CONFIG is now automatically set by the code when determinism is enabled.
+"""
+
+import argparse
+import json
+from pathlib import Path
+from superbench.benchmarks import BenchmarkRegistry, Framework
+from superbench.common.utils import logger
+
+MODEL_CHOICES = [
+    'bert-large',
+    'gpt2-small',
+    'llama2-7b',
+    'mixtral-8x7b',
+    'resnet101',
+    'lstm',
+]
+
+DEFAULT_PARAMS = {
+    'bert-large':
+    '--batch_size 1 --seq_len 64 --num_warmup 1 --num_steps 200 --precision float32 '
+    '--model_action train --check_frequency 20',
+    'gpt2-small':
+    '--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 128 --precision float32 '
+    '--model_action train --check_frequency 20',
+    'llama2-7b':
+    '--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 512 --precision float32 --model_action train '
+    '--check_frequency 20',
+    'mixtral-8x7b':
+    '--hidden_size=4096 --num_hidden_layers=32 --num_attention_heads=32 --intermediate_size=14336 '
+    '--num_key_value_heads=8 --max_position_embeddings=32768 --router_aux_loss_coef=0.02 '
+    '--check_frequency 20',
+    'resnet101':
+    '--batch_size 1 --precision float32 --num_warmup 1 --num_steps 120 --sample_count 8192 '
+    '--pin_memory --model_action train --check_frequency 20',
+    'lstm':
+    '--batch_size 1 --num_steps 100 --num_warmup 2 --seq_len 64 --precision float16 '
+    '--model_action train --check_frequency 30',
+}
+
+
+def main():
+    """Main function for determinism example file."""
+    parser = argparse.ArgumentParser(description='Unified PyTorch deterministic training example.')
+    parser.add_argument('--model', type=str, choices=MODEL_CHOICES, required=True, help='Model to run.')
+    parser.add_argument(
+        '--enable-determinism',
+        '--enable_determinism',
+        action='store_true',
+        help='Enable deterministic mode for reproducible results.',
+    )
+    parser.add_argument(
+        '--compare-log',
+        type=str,
+        default=None,
+        help='Path to reference results.json file for deterministic comparison.',
+    )
+    parser.add_argument(
+        '--deterministic-seed',
+        type=int,
+        default=None,
+        help='Seed for deterministic training.',
+    )
+    args = parser.parse_args()
+
+    parameters = DEFAULT_PARAMS[args.model]
+    if args.enable_determinism:
+        parameters += ' --enable-determinism'
+    if args.deterministic_seed is not None:
+        parameters += f' --deterministic_seed {args.deterministic_seed}'
+    if args.compare_log:
+        parameters += f' --compare-log {args.compare_log}'
+
+    context = BenchmarkRegistry.create_benchmark_context(args.model, parameters=parameters, framework=Framework.PYTORCH)
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    logger.info(f'Benchmark finished. Return code: {benchmark.return_code}')
+
+    # Save results to file for comparison
+    if not args.compare_log:
+        # Find next available results file name
+        counter = 0
+        while Path(f'results-{counter}.json').exists():
+            counter += 1
+        results_file = f'results-{counter}.json'
+
+        # Parse benchmark results and create nested format like results-summary.json
+        benchmark_results = json.loads(benchmark.serialized_result)
+
+        # Create nested structure: raw_data -> benchmark_name -> metrics
+        # Extract the benchmark name from the results (e.g., "pytorch-lstm")
+        benchmark_name = benchmark_results.get('name', args.model)
+
+        # Create results in the format expected by comparison logic
+        nested_results = {
+            'raw_data': {
+                f'model-benchmarks:{args.model}/{benchmark_name}': benchmark_results.get('raw_data', {})
+            }
+        }
+
+        # Write results to file
+        with open(results_file, 'w') as f:
+            json.dump(nested_results, f, indent=2)
+        logger.info(f'Results saved to {results_file}')
+        logger.info(f'To compare against this run, use: --compare-log {results_file}')
+    else:
+        logger.info(f'Comparison completed against {args.compare_log}')
+
+    if hasattr(benchmark, '_model_run_metadata'):
+        logger.info(f'Run metadata: {benchmark._model_run_metadata}')
+    if hasattr(benchmark, '_model_run_periodic'):
+        num_checkpoints = len(benchmark._model_run_periodic.get('step', []))
+        logger.info(f'Periodic fingerprints collected at {num_checkpoints} checkpoints')
+
+
+if __name__ == '__main__':
+    main()
@@ -110,14 +110,66 @@ def parse_args(self, ignore_invalid=False):
                 logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e)))
                 return False, None, []
 
-        ret = True
+        if args is not None and 'compare_log' in [a.dest for a in self._parser._actions]:
+            args = self._override_args_with_compare_log(args)
+
+        ret = self._check_unknown_args(unknown)
+
+        return ret, args, unknown
+
+    def _override_args_with_compare_log(self, args):
+        """Override arguments with metadata from a compare log file if available.
+
+        This is a legacy method. Metadata override is now handled by benchmark-specific
+        implementations (e.g., pytorch_base.py for PyTorch models).
+
+        Args:
+            args: Parsed arguments.
+
+        Returns:
+            argparse: Arguments (returned unchanged).
+        """
+        return args
+
+    def _convert_precision_value(self, value, Precision):
+        """Convert precision values to the appropriate format.
+
+        Args:
+            value: The precision value to convert.
+            Precision: The Precision class or type to convert to.
+
+        Returns:
+            list: A list of converted precision values.
+        """
+        if isinstance(value, list):
+            converted = []
+            for v in value:
+                if isinstance(v, Precision):
+                    converted.append(v)
+                else:
+                    converted.append(Precision(v))
+            return converted
+        else:
+            if isinstance(value, Precision):
+                return [value]
+            else:
+                return [Precision(value)]
+
+    def _check_unknown_args(self, unknown):
+        """Check for unknown arguments and log an error if any are found.
+
+        Args:
+            unknown (list): List of unknown arguments.
+
+        Returns:
+            bool: False if unknown arguments are found, True otherwise.
+        """
         if len(unknown) > 0:
             logger.error(
                 'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown))
             )
-            ret = False
-
-        return ret, args, unknown
+            return False
+        return True
 
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
@@ -263,6 +315,10 @@ def __check_raw_data(self):
               instance of List[List[Number]] or List[str] for BenchmarkType.MICRO.
         """
         for metric in self._result.raw_data:
+            # Skip validation for metadata (dict type used for configuration storage)
+            if metric.startswith('metadata'):
+                continue
+
             is_valid = True
             if self._benchmark_type == BenchmarkType.MODEL:
                 is_valid = self.__is_list_list_type(self._result.raw_data[metric], numbers.Number)

@@ -186,6 +186,17 @@ def _generate_dataset(self):
         """
         pass
 
+    def set_deterministic_seed(self):
+        """Hook to set deterministic RNG state before dataset generation.
+
+        Framework-specific subclasses may
+        override this to apply deterministic RNG settings (for example,
+        PyTorch benchmarks implement this to call their deterministic setup
+        when requested). This is called from _preprocess() before
+        _generate_dataset().
+        """
+        return None
+
     @abstractmethod
     def _init_dataloader(self):
         """Initialize the dataloader.
@@ -221,6 +232,12 @@ def _preprocess(self):
             self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
             return False
 
+        # Invoke model-specific deterministic seeding hook before dataset generation
+        try:
+            self.set_deterministic_seed()
+        except Exception:
+            logger.info('set_deterministic_seed() hook failed or not implemented for model: %s', self._name)
+
         # Set sample_count aligned with batch_size.
         self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size