From 20076ccaad6763425d817fd87d6f1e52d9b8ed28 Mon Sep 17 00:00:00 2001 From: Jonathan Mitchell Date: Wed, 17 Dec 2025 12:14:45 -0800 Subject: [PATCH 1/2] adds fp8 debug Signed-off-by: Jonathan Mitchell --- .../recipes/esm2_native_te/fp8_stats.yaml | 28 +++++++++++++++++++ .../recipes/esm2_native_te/train_fsdp2.py | 15 ++++++++++ 2 files changed, 43 insertions(+) create mode 100644 bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml diff --git a/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml b/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml new file mode 100644 index 0000000000..090e3ac8be --- /dev/null +++ b/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml @@ -0,0 +1,28 @@ +example_fp8_tensor_stat_collection: + enabled: True + layers: + layer_types: [layernorm_linear] + transformer_engine: + LogFp8TensorStats: + enabled: True + tensors_struct: + - tensor: activation + stats: [fp8_block_scaling_underflows%] + freq: 1 + - tensor: activation + stats: [fp8_block_scaling_overflows%] + freq: 1 + - tensor: activation + stats: [fp8_block_scaling_scale_inv_min] + freq: 1 + - tensor: activation + stats: [fp8_block_scaling_scale_inv_max] + freq: 1 + - tensor: activation + stats: [fp8_block_scaling_mse] + freq: 1 + - tensor: gradient + stats: [underflows%] + freq: 5 + start_step: 0 + end_step: 80 \ No newline at end of file diff --git a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py index 293eafe84d..0ba6e93c56 100644 --- a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py +++ b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py @@ -36,6 +36,8 @@ from perf_logger import PerfLogger from scheduler import get_linear_schedule_with_warmup +import nvdlfw_inspect.api as debug_api + logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -126,6 +128,15 @@ def main(args: DictConfig) -> float | None: perf_logger = PerfLogger(dist_config, args) + # TE Debug feature logging + debug_api.initialize( + config_file="/workspaces/bionemo-framework/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml", + feature_dirs=["/usr/local/lib/python3.12/dist-packages/transformer_engine/debug/features/"], + log_dir="./log", + default_logging_enabled=True + ) + + # Training loop step = start_step while step < args.num_train_steps: @@ -146,6 +157,9 @@ def main(args: DictConfig) -> float | None: # Step optimizer. optimizer.step() scheduler.step() + + debug_api.step() + optimizer.zero_grad() perf_logger.log_step( @@ -169,6 +183,7 @@ def main(args: DictConfig) -> float | None: max_checkpoints=args.checkpoint.max_checkpoints, ) + step += 1 if step >= args.num_train_steps: break From 961c96b9afd1dfbf9585bb4af48be4cc002317fa Mon Sep 17 00:00:00 2001 From: Jonathan Mitchell Date: Wed, 17 Dec 2025 16:05:20 -0800 Subject: [PATCH 2/2] mxfp8 Signed-off-by: Jonathan Mitchell --- .../recipes/esm2_native_te/fp8_stats.yaml | 12 ++++++------ .../recipes/esm2_native_te/train_fsdp2.py | 18 +++++++++++------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml b/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml index 090e3ac8be..54b0bd0ba1 100644 --- a/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml +++ b/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml @@ -1,25 +1,25 @@ example_fp8_tensor_stat_collection: enabled: True layers: - layer_types: [layernorm_linear] + layer_types: [self_attention] transformer_engine: LogFp8TensorStats: enabled: True tensors_struct: - tensor: activation - stats: [fp8_block_scaling_underflows%] + stats: [mxfp8_underflows%] freq: 1 - tensor: activation - stats: [fp8_block_scaling_overflows%] + stats: [mxfp8_overflows%] freq: 1 - tensor: activation - stats: [fp8_block_scaling_scale_inv_min] + stats: [mxfp8_scale_inv_min] freq: 1 - tensor: activation - stats: [fp8_block_scaling_scale_inv_max] + stats: [mxfp8_scale_inv_max] freq: 1 - tensor: activation - stats: [fp8_block_scaling_mse] + stats: [mxfp8_mse] freq: 1 - tensor: gradient stats: [underflows%] diff --git a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py index 0ba6e93c56..9a3c72474b 100644 --- a/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py +++ b/bionemo-recipes/recipes/esm2_native_te/train_fsdp2.py @@ -86,6 +86,16 @@ def main(args: DictConfig) -> float | None: logger.info("Initialized Model:\n%s", model) + # TE Debug feature logging + debug_api.initialize( + config_file="/workspaces/bionemo-framework/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml", + feature_dirs=["/usr/local/lib/python3.12/dist-packages/transformer_engine/debug/features/"], + log_dir="./log", + default_logging_enabled=True + ) + + debug_api.infer_and_assign_layer_names(model) + # We call the transformer stack "layers" in our TE models, but it's called "layer" in the original ESM-2 models. transformer_stack = model.esm.encoder.layers if hasattr(model.esm.encoder, "layers") else model.esm.encoder.layer for layer in transformer_stack: @@ -128,13 +138,7 @@ def main(args: DictConfig) -> float | None: perf_logger = PerfLogger(dist_config, args) - # TE Debug feature logging - debug_api.initialize( - config_file="/workspaces/bionemo-framework/bionemo-recipes/recipes/esm2_native_te/fp8_stats.yaml", - feature_dirs=["/usr/local/lib/python3.12/dist-packages/transformer_engine/debug/features/"], - log_dir="./log", - default_logging_enabled=True - ) + # Training loop