diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml index ada591aaf4..b193e273e0 100644 --- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml +++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml @@ -62,9 +62,11 @@ model_tag: nvidia/esm2_t48_15B_UR50D num_train_steps: 20_000 # dataset commands micro_batch_size: 8 -load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data +load_dataset_kwargs_path: parquet load_dataset_kwargs_streaming: true load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret +load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret + num_workers: 1 # lr commands @@ -148,6 +150,10 @@ run_script: | wandb_init_args.name=${wandb_name} \ num_train_steps=${num_train_steps} \ dataset.micro_batch_size=${micro_batch_size} \ + dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \ + dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \ + +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \ + +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \ use_sequence_packing=${thd_enabled} \ dataset.num_workers=${num_workers} \ lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \ diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml index 731884f24c..7b1525fba7 100644 --- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml +++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml @@ -61,9 +61,10 @@ model_tag: nvidia/esm2_t36_3B_UR50D num_train_steps: 20_000 # dataset commands micro_batch_size: 16 -load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data +load_dataset_kwargs_path: parquet load_dataset_kwargs_streaming: true load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret +load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret # lr commands num_warmup_steps: 2_000 @@ -133,6 +134,7 @@ run_script: | dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \ dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \ +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \ + +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \ lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \ checkpoint.ckpt_dir=${ckpt_dir} \ checkpoint.save_final_model=${save_final_model} \ diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml index b3a1d47b1f..8e4dcbe6d8 100644 --- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml +++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml @@ -61,9 +61,11 @@ model_tag: nvidia/esm2_t36_650M_UR50D num_train_steps: 20_000 # dataset commands micro_batch_size: 16 -load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data +load_dataset_kwargs_path: parquet load_dataset_kwargs_streaming: true load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret +load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret + num_workers: 1 # lr commands @@ -130,6 +132,7 @@ run_script: | dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \ dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \ +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \ + +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \ dataset.num_workers=${num_workers} \ lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \ checkpoint.ckpt_dir=${ckpt_dir} \