From 9a562c2ff6173de1019e59ffbc552ffc79481a2b Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Thu, 6 Nov 2025 10:52:26 -0600 Subject: [PATCH 01/30] new slurm for score_TRAK and score_logra --- experiments/gpt2_wikitext/score_TRAK.py | 1 + experiments/gpt2_wikitext/score_TRAK_slurm.sh | 35 +++++++++++++++++++ experiments/gpt2_wikitext/score_logra.py | 1 + .../gpt2_wikitext/score_logra_slurm.sh | 35 +++++++++++++++++++ ssh_config_template.txt | 16 +++++++++ 5 files changed, 88 insertions(+) create mode 100644 experiments/gpt2_wikitext/score_TRAK_slurm.sh create mode 100644 experiments/gpt2_wikitext/score_logra_slurm.sh create mode 100644 ssh_config_template.txt diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index aebe5b7e7..6473736f8 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -498,6 +498,7 @@ def main(): config=config, low_cpu_mem_usage=args.low_cpu_mem_usage, trust_remote_code=args.trust_remote_code, + attn_implementation="eager", # Use eager attention for better performance ) model = model.cuda() else: diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh new file mode 100644 index 000000000..c843168e4 --- /dev/null +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -0,0 +1,35 @@ +#!/bin/bash +#SBATCH --gres=gpu:A40:1 +#SBATCH --array=0-49 # 50 models with seeds 0-49 +#SBATCH --job-name=gpt2_wikitext_score_TRAK +#SBATCH --output=logs/score_TRAK_%A_%a.out +#SBATCH --error=logs/score_TRAK_%A_%a.err +#SBATCH --mem=64G +#SBATCH --cpus-per-task=8 + +echo "job is starting on `hostname`" +echo "Running TRAK scoring for seed: ${SLURM_ARRAY_TASK_ID}" + +# PyTorch memory management +export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 +export CUDA_LAUNCH_BLOCKING=0 + +# Create logs directory if it doesn't exist +mkdir -p logs + +SEED=${SLURM_ARRAY_TASK_ID:-0} + +# Change to results directory to save score files there +cd results + +python ../score_TRAK.py \ + --dataset_name wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --model_name_or_path openai-community/gpt2 \ + --output_dir ../checkpoints \ + --block_size 512 \ + --method TRAK-5 \ + --seed ${SEED} \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 + diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index d83e64f80..6b6ed01e6 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -483,6 +483,7 @@ def main(): config=config, low_cpu_mem_usage=args.low_cpu_mem_usage, trust_remote_code=args.trust_remote_code, + attn_implementation="eager", # Use eager attention for better performance ) else: logger.info("Training new model from scratch") diff --git a/experiments/gpt2_wikitext/score_logra_slurm.sh b/experiments/gpt2_wikitext/score_logra_slurm.sh new file mode 100644 index 000000000..3ebdd5e99 --- /dev/null +++ b/experiments/gpt2_wikitext/score_logra_slurm.sh @@ -0,0 +1,35 @@ +#!/bin/bash +#SBATCH --gres=gpu:A40:1 +#SBATCH --array=0-49 # 50 models with seeds 0-49 +#SBATCH --job-name=gpt2_wikitext_score_logra +#SBATCH --output=logs/score_logra_%A_%a.out +#SBATCH --error=logs/score_logra_%A_%a.err +#SBATCH --mem=64G +#SBATCH --cpus-per-task=8 + +echo "job is starting on `hostname`" +echo "Running LoGra scoring for seed: ${SLURM_ARRAY_TASK_ID}" + +# Force use GPU 1 (index 1) +#export CUDA_VISIBLE_DEVICES=1 + +# PyTorch memory management +export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 +export CUDA_LAUNCH_BLOCKING=0 + +# Create logs directory if it doesn't exist +mkdir -p logs + +SEED=${SLURM_ARRAY_TASK_ID:-0} + +# Change to results directory to save score files there +cd results + +python ../score_logra.py \ + --dataset_name wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --model_name_or_path openai-community/gpt2 \ + --output_dir ../checkpoints \ + --block_size 512 \ + --seed ${SEED} + diff --git a/ssh_config_template.txt b/ssh_config_template.txt new file mode 100644 index 000000000..0208585a0 --- /dev/null +++ b/ssh_config_template.txt @@ -0,0 +1,16 @@ +# SSH Configuration Template for Markov Server +# Copy this content to ~/.ssh/config and replace with your actual UIUC NetID + +Host markov.ischool.illinois.edu + HostName markov.ischool.illinois.edu + Port 22 + User + ServerAliveInterval 60 + ServerAliveCountMax 3 + TCPKeepAlive yes + +# Instructions: +# 1. Copy this content to ~/.ssh/config +# 2. Replace with your actual UIUC NetID +# 3. Set proper permissions: chmod 600 ~/.ssh/config +# 4. Test connection: ssh markov.ischool.illinois.edu From bb26f89fafa91b348aca7de5e7017bd17da09342 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Thu, 6 Nov 2025 17:19:38 -0600 Subject: [PATCH 02/30] fix importError of transformer --- experiments/gpt2_wikitext/score_TRAK.py | 12 ++++++++---- experiments/gpt2_wikitext/score_logra.py | 9 +++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 6473736f8..d90e802b4 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -54,8 +54,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version +try: + from transformers.utils import send_example_telemetry +except ImportError: + send_example_telemetry = None # Not available in newer transformers versions from dattri.benchmark.utils import SubsetSampler from dattri.func.utils import flatten_func, flatten_params @@ -337,9 +341,9 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_clm_no_trainer", args) + #fix the import error in newer transformers versions + if send_example_telemetry is not None: + send_example_telemetry("run_clm_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index 6b6ed01e6..40a59ea23 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -54,8 +54,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version +try: + from transformers.utils import send_example_telemetry +except ImportError: + send_example_telemetry = None # Not available in newer transformers versions from dattri.benchmark.utils import SubsetSampler @@ -324,7 +328,8 @@ def main(): # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_clm_no_trainer", args) + if send_example_telemetry is not None: + send_example_telemetry("run_clm_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers From 5845c68d7d3a55626fc9f84f142ca00886296c5e Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Thu, 6 Nov 2025 17:30:47 -0600 Subject: [PATCH 03/30] fix huggingface path error --- experiments/gpt2_wikitext/score_TRAK.py | 8 +++++++- experiments/gpt2_wikitext/score_logra.py | 13 +++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index d90e802b4..37c66cd77 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -697,7 +697,13 @@ def loss_tracin(params, batch): ) def checkpoints_load_func(model, checkpoint_path): - new_model = AutoModelForCausalLM.from_pretrained(checkpoint_path).cuda() + # Convert to absolute path to avoid HuggingFace Hub validation error + import os + checkpoint_abs = os.path.abspath(checkpoint_path) + new_model = AutoModelForCausalLM.from_pretrained( + checkpoint_abs, + local_files_only=True, # Force local file loading + ).cuda() new_model.eval() return new_model diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index 40a59ea23..2054f421a 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -326,8 +326,7 @@ def parse_args(): def main(): args = parse_args() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. + #fix the import error in newer transformers versions if send_example_telemetry is not None: send_example_telemetry("run_clm_no_trainer", args) @@ -599,8 +598,14 @@ def group_texts(examples): model_id = -1 checkpoint = f"{args.output_dir}/{model_id}" - def checkpoints_load_func(model, checkpoint): - model = AutoModelForCausalLM.from_pretrained(checkpoint).cuda() + def checkpoints_load_func(model, checkpoint_path): + # Convert to absolute path to avoid HuggingFace Hub validation error + import os + checkpoint_abs = os.path.abspath(checkpoint_path) + model = AutoModelForCausalLM.from_pretrained( + checkpoint_abs, + local_files_only=True, # Force local file loading + ).cuda() model.eval() return replace_conv1d_modules(model) From a566d7c10443246e94d5f1b6e79e8eca53802c7a Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Thu, 6 Nov 2025 17:39:01 -0600 Subject: [PATCH 04/30] modify the checkpoints_load_func function in two score files --- experiments/gpt2_wikitext/score_TRAK.py | 50 ++++++++++++++++++++--- experiments/gpt2_wikitext/score_logra.py | 51 +++++++++++++++++++++--- 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 37c66cd77..6e86cb95f 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -697,13 +697,53 @@ def loss_tracin(params, batch): ) def checkpoints_load_func(model, checkpoint_path): - # Convert to absolute path to avoid HuggingFace Hub validation error + # Convert to absolute path and verify it exists import os checkpoint_abs = os.path.abspath(checkpoint_path) - new_model = AutoModelForCausalLM.from_pretrained( - checkpoint_abs, - local_files_only=True, # Force local file loading - ).cuda() + + # Verify checkpoint directory exists + if not os.path.exists(checkpoint_abs): + raise FileNotFoundError( + f"Checkpoint directory not found: {checkpoint_abs}. " + f"Please ensure the checkpoint exists at this path." + ) + + # Try loading with local_files_only first + try: + # Load config and model separately to avoid path validation issues + config = AutoConfig.from_pretrained( + checkpoint_abs, + local_files_only=True, + trust_remote_code=True, + ) + + # Load model using config to bypass path validation + new_model = AutoModelForCausalLM.from_pretrained( + checkpoint_abs, + config=config, + local_files_only=True, # Force local file loading + trust_remote_code=True, # Allow loading from local path + ).cuda() + except Exception as e: + # If path validation fails, try loading from config.json directly + config_path = os.path.join(checkpoint_abs, "config.json") + if os.path.exists(config_path): + config = AutoConfig.from_json_file(config_path) + new_model = AutoModelForCausalLM.from_config(config).cuda() + # Load weights from pytorch_model.bin or model.safetensors + weight_path = os.path.join(checkpoint_abs, "pytorch_model.bin") + if not os.path.exists(weight_path): + weight_path = os.path.join(checkpoint_abs, "model.safetensors") + if os.path.exists(weight_path): + from safetensors.torch import load_file + if weight_path.endswith(".safetensors"): + state_dict = load_file(weight_path) + else: + state_dict = torch.load(weight_path, map_location="cpu") + new_model.load_state_dict(state_dict) + else: + raise e + new_model.eval() return new_model diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index 2054f421a..bd652cb2d 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -598,14 +598,55 @@ def group_texts(examples): model_id = -1 checkpoint = f"{args.output_dir}/{model_id}" + #modified for huggingface hub validation error def checkpoints_load_func(model, checkpoint_path): - # Convert to absolute path to avoid HuggingFace Hub validation error + # Convert to absolute path and verify it exists import os checkpoint_abs = os.path.abspath(checkpoint_path) - model = AutoModelForCausalLM.from_pretrained( - checkpoint_abs, - local_files_only=True, # Force local file loading - ).cuda() + + # Verify checkpoint directory exists + if not os.path.exists(checkpoint_abs): + raise FileNotFoundError( + f"Checkpoint directory not found: {checkpoint_abs}. " + f"Please ensure the checkpoint exists at this path." + ) + + # Try loading with local_files_only first + try: + # Load config and model separately to avoid path validation issues + config = AutoConfig.from_pretrained( + checkpoint_abs, + local_files_only=True, + trust_remote_code=True, + ) + + # Load model using config to bypass path validation + model = AutoModelForCausalLM.from_pretrained( + checkpoint_abs, + config=config, + local_files_only=True, # Force local file loading + trust_remote_code=True, # Allow loading from local path + ).cuda() + except Exception as e: + # If path validation fails, try loading from config.json directly + config_path = os.path.join(checkpoint_abs, "config.json") + if os.path.exists(config_path): + config = AutoConfig.from_json_file(config_path) + model = AutoModelForCausalLM.from_config(config).cuda() + # Load weights from pytorch_model.bin or model.safetensors + weight_path = os.path.join(checkpoint_abs, "pytorch_model.bin") + if not os.path.exists(weight_path): + weight_path = os.path.join(checkpoint_abs, "model.safetensors") + if os.path.exists(weight_path): + from safetensors.torch import load_file + if weight_path.endswith(".safetensors"): + state_dict = load_file(weight_path) + else: + state_dict = torch.load(weight_path, map_location="cpu") + model.load_state_dict(state_dict) + else: + raise e + model.eval() return replace_conv1d_modules(model) From 07dbb38d6b7608231b8af4f3d223f7de8833f45d Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Thu, 6 Nov 2025 17:50:50 -0600 Subject: [PATCH 05/30] modify the checkpoints_load_func function in two score files --- experiments/gpt2_wikitext/score_logra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index bd652cb2d..46363817c 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -595,7 +595,7 @@ def group_texts(examples): from transformers.pytorch_utils import Conv1D from dattri.task import AttributionTask - model_id = -1 + model_id = 0 # Use checkpoint 0 (final checkpoint) checkpoint = f"{args.output_dir}/{model_id}" #modified for huggingface hub validation error From 7cfca83a1b15df8472568e4c53debec865ea05c8 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Thu, 6 Nov 2025 19:44:50 -0600 Subject: [PATCH 06/30] fix index error for score_TRAK --- experiments/gpt2_wikitext/score_TRAK.py | 32 +++++++++++++++---------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 6e86cb95f..7c341ab56 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -629,14 +629,15 @@ def f(params, batch): """ input_ids, attention_mask, labels = batch - input_ids = input_ids.cuda() - attention_mask = attention_mask.cuda() - labels = labels.cuda() + # Re-add batch dimension removed by vmap + input_ids = input_ids.unsqueeze(0).cuda() + attention_mask = attention_mask.unsqueeze(0).cuda() + labels = labels.unsqueeze(0).cuda() outputs = torch.func.functional_call( model, params, - input_ids, + (input_ids,), # Pass as tuple to avoid dimension issues kwargs={"attention_mask": attention_mask, "labels": labels}, ) logp = -outputs.loss @@ -648,14 +649,15 @@ def m(params, batch): """ input_ids, attention_mask, labels = batch - input_ids = input_ids.cuda() - attention_mask = attention_mask.cuda() - labels = labels.cuda() + # Re-add batch dimension removed by vmap + input_ids = input_ids.unsqueeze(0).cuda() + attention_mask = attention_mask.unsqueeze(0).cuda() + labels = labels.unsqueeze(0).cuda() outputs = torch.func.functional_call( model, params, - input_ids, + (input_ids,), # Pass as tuple to avoid dimension issues kwargs={"attention_mask": attention_mask, "labels": labels}, ) p = torch.exp(-outputs.loss) @@ -667,13 +669,16 @@ def loss_tracin(params, batch): (TracIn sums over checkpoint updates of gradient dot-products). """ input_ids, attention_mask, labels = batch - input_ids = input_ids.cuda() - attention_mask = attention_mask.cuda() - labels = labels.cuda() + + # Re-add batch dimension removed by vmap + input_ids = input_ids.unsqueeze(0).cuda() + attention_mask = attention_mask.unsqueeze(0).cuda() + labels = labels.unsqueeze(0).cuda() + outputs = torch.func.functional_call( model, params, - input_ids, + (input_ids,), # Pass as tuple to avoid dimension issues kwargs={"attention_mask": attention_mask, "labels": labels}, ) return outputs.loss @@ -695,7 +700,8 @@ def loss_tracin(params, batch): raise ValueError( f"Unknown --method {method}. Try 'TRAK-5', 'TracIn', 'Grad-Dot', or 'Grad-Cos'." ) - + + #modified for huggingface hub validation error def checkpoints_load_func(model, checkpoint_path): # Convert to absolute path and verify it exists import os From 27fe7cd157bbeb8cffa1d1a18887e593fd65ba83 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Thu, 6 Nov 2025 19:53:16 -0600 Subject: [PATCH 07/30] fix the train batch size --- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index c843168e4..f2538e4e4 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -30,6 +30,6 @@ python ../score_TRAK.py \ --block_size 512 \ --method TRAK-5 \ --seed ${SEED} \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 2 From ce2dce97db36e6308a1921908ab3334570e05918 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sun, 9 Nov 2025 01:11:50 -0600 Subject: [PATCH 08/30] add argument for random projection in TRAK and fix memory issue --- experiments/gpt2_wikitext/score_TRAK.py | 29 +++++++++++++++++-- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 5 +++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 7c341ab56..e35b07ed5 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -220,6 +220,25 @@ def parse_args(): " account special tokens)." ), ) + + # add arguments for random projection and fix memory issues + parser.add_argument( + "--proj_dim", + type=int, + default=512, + help="Output dimension for random projection used by TRAK / TracIn.", + ) + parser.add_argument( + "--proj_max_batch_size", + type=int, + default=16, + help="Maximum batch size to process per projection block (controls memory usage).", + ) + parser.add_argument( + "--use_half_precision", + action="store_true", + help="Use half precision for projection matrices to reduce memory footprint.", + ) parser.add_argument( "--preprocessing_num_workers", type=int, @@ -771,9 +790,12 @@ def checkpoints_load_func(model, checkpoint_path): ) if method.startswith("TRAK"): + # fix memory issues projector_kwargs = { "device": "cuda", - "proj_dim": 2048, + "proj_dim": args.proj_dim, + "proj_max_batch_size": args.proj_max_batch_size, + "use_half_precision": args.use_half_precision, } attributor = TRAKAttributor( task=task, @@ -789,9 +811,12 @@ def checkpoints_load_func(model, checkpoint_path): weight_list = torch.ones(num_checkpoints) * 1e-3 + # fix memory issues projector_kwargs = { "device": "cuda", - "proj_dim": 2048, + "proj_dim": args.proj_dim, + "proj_max_batch_size": args.proj_max_batch_size, + "use_half_precision": args.use_half_precision, } attributor = TracInAttributor( diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index f2538e4e4..a1400c6c8 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -31,5 +31,8 @@ python ../score_TRAK.py \ --method TRAK-5 \ --seed ${SEED} \ --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 2 + --per_device_eval_batch_size 2 \ + --proj_dim 512 \ + --proj_max_batch_size 16 \ + --use_half_precision From a8a47a640d03c6da008eaf779b52b8d821d51319 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sun, 9 Nov 2025 01:18:50 -0600 Subject: [PATCH 09/30] fix argument error --- experiments/gpt2_wikitext/score_TRAK.py | 7 ------- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 3 +-- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index e35b07ed5..c2c02a5d7 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -234,11 +234,6 @@ def parse_args(): default=16, help="Maximum batch size to process per projection block (controls memory usage).", ) - parser.add_argument( - "--use_half_precision", - action="store_true", - help="Use half precision for projection matrices to reduce memory footprint.", - ) parser.add_argument( "--preprocessing_num_workers", type=int, @@ -795,7 +790,6 @@ def checkpoints_load_func(model, checkpoint_path): "device": "cuda", "proj_dim": args.proj_dim, "proj_max_batch_size": args.proj_max_batch_size, - "use_half_precision": args.use_half_precision, } attributor = TRAKAttributor( task=task, @@ -816,7 +810,6 @@ def checkpoints_load_func(model, checkpoint_path): "device": "cuda", "proj_dim": args.proj_dim, "proj_max_batch_size": args.proj_max_batch_size, - "use_half_precision": args.use_half_precision, } attributor = TracInAttributor( diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index a1400c6c8..ba0d34acf 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -33,6 +33,5 @@ python ../score_TRAK.py \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 2 \ --proj_dim 512 \ - --proj_max_batch_size 16 \ - --use_half_precision + --proj_max_batch_size 16 From 01e09b0796f1cdd9b9242471505f3c194066aca4 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sun, 9 Nov 2025 01:29:50 -0600 Subject: [PATCH 10/30] update proj_dim and batch size to solve OOM error --- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index ba0d34acf..257c154f3 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -30,8 +30,8 @@ python ../score_TRAK.py \ --block_size 512 \ --method TRAK-5 \ --seed ${SEED} \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 2 \ - --proj_dim 512 \ - --proj_max_batch_size 16 + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --proj_dim 256 \ + --proj_max_batch_size 8 From 65581fcf46fade4a7cd7ade5e74b301a20689eed Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sun, 9 Nov 2025 01:41:49 -0600 Subject: [PATCH 11/30] add project_type kwarg --- experiments/gpt2_wikitext/score_TRAK.py | 9 +++++++++ experiments/gpt2_wikitext/score_TRAK_slurm.sh | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index c2c02a5d7..1fecd1a4a 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -234,6 +234,13 @@ def parse_args(): default=16, help="Maximum batch size to process per projection block (controls memory usage).", ) + parser.add_argument( + "--proj_type", + type=str, + default="random_mask", + choices=["normal", "rademacher", "random_mask", "sjlt", "grass"], + help="Random projection type used for TRAK/TracIn (default: random_mask).", + ) parser.add_argument( "--preprocessing_num_workers", type=int, @@ -790,6 +797,7 @@ def checkpoints_load_func(model, checkpoint_path): "device": "cuda", "proj_dim": args.proj_dim, "proj_max_batch_size": args.proj_max_batch_size, + "proj_type": args.proj_type, } attributor = TRAKAttributor( task=task, @@ -810,6 +818,7 @@ def checkpoints_load_func(model, checkpoint_path): "device": "cuda", "proj_dim": args.proj_dim, "proj_max_batch_size": args.proj_max_batch_size, + "proj_type": args.proj_type, } attributor = TracInAttributor( diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index 257c154f3..9df6ffd90 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -33,5 +33,6 @@ python ../score_TRAK.py \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --proj_dim 256 \ - --proj_max_batch_size 8 + --proj_max_batch_size 8 \ + --proj_type random_mask From 807c42b0042c5918988fbbec11d3c14b361bc39c Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sun, 9 Nov 2025 10:49:49 -0600 Subject: [PATCH 12/30] fix checkpoint loading error --- experiments/gpt2_wikitext/score_TRAK.py | 43 ++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 1fecd1a4a..3639e4980 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -705,18 +705,53 @@ def loss_tracin(params, batch): return outputs.loss method = args.method + + #fix checkpoint loading error + checkpoint_root = Path(args.output_dir) + available_checkpoint_dirs = sorted( + [p for p in checkpoint_root.iterdir() if p.is_dir() and p.name.isdigit()], + key=lambda p: int(p.name), + ) + + if not available_checkpoint_dirs: + raise FileNotFoundError( + f"No numeric checkpoint directories found in {checkpoint_root}." + ) + if method.startswith("TRAK-"): parts = method.split("-") if len(parts) == 2 and parts[1].isdigit(): - num_checkpoints = int(parts[1]) + requested_checkpoints = int(parts[1]) else: raise ValueError( "Invalid method name for TRAK, must be like 'TRAK-5' or 'TRAK-10'." ) - checkpoints = [f"{args.output_dir}/{i}" for i in range(num_checkpoints)] + + #fix checkpoint loading error + if len(available_checkpoint_dirs) < requested_checkpoints: + logger.warning( + "Requested %s checkpoints but only found %s in %s. Using available checkpoints instead.", + requested_checkpoints, + len(available_checkpoint_dirs), + checkpoint_root, + ) + requested_checkpoints = len(available_checkpoint_dirs) + + checkpoints = [str(p) for p in available_checkpoint_dirs[:requested_checkpoints]] + elif method in ["TracIn", "Grad-Dot", "Grad-Cos"]: - num_checkpoints = 5 - checkpoints = [f"{args.output_dir}/{i}" for i in range(num_checkpoints)] + requested_checkpoints = min(5, len(available_checkpoint_dirs)) + if requested_checkpoints == 0: + raise FileNotFoundError( + f"No numeric checkpoint directories found in {checkpoint_root}." + ) + if requested_checkpoints < 5: + logger.warning( + "Only %s checkpoint(s) available; using these for %s.", + requested_checkpoints, + method, + ) + checkpoints = [str(p) for p in available_checkpoint_dirs[:requested_checkpoints]] else: raise ValueError( f"Unknown --method {method}. Try 'TRAK-5', 'TracIn', 'Grad-Dot', or 'Grad-Cos'." From 9b20c18b9fc6fffb4ae259856d64afad1d81279c Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sun, 9 Nov 2025 12:28:32 -0600 Subject: [PATCH 13/30] update readme in gpt2_wikitext --- experiments/gpt2_wikitext/readme.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/experiments/gpt2_wikitext/readme.md b/experiments/gpt2_wikitext/readme.md index 84fbe6676..27788bc23 100644 --- a/experiments/gpt2_wikitext/readme.md +++ b/experiments/gpt2_wikitext/readme.md @@ -20,7 +20,7 @@ This experiment could only be run on cuda device. pip install -r requirements.txt ``` -### Troubleshooting: vmap over calling .item() Error in Transformers + + +the troubleshooting can be avoided by setting the attn_implementation paramater to 'eager' in from_pretrained function + +if args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + low_cpu_mem_usage=args.low_cpu_mem_usage, + trust_remote_code=args.trust_remote_code, + attn_implementation="eager", # Use eager attention for better performance + ) + model = model.cuda() ## Training From 1d5fecdff1f63da0b9ebbf8d83b742cb1dfa7631 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Fri, 28 Nov 2025 19:04:28 -0600 Subject: [PATCH 14/30] error message in import --- experiments/gpt2_wikitext/score_TRAK.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 3639e4980..50e846866 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -54,12 +54,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version +from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version -try: - from transformers.utils import send_example_telemetry -except ImportError: - send_example_telemetry = None # Not available in newer transformers versions +# try: +# from transformers.utils import send_example_telemetry +# except ImportError: +# send_example_telemetry = None # Not available in newer transformers versions from dattri.benchmark.utils import SubsetSampler from dattri.func.utils import flatten_func, flatten_params From d42f59916c9257ecfef796d5c3109aa1c51b104c Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Fri, 28 Nov 2025 23:10:40 -0600 Subject: [PATCH 15/30] added arguments --- experiments/gpt2_wikitext/score_TRAK.py | 62 ++++++++++--------- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 6 +- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 50e846866..01b85cd67 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -54,12 +54,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import check_min_version from transformers.utils.versions import require_version -# try: -# from transformers.utils import send_example_telemetry -# except ImportError: -# send_example_telemetry = None # Not available in newer transformers versions +try: + from transformers.utils import send_example_telemetry +except ImportError: + send_example_telemetry = None # Not available in newer transformers versions from dattri.benchmark.utils import SubsetSampler from dattri.func.utils import flatten_func, flatten_params @@ -222,25 +222,25 @@ def parse_args(): ) # add arguments for random projection and fix memory issues - parser.add_argument( - "--proj_dim", - type=int, - default=512, - help="Output dimension for random projection used by TRAK / TracIn.", - ) - parser.add_argument( - "--proj_max_batch_size", - type=int, - default=16, - help="Maximum batch size to process per projection block (controls memory usage).", - ) - parser.add_argument( - "--proj_type", - type=str, - default="random_mask", - choices=["normal", "rademacher", "random_mask", "sjlt", "grass"], - help="Random projection type used for TRAK/TracIn (default: random_mask).", - ) + # parser.add_argument( + # "--proj_dim", + # type=int, + # default=512, + # help="Output dimension for random projection used by TRAK / TracIn.", + # ) + # parser.add_argument( + # "--proj_max_batch_size", + # type=int, + # default=16, + # help="Maximum batch size to process per projection block (controls memory usage).", + # ) + # parser.add_argument( + # "--proj_type", + # type=str, + # default="random_mask", + # choices=["normal", "rademacher", "random_mask", "sjlt", "grass"], + # help="Random projection type used for TRAK/TracIn (default: random_mask).", + # ) parser.add_argument( "--preprocessing_num_workers", type=int, @@ -830,9 +830,10 @@ def checkpoints_load_func(model, checkpoint_path): # fix memory issues projector_kwargs = { "device": "cuda", - "proj_dim": args.proj_dim, - "proj_max_batch_size": args.proj_max_batch_size, - "proj_type": args.proj_type, + "proj_dim": 2048, + # "proj_dim": args.proj_dim, + # "proj_max_batch_size": args.proj_max_batch_size, + # "proj_type": args.proj_type, } attributor = TRAKAttributor( task=task, @@ -851,9 +852,10 @@ def checkpoints_load_func(model, checkpoint_path): # fix memory issues projector_kwargs = { "device": "cuda", - "proj_dim": args.proj_dim, - "proj_max_batch_size": args.proj_max_batch_size, - "proj_type": args.proj_type, + "proj_dim": 2048, + # "proj_dim": args.proj_dim, + # "proj_max_batch_size": args.proj_max_batch_size, + # "proj_type": args.proj_type, } attributor = TracInAttributor( diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index 9df6ffd90..8c573a7ed 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -32,7 +32,7 @@ python ../score_TRAK.py \ --seed ${SEED} \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --proj_dim 256 \ - --proj_max_batch_size 8 \ - --proj_type random_mask + # --proj_dim 256 \ + # --proj_max_batch_size 8 \ + # --proj_type random_mask From 7e9a05ec02c025958fc240048cdb316843391b42 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 29 Nov 2025 00:18:34 -0600 Subject: [PATCH 16/30] unsqueeze --- experiments/gpt2_wikitext/score_TRAK.py | 76 ++++++++++--------- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 6 +- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 01b85cd67..966fe8894 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -222,25 +222,25 @@ def parse_args(): ) # add arguments for random projection and fix memory issues - # parser.add_argument( - # "--proj_dim", - # type=int, - # default=512, - # help="Output dimension for random projection used by TRAK / TracIn.", - # ) - # parser.add_argument( - # "--proj_max_batch_size", - # type=int, - # default=16, - # help="Maximum batch size to process per projection block (controls memory usage).", - # ) - # parser.add_argument( - # "--proj_type", - # type=str, - # default="random_mask", - # choices=["normal", "rademacher", "random_mask", "sjlt", "grass"], - # help="Random projection type used for TRAK/TracIn (default: random_mask).", - # ) + parser.add_argument( + "--proj_dim", + type=int, + default=512, + help="Output dimension for random projection used by TRAK / TracIn.", + ) + parser.add_argument( + "--proj_max_batch_size", + type=int, + default=16, + help="Maximum batch size to process per projection block (controls memory usage).", + ) + parser.add_argument( + "--proj_type", + type=str, + default="random_mask", + choices=["normal", "rademacher", "random_mask", "sjlt", "grass"], + help="Random projection type used for TRAK/TracIn (default: random_mask).", + ) parser.add_argument( "--preprocessing_num_workers", type=int, @@ -651,14 +651,18 @@ def f(params, batch): input_ids, attention_mask, labels = batch # Re-add batch dimension removed by vmap - input_ids = input_ids.unsqueeze(0).cuda() - attention_mask = attention_mask.unsqueeze(0).cuda() - labels = labels.unsqueeze(0).cuda() + input_ids = input_ids.cuda() + attention_mask = attention_mask.cuda() + labels = labels.cuda() + # input_ids = input_ids.unsqueeze(0).cuda() + # attention_mask = attention_mask.unsqueeze(0).cuda() + # labels = labels.unsqueeze(0).cuda() outputs = torch.func.functional_call( model, params, - (input_ids,), # Pass as tuple to avoid dimension issues + input_ids, + #(input_ids,), # Pass as tuple to avoid dimension issues kwargs={"attention_mask": attention_mask, "labels": labels}, ) logp = -outputs.loss @@ -671,14 +675,18 @@ def m(params, batch): input_ids, attention_mask, labels = batch # Re-add batch dimension removed by vmap - input_ids = input_ids.unsqueeze(0).cuda() - attention_mask = attention_mask.unsqueeze(0).cuda() - labels = labels.unsqueeze(0).cuda() + input_ids = input_ids.cuda() + attention_mask = attention_mask.cuda() + labels = labels.cuda() + # input_ids = input_ids.unsqueeze(0).cuda() + # attention_mask = attention_mask.unsqueeze(0).cuda() + # labels = labels.unsqueeze(0).cuda() outputs = torch.func.functional_call( model, params, - (input_ids,), # Pass as tuple to avoid dimension issues + input_ids, + #(input_ids,), # Pass as tuple to avoid dimension issues kwargs={"attention_mask": attention_mask, "labels": labels}, ) p = torch.exp(-outputs.loss) @@ -830,10 +838,9 @@ def checkpoints_load_func(model, checkpoint_path): # fix memory issues projector_kwargs = { "device": "cuda", - "proj_dim": 2048, - # "proj_dim": args.proj_dim, - # "proj_max_batch_size": args.proj_max_batch_size, - # "proj_type": args.proj_type, + "proj_dim": args.proj_dim, + "proj_max_batch_size": args.proj_max_batch_size, + "proj_type": args.proj_type, } attributor = TRAKAttributor( task=task, @@ -852,10 +859,9 @@ def checkpoints_load_func(model, checkpoint_path): # fix memory issues projector_kwargs = { "device": "cuda", - "proj_dim": 2048, - # "proj_dim": args.proj_dim, - # "proj_max_batch_size": args.proj_max_batch_size, - # "proj_type": args.proj_type, + "proj_dim": args.proj_dim, + "proj_max_batch_size": args.proj_max_batch_size, + "proj_type": args.proj_type, } attributor = TracInAttributor( diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index 8c573a7ed..9df6ffd90 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -32,7 +32,7 @@ python ../score_TRAK.py \ --seed ${SEED} \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - # --proj_dim 256 \ - # --proj_max_batch_size 8 \ - # --proj_type random_mask + --proj_dim 256 \ + --proj_max_batch_size 8 \ + --proj_type random_mask From 3d0a859ef175ab4c5f40c2a99ed6e115ce514b03 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 29 Nov 2025 01:30:22 -0600 Subject: [PATCH 17/30] checkpoint --- experiments/gpt2_wikitext/score_TRAK.py | 91 ++++++++++++------------- 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 966fe8894..f7acda93c 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -651,18 +651,14 @@ def f(params, batch): input_ids, attention_mask, labels = batch # Re-add batch dimension removed by vmap - input_ids = input_ids.cuda() - attention_mask = attention_mask.cuda() - labels = labels.cuda() - # input_ids = input_ids.unsqueeze(0).cuda() - # attention_mask = attention_mask.unsqueeze(0).cuda() - # labels = labels.unsqueeze(0).cuda() + input_ids = input_ids.unsqueeze(0).cuda() + attention_mask = attention_mask.unsqueeze(0).cuda() + labels = labels.unsqueeze(0).cuda() outputs = torch.func.functional_call( model, params, - input_ids, - #(input_ids,), # Pass as tuple to avoid dimension issues + (input_ids,), # Pass as tuple to avoid dimension issues kwargs={"attention_mask": attention_mask, "labels": labels}, ) logp = -outputs.loss @@ -675,18 +671,14 @@ def m(params, batch): input_ids, attention_mask, labels = batch # Re-add batch dimension removed by vmap - input_ids = input_ids.cuda() - attention_mask = attention_mask.cuda() - labels = labels.cuda() - # input_ids = input_ids.unsqueeze(0).cuda() - # attention_mask = attention_mask.unsqueeze(0).cuda() - # labels = labels.unsqueeze(0).cuda() + input_ids = input_ids.unsqueeze(0).cuda() + attention_mask = attention_mask.unsqueeze(0).cuda() + labels = labels.unsqueeze(0).cuda() outputs = torch.func.functional_call( model, params, - input_ids, - #(input_ids,), # Pass as tuple to avoid dimension issues + (input_ids,), # Pass as tuple to avoid dimension issues kwargs={"attention_mask": attention_mask, "labels": labels}, ) p = torch.exp(-outputs.loss) @@ -715,51 +707,54 @@ def loss_tracin(params, batch): method = args.method #fix checkpoint loading error - checkpoint_root = Path(args.output_dir) - available_checkpoint_dirs = sorted( - [p for p in checkpoint_root.iterdir() if p.is_dir() and p.name.isdigit()], - key=lambda p: int(p.name), - ) + # checkpoint_root = Path(args.output_dir) + # available_checkpoint_dirs = sorted( + # [p for p in checkpoint_root.iterdir() if p.is_dir() and p.name.isdigit()], + # key=lambda p: int(p.name), + # ) - if not available_checkpoint_dirs: - raise FileNotFoundError( - f"No numeric checkpoint directories found in {checkpoint_root}." - ) + # if not available_checkpoint_dirs: + # raise FileNotFoundError( + # f"No numeric checkpoint directories found in {checkpoint_root}." + # ) if method.startswith("TRAK-"): parts = method.split("-") if len(parts) == 2 and parts[1].isdigit(): - requested_checkpoints = int(parts[1]) + num_checkpoints = int(parts[1]) + # requested_checkpoints = int(parts[1]) else: raise ValueError( "Invalid method name for TRAK, must be like 'TRAK-5' or 'TRAK-10'." ) - + checkpoints = [f"{args.output_dir}/{i}" for i in range(num_checkpoints)] #fix checkpoint loading error - if len(available_checkpoint_dirs) < requested_checkpoints: - logger.warning( - "Requested %s checkpoints but only found %s in %s. Using available checkpoints instead.", - requested_checkpoints, - len(available_checkpoint_dirs), - checkpoint_root, - ) - requested_checkpoints = len(available_checkpoint_dirs) + # if len(available_checkpoint_dirs) < requested_checkpoints: + # logger.warning( + # "Requested %s checkpoints but only found %s in %s. Using available checkpoints instead.", + # requested_checkpoints, + # len(available_checkpoint_dirs), + # checkpoint_root, + # ) + # requested_checkpoints = len(available_checkpoint_dirs) - checkpoints = [str(p) for p in available_checkpoint_dirs[:requested_checkpoints]] + # checkpoints = [str(p) for p in available_checkpoint_dirs[:requested_checkpoints]] elif method in ["TracIn", "Grad-Dot", "Grad-Cos"]: - requested_checkpoints = min(5, len(available_checkpoint_dirs)) - if requested_checkpoints == 0: - raise FileNotFoundError( - f"No numeric checkpoint directories found in {checkpoint_root}." - ) - if requested_checkpoints < 5: - logger.warning( - "Only %s checkpoint(s) available; using these for %s.", - requested_checkpoints, - method, - ) - checkpoints = [str(p) for p in available_checkpoint_dirs[:requested_checkpoints]] + num_checkpoints = 5 + checkpoints = [f"{args.output_dir}/{i}" for i in range(num_checkpoints)] + # requested_checkpoints = min(5, len(available_checkpoint_dirs)) + # if requested_checkpoints == 0: + # raise FileNotFoundError( + # f"No numeric checkpoint directories found in {checkpoint_root}." + # ) + # if requested_checkpoints < 5: + # logger.warning( + # "Only %s checkpoint(s) available; using these for %s.", + # requested_checkpoints, + # method, + # ) + # checkpoints = [str(p) for p in available_checkpoint_dirs[:requested_checkpoints]] else: raise ValueError( f"Unknown --method {method}. Try 'TRAK-5', 'TracIn', 'Grad-Dot', or 'Grad-Cos'." From 57ac89a528d3bf005845f953070072cfed61878d Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 29 Nov 2025 01:52:32 -0600 Subject: [PATCH 18/30] huggingface --- experiments/gpt2_wikitext/score_TRAK.py | 85 +++++++++++++------------ 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index f7acda93c..c11a7a481 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -762,52 +762,53 @@ def loss_tracin(params, batch): #modified for huggingface hub validation error def checkpoints_load_func(model, checkpoint_path): + new_model = AutoModelForCausalLM.from_pretrained(checkpoint_path).cuda() # Convert to absolute path and verify it exists - import os - checkpoint_abs = os.path.abspath(checkpoint_path) + # import os + # checkpoint_abs = os.path.abspath(checkpoint_path) - # Verify checkpoint directory exists - if not os.path.exists(checkpoint_abs): - raise FileNotFoundError( - f"Checkpoint directory not found: {checkpoint_abs}. " - f"Please ensure the checkpoint exists at this path." - ) + # # Verify checkpoint directory exists + # if not os.path.exists(checkpoint_abs): + # raise FileNotFoundError( + # f"Checkpoint directory not found: {checkpoint_abs}. " + # f"Please ensure the checkpoint exists at this path." + # ) - # Try loading with local_files_only first - try: - # Load config and model separately to avoid path validation issues - config = AutoConfig.from_pretrained( - checkpoint_abs, - local_files_only=True, - trust_remote_code=True, - ) + # # Try loading with local_files_only first + # try: + # # Load config and model separately to avoid path validation issues + # config = AutoConfig.from_pretrained( + # checkpoint_abs, + # local_files_only=True, + # trust_remote_code=True, + # ) - # Load model using config to bypass path validation - new_model = AutoModelForCausalLM.from_pretrained( - checkpoint_abs, - config=config, - local_files_only=True, # Force local file loading - trust_remote_code=True, # Allow loading from local path - ).cuda() - except Exception as e: - # If path validation fails, try loading from config.json directly - config_path = os.path.join(checkpoint_abs, "config.json") - if os.path.exists(config_path): - config = AutoConfig.from_json_file(config_path) - new_model = AutoModelForCausalLM.from_config(config).cuda() - # Load weights from pytorch_model.bin or model.safetensors - weight_path = os.path.join(checkpoint_abs, "pytorch_model.bin") - if not os.path.exists(weight_path): - weight_path = os.path.join(checkpoint_abs, "model.safetensors") - if os.path.exists(weight_path): - from safetensors.torch import load_file - if weight_path.endswith(".safetensors"): - state_dict = load_file(weight_path) - else: - state_dict = torch.load(weight_path, map_location="cpu") - new_model.load_state_dict(state_dict) - else: - raise e + # # Load model using config to bypass path validation + # new_model = AutoModelForCausalLM.from_pretrained( + # checkpoint_abs, + # config=config, + # local_files_only=True, # Force local file loading + # trust_remote_code=True, # Allow loading from local path + # ).cuda() + # except Exception as e: + # # If path validation fails, try loading from config.json directly + # config_path = os.path.join(checkpoint_abs, "config.json") + # if os.path.exists(config_path): + # config = AutoConfig.from_json_file(config_path) + # new_model = AutoModelForCausalLM.from_config(config).cuda() + # # Load weights from pytorch_model.bin or model.safetensors + # weight_path = os.path.join(checkpoint_abs, "pytorch_model.bin") + # if not os.path.exists(weight_path): + # weight_path = os.path.join(checkpoint_abs, "model.safetensors") + # if os.path.exists(weight_path): + # from safetensors.torch import load_file + # if weight_path.endswith(".safetensors"): + # state_dict = load_file(weight_path) + # else: + # state_dict = torch.load(weight_path, map_location="cpu") + # new_model.load_state_dict(state_dict) + # else: + # raise e new_model.eval() return new_model From c995b3bba2a9965e1473f067c7d1f17770a5bb19 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 29 Nov 2025 02:22:23 -0600 Subject: [PATCH 19/30] model_id --- experiments/gpt2_wikitext/score_logra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index 46363817c..1d6cef751 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -595,7 +595,7 @@ def group_texts(examples): from transformers.pytorch_utils import Conv1D from dattri.task import AttributionTask - model_id = 0 # Use checkpoint 0 (final checkpoint) + model_id = -1 # Use checkpoint 0 (final checkpoint) checkpoint = f"{args.output_dir}/{model_id}" #modified for huggingface hub validation error From 27d48a898acd66a1675e028ed55429a06d71d56e Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 29 Nov 2025 02:32:01 -0600 Subject: [PATCH 20/30] huggingface --- experiments/gpt2_wikitext/score_logra.py | 91 ++++++++++++------------ 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index 1d6cef751..fa088c729 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -599,54 +599,55 @@ def group_texts(examples): checkpoint = f"{args.output_dir}/{model_id}" #modified for huggingface hub validation error - def checkpoints_load_func(model, checkpoint_path): - # Convert to absolute path and verify it exists - import os - checkpoint_abs = os.path.abspath(checkpoint_path) + # def checkpoints_load_func(model, checkpoint_path): + # # Convert to absolute path and verify it exists + # import os + # checkpoint_abs = os.path.abspath(checkpoint_path) - # Verify checkpoint directory exists - if not os.path.exists(checkpoint_abs): - raise FileNotFoundError( - f"Checkpoint directory not found: {checkpoint_abs}. " - f"Please ensure the checkpoint exists at this path." - ) + # # Verify checkpoint directory exists + # if not os.path.exists(checkpoint_abs): + # raise FileNotFoundError( + # f"Checkpoint directory not found: {checkpoint_abs}. " + # f"Please ensure the checkpoint exists at this path." + # ) - # Try loading with local_files_only first - try: - # Load config and model separately to avoid path validation issues - config = AutoConfig.from_pretrained( - checkpoint_abs, - local_files_only=True, - trust_remote_code=True, - ) + # # Try loading with local_files_only first + # try: + # # Load config and model separately to avoid path validation issues + # config = AutoConfig.from_pretrained( + # checkpoint_abs, + # local_files_only=True, + # trust_remote_code=True, + # ) - # Load model using config to bypass path validation - model = AutoModelForCausalLM.from_pretrained( - checkpoint_abs, - config=config, - local_files_only=True, # Force local file loading - trust_remote_code=True, # Allow loading from local path - ).cuda() - except Exception as e: - # If path validation fails, try loading from config.json directly - config_path = os.path.join(checkpoint_abs, "config.json") - if os.path.exists(config_path): - config = AutoConfig.from_json_file(config_path) - model = AutoModelForCausalLM.from_config(config).cuda() - # Load weights from pytorch_model.bin or model.safetensors - weight_path = os.path.join(checkpoint_abs, "pytorch_model.bin") - if not os.path.exists(weight_path): - weight_path = os.path.join(checkpoint_abs, "model.safetensors") - if os.path.exists(weight_path): - from safetensors.torch import load_file - if weight_path.endswith(".safetensors"): - state_dict = load_file(weight_path) - else: - state_dict = torch.load(weight_path, map_location="cpu") - model.load_state_dict(state_dict) - else: - raise e - + # # Load model using config to bypass path validation + # model = AutoModelForCausalLM.from_pretrained( + # checkpoint_abs, + # config=config, + # local_files_only=True, # Force local file loading + # trust_remote_code=True, # Allow loading from local path + # ).cuda() + # except Exception as e: + # # If path validation fails, try loading from config.json directly + # config_path = os.path.join(checkpoint_abs, "config.json") + # if os.path.exists(config_path): + # config = AutoConfig.from_json_file(config_path) + # model = AutoModelForCausalLM.from_config(config).cuda() + # # Load weights from pytorch_model.bin or model.safetensors + # weight_path = os.path.join(checkpoint_abs, "pytorch_model.bin") + # if not os.path.exists(weight_path): + # weight_path = os.path.join(checkpoint_abs, "model.safetensors") + # if os.path.exists(weight_path): + # from safetensors.torch import load_file + # if weight_path.endswith(".safetensors"): + # state_dict = load_file(weight_path) + # else: + # state_dict = torch.load(weight_path, map_location="cpu") + # model.load_state_dict(state_dict) + # else: + # raise e + def checkpoints_load_func(model, checkpoint): + model = AutoModelForCausalLM.from_pretrained(checkpoint).cuda() model.eval() return replace_conv1d_modules(model) From a5503cfd0e9ca4b401cd49ca097781d0ddb2fcb7 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Fri, 19 Dec 2025 16:00:43 -0600 Subject: [PATCH 21/30] fix model id --- experiments/gpt2_wikitext/readme.md | 1 - experiments/gpt2_wikitext/score_TRAK.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/experiments/gpt2_wikitext/readme.md b/experiments/gpt2_wikitext/readme.md index 27788bc23..bc28d3254 100644 --- a/experiments/gpt2_wikitext/readme.md +++ b/experiments/gpt2_wikitext/readme.md @@ -68,7 +68,6 @@ Then, add the following line: return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) ``` --> -the troubleshooting can be avoided by setting the attn_implementation paramater to 'eager' in from_pretrained function if args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index c11a7a481..dc994fe2b 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -54,12 +54,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version -from transformers.utils.versions import require_version -try: - from transformers.utils import send_example_telemetry -except ImportError: - send_example_telemetry = None # Not available in newer transformers versions +from transformers.utils import check_min_version, send_example_telemetry +# from transformers.utils.versions import require_version +# try: +# from transformers.utils import send_example_telemetry +# except ImportError: +# send_example_telemetry = None # Not available in newer transformers versions from dattri.benchmark.utils import SubsetSampler from dattri.func.utils import flatten_func, flatten_params @@ -361,10 +361,10 @@ def parse_args(): def main(): args = parse_args() - + send_example_telemetry("run_clm_no_trainer", args) #fix the import error in newer transformers versions - if send_example_telemetry is not None: - send_example_telemetry("run_clm_no_trainer", args) + # if send_example_telemetry is not None: + # send_example_telemetry("run_clm_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers From 41593922d7c71c0ae294c30bc93aedb750977b45 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Fri, 19 Dec 2025 20:45:10 -0600 Subject: [PATCH 22/30] fix transformer's version --- experiments/gpt2_wikitext/score_TRAK.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index dc994fe2b..54f2aa4cd 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -55,7 +55,7 @@ get_scheduler, ) from transformers.utils import check_min_version, send_example_telemetry -# from transformers.utils.versions import require_version +from transformers.utils.versions import require_version # try: # from transformers.utils import send_example_telemetry # except ImportError: From 3c9dd6bd7a05a39165745bcc07d6f4fef9f559bf Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Fri, 19 Dec 2025 23:32:31 -0600 Subject: [PATCH 23/30] fix transformer's huggingface_id --- experiments/gpt2_wikitext/score_TRAK.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 54f2aa4cd..e36e8678b 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -727,7 +727,8 @@ def loss_tracin(params, batch): raise ValueError( "Invalid method name for TRAK, must be like 'TRAK-5' or 'TRAK-10'." ) - checkpoints = [f"{args.output_dir}/{i}" for i in range(num_checkpoints)] + checkpoint_root_abs = Path(args.output_dir).resolve() + checkpoints = [f"{checkpoint_root_abs}/{i}" for i in range(num_checkpoints)] #fix checkpoint loading error # if len(available_checkpoint_dirs) < requested_checkpoints: # logger.warning( @@ -742,7 +743,8 @@ def loss_tracin(params, batch): elif method in ["TracIn", "Grad-Dot", "Grad-Cos"]: num_checkpoints = 5 - checkpoints = [f"{args.output_dir}/{i}" for i in range(num_checkpoints)] + checkpoint_root_abs = Path(args.output_dir).resolve() + checkpoints = [f"{checkpoint_root_abs}/{i}" for i in range(num_checkpoints)] # requested_checkpoints = min(5, len(available_checkpoint_dirs)) # if requested_checkpoints == 0: # raise FileNotFoundError( From ad29c76e4cdcea134ba05e6090bfc1fb49bea88f Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Fri, 19 Dec 2025 23:47:28 -0600 Subject: [PATCH 24/30] fix transformer's huggingface_id --- experiments/gpt2_wikitext/score_TRAK.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index e36e8678b..9ad1b3b47 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -728,7 +728,7 @@ def loss_tracin(params, batch): "Invalid method name for TRAK, must be like 'TRAK-5' or 'TRAK-10'." ) checkpoint_root_abs = Path(args.output_dir).resolve() - checkpoints = [f"{checkpoint_root_abs}/{i}" for i in range(num_checkpoints)] + checkpoints = [f"{checkpoint_root_abs}/{i}" for i in range(-1, num_checkpoints)] #fix checkpoint loading error # if len(available_checkpoint_dirs) < requested_checkpoints: # logger.warning( @@ -744,7 +744,7 @@ def loss_tracin(params, batch): elif method in ["TracIn", "Grad-Dot", "Grad-Cos"]: num_checkpoints = 5 checkpoint_root_abs = Path(args.output_dir).resolve() - checkpoints = [f"{checkpoint_root_abs}/{i}" for i in range(num_checkpoints)] + checkpoints = [f"{checkpoint_root_abs}/{i}" for i in range(-1,num_checkpoints)] # requested_checkpoints = min(5, len(available_checkpoint_dirs)) # if requested_checkpoints == 0: # raise FileNotFoundError( From 28ee7a5f036842a888cd01eacbf3257f5e97813a Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 20 Dec 2025 16:15:57 -0600 Subject: [PATCH 25/30] fix checkpoint id error --- experiments/gpt2_wikitext/score_TRAK.py | 118 +++++++----------------- 1 file changed, 32 insertions(+), 86 deletions(-) diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index 9ad1b3b47..a832d5c65 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -706,17 +706,13 @@ def loss_tracin(params, batch): method = args.method - #fix checkpoint loading error - # checkpoint_root = Path(args.output_dir) - # available_checkpoint_dirs = sorted( - # [p for p in checkpoint_root.iterdir() if p.is_dir() and p.name.isdigit()], - # key=lambda p: int(p.name), - # ) - - # if not available_checkpoint_dirs: - # raise FileNotFoundError( - # f"No numeric checkpoint directories found in {checkpoint_root}." - # ) + #this gets the existing checkpoints in the output directory + checkpoint_root_abs = Path(args.output_dir).resolve() + existing_ckpt_dirs = [p for p in checkpoint_root_abs.iterdir() if p.is_dir()] + existing_names = {p.name for p in existing_ckpt_dirs} + has_minus1 = "-1" in existing_names + numeric_sorted = sorted([int(n) for n in existing_names if n.isdigit()]) + numeric_count = len(numeric_sorted) if method.startswith("TRAK-"): parts = method.split("-") @@ -727,36 +723,31 @@ def loss_tracin(params, batch): raise ValueError( "Invalid method name for TRAK, must be like 'TRAK-5' or 'TRAK-10'." ) - checkpoint_root_abs = Path(args.output_dir).resolve() - checkpoints = [f"{checkpoint_root_abs}/{i}" for i in range(-1, num_checkpoints)] - #fix checkpoint loading error - # if len(available_checkpoint_dirs) < requested_checkpoints: - # logger.warning( - # "Requested %s checkpoints but only found %s in %s. Using available checkpoints instead.", - # requested_checkpoints, - # len(available_checkpoint_dirs), - # checkpoint_root, - # ) - # requested_checkpoints = len(available_checkpoint_dirs) - - # checkpoints = [str(p) for p in available_checkpoint_dirs[:requested_checkpoints]] - + #prevent checkpoint id error when only -1 is present + if has_minus1 and numeric_count == 0: + selected_indices = [-1] + else: + if numeric_count == 0 and not has_minus1: + raise FileNotFoundError( f"No numeric checkpoint directories found in {checkpoint_root_abs}." ) + if numeric_count > 0 and has_minus1: + selected_indices = list(range(-1, min(num_checkpoints, numeric_count))) + if numeric_count > 0 and not has_minus1: + selected_indices = list(range(min(num_checkpoints, numeric_count))) + checkpoints = [str(checkpoint_root_abs / str(i)) for i in selected_indices] + elif method in ["TracIn", "Grad-Dot", "Grad-Cos"]: num_checkpoints = 5 - checkpoint_root_abs = Path(args.output_dir).resolve() - checkpoints = [f"{checkpoint_root_abs}/{i}" for i in range(-1,num_checkpoints)] - # requested_checkpoints = min(5, len(available_checkpoint_dirs)) - # if requested_checkpoints == 0: - # raise FileNotFoundError( - # f"No numeric checkpoint directories found in {checkpoint_root}." - # ) - # if requested_checkpoints < 5: - # logger.warning( - # "Only %s checkpoint(s) available; using these for %s.", - # requested_checkpoints, - # method, - # ) - # checkpoints = [str(p) for p in available_checkpoint_dirs[:requested_checkpoints]] + if has_minus1 and numeric_count == 0: + selected_indices = [-1] + else: #prevent checkpoint id error when only -1 is present + if numeric_count == 0 and not has_minus1: + raise FileNotFoundError( f"No numeric checkpoint directories found in {checkpoint_root_abs}.") + if numeric_count > 0 and has_minus1: + selected_indices = list(range(-1, min(num_checkpoints, numeric_count))) + if numeric_count > 0 and not has_minus1: + selected_indices = list(range(min(num_checkpoints, numeric_count))) + checkpoints = [str(checkpoint_root_abs / str(i)) for i in selected_indices] + else: raise ValueError( f"Unknown --method {method}. Try 'TRAK-5', 'TracIn', 'Grad-Dot', or 'Grad-Cos'." @@ -765,53 +756,6 @@ def loss_tracin(params, batch): #modified for huggingface hub validation error def checkpoints_load_func(model, checkpoint_path): new_model = AutoModelForCausalLM.from_pretrained(checkpoint_path).cuda() - # Convert to absolute path and verify it exists - # import os - # checkpoint_abs = os.path.abspath(checkpoint_path) - - # # Verify checkpoint directory exists - # if not os.path.exists(checkpoint_abs): - # raise FileNotFoundError( - # f"Checkpoint directory not found: {checkpoint_abs}. " - # f"Please ensure the checkpoint exists at this path." - # ) - - # # Try loading with local_files_only first - # try: - # # Load config and model separately to avoid path validation issues - # config = AutoConfig.from_pretrained( - # checkpoint_abs, - # local_files_only=True, - # trust_remote_code=True, - # ) - - # # Load model using config to bypass path validation - # new_model = AutoModelForCausalLM.from_pretrained( - # checkpoint_abs, - # config=config, - # local_files_only=True, # Force local file loading - # trust_remote_code=True, # Allow loading from local path - # ).cuda() - # except Exception as e: - # # If path validation fails, try loading from config.json directly - # config_path = os.path.join(checkpoint_abs, "config.json") - # if os.path.exists(config_path): - # config = AutoConfig.from_json_file(config_path) - # new_model = AutoModelForCausalLM.from_config(config).cuda() - # # Load weights from pytorch_model.bin or model.safetensors - # weight_path = os.path.join(checkpoint_abs, "pytorch_model.bin") - # if not os.path.exists(weight_path): - # weight_path = os.path.join(checkpoint_abs, "model.safetensors") - # if os.path.exists(weight_path): - # from safetensors.torch import load_file - # if weight_path.endswith(".safetensors"): - # state_dict = load_file(weight_path) - # else: - # state_dict = torch.load(weight_path, map_location="cpu") - # new_model.load_state_dict(state_dict) - # else: - # raise e - new_model.eval() return new_model @@ -852,6 +796,8 @@ def checkpoints_load_func(model, checkpoint_path): if method == "Grad-Cos": normalized_grad = True + #get the number of checkpoints + num_checkpoints = len(checkpoints) weight_list = torch.ones(num_checkpoints) * 1e-3 # fix memory issues From 864be74d7fbddd27824adb1056080c7fcedfc078 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 20 Dec 2025 16:40:32 -0600 Subject: [PATCH 26/30] fix checkpoint id error --- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index 9df6ffd90..14d9f3402 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -12,7 +12,7 @@ echo "Running TRAK scoring for seed: ${SLURM_ARRAY_TASK_ID}" # PyTorch memory management export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 -export CUDA_LAUNCH_BLOCKING=0 +export CUDA_LAUNCH_BLOCKING=2 # Create logs directory if it doesn't exist mkdir -p logs From c3409dc16ac8d32f8027b6cd2966d2349ca1be64 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 20 Dec 2025 16:59:38 -0600 Subject: [PATCH 27/30] change to GPU 2 --- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh index 14d9f3402..696d0b388 100644 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ b/experiments/gpt2_wikitext/score_TRAK_slurm.sh @@ -11,8 +11,10 @@ echo "job is starting on `hostname`" echo "Running TRAK scoring for seed: ${SLURM_ARRAY_TASK_ID}" # PyTorch memory management +export CUDA_VISIBLE_DEVICES=2 export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 -export CUDA_LAUNCH_BLOCKING=2 +export TOKENIZERS_PARALLELISM=false +#export CUDA_LAUNCH_BLOCKING=2 # Create logs directory if it doesn't exist mkdir -p logs From 0f06e7eba4a6e3a160596717fd85d5c7ae9a1910 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Sat, 20 Dec 2025 22:48:14 -0600 Subject: [PATCH 28/30] score_logra fix --- experiments/gpt2_wikitext/score_logra.py | 10 +++++----- experiments/gpt2_wikitext/score_logra_slurm.sh | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index fa088c729..27b2c9518 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -54,12 +54,12 @@ default_data_collator, get_scheduler, ) -from transformers.utils import check_min_version +from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version -try: - from transformers.utils import send_example_telemetry -except ImportError: - send_example_telemetry = None # Not available in newer transformers versions +# try: +# from transformers.utils import send_example_telemetry +# except ImportError: +# send_example_telemetry = None # Not available in newer transformers versions from dattri.benchmark.utils import SubsetSampler diff --git a/experiments/gpt2_wikitext/score_logra_slurm.sh b/experiments/gpt2_wikitext/score_logra_slurm.sh index 3ebdd5e99..689b7f4b8 100644 --- a/experiments/gpt2_wikitext/score_logra_slurm.sh +++ b/experiments/gpt2_wikitext/score_logra_slurm.sh @@ -14,8 +14,9 @@ echo "Running LoGra scoring for seed: ${SLURM_ARRAY_TASK_ID}" #export CUDA_VISIBLE_DEVICES=1 # PyTorch memory management +export CUDA_VISIBLE_DEVICES=2 export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 -export CUDA_LAUNCH_BLOCKING=0 +export TOKENIZERS_PARALLELISM=false # Create logs directory if it doesn't exist mkdir -p logs From aa51bd09080463456f55e4c6ac9dd254d059d537 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Wed, 24 Dec 2025 22:15:04 -0600 Subject: [PATCH 29/30] fix: telemetry call guard; sync with upstream; remove slurm scripts and unused ssh_config_template.txt --- experiments/gpt2_wikitext/score_TRAK.py | 4 -- experiments/gpt2_wikitext/score_TRAK_slurm.sh | 40 -------------- experiments/gpt2_wikitext/score_logra.py | 53 ------------------- .../gpt2_wikitext/score_logra_slurm.sh | 36 ------------- ssh_config_template.txt | 16 ------ 5 files changed, 149 deletions(-) delete mode 100644 experiments/gpt2_wikitext/score_TRAK_slurm.sh delete mode 100644 experiments/gpt2_wikitext/score_logra_slurm.sh delete mode 100644 ssh_config_template.txt diff --git a/experiments/gpt2_wikitext/score_TRAK.py b/experiments/gpt2_wikitext/score_TRAK.py index a832d5c65..e41749341 100644 --- a/experiments/gpt2_wikitext/score_TRAK.py +++ b/experiments/gpt2_wikitext/score_TRAK.py @@ -56,10 +56,6 @@ ) from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version -# try: -# from transformers.utils import send_example_telemetry -# except ImportError: -# send_example_telemetry = None # Not available in newer transformers versions from dattri.benchmark.utils import SubsetSampler from dattri.func.utils import flatten_func, flatten_params diff --git a/experiments/gpt2_wikitext/score_TRAK_slurm.sh b/experiments/gpt2_wikitext/score_TRAK_slurm.sh deleted file mode 100644 index 696d0b388..000000000 --- a/experiments/gpt2_wikitext/score_TRAK_slurm.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -#SBATCH --gres=gpu:A40:1 -#SBATCH --array=0-49 # 50 models with seeds 0-49 -#SBATCH --job-name=gpt2_wikitext_score_TRAK -#SBATCH --output=logs/score_TRAK_%A_%a.out -#SBATCH --error=logs/score_TRAK_%A_%a.err -#SBATCH --mem=64G -#SBATCH --cpus-per-task=8 - -echo "job is starting on `hostname`" -echo "Running TRAK scoring for seed: ${SLURM_ARRAY_TASK_ID}" - -# PyTorch memory management -export CUDA_VISIBLE_DEVICES=2 -export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 -export TOKENIZERS_PARALLELISM=false -#export CUDA_LAUNCH_BLOCKING=2 - -# Create logs directory if it doesn't exist -mkdir -p logs - -SEED=${SLURM_ARRAY_TASK_ID:-0} - -# Change to results directory to save score files there -cd results - -python ../score_TRAK.py \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ - --model_name_or_path openai-community/gpt2 \ - --output_dir ../checkpoints \ - --block_size 512 \ - --method TRAK-5 \ - --seed ${SEED} \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --proj_dim 256 \ - --proj_max_batch_size 8 \ - --proj_type random_mask - diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index 27b2c9518..271983074 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -56,11 +56,6 @@ ) from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version -# try: -# from transformers.utils import send_example_telemetry -# except ImportError: -# send_example_telemetry = None # Not available in newer transformers versions - from dattri.benchmark.utils import SubsetSampler @@ -598,54 +593,6 @@ def group_texts(examples): model_id = -1 # Use checkpoint 0 (final checkpoint) checkpoint = f"{args.output_dir}/{model_id}" - #modified for huggingface hub validation error - # def checkpoints_load_func(model, checkpoint_path): - # # Convert to absolute path and verify it exists - # import os - # checkpoint_abs = os.path.abspath(checkpoint_path) - - # # Verify checkpoint directory exists - # if not os.path.exists(checkpoint_abs): - # raise FileNotFoundError( - # f"Checkpoint directory not found: {checkpoint_abs}. " - # f"Please ensure the checkpoint exists at this path." - # ) - - # # Try loading with local_files_only first - # try: - # # Load config and model separately to avoid path validation issues - # config = AutoConfig.from_pretrained( - # checkpoint_abs, - # local_files_only=True, - # trust_remote_code=True, - # ) - - # # Load model using config to bypass path validation - # model = AutoModelForCausalLM.from_pretrained( - # checkpoint_abs, - # config=config, - # local_files_only=True, # Force local file loading - # trust_remote_code=True, # Allow loading from local path - # ).cuda() - # except Exception as e: - # # If path validation fails, try loading from config.json directly - # config_path = os.path.join(checkpoint_abs, "config.json") - # if os.path.exists(config_path): - # config = AutoConfig.from_json_file(config_path) - # model = AutoModelForCausalLM.from_config(config).cuda() - # # Load weights from pytorch_model.bin or model.safetensors - # weight_path = os.path.join(checkpoint_abs, "pytorch_model.bin") - # if not os.path.exists(weight_path): - # weight_path = os.path.join(checkpoint_abs, "model.safetensors") - # if os.path.exists(weight_path): - # from safetensors.torch import load_file - # if weight_path.endswith(".safetensors"): - # state_dict = load_file(weight_path) - # else: - # state_dict = torch.load(weight_path, map_location="cpu") - # model.load_state_dict(state_dict) - # else: - # raise e def checkpoints_load_func(model, checkpoint): model = AutoModelForCausalLM.from_pretrained(checkpoint).cuda() model.eval() diff --git a/experiments/gpt2_wikitext/score_logra_slurm.sh b/experiments/gpt2_wikitext/score_logra_slurm.sh deleted file mode 100644 index 689b7f4b8..000000000 --- a/experiments/gpt2_wikitext/score_logra_slurm.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -#SBATCH --gres=gpu:A40:1 -#SBATCH --array=0-49 # 50 models with seeds 0-49 -#SBATCH --job-name=gpt2_wikitext_score_logra -#SBATCH --output=logs/score_logra_%A_%a.out -#SBATCH --error=logs/score_logra_%A_%a.err -#SBATCH --mem=64G -#SBATCH --cpus-per-task=8 - -echo "job is starting on `hostname`" -echo "Running LoGra scoring for seed: ${SLURM_ARRAY_TASK_ID}" - -# Force use GPU 1 (index 1) -#export CUDA_VISIBLE_DEVICES=1 - -# PyTorch memory management -export CUDA_VISIBLE_DEVICES=2 -export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 -export TOKENIZERS_PARALLELISM=false - -# Create logs directory if it doesn't exist -mkdir -p logs - -SEED=${SLURM_ARRAY_TASK_ID:-0} - -# Change to results directory to save score files there -cd results - -python ../score_logra.py \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ - --model_name_or_path openai-community/gpt2 \ - --output_dir ../checkpoints \ - --block_size 512 \ - --seed ${SEED} - diff --git a/ssh_config_template.txt b/ssh_config_template.txt deleted file mode 100644 index 0208585a0..000000000 --- a/ssh_config_template.txt +++ /dev/null @@ -1,16 +0,0 @@ -# SSH Configuration Template for Markov Server -# Copy this content to ~/.ssh/config and replace with your actual UIUC NetID - -Host markov.ischool.illinois.edu - HostName markov.ischool.illinois.edu - Port 22 - User - ServerAliveInterval 60 - ServerAliveCountMax 3 - TCPKeepAlive yes - -# Instructions: -# 1. Copy this content to ~/.ssh/config -# 2. Replace with your actual UIUC NetID -# 3. Set proper permissions: chmod 600 ~/.ssh/config -# 4. Test connection: ssh markov.ischool.illinois.edu From dcad19a4a338de7ef77cfc6799f8b5beaf1c4d96 Mon Sep 17 00:00:00 2001 From: JINGYUAN NI Date: Thu, 25 Dec 2025 23:22:35 -0800 Subject: [PATCH 30/30] update score_logra --- experiments/gpt2_wikitext/score_logra.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/experiments/gpt2_wikitext/score_logra.py b/experiments/gpt2_wikitext/score_logra.py index 271983074..0c0fbc864 100644 --- a/experiments/gpt2_wikitext/score_logra.py +++ b/experiments/gpt2_wikitext/score_logra.py @@ -321,9 +321,7 @@ def parse_args(): def main(): args = parse_args() - #fix the import error in newer transformers versions - if send_example_telemetry is not None: - send_example_telemetry("run_clm_no_trainer", args) + send_example_telemetry("run_clm_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers