From 0946333cb4a4b4c2bb0329e41c5c3931b8eb2489 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Fri, 21 Mar 2025 12:14:50 -0500 Subject: [PATCH 1/8] Add local data support --- data.json | 3 --- models.json | 33 ++++----------------------- scripts/vllm/vllm_benchmark_report.sh | 5 ++++ 3 files changed, 10 insertions(+), 31 deletions(-) delete mode 100644 data.json diff --git a/data.json b/data.json deleted file mode 100644 index 6abaa86..0000000 --- a/data.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "huggingface": {} -} diff --git a/models.json b/models.json index fa50cbd..775b560 100644 --- a/models.json +++ b/models.json @@ -33,7 +33,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -51,7 +50,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -69,7 +67,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -87,7 +84,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -105,7 +101,7 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", + "data": "llama-2-7b-chat-hf", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -123,7 +119,7 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", + "data": "llama-2-70b-chat-hf", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -141,7 +137,7 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", + "data": "Mixtral-8x7B-Instruct-v0.1", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -159,7 +155,7 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", + "data": "Mixtral-8x22B-Instruct-v0.1", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -177,7 +173,7 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", + "data": "mistral-7b-v0.1", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -195,7 +191,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -213,7 +208,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -231,7 +225,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -249,7 +242,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -267,7 +259,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -285,7 +276,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -303,7 +293,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -321,7 +310,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -339,7 +327,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -357,7 +344,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -375,7 +361,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -393,7 +378,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -411,7 +395,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -429,7 +412,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -447,7 +429,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -465,7 +446,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -483,7 +463,6 @@ "url": "", "dockerfile": "docker/pytorch_train", "scripts": "scripts/pytorch_train/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -500,7 +479,6 @@ "url": "", "dockerfile": "docker/pytorch_train", "scripts": "scripts/pytorch_train/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -517,7 +495,6 @@ "url": "", "dockerfile": "docker/pytorch_train", "scripts": "scripts/pytorch_train/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", diff --git a/scripts/vllm/vllm_benchmark_report.sh b/scripts/vllm/vllm_benchmark_report.sh index c3b8b21..e4c2dd6 100755 --- a/scripts/vllm/vllm_benchmark_report.sh +++ b/scripts/vllm/vllm_benchmark_report.sh @@ -50,6 +50,11 @@ model_org_name=(${model//// }) model_name=${model_org_name[1]} tp=$numgpu +# Use local data if present +if [ -n $MAD_DATAHOME ]; then + model=$MAD_DATAHOME +fi + # perf configuration export VLLM_USE_TRITON_FLASH_ATTN=0 export NCCL_MIN_NCHANNELS=112 From 9b1aa7724280076a8e2a22741ac9c58fd9484379 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Fri, 21 Mar 2025 16:08:25 -0500 Subject: [PATCH 2/8] Remove data field --- models.json | 5 ----- 1 file changed, 5 deletions(-) diff --git a/models.json b/models.json index 775b560..1e9b501 100644 --- a/models.json +++ b/models.json @@ -101,7 +101,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "llama-2-7b-chat-hf", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -119,7 +118,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "llama-2-70b-chat-hf", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -137,7 +135,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "Mixtral-8x7B-Instruct-v0.1", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -155,7 +152,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "Mixtral-8x22B-Instruct-v0.1", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -173,7 +169,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "mistral-7b-v0.1", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", From 1922826f36b1b82eb178f08bf8d61dc16ca685ea Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Mon, 7 Apr 2025 12:40:58 -0500 Subject: [PATCH 3/8] Add echo --- scripts/vllm/vllm_benchmark_report.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/vllm/vllm_benchmark_report.sh b/scripts/vllm/vllm_benchmark_report.sh index e4c2dd6..f18137a 100755 --- a/scripts/vllm/vllm_benchmark_report.sh +++ b/scripts/vllm/vllm_benchmark_report.sh @@ -52,6 +52,7 @@ tp=$numgpu # Use local data if present if [ -n $MAD_DATAHOME ]; then + echo "Using data from MAD_DATAHOME" model=$MAD_DATAHOME fi From ab7de7b56585f52a319a687cadb35594034f85f7 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Mon, 7 Apr 2025 12:42:51 -0500 Subject: [PATCH 4/8] Add echo --- scripts/vllm/vllm_benchmark_report.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/vllm/vllm_benchmark_report.sh b/scripts/vllm/vllm_benchmark_report.sh index f18137a..1f39b2d 100755 --- a/scripts/vllm/vllm_benchmark_report.sh +++ b/scripts/vllm/vllm_benchmark_report.sh @@ -52,7 +52,7 @@ tp=$numgpu # Use local data if present if [ -n $MAD_DATAHOME ]; then - echo "Using data from MAD_DATAHOME" + echo "Using data from $MAD_DATAHOME" model=$MAD_DATAHOME fi From 9c1d328f75cf2d887112a97b4feb2da7831eaa2b Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Mon, 7 Apr 2025 12:59:11 -0500 Subject: [PATCH 5/8] Add echo --- scripts/vllm/vllm_benchmark_report.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/vllm/vllm_benchmark_report.sh b/scripts/vllm/vllm_benchmark_report.sh index 1f39b2d..bd5f5d2 100755 --- a/scripts/vllm/vllm_benchmark_report.sh +++ b/scripts/vllm/vllm_benchmark_report.sh @@ -51,7 +51,7 @@ model_name=${model_org_name[1]} tp=$numgpu # Use local data if present -if [ -n $MAD_DATAHOME ]; then +if [ -n "$MAD_DATAHOME" ]; then echo "Using data from $MAD_DATAHOME" model=$MAD_DATAHOME fi From 99cf154c24041dbc3aca22712336994889b4afad Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Mon, 28 Apr 2025 10:43:05 -0500 Subject: [PATCH 6/8] Add llama2 7b and 70b nas for testing --- models.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/models.json b/models.json index c9d75ac..89d7c4e 100644 --- a/models.json +++ b/models.json @@ -99,6 +99,7 @@ { "name": "pyt_vllm_llama-2-7b", "url": "", + "data": "llama-2-7b-chat-hf", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", @@ -116,6 +117,7 @@ { "name": "pyt_vllm_llama-2-70b", "url": "", + "data": "llama-2-70b-chat-hf", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", From b861623eeeb15f0561271226ade9fecbab29fa03 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Thu, 8 May 2025 16:15:16 -0500 Subject: [PATCH 7/8] Update Llama and Mistral data --- models.json | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/models.json b/models.json index ba686a5..84409e3 100644 --- a/models.json +++ b/models.json @@ -29,112 +29,117 @@ "args": "" }, { - "name": "pyt_vllm_llama-3.1-8b", + "name": "pyt_vllm_llama-2-7b", "url": "", + "data": "meta-llama/Llama-2-7b-chat-hf", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-3.1-8B-Instruct.csv", + "multiple_results": "perf_Llama-2-7b-chat-hf.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-3.1-8B-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-2-7b-chat-hf --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-3.1-70b", + "name": "pyt_vllm_llama-2-70b", "url": "", + "data": "meta-llama/Llama-2-70b-chat-hf", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-3.1-70B-Instruct.csv", + "multiple_results": "perf_Llama-2-70b-chat-hf.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-3.1-70B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-2-70b-chat-hf --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-3.1-405b", + "name": "pyt_vllm_llama-3.1-8b", "url": "", + "data": "meta-llama/Llama-3.1-8B-Instruct", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-3.1-405B-Instruct.csv", + "multiple_results": "perf_Llama-3.1-8B-Instruct.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-3.1-405B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-3.1-8B-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-3.2-11b-vision-instruct", + "name": "pyt_vllm_llama-3.1-70b", "url": "", + "data": "meta-llama/Llama-3.1-70B-Instruct", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-3.2-11B-Vision-Instruct.csv", + "multiple_results": "perf_Llama-3.1-70B-Instruct.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-3.2-11B-Vision-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-3.1-70B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-2-7b", + "name": "pyt_vllm_llama-3.1-405b", "url": "", - "data": "llama-2-7b-chat-hf", + "data": "meta-llama/Llama-3.1-405B-Instruct", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-2-7b-chat-hf.csv", + "multiple_results": "perf_Llama-3.1-405B-Instruct.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-2-7b-chat-hf --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-3.1-405B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-2-70b", + "name": "pyt_vllm_llama-3.2-11b-vision-instruct", "url": "", - "data": "llama-2-70b-chat-hf", + "data": "meta-llama/Llama-3.2-11B-Vision-Instruct", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-2-70b-chat-hf.csv", + "multiple_results": "perf_Llama-3.2-11B-Vision-Instruct.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-2-70b-chat-hf --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-3.2-11B-Vision-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" }, { "name": "pyt_vllm_mixtral-8x7b", "url": "", + "data": "mistralai/Mixtral-8x7B-Instruct-v0.1", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", @@ -152,6 +157,7 @@ { "name": "pyt_vllm_mixtral-8x22b", "url": "", + "data": "mistralai/Mixtral-8x22B-Instruct-v0.1", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", @@ -169,6 +175,7 @@ { "name": "pyt_vllm_mistral-7b", "url": "", + "data": "mistralai/Mistral-7B-Instruct-v0.1", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", From 7c9d80fa498105833a142067dbb134250055a9e0 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Fri, 9 May 2025 01:08:57 -0500 Subject: [PATCH 8/8] Add entries for llama FP8 --- models.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/models.json b/models.json index 84409e3..7b73ddd 100644 --- a/models.json +++ b/models.json @@ -295,6 +295,7 @@ { "name": "pyt_vllm_llama-3.1-8b_fp8", "url": "", + "data": "amd/Llama-3.1-8B-Instruct-FP8-KV", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", @@ -312,6 +313,7 @@ { "name": "pyt_vllm_llama-3.1-70b_fp8", "url": "", + "data": "amd/Llama-3.1-70B-Instruct-FP8-KV", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", @@ -329,6 +331,7 @@ { "name": "pyt_vllm_llama-3.1-405b_fp8", "url": "", + "data": "amd/Llama-3.1-405B-Instruct-FP8-KV", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1",