pytorch · fadara01 · Dec 16, 2025
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -19,7 +19,7 @@
         "linux.rocm.gpu.gfx942.1",
         "linux.24xl.spr-metal",
         "linux.24xl.gnr",
-        # "linux.arm64.m7g.4xlarge",  # TODO (huydhn): This is not working yet
+        "linux.arm64.m8g.4xlarge",
         "linux.dgx.b200",
         "linux.hpu.gaudi3.8",
     ],
@@ -60,7 +60,7 @@
     "linux.rocm.gpu.gfx942.8": "rocm",
     "linux.24xl.spr-metal": "cpu",
     "linux.24xl.gnr": "cpu",
-    # "linux.arm64.m7g.4xlarge": "arm64-cpu",  # TODO (huydhn): This is not working yet
+    "linux.arm64.m8g.4xlarge": "arm64-cpu",
     "linux.hpu.gaudi3.8": "hpu",
 }
 

diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py
@@ -22,7 +22,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -209,7 +209,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -247,7 +247,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -286,7 +286,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -321,7 +321,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -409,7 +409,7 @@ def test_generate_benchmark_matrix():
 
     # Select multiple runners
     models = []
-    runners = ["h100", "spr", "m7g"]
+    runners = ["h100", "spr", "m8g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -419,7 +419,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {
@@ -624,7 +624,7 @@ def test_generate_benchmark_matrix():
         "meta-llama/meta-llama-3.1-8b-instruct",
         "mistralai/mixtral-8x7b-instruct-v0.1",
     ]
-    runners = ["rocm", "spr", "m7g"]
+    runners = ["rocm", "spr", "m8g"]
     output = json.dumps(
         generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
     )
@@ -634,7 +634,7 @@ def test_generate_benchmark_matrix():
 {
   "include": [
     {
-      "runner": "linux.arm64.m7g.4xlarge",
+      "runner": "linux.arm64.m8g.4xlarge",
       "models": "meta-llama/meta-llama-3.1-8b-instruct"
     },
     {

diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -25,10 +25,7 @@ on:
           A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
         required: true
         type: string
-        # TODO (huydhn): Remove aarch64 CPU benchmark running on m7g until the change
-        # from https://github.com/vllm-project/vllm/pull/26494#issuecomment-3537415441
-        # is resolved and merged
-        default: h100,rocm,spr,gnr,b200,gaudi3
+        default: h100,rocm,spr,gnr,b200,m8g,gaudi3
   pull_request:
     paths:
       - .github/workflows/vllm-benchmark.yml
@@ -306,13 +303,11 @@ jobs:
         run: |
           set -eux
 
-          ON_ARM64_CPU=0
           ON_CPU=0
 
-          case "$DEVICE_NAME" in
-            cpu)       ON_CPU=1 ;;
-            arm64-cpu) ON_ARM64_CPU=1 ;;
-          esac
+          if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
+            ON_CPU=1
+          fi
 
           container_name=$(docker run \
             ${GPU_FLAG:-} \
@@ -325,7 +320,6 @@ jobs:
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
             -e ON_CPU="${ON_CPU}" \
-            -e ON_ARM64_CPU="${ON_ARM64_CPU}" \
             --ipc=host \
             --tty \
             --detach \

diff --git a/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json b/vllm-benchmarks/benchmarks/arm64-cpu/serving-tests-arm64-cpu.json
@@ -14,7 +14,6 @@
             "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "disable_log_stats": "",
             "disable_log_requests": "",
@@ -43,7 +42,6 @@
             "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "disable_log_stats": "",
             "disable_log_requests": "",
@@ -72,7 +70,6 @@
             "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "disable_log_stats": "",
             "disable_log_requests": "",
@@ -101,7 +98,6 @@
             "device": "cpu",
             "dtype": "bfloat16",
             "distributed_executor_backend": "mp",
-            "block_size": 16,
             "trust_remote_code": "",
             "enable_chunked_prefill": "",
             "disable_log_stats": "",