zhenwei-intel · zhenwei-intel · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/.buildkite/cases/comprehensive-cases.txt b/.buildkite/cases/comprehensive-cases.txt
@@ -3,3 +3,6 @@ local_disk.yaml
 local_cpu_mla.yaml
 pd.yaml
 multi_device.yaml
+async.yaml
+p2p.yaml
+layerwise.yaml
diff --git a/.buildkite/configs/async.yaml b/.buildkite/configs/async.yaml
@@ -0,0 +1,28 @@
+workload:
+  type: long_doc_qa
+  max-inflight-requests: 20
+  sleep-time-after-warmup: 20
+  expected-latency-gain: 1.5
+  num-documents: 20
+  repeat-count: 1
+  hit-miss-ratio: 2:2
+
+docker:
+  env:
+    - "LMCACHE_CHUNK_SIZE=256"
+    - "LMCACHE_LOCAL_CPU=False"
+    - "LMCACHE_MAX_LOCAL_CPU_SIZE=70"
+    - "LMCACHE_MAX_LOCAL_DISK_SIZE=70"
+    - "LMCACHE_LOCAL_DISK=\"file:///local/end-to-end-tests/local/\""
+    - "LMCACHE_ENABLE_ASYNC_LOADING=True"
+    - "LMCACHE_EXTRA_CONFIG={\"lookup_backoff_time\": 0.01, \"use_odirect\": True}"
+    - "LMCACHE_SAVE_UNFULL_CHUNK=False"
+
+vllm:
+  model: "meta-llama/Llama-3.1-8B-Instruct"
+  args:
+    - "--load-format"
+    - "dummy"
+    - "--no-enable-prefix-caching"
+    - "--kv-transfer-config"
+    - "{\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
diff --git a/.buildkite/configs/layerwise.yaml b/.buildkite/configs/layerwise.yaml
@@ -0,0 +1,22 @@
+workload:
+  type: long_doc_qa
+  max-inflight-requests: 20
+  expected-latency-gain: 3
+
+docker:
+  env:
+    - "LMCACHE_CHUNK_SIZE=256"
+    - "LMCACHE_LOCAL_CPU=True"
+    - "LMCACHE_MAX_LOCAL_CPU_SIZE=5"
+    - "LMCACHE_USE_LAYERWISE=true"
+
+vllm:
+  model: "meta-llama/Llama-3.2-1B-Instruct"
+  args:
+    - "--load-format"
+    - "dummy"
+    - "--no-enable-prefix-caching"
+    - "--kv-transfer-config"
+    - "{\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
+    - "--compilation-config" 
+    - "{\"cudagraph_mode\":\"PIECEWISE\"}"
diff --git a/.buildkite/configs/local_cpu.yaml b/.buildkite/configs/local_cpu.yaml
@@ -1,7 +1,7 @@
 workload:
   type: long_doc_qa
   max-inflight-requests: 20
-  expected-latency-gain: 3.7
+  expected-latency-gain: 3.6
 
 docker:
   env:

diff --git a/.buildkite/configs/p2p.yaml b/.buildkite/configs/p2p.yaml
@@ -0,0 +1,63 @@
+workload:
+  type: long_doc_qa
+  num-documents: 20
+  max-inflight-requests: 2
+  repeat-count: 1
+  expected-latency: 4
+
+feature:
+  type: p2p
+
+docker1:
+  env:
+    - "LMCACHE_MAX_LOCAL_CPU_SIZE=60"
+    - "LMCACHE_ENABLE_ASYNC_LOADING=True"
+    - "LMCACHE_ENABLE_P2P=True"
+    - "LMCACHE_P2P_HOST=localhost"
+    - "LMCACHE_P2P_INIT_PORTS=8200"
+    - "LMCACHE_P2P_LOOKUP_PORTS=8201"
+    - "LMCACHE_TRANSFER_CHANNEL=nixl"
+    - "LMCACHE_ENABLE_CONTROLLER=True"
+    - "LMCACHE_LMCACHE_INSTANCE_ID=lmcache_instance_1"
+    - "LMCACHE_LMCACHE_WORKER_PORTS=8500"
+    - "LMCACHE_EXTRA_CONFIG={\"lookup_backoff_time\": 0.001}"
+    - "LMCACHE_SAVE_UNFULL_CHUNK=False"
+    - "PYTHONHASHSEED=123"
+  pull-port: 8300
+  reply-port: 8400
+
+docker2:
+  env:
+    - "LMCACHE_MAX_LOCAL_CPU_SIZE=60"
+    - "LMCACHE_ENABLE_ASYNC_LOADING=True"
+    - "LMCACHE_ENABLE_P2P=True"
+    - "LMCACHE_P2P_HOST=localhost"
+    - "LMCACHE_P2P_INIT_PORTS=8202"
+    - "LMCACHE_P2P_LOOKUP_PORTS=8203"
+    - "LMCACHE_TRANSFER_CHANNEL=nixl"
+    - "LMCACHE_ENABLE_CONTROLLER=True"
+    - "LMCACHE_LMCACHE_INSTANCE_ID=lmcache_instance_2"
+    - "LMCACHE_LMCACHE_WORKER_PORTS=8501"
+    - "LMCACHE_EXTRA_CONFIG={\"lookup_backoff_time\": 0.001}"
+    - "LMCACHE_SAVE_UNFULL_CHUNK=False"
+    - "PYTHONHASHSEED=123"
+  pull-port: 8300
+  reply-port: 8400
+
+vllm1:
+  model: "meta-llama/Llama-3.1-8B-Instruct"
+  args:
+    - "--load-format"
+    - "dummy"
+    - "--no-enable-prefix-caching"
+    - "--kv-transfer-config"
+    - "{\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
+
+vllm2:
+  model: "meta-llama/Llama-3.1-8B-Instruct"
+  args:
+    - "--load-format"
+    - "dummy"
+    - "--no-enable-prefix-caching"
+    - "--kv-transfer-config"
+    - "{\"kv_connector\":\"LMCacheConnectorV1\",\"kv_role\":\"kv_both\"}"
diff --git a/.buildkite/scripts/vllm-integration-tests.sh b/.buildkite/scripts/vllm-integration-tests.sh
@@ -272,6 +272,123 @@ run_pd_lmcache() {
     sleep 10
 }
 
+run_p2p_lmcache() {
+    local docker1="$1"
+    local vllm1="$2"
+    local docker2="$3"
+    local vllm2="$4"
+    local cfg_name="$5"
+    LOGFILE1="/tmp/build_${BUILD_ID}_${cfg_name}1.log"
+    LOGFILE2="/tmp/build_${BUILD_ID}_${cfg_name}2.log"
+
+    ########## Instance 1 ##########
+    # docker args
+    docker1_args=(
+        --runtime nvidia
+        --network host
+        --gpus "device=0"
+        --volume ~/.cache/huggingface:/root/.cache/huggingface
+        --env VLLM_USE_FLASHINFER_SAMPLER=0
+        --env HF_TOKEN="$HF_TOKEN"
+        --env UCX_TLS=tcp
+        --ipc host
+        --shm-size 4G
+    )
+    while IFS= read -r e; do
+        [[ -n $e ]] && docker1_args+=(--env "$e")
+    done < <(yq -r '.env[]?' <<<"$docker1")
+    pull=$(yq -er '."pull-port"' <<<"$docker1" 2>/dev/null)
+    docker1_args+=(--env "LMCACHE_CONTROLLER_PULL_URL=localhost:$pull")
+    reply=$(yq -er '."reply-port"' <<<"$docker1" 2>/dev/null)
+    docker1_args+=(--env "LMCACHE_CONTROLLER_REPLY_URL=localhost:$reply")
+
+    # vllm args
+    vllm1_model="$(yq -r '.model' <<<"$vllm1")"
+    mapfile -t vllm1_cli_args < <(yq -r '.args // [] | .[]' <<<"$vllm1")
+    cmd_args1=(
+        lmcache/vllm-openai:build-latest
+        "$vllm1_model"
+    )
+    cmd_args1+=("${vllm1_cli_args[@]}")
+    cmd_args1+=("--port" "$PORT1")
+
+    ##### Controller part start #####
+    if [ ! -d ".venv" ]; then
+        UV_PYTHON=python3 uv -q venv
+    fi
+    source .venv/bin/activate
+    uv pip install -r "$ORIG_DIR/requirements/build.txt" > /dev/null 2>&1
+    uv pip install torch==2.7.1 httpx fastapi uvicorn > /dev/null 2>&1
+    uv pip install -e "$ORIG_DIR" --no-build-isolation > /dev/null 2>&1
+    # Start controller
+    PYTHONHASHSEED=123 lmcache_controller \
+        --host localhost \
+        --port "$PORT" \
+        --monitor-ports "{\"pull\": ${pull}, \"reply\": ${reply}}" \
+        > "/tmp/build_${BUILD_ID}_${cfg_name}_controller.log" 2>&1 &
+    sleep 10
+    ##### Controller part end #####
+
+    # Start docker
+    CID1=$(
+        docker run -d \
+            "${docker1_args[@]}" \
+            "${cmd_args1[@]}"
+    )
+
+    # Health check
+    wait_for_openai_api_server "$PORT1" "$vllm1_model" "$CID1"
+
+    # Logging
+    touch "$LOGFILE1"
+    docker logs -f "$CID1" >>"$LOGFILE1" 2>&1 &
+
+    ########## Instance 2 ##########
+    # docker args
+    docker2_args=(
+        --runtime nvidia
+        --network host
+        --gpus "device=1"
+        --volume ~/.cache/huggingface:/root/.cache/huggingface
+        --env VLLM_USE_FLASHINFER_SAMPLER=0
+        --env HF_TOKEN="$HF_TOKEN"
+        --env UCX_TLS=tcp
+        --ipc host
+        --shm-size 4G
+    )
+    while IFS= read -r e; do
+        [[ -n $e ]] && docker2_args+=(--env "$e")
+    done < <(yq -r '.env[]?' <<<"$docker2")
+    pull=$(yq -er '."pull-port"' <<<"$docker2" 2>/dev/null)
+    docker2_args+=(--env "LMCACHE_CONTROLLER_PULL_URL=localhost:$pull")
+    reply=$(yq -er '."reply-port"' <<<"$docker2" 2>/dev/null)
+    docker2_args+=(--env "LMCACHE_CONTROLLER_REPLY_URL=localhost:$reply")
+
+    # vllm args
+    vllm2_model="$(yq -r '.model' <<<"$vllm2")"
+    mapfile -t vllm2_cli_args < <(yq -r '.args // [] | .[]' <<<"$vllm2")
+    cmd_args2=(
+        lmcache/vllm-openai:build-latest
+        "$vllm2_model"
+    )
+    cmd_args2+=("${vllm2_cli_args[@]}")
+    cmd_args2+=("--port" "$PORT2")
+
+    # Start docker
+    CID2=$(
+        docker run -d \
+            "${docker2_args[@]}" \
+            "${cmd_args2[@]}"
+    )
+
+    # Health check
+    wait_for_openai_api_server "$PORT2" "$vllm2_model" "$CID2"
+
+    # Logging
+    touch "$LOGFILE2"
+    docker logs -f "$CID2" >>"$LOGFILE2" 2>&1 &
+}
+
 usage() {
     echo "Usage: $0 [OPTIONS]"
     echo " "
@@ -315,6 +432,7 @@ test_vllmopenai_server_with_lmcache_integrated() {
 
 run_long_doc_qa() {
     local workload_config="$1"
+    local port="$2"
 
     echo "→ Running long_doc_qa with customed workload config:"
     printf '%s\n' "$workload_config"
@@ -349,7 +467,7 @@ run_long_doc_qa() {
     uv -q pip install openai pandas matplotlib
     python3 "$ORIG_DIR/benchmarks/long_doc_qa/long_doc_qa.py" \
         "${workload_args[@]}" \
-        --port="$PORT" \
+        --port="$port" \
         --output="response.txt"
 }
 
@@ -433,6 +551,15 @@ for cfg_name in "${CONFIG_NAMES[@]}"; do
         decoder_vllm_args="$(yq '.["vllm-decoder"]' "$cfg_file")"
         run_pd_lmcache "$prefiller_docker_args" "$prefiller_vllm_args" "$decoder_docker_args" "$decoder_vllm_args" "$cfg_name" 
         model="$(yq -r '.["vllm-prefiller"].model' "$cfg_file")"
+    elif [[ "$feature_type" == "p2p" ]]; then
+        PORT1=$(find_available_port 8177)
+        docker1_args="$(yq '.["docker1"]' "$cfg_file")"
+        vllm1_args="$(yq '.["vllm1"]' "$cfg_file")"
+        PORT2=$(find_available_port 8277)
+        docker2_args="$(yq '.["docker2"]' "$cfg_file")"
+        vllm2_args="$(yq '.["vllm2"]' "$cfg_file")"
+        run_p2p_lmcache "$docker1_args" "$vllm1_args" "$docker2_args" "$vllm2_args" "$cfg_name" 
+        model="$(yq -r '.["vllm1"].model' "$cfg_file")"
     elif [[ -z "$feature_type" ]]; then
         docker_args="$(yq '.docker' "$cfg_file")"
         vllm_args="$(yq '.vllm' "$cfg_file")"
@@ -446,7 +573,13 @@ for cfg_name in "${CONFIG_NAMES[@]}"; do
         test_vllmopenai_server_with_lmcache_integrated "$model"
     elif [ "$test_mode" = "long_doc_qa" ]; then
         workload_yaml="$(yq "(.workload * {\"model\": \"$model\"}) | del(.type)" "$cfg_file")"
-        run_long_doc_qa "$workload_yaml"
+        if [[ "$feature_type" == "p2p" ]]; then
+            tmp_workload_yaml=$(jq 'del(."expected-latency")' <<< "$workload_yaml")
+            run_long_doc_qa "$tmp_workload_yaml" "$PORT1"
+            run_long_doc_qa "$workload_yaml" "$PORT2"
+        else
+            run_long_doc_qa "$workload_yaml" "$PORT"
+        fi
     fi
 
     cleanup 0

diff --git a/.github/ISSUE_TEMPLATE/blank_issue.md b/.github/ISSUE_TEMPLATE/blank_issue.md
@@ -6,7 +6,7 @@ labels: ''
 assignees: ''
 ---
 **Label**
-Please label your issue so that it can easily be easily categorized under [LMCache Onboarding](https://github.com/LMCache/LMCache/issues/627)
+Please label your issue so that it can easily be easily categorized under [LMCache Onboarding](https://github.com/LMCache/LMCache/issues/1882)
 
 **Summary**
 A concise overview of the issue you want to raise.

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -7,7 +7,7 @@ assignees: ''
 
 ---
 **Label**
-Please label your issue with "bug" and any other relevant labels so that it can easily be easily categorized under [LMCache Onboarding](https://github.com/LMCache/LMCache/issues/627)
+Please label your issue with "bug" and any other relevant labels so that it can easily be easily categorized under [LMCache Onboarding](https://github.com/LMCache/LMCache/issues/1882)
 
 **Describe the bug**
 A clear and concise description of what the bug is.

diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -7,7 +7,7 @@ assignees: ''
 
 ---
 **Label**
-Please label your issue with "new feature" and any other relevant labels so that it can easily be easily categorized under [LMCache Onboarding](https://github.com/LMCache/LMCache/issues/627)
+Please label your issue with "new feature" and any other relevant labels so that it can easily be easily categorized under [LMCache Onboarding](https://github.com/LMCache/LMCache/issues/1882)
 
 **Is your feature request related to a problem? Please describe.**
 A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,40 +1,12 @@
-FILL IN THE PR DESCRIPTION HERE
+<!--  Thanks for your contribution to LMCache!  Here are some tips for you:
+1. Make sure to read the Contributing Guide before submitting your PR: https://github.com/LMCache/LMCache/blob/dev/CONTRIBUTING.md
+2. If this PR closes another issue, add 'Fixes #<issue number>' somewhere in the PR summary. GitHub will automatically close that issue when this PR gets merged. Alternatively, adding 'Refs #<issue number>' will not close the issue, but help provide the reviewer more context.-->
 
-FIX #xxxx (*link existing issues this PR will resolve*)
+**What this PR does / why we need it**:
 
-**PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
+**Special notes for your reviewers**:
 
----
+**If applicable**:
 
-<details>
-<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
-<summary><b> PR Checklist (Click to Expand) </b></summary>
-
-<p>Thank you for your contribution to LMCache! Before submitting the pull request, please ensure the PR meets the following criteria. This helps us maintain the code quality and improve the efficiency of the review process.</p>
-
-<h3>PR Title and Classification</h3>
-<p>Please try to classify PRs for easy understanding of the type of changes. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
-<ul>
-    <li><code>[Bugfix]</code> for bug fixes.</li>
-    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
-    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
-    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
-    <li><code>[Core]</code> for changes in the core LMCache logic (e.g., <code>LMCacheEngine</code>, <code>Backend</code> etc.)</li>
-    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
-</ul>
-<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
-
-<h3>Code Quality</h3>
-
-<p>The PR need to meet the following code quality standards:</p>
-
-<ul>
-    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
-    <li> Please include sufficient unit tests to ensure the change is stay correct and robust. The unit and integration tests will always run and our comprehensive test will be triggered after the "full" label is tagged onto a PR.</li>
-</ul>
-
-<h3>What to Expect for the Reviews</h3>
-
-We aim to address all PRs in a timely manner. If no one reviews your PR within 5 days, please @-mention one of KuntaiDu, ApostaC or YaoJiayi.
-
-</details>
+- [ ] this PR contains user facing changes - docs added
+- [ ] this PR contains unit tests
diff --git a/.github/workflows/automerge-labeler.yml b/.github/workflows/automerge-labeler.yml
@@ -0,0 +1,17 @@
+name: Label auto-merge PRs
+
+on:
+  pull_request_target:
+    types: [ auto_merge_enabled, auto_merge_disabled ]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  add_remove_labels:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: ubuntudroid/automerge-labeler@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          label: 'full'