双机2*H20(8*96GiB)部署满血DeepSeek-R1(fp8)验证过程

双机2*H20(8*96GiB)部署满血DeepSeek-R1(fp8)验证过程、vllm 与 sglang 双机测试与性能对比

Posted by 陈谭军 on Saturday, April 5, 2025 | 阅读 |,阅读约 8 分钟

环境信息

机器配置

OS:CentOS Linux release 7.6 (Final)
Kernel:4.19.0-1.0.0.9
驱动: Driver Version: 535.216.03   CUDA Version: 12.2
GPU:NVIDIA  H20
vLLM:https://docs.vllm.ai/en/stable/
Docker:v1.5.4
nvidia-container-runtime:1.0.2-dev
vllm 测试镜像:vllm/vllm-openai:v0.8.2
sglang 测试镜像:lmsysorg/sglang:ea52b45be1a7 
配置与模型根目录  /mnt/models

RDMA 网络配置:

[root@node01 models]# ibdev2netdev
mlx5_0 port 1 ==> xgbe0 (Up)
mlx5_1 port 1 ==> xgbe1 (Up)
mlx5_2 port 1 ==> xgbe2 (Up)
mlx5_3 port 1 ==> xgbe3 (Up)
mlx5_4 port 1 ==> xgbe4 (Up)
mlx5_5 port 1 ==> xgbe5 (Up)
mlx5_6 port 1 ==> xgbe6 (Up)
mlx5_7 port 1 ==> xgbe7 (Up)
mlx5_8 port 1 ==> xgbe8 (Up)
[root@node01 models]# show_gids
DEV     PORT    INDEX   GID                                     IPv4            VER     DEV
---     ----    -----   ---                                     ------------    ---     ---
mlx5_0  1       0       fe80:0000:0000:0000:bae9:24ff:fef9:56e0                 v1      xgbe0
mlx5_0  1       1       fe80:0000:0000:0000:bae9:24ff:fef9:56e0                 v2      xgbe0
mlx5_0  1       2       0000:0000:0000:0000:0000:ffff:0a3f:41cf 10.63.65.207    v1      xgbe0
mlx5_0  1       3       0000:0000:0000:0000:0000:ffff:0a3f:41cf 10.63.65.207    v2      xgbe0
mlx5_1  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:b8f8                 v1      xgbe1
mlx5_1  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:b8f8                 v2      xgbe1
mlx5_1  1       2       0000:0000:0000:0000:0000:ffff:2102:a00f 33.2.160.15     v1      xgbe1
mlx5_1  1       3       0000:0000:0000:0000:0000:ffff:2102:a00f 33.2.160.15     v2      xgbe1
mlx5_2  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:f360                 v1      xgbe2
mlx5_2  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:f360                 v2      xgbe2
mlx5_2  1       2       0000:0000:0000:0000:0000:ffff:2102:a08f 33.2.160.143    v1      xgbe2
mlx5_2  1       3       0000:0000:0000:0000:0000:ffff:2102:a08f 33.2.160.143    v2      xgbe2
mlx5_3  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:a6a8                 v1      xgbe3
mlx5_3  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:a6a8                 v2      xgbe3
mlx5_3  1       2       0000:0000:0000:0000:0000:ffff:2102:a10f 33.2.161.15     v1      xgbe3
mlx5_3  1       3       0000:0000:0000:0000:0000:ffff:2102:a10f 33.2.161.15     v2      xgbe3
mlx5_4  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:bc70                 v1      xgbe4
mlx5_4  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:bc70                 v2      xgbe4
mlx5_4  1       2       0000:0000:0000:0000:0000:ffff:2102:a18f 33.2.161.143    v1      xgbe4
mlx5_4  1       3       0000:0000:0000:0000:0000:ffff:2102:a18f 33.2.161.143    v2      xgbe4
mlx5_5  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:b9a0                 v1      xgbe5
mlx5_5  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:b9a0                 v2      xgbe5
mlx5_5  1       2       0000:0000:0000:0000:0000:ffff:2102:a20f 33.2.162.15     v1      xgbe5
mlx5_5  1       3       0000:0000:0000:0000:0000:ffff:2102:a20f 33.2.162.15     v2      xgbe5
mlx5_6  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:baf8                 v1      xgbe6
mlx5_6  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:baf8                 v2      xgbe6
mlx5_6  1       2       0000:0000:0000:0000:0000:ffff:2102:a28f 33.2.162.143    v1      xgbe6
mlx5_6  1       3       0000:0000:0000:0000:0000:ffff:2102:a28f 33.2.162.143    v2      xgbe6
mlx5_7  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:f118                 v1      xgbe7
mlx5_7  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:f118                 v2      xgbe7
mlx5_7  1       2       0000:0000:0000:0000:0000:ffff:2102:a30f 33.2.163.15     v1      xgbe7
mlx5_7  1       3       0000:0000:0000:0000:0000:ffff:2102:a30f 33.2.163.15     v2      xgbe7
mlx5_8  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:c800                 v1      xgbe8
mlx5_8  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:c800                 v2      xgbe8
mlx5_8  1       2       0000:0000:0000:0000:0000:ffff:2102:a38f 33.2.163.143    v1      xgbe8
mlx5_8  1       3       0000:0000:0000:0000:0000:ffff:2102:a38f 33.2.163.143    v2      xgbe8

[root@node02 models]# ibdev2netdev 
mlx5_0 port 1 ==> xgbe0 (Up)
mlx5_1 port 1 ==> xgbe1 (Up)
mlx5_2 port 1 ==> xgbe2 (Up)
mlx5_3 port 1 ==> xgbe3 (Up)
mlx5_4 port 1 ==> xgbe4 (Up)
mlx5_5 port 1 ==> xgbe5 (Up)
mlx5_6 port 1 ==> xgbe6 (Up)
mlx5_7 port 1 ==> xgbe7 (Up)
mlx5_8 port 1 ==> xgbe8 (Up)
[root@node02 models]# show_gids
DEV     PORT    INDEX   GID                                     IPv4            VER     DEV
---     ----    -----   ---                                     ------------    ---     ---
mlx5_0  1       0       fe80:0000:0000:0000:e29d:73ff:fe25:effc                 v1      xgbe0
mlx5_0  1       1       fe80:0000:0000:0000:e29d:73ff:fe25:effc                 v2      xgbe0
mlx5_0  1       2       0000:0000:0000:0000:0000:ffff:0a3f:41d0 10.63.65.208    v1      xgbe0
mlx5_0  1       3       0000:0000:0000:0000:0000:ffff:0a3f:41d0 10.63.65.208    v2      xgbe0
mlx5_1  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:d868                 v1      xgbe1
mlx5_1  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:d868                 v2      xgbe1
mlx5_1  1       2       0000:0000:0000:0000:0000:ffff:2102:a010 33.2.160.16     v1      xgbe1
mlx5_1  1       3       0000:0000:0000:0000:0000:ffff:2102:a010 33.2.160.16     v2      xgbe1
mlx5_2  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:f710                 v1      xgbe2
mlx5_2  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:f710                 v2      xgbe2
mlx5_2  1       2       0000:0000:0000:0000:0000:ffff:2102:a090 33.2.160.144    v1      xgbe2
mlx5_2  1       3       0000:0000:0000:0000:0000:ffff:2102:a090 33.2.160.144    v2      xgbe2
mlx5_3  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:ee30                 v1      xgbe3
mlx5_3  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:ee30                 v2      xgbe3
mlx5_3  1       2       0000:0000:0000:0000:0000:ffff:2102:a110 33.2.161.16     v1      xgbe3
mlx5_3  1       3       0000:0000:0000:0000:0000:ffff:2102:a110 33.2.161.16     v2      xgbe3
mlx5_4  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:c850                 v1      xgbe4
mlx5_4  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:c850                 v2      xgbe4
mlx5_4  1       2       0000:0000:0000:0000:0000:ffff:2102:a190 33.2.161.144    v1      xgbe4
mlx5_4  1       3       0000:0000:0000:0000:0000:ffff:2102:a190 33.2.161.144    v2      xgbe4
mlx5_5  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:ef38                 v1      xgbe5
mlx5_5  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:ef38                 v2      xgbe5
mlx5_5  1       2       0000:0000:0000:0000:0000:ffff:2102:a210 33.2.162.16     v1      xgbe5
mlx5_5  1       3       0000:0000:0000:0000:0000:ffff:2102:a210 33.2.162.16     v2      xgbe5
mlx5_6  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:c7e8                 v1      xgbe6
mlx5_6  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:c7e8                 v2      xgbe6
mlx5_6  1       2       0000:0000:0000:0000:0000:ffff:2102:a290 33.2.162.144    v1      xgbe6
mlx5_6  1       3       0000:0000:0000:0000:0000:ffff:2102:a290 33.2.162.144    v2      xgbe6
mlx5_7  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:b580                 v1      xgbe7
mlx5_7  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:b580                 v2      xgbe7
mlx5_7  1       2       0000:0000:0000:0000:0000:ffff:2102:a310 33.2.163.16     v1      xgbe7
mlx5_7  1       3       0000:0000:0000:0000:0000:ffff:2102:a310 33.2.163.16     v2      xgbe7
mlx5_8  1       0       fe80:0000:0000:0000:a288:c2ff:fe59:b908                 v1      xgbe8
mlx5_8  1       1       fe80:0000:0000:0000:a288:c2ff:fe59:b908                 v2      xgbe8
mlx5_8  1       2       0000:0000:0000:0000:0000:ffff:2102:a390 33.2.163.144    v1      xgbe8
mlx5_8  1       3       0000:0000:0000:0000:0000:ffff:2102:a390 33.2.163.144    v2      xgbe8

⚠️ 注意:模型文件可能分为多个部分,一定要验证所有文件的完整性,避免因文件损坏导致的启动失败

下载权重

DeepSeek-R1 下载 DeepSeek-R1 FP8 模型文件;

vllm 双机测试

测试脚本

在主节点 head 机器启动前确保 8379 端口没有占用。

启动 ray 集群的脚本如下所示:

#!/bin/bash

# Check for minimum number of required arguments
if [ $# -lt 4 ]; then
    echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
    exit 1
fi

# Assign the first three arguments and shift them away
DOCKER_IMAGE="$1"
HEAD_NODE_ADDRESS="$2"
NODE_TYPE="$3"  # Should be --head or --worker
PATH_TO_HF_HOME="$4"
NODE_NAME="$5"
shift 5

# Additional arguments are passed directly to the Docker command
ADDITIONAL_ARGS=("$@")

# Validate node type
if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
    echo "Error: Node type must be --head or --worker"
    exit 1
fi

# Define a function to cleanup on EXIT signal
cleanup() {
    docker stop "${NODE_NAME}"
    docker rm "${NODE_NAME}"
}
trap cleanup EXIT

# Command setup for head or worker node
RAY_START_CMD="ray start --block"
if [ "${NODE_TYPE}" == "--head" ]; then
    RAY_START_CMD+=" --head --port=8379 --dashboard-host=0.0.0.0"
else
    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:8379"
fi

# Run the docker command with the user specified parameters and additional arguments
docker run \
    --entrypoint /bin/bash \
    --privileged \
    --network host \
    --ipc host \
    --name "${NODE_NAME}" \
    --shm-size 128G \
    --gpus all \
    -v "${PATH_TO_HF_HOME}:/mnt/models" \
    "${ADDITIONAL_ARGS[@]}" \
    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"

在每个节点执行如下所示:

# 主节点 IP
export IP_OF_HEAD_NODE=10.63.65.207
export MODEL_PATH=/ssd1/models

export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
export HOST_IP=$(hostname -I | awk '{print $1}')

export GLOO_SOCKET_IFNAME=xgbe0
export TP_SOCKET_IFNAME=xgbe0
export NCCL_SOCKET_IFNAME=xgbe0
export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1 
# 以下配置也可
#export NCCL_IB_HCA=xgbe1,xgbe2,xgbe3,xgbe4,xgbe5,xgbe6,xgbe7,xgbe8
export NCCL_IB_GID_INDEX=3
export NCCL_IB_DISABLE=0

启动 head 节点

启动主节点 head:

nohup bash run_cluster.sh  vllm/vllm-openai:v0.7.2 $IP_OF_HEAD_NODE --head $MODEL_PATH  ray-node-head -e  NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME -e NCCL_DEBUG=$NCCL_DEBUG -e NCCL_IB_DISABLE=$NCCL_IB_DISABLE -e NCCL_IB_HCA=$NCCL_IB_HCA -e  NCCL_IB_GID_INDEX=$NCCL_IB_GID_INDEX -e VLLM_HOST_IP=$VLLM_HOST_IP  -e HOST_IP=$HOST_IP > nohup-head.log 2>&1 & 

启动 worker 节点

启动从节点 worker:

nohup bash run_cluster.sh  vllm/vllm-openai:v0.7.2 $IP_OF_HEAD_NODE --worker $MODEL_PATH  ray-node-worker -e NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME -e NCCL_DEBUG=$NCCL_DEBUG -e NCCL_IB_DISABLE=$NCCL_IB_DISABLE -e NCCL_IB_HCA=$NCCL_IB_HCA -e  NCCL_IB_GID_INDEX=$NCCL_IB_GID_INDEX -e VLLM_HOST_IP=$VLLM_HOST_IP  -e HOST_IP=$HOST_IP > nohup-worker.log 2>&1 & 

启动 deepseek r1 模型

注:如果机器开启ipv6双栈网络,强烈建议配置该参数;

export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
export HOST_IP=$(hostname -I | awk '{print $1}')
python3 -m vllm.entrypoints.openai.api_server --model /mnt/models/DeepSeek-R1 --gpu-memory-utilization 0.90 --tensor-parallel-size 16 --host 0.0.0.0 --trust-remote-code --port 8000  --max_num_seqs 256  --max-num-batched-tokens 16384  --max_model_len 16384  --max_seq_len_to_capture 16384   --tokenizer-mode auto  --enable-chunked-prefill=False  --enable-reasoning --reasoning-parser deepseek_r1  --enforce-eager 

参数说明:

  • –model 模型路径,加载 /workspace/DeepSeek-R1 目录下的模型。
  • –host 0.0.0.0监听所有网络接口的请求(允许外部访问)。
  • –port 服务端口号设为 8000。
  • –tokenizer-mode auto 自动选择分词器模式(优先使用 HuggingFace 的 auto 配置)。
  • –gpu-memory-utilization 0.90 GPU 显存利用率上限设为 90%(避免 OOM)。如果高并发服务运行异常,建议调低此值。
  • –tensor-parallel-size 16 使用 16 路张量并行(需 GPU 数量支持,适合超大模型)。
  • –enforce-eager 强制启用 PyTorch 的 eager 模式(可能牺牲性能以提升兼容性)。
  • –max_num_seqs 256 单次批处理的最大序列数(并发请求数上限)。vLLM 会动态调整,一般默认值设置为256。
  • –max-num-batched-tokens 16384 批处理的最大总 token 数(影响吞吐量)。最大输入长度要小于这个数值。
  • –max_model_len 16384 模型支持的最大上下文长度(16K tokens)。
  • –max_seq_len_to_capture 16384 捕获的序列最大长度(用于优化 CUDA 内核)。框架默认超参,当前版本需要与 max_model_len 保持一致。
  • –enable-chunked-prefill=False 禁用分块预填充(可能减少延迟但增加显存压力)。
  • –enable-reasoning 启用推理模式(需配合 –reasoning-parser)。
  • –reasoning-parser deepseek_r1 指定推理解析器为 deepseek_r1(自定义逻辑)。
  • –trust-remote-code 允许加载远程代码(如自定义模型或分词器)。

如果 NET/IB 出现如下字段表示双机 RDMA 通信正常:

NET/TB: Using ..... OOB xgbe0:10.63.65.208<0>

如果出现如下文字,则表示 DeepSeek R1 启动成功:

Starting vLLM API server on http://0.0.0.0:8000

测试服务是否启动成功:

root@node01 curl http://localhost:8000/v1/models -H "Content-Type: application/json"

{"object":"list","data":[{"id":"/mnt/models/DeepSeek-R1","object":"model","created":1743226823,"owned_by":"vllm","root":"/mnt/models/DeepSeek-R1","parent":null,"max_model_len":16384,"permission":[{"id":"modelperm-84fad4c5fa5748aeaf39e39fbe56236e","object":"model_permission","created":1743226823,"allow_create_engine":false,"allow_sampling":true,"allow_logprobs":true,"allow_search_indices":false,"allow_view":true,"allow_fine_tuning":false,"organization":"*","group":null,"is_blocking":false}]}]}

参考

sglang 双机测试

配置环境

启动节点 node1:

docker run --gpus all \
    --network=host \
    --ipc=host \
    --security-opt=seccomp=unconfined \
    --cap-add=SYS_PTRACE \
    --ulimit=memlock=-1 --ulimit=nofile=120000 --ulimit=stack=67108864 \
    --shm-size=128G \
    --privileged \
    -v /ssd1/models:/mnt/models \
    --name sglang_node1 \
    -itd \
    lmsysorg/sglang:481f2edc18b5

启动节点 node2:

docker run --gpus all \
    --network=host \
    --ipc=host \
    --security-opt=seccomp=unconfined \
    --cap-add=SYS_PTRACE \
    --ulimit=memlock=-1 --ulimit=nofile=120000 --ulimit=stack=67108864 \
    --shm-size=128G \
    --privileged \
    -v /ssd1/models:/mnt/models \
    --name sglang_node2 \
    -itd \
    --ipc=host \
    lmsysorg/sglang:481f2edc18b5

构建 sglang 双机环境

在主节点 node1 服务启动前确保机器 30000 端口没有被占用。

在每个节点的容器里面执行以下操作:

# 主节点 IP
export IP_OF_HEAD_NODE=10.63.65.207
export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')
export HOST_IP=$(hostname -I | awk '{print $1}')

export GLOO_SOCKET_IFNAME=xgbe0
export TP_SOCKET_IFNAME=xgbe0
export NCCL_SOCKET_IFNAME=xgbe0
export NCCL_DEBUG=INFO
export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1 
# 以下配置也可以
#export NCCL_IB_HCA=xgbe1,xgbe2,xgbe3,xgbe4,xgbe5,xgbe6,xgbe7,xgbe8
export NCCL_IB_GID_INDEX=3
export NCCL_IB_DISABLE=0

注:如果机器开启ipv6双栈网络,并且 ipv6 网络不通,强烈建议配置该参数;

export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')
export HOST_IP=$(hostname -I | awk '{print $1}')

这个地方非常非常非常非常有点坑,详见代码 https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/utils.py#L1601

docker exec -it sglang_node1 bash

# 节点1
python3 -m sglang.launch_server \
    --model-path /mnt/models/DeepSeek-R1 \
    --dist-init-addr $IP_OF_HEAD_NODE:30000 \
    --nnodes 2 \
    --node-rank 0 \
    --tp 16 \
    --trust-remote-code \
    --mem-fraction-static 0.90 \
    --tokenizer-mode auto \
    --chunked-prefill-size  4096 \
    --context-length  65536 \
    --quantization fp8  \
    --reasoning-parser deepseek-r1 \
    --host 0.0.0.0 \
    --port 8000 

docker exec -it sglang_node2 bash

# 节点2
python3 -m sglang.launch_server \
    --model-path /mnt/models/DeepSeek-R1 \
    --dist-init-addr $IP_OF_HEAD_NODE:30000 \
    --nnodes 2 \
    --node-rank 1 \
    --tp 16 \
    --trust-remote-code \
    --mem-fraction-static 0.90 \
    --tokenizer-mode auto \
    --chunked-prefill-size  4096 \
    --context-length  65536 \
    --quantization fp8  \
    --reasoning-parser deepseek-r1 \
    --host 0.0.0.0 \
    --port 8000 

参数说明

  • –model-path 模型路径,加载 /mnt/models/DeepSeek-R1 目录下的模型。
  • –host 0.0.0.0监听所有网络接口的请求(允许外部访问)。
  • –port 服务端口号设为 8000。
  • –tokenizer-mode auto 自动选择分词器模式(优先使用 HuggingFace 的 auto 配置)。
  • –tp 16 使用 16 路张量并行(需 GPU 数量支持,适合超大模型)。
  • –reasoning-parser deepseek-r1 指定推理解析器为 deepseek-r1(自定义逻辑)。
  • –trust-remote-code 允许加载远程代码(如自定义模型或分词器)。

启动 deepseek r1 模型

docker exec -it sglang_node1 bash

# 节点1
python3 -m sglang.launch_server \
    --model-path /mnt/models/DeepSeek-R1 \
    --dist-init-addr $IP_OF_HEAD_NODE:30000 \
    --nnodes 2 \
    --node-rank 0 \
    --tp 16 \
    --trust-remote-code \
    --mem-fraction-static 0.90 \
    --tokenizer-mode auto \
    --chunked-prefill-size  4096 \
    --context-length  65536 \
    --quantization fp8  \
    --reasoning-parser deepseek-r1 \
    --host 0.0.0.0 \
    --port 8000 

docker exec -it sglang_node2 bash

# 节点2
python3 -m sglang.launch_server \
    --model-path /mnt/models/DeepSeek-R1 \
    --dist-init-addr $IP_OF_HEAD_NODE:30000 \
    --nnodes 2 \
    --node-rank 1 \
    --tp 16 \
    --trust-remote-code \
    --mem-fraction-static 0.90 \
    --tokenizer-mode auto \
    --chunked-prefill-size  4096 \
    --context-length  65536 \
    --quantization fp8  \
    --reasoning-parser deepseek-r1 \
    --host 0.0.0.0 \
    --port 8000 

如果 NET/IB 出现如下字段表示双机 RDMA 通信正常:

NET/TB: Using ..... OOB xgbe0:10.63.65.208

如果出现如下文字,则表示 DeepSeek R1 启动成功:

The server is fired up and ready to roll!

测试服务是否启动成功:

root@node01 curl http://localhost:8000/v1/models -H "Content-Type: application/json"

{"object":"list","data":[{"id":"/mnt/models/DeepSeek-R1","object":"model","created":1743226823,"owned_by":"vllm","root":"/mnt/models/DeepSeek-R1","parent":null,"max_model_len":16384,"permission":[{"id":"modelperm-84fad4c5fa5748aeaf39e39fbe56236e","object":"model_permission","created":1743226823,"allow_create_engine":false,"allow_sampling":true,"allow_logprobs":true,"allow_search_indices":false,"allow_view":true,"allow_fine_tuning":false,"organization":"*","group":null,"is_blocking":false}]}]}
  • 性能优化选项:默认已启用 MLA(内存布局优化)加速。以下为可选优化项,可按需开启:
  • 数据并行注意力(Data Parallelism Attention):适用于高 QPS(每秒查询量)场景,添加 --enable-dp-attention 参数可显著提升吞吐量。
  • Torch.compile 优化:添加 --enable-torch-compile 参数以启用。服务启动时将消耗额外时间进行编译。可通过 --torch-compile-max-bs 控制优化后的最大批处理大小,建议设为 1 至 8 之间(例如:--torch-compile-max-bs 8)。

开启 --enable-torch-compile --torch-compile-max-bs 8 参数后启动耗时比较长,大约需要 20 多分钟;

参考

构建 sglang 双机环境(默认参数)

构建环境

在主节点 node1 服务启动前确保机器 30000 端口没有被占用

启动节点 node1:

docker run --gpus all \
    --network=host \
    --ipc=host \
    --security-opt=seccomp=unconfined \
    --cap-add=SYS_PTRACE \
    --ulimit=memlock=-1 --ulimit=nofile=120000 --ulimit=stack=67108864 \
    --shm-size=128G \
    --privileged \
    -v /ssd1/models:/mnt/models \
    --name sglang_node1 \
    -itd \
    lmsysorg/sglang:ea52b45be1a7

启动节点 node2:

docker run --gpus all \
    --network=host \
    --ipc=host \
    --security-opt=seccomp=unconfined \
    --cap-add=SYS_PTRACE \
    --ulimit=memlock=-1 --ulimit=nofile=120000 --ulimit=stack=67108864 \
    --shm-size=128G \
    --privileged \
    -v /ssd1/models:/mnt/models \
    --name sglang_node2 \
    -itd \
    --ipc=host \
    lmsysorg/sglang:ea52b45be1a7

启动 deepseek r1 服务,在每个节点的容器里面执行以下操作:

# 主节点 IP
export IP_OF_HEAD_NODE=10.63.65.207
export SGLANG_HOST_IP=$(hostname -I | awk '{print $1}')
export HOST_IP=$(hostname -I | awk '{print $1}')

export GLOO_SOCKET_IFNAME=xgbe0
export TP_SOCKET_IFNAME=xgbe0
export NCCL_SOCKET_IFNAME=xgbe0
export NCCL_DEBUG=INFO
export NCCL_IB_HCA=xgbe1,xgbe2,xgbe3,xgbe4,xgbe5,xgbe6,xgbe7,xgbe8
export NCCL_IB_GID_INDEX=3
export NCCL_IB_DISABLE=0
docker exec -it sglang_node1 bash

python3 -m sglang.launch_server --model-path /mnt/models/DeepSeek-R1 --tp 16 --dist-init-addr $IP_OF_HEAD_NODE:30000 --nnodes 2 --node-rank 0 --trust-remote-code --host 0.0.0.0 --port 8000

docker exec -it sglang_node2 bash
    
python3 -m sglang.launch_server --model-path /mnt/models/DeepSeek-R1 --tp 16 --dist-init-addr $IP_OF_HEAD_NODE:30000 --nnodes 2 --node-rank 1 --trust-remote-code --host 0.0.0.0 --port 8000

如果启动成功,则出现如下日志:

The server is fired up and ready to roll!

性能测试

测试脚本:

#!/bin/bash

output_dir=$1

declare -A test_cases=(
    # 格式: "输入长度 输出长度 numprt"="描述"
    ["128 128 1"]="128"
    ["128 128 2"]="128"
    ["128 128 4"]="128"
    ["128 128 8"]="128"
    ["128 128 16"]="128"
    ["128 128 32"]="128"
    ["128 128 64"]="128"
    ["128 128 128"]="128"
    ["128 128 256"]="128"
    ["128 128 512"]="128"
    ["128 128 1024"]="128"

    ["256 256 1"]="256"
    ["256 256 2"]="256"
    ["256 256 4"]="256"
    ["256 256 8"]="256"
    ["256 256 16"]="256"
    ["256 256 32"]="256"
    ["256 256 64"]="256"
    ["256 256 128"]="256"
    ["256 256 256"]="256"
    ["256 256 512"]="256"
    ["256 256 1024"]="256"
)

random_range_ratios=(1.0)

# 循环遍历所有测试用例
for case in "${!test_cases[@]}"; do
    # 分割输入长度、输出长度和numprt
    IFS=' ' read inlen outlen numprt <<< "${case}"
    
    # 构建日志文件名和结果文件名
    result_file="${output_dir}/serving_benchmark_${numprt}_${inlen}_${outlen}.json"

     for ratio in "${random_range_ratios[@]}"
        do
            python3 -m sglang.bench_serving \
                --backend sglang \
                --dataset-name random \
                --dataset-path /mnt/models/sglang-perf/datasets/ShareGPT_V3_unfiltered_cleaned_split.json \
                --model /mnt/models/DeepSeek-R1 \
                --random-input ${inlen} \
                --random-output ${outlen} \
                --random-range-ratio ${ratio} \
                --num-prompts ${numprt} \
                --host 0.0.0.0 \
                --port 8000 \
                --output-file "$result_file" \
                --max-concurrency ${numprt}
        done
        echo "Executed command for num-prompts=$numprt, inlen=$input_len, outlen=$output_len. Output saved to $result_file"

done

收集测试结果脚本:

import json
import os
import csv
import re
import argparse

#fieldnames = ["tp", "concurrency", "mean TTFT(ms)", "p90_tpot_ms", 
#              "output_throughput(tokens/s)", "total_throughput(tokens/s)", 
#              "mean TPOT(ms)", "mean e2e(ms)", "mean_input_len", "mean_output_len"]

fieldnames = ["concurrency", "mean TTFT(ms)", "input_throughput(tokens/s)", 
              "output_throughput(tokens/s)", "mean TPOT(ms)", "mean_input_len", 
              "mean_output_len"]


def main(args):
    data = []

    for _, _, files in os.walk(args.dir):
        for file in files:
            if file.endswith(".json"):
                # tp${TENSOR_PARALLEL}_CON${i}_in${INPUT_LENGTH}_out${OUTPUT_LENGTH}.json
                # pattern = r'tp(\d+)_CON(\d+)_in(\d+)_out(\d+)\.json'
                pattern = r'serving_benchmark_(\d+)_(\d+)_(\d+).json'
                # 使用正则表达式匹配文件名
                match = re.match(pattern, file)
                if match:
                    # tp = int(match.group(1))
                    # con = int(match.group(2))
                    # input_length = int(match.group(3))
                    # output_length = int(match.group(4))
                    con = int(match.group(1))
                    input_length = int(match.group(2))
                    output_length = int(match.group(3))
                else:
                    print("not match")

                with open(os.path.join(args.dir, file), 'r') as f:
                    result = json.load(f)
                    data.append(
                        {
                            # "tp": tp, 
                            "concurrency": con, 
                            "mean TTFT(ms)": result['mean_ttft_ms'],
                            # "p90_tpot_ms": result["p90_tpot_ms"],
                            "input_throughput(tokens/s)": result["input_throughput"],
                            "output_throughput(tokens/s)": result['output_throughput'],
                            # "total_throughput(tokens/s)": result["total_token_throughput"],
                            "mean TPOT(ms)": result['mean_tpot_ms'],
                            # "mean e2e(ms)": result['mean_e2el_ms'],
                            "mean_input_len": sum(result['input_lens']) / len(result['input_lens']),
                            "mean_output_len": sum(result['output_lens']) / len(result['output_lens'])
                        }
                    )

    with open(os.path.join(args.dir, 'summary.csv'), 'w') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dir", type=str, help="")

    args = parser.parse_args()
    main(args)

测试结果

ConcurrencyMean TTFT (ms)Input Throughput (tokens/s)Output Throughput (tokens/s)Mean TPOT (ms)Mean Input LenMean Output Len
1316.5217.2017.2057.14256256
1285.8417.4217.4255.60128128
21126.9131.7831.7858.76256256
2321.5633.0333.0358.49128128
4342.9659.2459.2466.44256256
4454.8259.3559.3564.34128128
8631.9389.4089.4085.20128128
81349.7691.1491.1482.82256256
16681.99137.47137.47114.16256256
16608.29136.20136.20113.59128128
32501.92219.70219.70144.24256256
32838.25198.94198.94155.46128128
641062.61290.74290.74213.41128128
641046.21291.33291.33216.41256256
1001008.45413.92413.92238.54256256
1001441.36435.11435.11220.18128128
1281263.11506.19506.19244.77128128
1281205.51505.17505.17249.58256256
2002316.10357.58357.58552.32256256
2001395.16436.23436.23450.87128128
2561695.48378.55378.55672.14256256
2561991.52375.48375.48671.24128128
5124446.39459.66459.661100.49256256
5123023.79436.38436.381158.17128128
10243088.611092.911092.91918.67128128
10242798.731090.291090.29931.18256256

参考

总结

  • 与 vllm 单机相比,vllm 双机性能存在下降,推测双机通信存在瓶颈,双机之间的通讯开销较大。
  • 与 sglang 单机相比,sglang 双机性能存在下降,推测双机通信存在瓶颈,双机之间的通讯开销较大。