Pro(1) 四、推理服务启停 4.1 创建部署目录 shell mkdir p /home/deepseek mkdir p /home/deepseek/logs mkdir p /home/deepseek/ascendlog 4.2 准备slurm启动脚本 将以下脚本保存至/home/deepseek/srun.sh shell !/bin/bash SBATCH N 4 SBATCH partitionbatch SBATCH J deepseek SBATCH o logs/log%J.out SBATCH e logs/log%J.err SBATCH gresgpu:8 SBATCH cpuspertask190 SBATCH nodelistmaster0001,compute0001,compute0002,compute0003 export LCCTYPEC.UTF8 export MASTERADDR$(scontrol show hostnames "$SLURMJOBNODELIST" head n 1 hostname i) export MODELNAMEDeepSeekV4 export MODELPORT11025 export MODELDIR/mnt/nvme1n1/model/DeepSeekV4Prow4a8mtp export VLLMIMGvllmascenddeepseekv4.sif echo "模型推理服务API为: echo "模型名称为:$MODELNAME" srun ntaskspernode1 o logs/log%J.%t.out e logs/log%J.%t.err ./node.sh 4.3 准备节点部署脚本 将以下脚本保存至/home/deepseek/node.sh shell !/bin/sh nicname"eno0" 网卡,弹性裸金属使用“eno0”,标准裸金属使用“bond0” localip$(hostname i awk '{print $1}') node0ip$MASTERADDR export HCCLIFIP$localip export GLOOSOCKETIFNAME$nicname export TPSOCKETIFNAME$nicname export HCCLSOCKETIFNAME$nicname export OMPPROCBINDfalse export OMPNUMTHREADS10 export HCCLBUFFSIZE200 export HCCLOPEXPANSIONMODE"AIV" export PYTORCHNPUALLOCCONFexpandablesegments:True export HCCLCONNECTTIMEOUT120 export HCCLINTRAPCIEENABLE1 export HCCLINTRAROCEENABLE0 export ACLOPINITMODE1 export TRITONALLBLOCKSPARALLEL1 export USEMULTIBLOCKPOOL1 export USEMULTIGROUPSKVCACHE1 export ASCENDBUFFERPOOL0:0 export VLLMASCENDENABLEFLASHCOMM11 export VLLMENGINEREADYTIMEOUTS3600 apptainer instance start nohome writabletmpfs B /usr/local/sbin:/usr/local/sbin B /usr/local/Ascend/driver:/usr/local/Ascend/driver B ascendlog:/root/ascend B $MODELDIR:/model $VLLMIMG appinstance if [ $SLURMNODEID 0 ]; then apptainer exec instance://appinstance vllm serve /model servedmodelname "$MODELNAME" host 0.0.0.0 port "$MODELPORT" dataparallelsize $SLURMNNODES dataparallelsizelocal 1 dataparalleladdress $node0ip dataparallelrpcport 13389 tensorparallelsize 8 quantization ascend seed 1024 enableexpertparallel maxnumseqs 16 maxmodellen 65536 maxnumbatchedtokens 4096 tokenizermode deepseekv4 toolcallparser deepseekv4 enableautotoolchoice reasoningparser deepseekv4 trustremotecode asyncscheduling enableprefixcaching gpumemoryutilization 0.95 safetensorsloadstrategy 'prefetch' defaultchattemplatekwargs '{"thinking": true}' compilationconfig '{"cudagraphmode": "FULLDECODEONLY"}' additionalconfig '{"ascendcompilationconfig":{"enablenpugraphex":true,"enablestatickernel":false},"enablecpubinding":"True"}' speculativeconfig '{"numspeculativetokens": 3, "method": "deepseekmtp"}' else apptainer exec instance://appinstance vllm serve /model servedmodelname "$MODELNAME" host 0.0.0.0 port "$MODELPORT" headless dataparallelsize $SLURMNNODES dataparallelsizelocal 1 dataparallelstartrank $SLURMNODEID dataparalleladdress $node0ip dataparallelrpcport 13389 tensorparallelsize 8 quantization ascend seed 1024 enableexpertparallel maxnumseqs 16 maxmodellen 65536 maxnumbatchedtokens 4096 tokenizermode deepseekv4 toolcallparser deepseekv4 enableautotoolchoice reasoningparser deepseekv4 trustremotecode asyncscheduling enableprefixcaching gpumemoryutilization 0.95 safetensorsloadstrategy 'prefetch' defaultchattemplatekwargs '{"thinking": true}' compilationconfig '{"cudagraphmode": "FULLDECODEONLY"}' additionalconfig '{"ascendcompilationconfig":{"enablenpugraphex":true,"enablestatickernel":false},"enablecpubinding":"True"}' speculativeconfig '{"numspeculativetokens": 3, "method": "deepseekmtp"}' fi
来自: