add vllm notes

This commit is contained in:
2026-04-06 11:44:53 -04:00
parent 9776f8ed9f
commit 9eb79d34f1

View File

@@ -45,7 +45,10 @@
- [Benchmark Results](#benchmark-results)
- [Testing with Curl](#testing-with-curl)
- [OpenAI API](#openai-api)
- [VLLM](#vllm)
- [Run VLLM with Podman](#run-vllm-with-podman)
- [Misc](#misc)
- [Quantizing your own Models](#quantizing-your-own-models)
- [Qwen3.5 Settings](#qwen35-settings)
## Notes
@@ -215,6 +218,11 @@ hf download --local-dir . ggml-org/Ministral-3-3B-Instruct-2512-GGUF
##### Qwen
```bash
# qwen3.5-27b-opus
mkdir qwen3.5-27b-opus && qwen3.5-27b-opus
hf download --local-dir . Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF Qwen3.5-27B.Q4_K_M.gguf
hf download --local-dir . Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF mmproj-BF16.gguf
# qwen3.5-4b
mkdir qwen3.5-4b && cd qwen3.5-4b
hf download --local-dir . unsloth/Qwen3.5-4B-GGUF Qwen3.5-4B-Q8_0.gguf
@@ -264,6 +272,17 @@ hf download --local-dir . unsloth/GLM-4.7-Flash-GGUF GLM-4.7-Flash-Q8_0.gguf
```bash
# Note "it" vs "pt" suffixes. "it" is instruction following, "pt" is the base model (not as good for out-of-the-box use)
# gemma-4-26b-a4b
mkdir gemma-4-26b-a4b && cd gemma-4-26b-a4b
hf download --local-dir . ggml-org/gemma-4-26B-A4B-it-GGUF gemma-4-26B-A4B-it-Q8_0.gguf
hf download --local-dir . ggml-org/gemma-4-26B-A4B-it-GGUF mmproj-gemma-4-26B-A4B-it-f16.gguf
# gemma-4-31b
mkdir gemma-4-31b && cd gemma-4-31b
hf download --local-dir . ggml-org/gemma-4-31B-it-GGUF gemma-4-31B-it-Q8_0.gguf
hf download --local-dir . ggml-org/gemma-4-31B-it-GGUF mmproj-gemma-4-31B-it-f16.gguf
# gemma-3-27b-it
mkdir gemma-3-27b-it && cd gemma-3-27b-it
hf download --local-dir . unsloth/gemma-3-27b-it-GGUF gemma-3-27b-it-Q8_0.gguf
@@ -353,7 +372,7 @@ podman build -f .devops/vulkan.Dockerfile -t llama-cpp-vulkan:${BUILD_TAG} -t ll
# ROCM
podman build -f .devops/rocm.Dockerfile -t llama-cpp-rocm:${BUILD_TAG} -t llama-cpp-rocm:latest .
# Run llama demo server (Available on port 8000)
# Run llama demo server (Available on port 8010)
podman run \
--rm \
--name llama-server-demo \
@@ -361,10 +380,11 @@ podman run \
--device=/dev/dri \
-v /home/ai/models/text:/models:z \
-p 8010:8000 \
--ipc host \
localhost/llama-cpp-vulkan:latest \
--host 0.0.0.0 \
--port 8000 \
-c 16000 \
-c 128000 \
--perf \
--n-gpu-layers all \
--jinja \
@@ -756,8 +776,64 @@ curl \
}'
```
## VLLM
### Run VLLM with Podman
```bash
# 'latest' and 'nightly' are both viable tags
podman run --rm \
--device /dev/kfd \
--device /dev/dri \
-v ~/.cache/huggingface:/root/.cache/huggingface:z \
--env "HF_TOKEN=$HF_TOKEN" \
-p 8010:8000 \
--ipc=host \
docker.io/vllm/vllm-openai-rocm:nightly \
--enable-offline-docs \
# Pick your model
Qwen/Qwen3.5-35B-A3B --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
Qwen/Qwen3.5-9B --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
Qwen/Qwen3.5-35B-A3B-FP8
google/gemma-4-26B-A4B-it
openai/gpt-oss-120b
```
## Misc
### Quantizing your own Models
```bash
# Create a scratch dir for downloading models
mkdir scratch && cd scratch
# qwen 3.5 35b
mkdir qwen3.5-35b-a3b && cd qwen3.5-35b-a3b
hf download --local-dir . Qwen/Qwen3.5-35B-A3B
# nemotron cascade
mkdir nemotron-cascade-2-30b-a3b && cd nemotron-cascade-2-30b-a3b
hf download --local-dir . nvidia/Nemotron-Cascade-2-30B-A3B
# Run the full
podman run -it --rm \
--device=/dev/kfd \
--device=/dev/dri \
-v $(pwd):/models:z \
--entrypoint /bin/bash \
ghcr.io/ggml-org/llama.cpp:full-vulkan
# Run ./llama-quantize to see available quants
# 7 = q_8
# 18 = q_6_k
# 17 = q_5_k
# 15 = q_4_k
./llama-quantize /models/$MODEL_NAME.gguf /models/$MODEL_NAME-Q6_K.gguf 18
./llama-quantize /models/$MODEL_NAME.gguf /models/$MODEL_NAME-Q8_0.gguf 7
```
### Qwen3.5 Settings
> We recommend using the following set of sampling parameters for generation