add vllm notes
This commit is contained in:
@@ -45,7 +45,10 @@
|
||||
- [Benchmark Results](#benchmark-results)
|
||||
- [Testing with Curl](#testing-with-curl)
|
||||
- [OpenAI API](#openai-api)
|
||||
- [VLLM](#vllm)
|
||||
- [Run VLLM with Podman](#run-vllm-with-podman)
|
||||
- [Misc](#misc)
|
||||
- [Quantizing your own Models](#quantizing-your-own-models)
|
||||
- [Qwen3.5 Settings](#qwen35-settings)
|
||||
|
||||
## Notes
|
||||
@@ -215,6 +218,11 @@ hf download --local-dir . ggml-org/Ministral-3-3B-Instruct-2512-GGUF
|
||||
##### Qwen
|
||||
|
||||
```bash
|
||||
# qwen3.5-27b-opus
|
||||
mkdir qwen3.5-27b-opus && qwen3.5-27b-opus
|
||||
hf download --local-dir . Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF Qwen3.5-27B.Q4_K_M.gguf
|
||||
hf download --local-dir . Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF mmproj-BF16.gguf
|
||||
|
||||
# qwen3.5-4b
|
||||
mkdir qwen3.5-4b && cd qwen3.5-4b
|
||||
hf download --local-dir . unsloth/Qwen3.5-4B-GGUF Qwen3.5-4B-Q8_0.gguf
|
||||
@@ -264,6 +272,17 @@ hf download --local-dir . unsloth/GLM-4.7-Flash-GGUF GLM-4.7-Flash-Q8_0.gguf
|
||||
|
||||
```bash
|
||||
# Note "it" vs "pt" suffixes. "it" is instruction following, "pt" is the base model (not as good for out-of-the-box use)
|
||||
|
||||
# gemma-4-26b-a4b
|
||||
mkdir gemma-4-26b-a4b && cd gemma-4-26b-a4b
|
||||
hf download --local-dir . ggml-org/gemma-4-26B-A4B-it-GGUF gemma-4-26B-A4B-it-Q8_0.gguf
|
||||
hf download --local-dir . ggml-org/gemma-4-26B-A4B-it-GGUF mmproj-gemma-4-26B-A4B-it-f16.gguf
|
||||
|
||||
# gemma-4-31b
|
||||
mkdir gemma-4-31b && cd gemma-4-31b
|
||||
hf download --local-dir . ggml-org/gemma-4-31B-it-GGUF gemma-4-31B-it-Q8_0.gguf
|
||||
hf download --local-dir . ggml-org/gemma-4-31B-it-GGUF mmproj-gemma-4-31B-it-f16.gguf
|
||||
|
||||
# gemma-3-27b-it
|
||||
mkdir gemma-3-27b-it && cd gemma-3-27b-it
|
||||
hf download --local-dir . unsloth/gemma-3-27b-it-GGUF gemma-3-27b-it-Q8_0.gguf
|
||||
@@ -353,7 +372,7 @@ podman build -f .devops/vulkan.Dockerfile -t llama-cpp-vulkan:${BUILD_TAG} -t ll
|
||||
# ROCM
|
||||
podman build -f .devops/rocm.Dockerfile -t llama-cpp-rocm:${BUILD_TAG} -t llama-cpp-rocm:latest .
|
||||
|
||||
# Run llama demo server (Available on port 8000)
|
||||
# Run llama demo server (Available on port 8010)
|
||||
podman run \
|
||||
--rm \
|
||||
--name llama-server-demo \
|
||||
@@ -361,10 +380,11 @@ podman run \
|
||||
--device=/dev/dri \
|
||||
-v /home/ai/models/text:/models:z \
|
||||
-p 8010:8000 \
|
||||
--ipc host \
|
||||
localhost/llama-cpp-vulkan:latest \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
-c 16000 \
|
||||
-c 128000 \
|
||||
--perf \
|
||||
--n-gpu-layers all \
|
||||
--jinja \
|
||||
@@ -756,8 +776,64 @@ curl \
|
||||
}'
|
||||
```
|
||||
|
||||
## VLLM
|
||||
|
||||
### Run VLLM with Podman
|
||||
|
||||
```bash
|
||||
# 'latest' and 'nightly' are both viable tags
|
||||
podman run --rm \
|
||||
--device /dev/kfd \
|
||||
--device /dev/dri \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface:z \
|
||||
--env "HF_TOKEN=$HF_TOKEN" \
|
||||
-p 8010:8000 \
|
||||
--ipc=host \
|
||||
docker.io/vllm/vllm-openai-rocm:nightly \
|
||||
--enable-offline-docs \
|
||||
|
||||
# Pick your model
|
||||
Qwen/Qwen3.5-35B-A3B --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
|
||||
Qwen/Qwen3.5-9B --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
|
||||
Qwen/Qwen3.5-35B-A3B-FP8
|
||||
google/gemma-4-26B-A4B-it
|
||||
openai/gpt-oss-120b
|
||||
|
||||
```
|
||||
|
||||
## Misc
|
||||
|
||||
### Quantizing your own Models
|
||||
|
||||
```bash
|
||||
# Create a scratch dir for downloading models
|
||||
mkdir scratch && cd scratch
|
||||
|
||||
# qwen 3.5 35b
|
||||
mkdir qwen3.5-35b-a3b && cd qwen3.5-35b-a3b
|
||||
hf download --local-dir . Qwen/Qwen3.5-35B-A3B
|
||||
|
||||
# nemotron cascade
|
||||
mkdir nemotron-cascade-2-30b-a3b && cd nemotron-cascade-2-30b-a3b
|
||||
hf download --local-dir . nvidia/Nemotron-Cascade-2-30B-A3B
|
||||
|
||||
# Run the full
|
||||
podman run -it --rm \
|
||||
--device=/dev/kfd \
|
||||
--device=/dev/dri \
|
||||
-v $(pwd):/models:z \
|
||||
--entrypoint /bin/bash \
|
||||
ghcr.io/ggml-org/llama.cpp:full-vulkan
|
||||
|
||||
# Run ./llama-quantize to see available quants
|
||||
# 7 = q_8
|
||||
# 18 = q_6_k
|
||||
# 17 = q_5_k
|
||||
# 15 = q_4_k
|
||||
./llama-quantize /models/$MODEL_NAME.gguf /models/$MODEL_NAME-Q6_K.gguf 18
|
||||
./llama-quantize /models/$MODEL_NAME.gguf /models/$MODEL_NAME-Q8_0.gguf 7
|
||||
```
|
||||
|
||||
### Qwen3.5 Settings
|
||||
|
||||
> We recommend using the following set of sampling parameters for generation
|
||||
|
||||
Reference in New Issue
Block a user