add vllm notes
This commit is contained in:
@@ -45,7 +45,10 @@
|
|||||||
- [Benchmark Results](#benchmark-results)
|
- [Benchmark Results](#benchmark-results)
|
||||||
- [Testing with Curl](#testing-with-curl)
|
- [Testing with Curl](#testing-with-curl)
|
||||||
- [OpenAI API](#openai-api)
|
- [OpenAI API](#openai-api)
|
||||||
|
- [VLLM](#vllm)
|
||||||
|
- [Run VLLM with Podman](#run-vllm-with-podman)
|
||||||
- [Misc](#misc)
|
- [Misc](#misc)
|
||||||
|
- [Quantizing your own Models](#quantizing-your-own-models)
|
||||||
- [Qwen3.5 Settings](#qwen35-settings)
|
- [Qwen3.5 Settings](#qwen35-settings)
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
@@ -215,6 +218,11 @@ hf download --local-dir . ggml-org/Ministral-3-3B-Instruct-2512-GGUF
|
|||||||
##### Qwen
|
##### Qwen
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# qwen3.5-27b-opus
|
||||||
|
mkdir qwen3.5-27b-opus && qwen3.5-27b-opus
|
||||||
|
hf download --local-dir . Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF Qwen3.5-27B.Q4_K_M.gguf
|
||||||
|
hf download --local-dir . Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF mmproj-BF16.gguf
|
||||||
|
|
||||||
# qwen3.5-4b
|
# qwen3.5-4b
|
||||||
mkdir qwen3.5-4b && cd qwen3.5-4b
|
mkdir qwen3.5-4b && cd qwen3.5-4b
|
||||||
hf download --local-dir . unsloth/Qwen3.5-4B-GGUF Qwen3.5-4B-Q8_0.gguf
|
hf download --local-dir . unsloth/Qwen3.5-4B-GGUF Qwen3.5-4B-Q8_0.gguf
|
||||||
@@ -264,6 +272,17 @@ hf download --local-dir . unsloth/GLM-4.7-Flash-GGUF GLM-4.7-Flash-Q8_0.gguf
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Note "it" vs "pt" suffixes. "it" is instruction following, "pt" is the base model (not as good for out-of-the-box use)
|
# Note "it" vs "pt" suffixes. "it" is instruction following, "pt" is the base model (not as good for out-of-the-box use)
|
||||||
|
|
||||||
|
# gemma-4-26b-a4b
|
||||||
|
mkdir gemma-4-26b-a4b && cd gemma-4-26b-a4b
|
||||||
|
hf download --local-dir . ggml-org/gemma-4-26B-A4B-it-GGUF gemma-4-26B-A4B-it-Q8_0.gguf
|
||||||
|
hf download --local-dir . ggml-org/gemma-4-26B-A4B-it-GGUF mmproj-gemma-4-26B-A4B-it-f16.gguf
|
||||||
|
|
||||||
|
# gemma-4-31b
|
||||||
|
mkdir gemma-4-31b && cd gemma-4-31b
|
||||||
|
hf download --local-dir . ggml-org/gemma-4-31B-it-GGUF gemma-4-31B-it-Q8_0.gguf
|
||||||
|
hf download --local-dir . ggml-org/gemma-4-31B-it-GGUF mmproj-gemma-4-31B-it-f16.gguf
|
||||||
|
|
||||||
# gemma-3-27b-it
|
# gemma-3-27b-it
|
||||||
mkdir gemma-3-27b-it && cd gemma-3-27b-it
|
mkdir gemma-3-27b-it && cd gemma-3-27b-it
|
||||||
hf download --local-dir . unsloth/gemma-3-27b-it-GGUF gemma-3-27b-it-Q8_0.gguf
|
hf download --local-dir . unsloth/gemma-3-27b-it-GGUF gemma-3-27b-it-Q8_0.gguf
|
||||||
@@ -353,7 +372,7 @@ podman build -f .devops/vulkan.Dockerfile -t llama-cpp-vulkan:${BUILD_TAG} -t ll
|
|||||||
# ROCM
|
# ROCM
|
||||||
podman build -f .devops/rocm.Dockerfile -t llama-cpp-rocm:${BUILD_TAG} -t llama-cpp-rocm:latest .
|
podman build -f .devops/rocm.Dockerfile -t llama-cpp-rocm:${BUILD_TAG} -t llama-cpp-rocm:latest .
|
||||||
|
|
||||||
# Run llama demo server (Available on port 8000)
|
# Run llama demo server (Available on port 8010)
|
||||||
podman run \
|
podman run \
|
||||||
--rm \
|
--rm \
|
||||||
--name llama-server-demo \
|
--name llama-server-demo \
|
||||||
@@ -361,10 +380,11 @@ podman run \
|
|||||||
--device=/dev/dri \
|
--device=/dev/dri \
|
||||||
-v /home/ai/models/text:/models:z \
|
-v /home/ai/models/text:/models:z \
|
||||||
-p 8010:8000 \
|
-p 8010:8000 \
|
||||||
|
--ipc host \
|
||||||
localhost/llama-cpp-vulkan:latest \
|
localhost/llama-cpp-vulkan:latest \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
-c 16000 \
|
-c 128000 \
|
||||||
--perf \
|
--perf \
|
||||||
--n-gpu-layers all \
|
--n-gpu-layers all \
|
||||||
--jinja \
|
--jinja \
|
||||||
@@ -756,8 +776,64 @@ curl \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## VLLM
|
||||||
|
|
||||||
|
### Run VLLM with Podman
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 'latest' and 'nightly' are both viable tags
|
||||||
|
podman run --rm \
|
||||||
|
--device /dev/kfd \
|
||||||
|
--device /dev/dri \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface:z \
|
||||||
|
--env "HF_TOKEN=$HF_TOKEN" \
|
||||||
|
-p 8010:8000 \
|
||||||
|
--ipc=host \
|
||||||
|
docker.io/vllm/vllm-openai-rocm:nightly \
|
||||||
|
--enable-offline-docs \
|
||||||
|
|
||||||
|
# Pick your model
|
||||||
|
Qwen/Qwen3.5-35B-A3B --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
|
||||||
|
Qwen/Qwen3.5-9B --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
|
||||||
|
Qwen/Qwen3.5-35B-A3B-FP8
|
||||||
|
google/gemma-4-26B-A4B-it
|
||||||
|
openai/gpt-oss-120b
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
## Misc
|
## Misc
|
||||||
|
|
||||||
|
### Quantizing your own Models
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create a scratch dir for downloading models
|
||||||
|
mkdir scratch && cd scratch
|
||||||
|
|
||||||
|
# qwen 3.5 35b
|
||||||
|
mkdir qwen3.5-35b-a3b && cd qwen3.5-35b-a3b
|
||||||
|
hf download --local-dir . Qwen/Qwen3.5-35B-A3B
|
||||||
|
|
||||||
|
# nemotron cascade
|
||||||
|
mkdir nemotron-cascade-2-30b-a3b && cd nemotron-cascade-2-30b-a3b
|
||||||
|
hf download --local-dir . nvidia/Nemotron-Cascade-2-30B-A3B
|
||||||
|
|
||||||
|
# Run the full
|
||||||
|
podman run -it --rm \
|
||||||
|
--device=/dev/kfd \
|
||||||
|
--device=/dev/dri \
|
||||||
|
-v $(pwd):/models:z \
|
||||||
|
--entrypoint /bin/bash \
|
||||||
|
ghcr.io/ggml-org/llama.cpp:full-vulkan
|
||||||
|
|
||||||
|
# Run ./llama-quantize to see available quants
|
||||||
|
# 7 = q_8
|
||||||
|
# 18 = q_6_k
|
||||||
|
# 17 = q_5_k
|
||||||
|
# 15 = q_4_k
|
||||||
|
./llama-quantize /models/$MODEL_NAME.gguf /models/$MODEL_NAME-Q6_K.gguf 18
|
||||||
|
./llama-quantize /models/$MODEL_NAME.gguf /models/$MODEL_NAME-Q8_0.gguf 7
|
||||||
|
```
|
||||||
|
|
||||||
### Qwen3.5 Settings
|
### Qwen3.5 Settings
|
||||||
|
|
||||||
> We recommend using the following set of sampling parameters for generation
|
> We recommend using the following set of sampling parameters for generation
|
||||||
|
|||||||
Reference in New Issue
Block a user