checkpoint commit

2026-05-05 06:26:40 -04:00
parent e43c534ceb
commit f2015e2c71
76 changed files with 4265 additions and 235 deletions
--- a/active/software_ai_stack/ai-internal.network
+++ b/active/software_ai_stack/ai-internal.network
@@ -1,3 +1,2 @@
 [Network]
-IPv6=true
 Internal=true
--- a/active/software_ai_stack/ai-internal.pod
+++ b/active/software_ai_stack/ai-internal.pod
@@ -6,6 +6,8 @@ PublishPort=8000:8000/tcp
 PublishPort=8001:8001/tcp
 # llama.cpp instruct
 PublishPort=8002:8002/tcp
+# llama.cpp code
+PublishPort=8003:8003/tcp
 # stable-diffusion.cpp gen
 PublishPort=1234:1234/tcp
 # stable-diffusion.cpp edit
--- a/active/software_ai_stack/ai_stack.md
+++ b/active/software_ai_stack/ai_stack.md
@@ -34,6 +34,7 @@
  - [open-webui](#open-webui)
  - [lite-llm](#lite-llm)
  - [Install Services with Quadlets](#install-services-with-quadlets)
+    - [API Keys](#api-keys)
    - [Internal and External Pods](#internal-and-external-pods)
    - [Llama CPP Server (Port 8000)](#llama-cpp-server-port-8000)
    - [Llama CPP Embedding Server (Port 8001)](#llama-cpp-embedding-server-port-8001)
@@ -179,7 +180,11 @@ rsync -av --progress /home/ai/models/ /srv/models/

 ### Download models

-In general I try to run 8 bit quantized minimum.
+In my completely subjective opinion: 5 bit quant is usually the sweet spot for
+unsloth models. Q5_K_S is usually just fine.
+
+I usually download the F16 mmproj files. This is also completely subjective.
+BF16 is fine. F32 is overkill.

 #### Text models

@@ -218,8 +223,13 @@ hf download --local-dir . ggml-org/Ministral-3-3B-Instruct-2512-GGUF
 ##### Qwen

 ```bash
+# qwen3.6-35b-a3b
+mkdir qwen3.6-35b-a3b && cd qwen3.6-35b-a3b
+hf download --local-dir . unsloth/Qwen3.6-35B-A3B-GGUF Qwen3.6-35B-A3B-UD-Q5_K_M.gguf
+hf download --local-dir . unsloth/Qwen3.6-35B-A3B-GGUF mmproj-F16.gguf
+
 # qwen3.5-27b-opus
-mkdir qwen3.5-27b-opus && qwen3.5-27b-opus
+mkdir qwen3.5-27b-opus && cd qwen3.5-27b-opus
 hf download --local-dir . Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF Qwen3.5-27B.Q4_K_M.gguf
 hf download --local-dir . Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF mmproj-BF16.gguf

@@ -555,6 +565,22 @@ podman run \

 ## Install Services with Quadlets

+### API Keys
+
+```bash
+mkdir -p /home/ai/.llama-api
+touch /home/ai/.llama-api/keys.env
+chmod 600 /home/ai/.llama-api/keys.env
+vim /home/ai/.llama-api/keys.env
+
+LLAMA_API_KEY=
+
+# Generate keys and append to file, then comma separate the keys
+openssl rand -base64 48 >> keys.env
+openssl rand -base64 48 >> keys.env
+openssl rand -base64 48 >> keys.env
+```
+
 ### Internal and External Pods

 These will be used to restrict internet access to our llama.cpp and
@@ -562,10 +588,10 @@ stable-diffusion.cpp services while allowing the frontend services to
 communicate with those containers.

 ```bash
-scp -r active/software_ai_stack/quadlets_pods/* deskwork-ai:.config/containers/systemd/
+scp -r active/software_ai_stack/ai-internal.* deskwork-ai:.config/containers/systemd/
 ssh deskwork-ai
 systemctl --user daemon-reload
-systemctl --user start ai-internal-pod.service ai-external-pod.service
+systemctl --user start ai-internal-pod.service
 ```

 ### Llama CPP Server (Port 8000)
@@ -573,7 +599,7 @@ systemctl --user start ai-internal-pod.service ai-external-pod.service
 Installs the llama.cpp server to run our text models.

 ```bash
-scp -r active/software_ai_stack/quadlets_llama_think/* deskwork-ai:.config/containers/systemd/
+scp -r active/software_ai_stack/llama-think.container deskwork-ai:.config/containers/systemd/
 ssh deskwork-ai
 systemctl --user daemon-reload
 systemctl --user restart ai-internal-pod.service
@@ -584,7 +610,7 @@ systemctl --user restart ai-internal-pod.service
 Installs the llama.cpp server to run our embedding models

 ```bash
-scp -r active/software_ai_stack/quadlets_llama_embed/* deskwork-ai:.config/containers/systemd/
+scp -r active/software_ai_stack/llama-embed.container deskwork-ai:.config/containers/systemd/
 ssh deskwork-ai
 systemctl --user daemon-reload
 systemctl --user restart ai-internal-pod.service
@@ -595,7 +621,7 @@ systemctl --user restart ai-internal-pod.service
 Installs the llama.cpp server to run a constant instruct (no thinking) model for quick replies

 ```bash
-scp -r active/software_ai_stack/quadlets_llama_instruct/* deskwork-ai:.config/containers/systemd/
+scp -r active/software_ai_stack/llama-instruct.container deskwork-ai:.config/containers/systemd/
 ssh deskwork-ai
 systemctl --user daemon-reload
 systemctl --user restart ai-internal-pod.service
@@ -711,11 +737,11 @@ Apple M4 max
 export TOKEN=$(cat active/software_ai_stack/secrets/aipi-token)

 # List Models
-curl https://aipi.reeseapps.com/v1/models \
-H "Authorization: Bearer $TOKEN" | jq
+curl https://llama-instruct.reeseapps.com/v1/models \
+-H "Authorization: Bearer $TOKEN" | jq '.data'

 # Text
-curl https://aipi.reeseapps.com/v1/chat/completions \
+curl https://llama-instruct.reeseapps.com/v1/chat/completions \
 -H "Content-Type: application/json" \
 -H "Authorization: Bearer $TOKEN" \
 -d '{
@@ -724,26 +750,21 @@ curl https://aipi.reeseapps.com/v1/chat/completions \
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello, how are you?"}
  ],
-  "temperature": 0.7,
  "max_tokens": 500
 }' | jq

 # Completion
-curl https://aipi.reeseapps.com/v1/completions \
+curl https://llama-instruct.reeseapps.com/v1/completions \
 -H "Content-Type: application/json" \
 -H "Authorization: Bearer $TOKEN" \
 -d '{
  "model": "llama-instruct/instruct",
  "prompt": "Write a short poem about the ocean.",
-  "temperature": 0.7,
-  "max_tokens": 500,
-  "top_p": 1,
-  "frequency_penalty": 0,
-  "presence_penalty": 0
+  "max_tokens": 500
 }' | jq

 # Image Gen
-curl https://aipi.reeseapps.com/v1/images/generations \
+curl https://image-gen.reeselink.com/v1/images/generations \
 -H "Content-Type: application/json" \
 -H "Authorization: Bearer $TOKEN" \
 -d '{
@@ -766,11 +787,11 @@ curl http://aipi.reeseapps.com/v1/images/edits \

 # Embed
 curl \
-"https://aipi.reeseapps.com/v1/embeddings" \
+"https://llama-embed.reeseapps.com/v1/embeddings" \
 -H "Authorization: Bearer $TOKEN" \
 -H "Content-Type: application/json" \
 -d '{
-  "model": "llama-embed/embed",
+  "model": "deskwork-embed/embed",
  "input":"This is the reason you ended up here:",
  "encoding_format": "float"
 }'
@@ -789,16 +810,20 @@ podman run --rm \
 --env "HF_TOKEN=$HF_TOKEN" \
 -p 8010:8000 \
 --ipc=host \
+-e ROCBLAS_USE_HIPBLASLT=1 \
+-e TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 \
+-e VLLM_TARGET_DEVICE=rocm \
+-e HIP_FORCE_DEV_KERNARG=1 \
+-e RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \
 docker.io/vllm/vllm-openai-rocm:nightly \
 --enable-offline-docs \

 # Pick your model
-Qwen/Qwen3.5-35B-A3B --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
+Qwen/Qwen3.5-35B-A3B-FP8 --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
 Qwen/Qwen3.5-9B --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
 Qwen/Qwen3.5-35B-A3B-FP8
 google/gemma-4-26B-A4B-it
 openai/gpt-oss-120b
-
 ```

 ## Misc
--- a/active/software_ai_stack/install_ai_image_stack.yaml
+++ b/active/software_ai_stack/install_ai_image_stack.yaml
@@ -1,5 +1,5 @@
 - name: Create Deskwork AI Stack
-  hosts: toybox-ai
+  hosts: deskwork-ai
  tasks:
    - name: Create /home/ai/.config/containers/systemd
      ansible.builtin.file:
--- a/active/software_ai_stack/install_ai_instruct_stack.yaml
+++ b/active/software_ai_stack/install_ai_instruct_stack.yaml
@@ -15,7 +15,7 @@
        - ai-internal.pod
        - llama-embed.container
        - llama-instruct.container
-        - llama-think.container
+        - llama-code.container
    - name: Reload and start the ai-internal-pod service
      ansible.builtin.systemd_service:
        state: restarted
--- a/active/software_ai_stack/install_ai_think_stack.yaml
+++ b/active/software_ai_stack/install_ai_think_stack.yaml
@@ -0,0 +1,22 @@
+- name: Create Deskwork AI Stack
+  hosts: driveripper-ai
+  tasks:
+    - name: Create /home/ai/.config/containers/systemd
+      ansible.builtin.file:
+        path: /home/ai/.config/containers/systemd
+        state: directory
+        mode: "0755"
+    - name: Copy Quadlets
+      template:
+        src: "{{ item }}"
+        dest: "/home/ai/.config/containers/systemd/{{ item }}"
+      loop:
+        - ai-internal.network
+        - ai-internal.pod
+        - llama-think.container
+    - name: Reload and start the ai-internal-pod service
+      ansible.builtin.systemd_service:
+        state: restarted
+        name: ai-internal-pod.service
+        daemon_reload: true
+        scope: user
--- a/active/software_ai_stack/llama-code.container
+++ b/active/software_ai_stack/llama-code.container
@@ -0,0 +1,49 @@
+[Unit]
+Description=A Llama CPP Server Running a Coding Model
+
+[Container]
+# Shared AI internal pod without internet access
+Pod=ai-internal.pod
+
+# Image is built locally via podman build
+Image=localhost/llama-cpp-vulkan:latest
+
+# Downloaded models volume
+Volume=/home/ai/models/text:/models:z
+
+# GPU Device
+AddDevice=/dev/kfd
+AddDevice=/dev/dri
+
+# Server command
+Exec=--port 8003 \
+    -c 256000 \
+    -n 65536 \
+    --temp 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repeat-penalty 1.05 \
+    --perf \
+    --n-gpu-layers all \
+    --jinja \
+    -m /models/qwen3-coder-30b-a3b/Qwen3-Coder-30B-A3B-Instruct-Q5_K_M.gguf \
+    --alias code
+
+# Health Check
+HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8003/health || exit 1
+HealthInterval=10s
+HealthRetries=3
+HealthStartPeriod=10s
+HealthTimeout=30s
+HealthOnFailure=kill
+
+EnvironmentFile=/home/ai/.llama-api/keys.env
+
+[Service]
+Restart=always
+# Extend Timeout to allow time to pull the image
+TimeoutStartSec=900
+
+[Install]
+# Start by default on boot
+WantedBy=multi-user.target default.target
--- a/active/software_ai_stack/llama-embed.container
+++ b/active/software_ai_stack/llama-embed.container
@@ -2,7 +2,7 @@
 Description=A Llama CPP Server For Embedding Models

 [Container]
-# Shared AI internal pod
+# Shared AI internal pod without internet access
 Pod=ai-internal.pod

 # Image is built locally via podman build
@@ -18,21 +18,22 @@ AddDevice=/dev/dri
 # Server command
 Exec=--port 8001 \
    -c 0 \
+    -b 1024 \
+    -ub 1024 \
    --perf \
    --n-gpu-layers all \
-    --models-max 1 \
-    --models-dir /models \
    --embedding \
-    -m /models/qwen3-embed-4b/Qwen3-Embedding-4B-Q8_0.gguf \
+    -m /models/emebeddinggemma-300m/embeddinggemma-300M-BF16.gguf \
    --alias embed

 # Health Check
-HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8001/props || exit 1
+HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8001/health || exit 1
 HealthInterval=10s
 HealthRetries=3
 HealthStartPeriod=10s
 HealthTimeout=30s
 HealthOnFailure=kill
+EnvironmentFile=/home/ai/.llama-api/keys.env

 [Service]
 Restart=always
@@ -41,4 +42,4 @@ TimeoutStartSec=900

 [Install]
 # Start by default on boot
-WantedBy=multi-user.target default.target
+WantedBy=multi-user.target default.target
--- a/active/software_ai_stack/llama-instruct.container
+++ b/active/software_ai_stack/llama-instruct.container
@@ -1,8 +1,8 @@
 [Unit]
-Description=A Llama CPP Server Running GPT OSS 120b
+Description=A Llama CPP Server Running a Non-Reasoning Model

 [Container]
-# Shared AI internal pod
+# Shared AI internal pod without internet access
 Pod=ai-internal.pod

 # Image is built locally via podman build
@@ -17,29 +17,31 @@ AddDevice=/dev/dri

 # Server command
 Exec=--port 8002 \
-    -c 64000 \
-    --perf \
-    -v \
-    --top-k 20 \
-    --top-p 0.8 \
-    --min-p 0 \
-    --presence-penalty 1.5 \
-    --repeat-penalty 1 \
+    -c 262144 \
+    -n 32768 \
    --temp 0.7 \
+    --top-p 0.8 \
+    --min-p 0.0 \
+    --top-k 20 \
+    --repeat-penalty 1.0 \
+    --presence-penalty 1.5 \
+    --reasoning-budget 0 \
+    --perf \
    --n-gpu-layers all \
    --jinja \
+    -m /models/qwen3.6-35b-a3b/Qwen3.6-35B-A3B-UD-Q5_K_M.gguf \
+    --mmproj /models/qwen3.6-35b-a3b/mmproj-F16.gguf \
    --chat-template-kwargs '{"enable_thinking": false}' \
-    -m /models/qwen3.5-35b-a3b/Qwen3.5-35B-A3B-Q8_0.gguf \
-    --mmproj /models/qwen3.5-35b-a3b/mmproj-F16.gguf \
    --alias instruct

 # Health Check
-HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8000/health || exit 1
+HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8002/health || exit 1
 HealthInterval=10s
 HealthRetries=3
 HealthStartPeriod=10s
 HealthTimeout=30s
 HealthOnFailure=kill
+EnvironmentFile=/home/ai/.llama-api/keys.env

 [Service]
 Restart=always
@@ -48,4 +50,4 @@ TimeoutStartSec=900

 [Install]
 # Start by default on boot
-WantedBy=multi-user.target default.target
+WantedBy=multi-user.target default.target
--- a/active/software_ai_stack/llama-think.container
+++ b/active/software_ai_stack/llama-think.container
@@ -1,8 +1,8 @@
 [Unit]
-Description=A Llama CPP Server Running GPT OSS 120b
+Description=A Llama CPP Server Running a Reasoning Model

 [Container]
-# Shared AI internal pod
+# Shared AI internal pod without internet access
 Pod=ai-internal.pod

 # Image is built locally via podman build
@@ -17,16 +17,21 @@ AddDevice=/dev/dri

 # Server command
 Exec=--port 8000 \
-    -c 128000 \
-    --top-k 64 \
+    -c 262144 \
+    -n 32768 \
+    --temp 0.7 \
    --top-p 0.95 \
-    --temp 1.0 \
+    --top-k 20 \
+    --min-p 0.0 \
+    --presence-penalty 0.0 \
+    --repeat-penalty 1.0 \
+    --reasoning-budget 5000 \
+    -fa on \
    --perf \
-    -v \
    --n-gpu-layers all \
    --jinja \
-    -m /models/gemma-4-26b-a4b/gemma-4-26B-A4B-it-UD-Q8_K_XL.gguf \
-    --mmproj /models/gemma-4-26b-a4b/mmproj-BF16.gguf \
+    -m /models/qwen3.6-35b-a3b/Qwen3.6-35B-A3B-UD-Q5_K_M.gguf \
+    --mmproj /models/qwen3.6-35b-a3b/mmproj-F16.gguf \
    --alias think

 # Health Check
@@ -44,4 +49,4 @@ TimeoutStartSec=900

 [Install]
 # Start by default on boot
-WantedBy=multi-user.target default.target
+WantedBy=multi-user.target default.target
--- a/active/software_ai_stack/stable-diffusion-edit-server.container
+++ b/active/software_ai_stack/stable-diffusion-edit-server.container
@@ -2,7 +2,7 @@
 Description=A Stable Diffusion CPP Server for Editing Images

 [Container]
-# Shared AI Internal pod
+# Shared AI internal pod without internet access
 Pod=ai-internal.pod

 # Vulkan image for AMD GPU
@@ -23,7 +23,7 @@ Exec=-l 0.0.0.0 \
    --listen-port 1235 \
    --diffusion-model /models/image/flux2-klein/flux-2-klein-9b-Q8_0.gguf \
    --vae /models/image/flux2-klein/ae.safetensors \
-    --llm /models/image/flux2-klein/Qwen3-8B-Q8_0.gguf \
+    --llm /models/image/flux2-klein/Qwen3-8B-Q4_K_M.gguf \
    -v \
    --sampling-method euler \
    --cfg-scale 1.0 \
--- a/active/software_ai_stack/stable-diffusion-gen-server.container
+++ b/active/software_ai_stack/stable-diffusion-gen-server.container
@@ -2,7 +2,7 @@
 Description=A Stable Diffusion CPP Server for Generating Images

 [Container]
-# Shared AI internal pod
+# Shared AI internal pod without internet access
 Pod=ai-internal.pod

 # Vulkan image for AMD GPU
@@ -23,7 +23,7 @@ Exec=-l 0.0.0.0 \
    --listen-port 1234 \
    --diffusion-model /models/image/z-turbo/z_image_turbo-Q8_0.gguf \
    --vae /models/image/z-turbo/ae.safetensors  \
-    --llm /models/image/z-turbo/Qwen3-4B-Instruct-2507-Q8_0.gguf \
+    --llm /models/image/z-turbo/Qwen3-4B-Instruct-2507-Q4_K_M.gguf \
    -v \
    --cfg-scale 1.0 \
    --vae-conv-direct \