From cc75227a77841dd463f800f9560bee0d6dddb39f Mon Sep 17 00:00:00 2001
From: ducoterra <git@ducoterra.net>
Date: Mon, 16 Mar 2026 09:54:13 -0400
Subject: [PATCH] reconfigure software ai stack

---
 .../{quadlets_pods => }/ai-internal.network   |   0
 .../{quadlets_pods => }/ai-internal.pod       |   6 +-
 active/software_ai_stack/ai_stack.md          | 315 ++++++++++++++++--
 .../install_ai_image_stack.yaml               |  23 ++
 .../install_ai_text_stack.yaml                |  24 ++
 .../llama-embed.container                     |  11 +-
 .../llama-instruct.container                  |  51 +++
 ...server.container => llama-think.container} |   4 +-
 .../open-webui.container                      |   2 +-
 active/software_ai_stack/openai-example.py    | 133 ++++++++
 .../quadlets_guest/open-webui-guest.container |  32 --
 .../quadlets_pods/ai-external.network         |   2 -
 .../quadlets_pods/ai-external.pod             |   6 -
 .../stable-diffusion-edit-server.container    |   0
 .../stable-diffusion-gen-server.container     |   0
 15 files changed, 526 insertions(+), 83 deletions(-)
 rename active/software_ai_stack/{quadlets_pods => }/ai-internal.network (100%)
 rename active/software_ai_stack/{quadlets_pods => }/ai-internal.pod (60%)
 create mode 100644 active/software_ai_stack/install_ai_image_stack.yaml
 create mode 100644 active/software_ai_stack/install_ai_text_stack.yaml
 rename active/software_ai_stack/{quadlets_guest => }/llama-embed.container (75%)
 create mode 100644 active/software_ai_stack/llama-instruct.container
 rename active/software_ai_stack/{quadlets_llama_server/llama-server.container => llama-think.container} (89%)
 rename active/software_ai_stack/{quadlets_openwebui => }/open-webui.container (96%)
 create mode 100644 active/software_ai_stack/openai-example.py
 delete mode 100644 active/software_ai_stack/quadlets_guest/open-webui-guest.container
 delete mode 100644 active/software_ai_stack/quadlets_pods/ai-external.network
 delete mode 100644 active/software_ai_stack/quadlets_pods/ai-external.pod
 rename active/software_ai_stack/{quadlets_stable_diffusion => }/stable-diffusion-edit-server.container (100%)
 rename active/software_ai_stack/{quadlets_stable_diffusion => }/stable-diffusion-gen-server.container (100%)

diff --git a/active/software_ai_stack/quadlets_pods/ai-internal.network b/active/software_ai_stack/ai-internal.network
similarity index 100%
rename from active/software_ai_stack/quadlets_pods/ai-internal.network
rename to active/software_ai_stack/ai-internal.network
diff --git a/active/software_ai_stack/quadlets_pods/ai-internal.pod b/active/software_ai_stack/ai-internal.pod
similarity index 60%
rename from active/software_ai_stack/quadlets_pods/ai-internal.pod
rename to active/software_ai_stack/ai-internal.pod
index a5b22bb..23cca87 100644
--- a/active/software_ai_stack/quadlets_pods/ai-internal.pod
+++ b/active/software_ai_stack/ai-internal.pod
@@ -1,7 +1,11 @@
 [Pod]
 Network=ai-internal.network
-# llama.cpp
+# llama.cpp server
 PublishPort=8000:8000/tcp
+# llama.cpp embed
+PublishPort=8001:8001/tcp
+# llama.cpp instruct
+PublishPort=8002:8002/tcp
 # stable-diffusion.cpp gen
 PublishPort=1234:1234/tcp
 # stable-diffusion.cpp edit
diff --git a/active/software_ai_stack/ai_stack.md b/active/software_ai_stack/ai_stack.md
index a1f66bc..840227e 100644
--- a/active/software_ai_stack/ai_stack.md
+++ b/active/software_ai_stack/ai_stack.md
@@ -3,6 +3,10 @@
 - [Self Hosted AI Stack](#self-hosted-ai-stack)
   - [Notes](#notes)
     - [Podman Volume Locations](#podman-volume-locations)
+    - [List of Internal Links](#list-of-internal-links)
+  - [Quick Install](#quick-install)
+    - [Text Stack](#text-stack)
+    - [Image Stack](#image-stack)
   - [Setup](#setup)
     - [Create the AI user](#create-the-ai-user)
     - [Helper aliases](#helper-aliases)
@@ -17,6 +21,8 @@
         - [GLM](#glm)
         - [Gemma](#gemma)
         - [Dolphin](#dolphin)
+        - [LiquidAI](#liquidai)
+        - [Level 1 Techs](#level-1-techs)
       - [Image models](#image-models)
         - [Z-Image](#z-image)
         - [Flux](#flux)
@@ -26,22 +32,71 @@
   - [llama.cpp](#llamacpp)
   - [stable-diffusion.cpp](#stable-diffusioncpp)
   - [open-webui](#open-webui)
+  - [lite-llm](#lite-llm)
   - [Install Services with Quadlets](#install-services-with-quadlets)
     - [Internal and External Pods](#internal-and-external-pods)
-    - [Llama CPP Server](#llama-cpp-server)
-    - [Llama CPP Embedding Server](#llama-cpp-embedding-server)
-    - [Stable Diffusion CPP](#stable-diffusion-cpp)
-    - [Open Webui](#open-webui-1)
+    - [Llama CPP Server (Port 8000)](#llama-cpp-server-port-8000)
+    - [Llama CPP Embedding Server (Port 8001)](#llama-cpp-embedding-server-port-8001)
+    - [Llama CPP Instruct Server (Port 8002)](#llama-cpp-instruct-server-port-8002)
+    - [Stable Diffusion CPP (Port 1234 and 1235)](#stable-diffusion-cpp-port-1234-and-1235)
+    - [Open Webui (Port 8080)](#open-webui-port-8080)
     - [Install the update script](#install-the-update-script)
     - [Install Guest Open Webui with Start/Stop Services](#install-guest-open-webui-with-startstop-services)
   - [Benchmark Results](#benchmark-results)
+  - [Testing with Curl](#testing-with-curl)
+    - [OpenAI API](#openai-api)
+  - [Misc](#misc)
+    - [Qwen3.5 Settings](#qwen35-settings)
 
 ## Notes
 
+```bash
+# Shortcut for downloading models
+hf-download ()
+{
+    if [ $# -ne 3 ]; then
+        echo "ERROR: Expected 3 arguments, but only got $#" 1>&2
+        return 1
+    fi
+    BASE_DIR='/opt/ai/models'
+    mkdir -p $BASE_DIR/$1
+    pushd $BASE_DIR/$1 2>&1 >/dev/null
+    hf download --local-dir . $2 $3
+    popd 2>&1 >/dev/null
+}
+```
+
 ### Podman Volume Locations
 
 `~/.local/share/containers/storage/volumes/`
 
+### List of Internal Links
+
+- llama-cpp
+- llama-embed
+- llama-instruct
+- image-gen
+- image-edit
+- openwebui
+
+## Quick Install
+
+### Text Stack
+
+```bash
+ansible-playbook \
+-i ansible/inventory.yaml \
+active/software_ai_stack/install_ai_text_stack.yaml
+```
+
+### Image Stack
+
+```bash
+ansible-playbook \
+-i ansible/inventory.yaml \
+active/software_ai_stack/install_ai_image_stack.yaml
+```
+
 ## Setup
 
 ### Create the AI user
@@ -160,9 +215,15 @@ hf download --local-dir . ggml-org/Ministral-3-3B-Instruct-2512-GGUF
 ##### Qwen
 
 ```bash
-# qwen3-30b-a3b-thinking
-mkdir qwen3-30b-a3b-thinking && cd qwen3-30b-a3b-thinking
-hf download --local-dir . ggml-org/Qwen3-30B-A3B-Thinking-2507-Q8_0-GGUF
+# qwen3.5-4b
+mkdir qwen3.5-4b && cd qwen3.5-4b
+hf download --local-dir . unsloth/Qwen3.5-4B-GGUF Qwen3.5-4B-Q8_0.gguf
+hf download --local-dir . unsloth/Qwen3.5-4B-GGUF mmproj-F16.gguf
+
+# qwen3.5-35b-a3b
+mkdir qwen3.5-35b-a3b && cd qwen3.5-35b-a3b
+hf download --local-dir . unsloth/Qwen3.5-35B-A3B-GGUF Qwen3.5-35B-A3B-Q8_0.gguf
+hf download --local-dir . unsloth/Qwen3.5-35B-A3B-GGUF mmproj-F16.gguf
 
 # qwen3-30b-a3b-instruct
 mkdir qwen3-30b-a3b-instruct && cd qwen3-30b-a3b-instruct
@@ -185,6 +246,10 @@ hf download --local-dir . ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
 # qwen3-coder-next
 mkdir qwen3-coder-next && cd qwen3-coder-next
 hf download --local-dir . unsloth/Qwen3-Coder-Next-GGUF --include "Q8_0/*.gguf"
+
+# qwen3-8b (benchmarks)
+mkdir qwen3-8b && cd qwen3-8b
+hf download --local-dir . Qwen/Qwen3-8B-GGUF Qwen3-8B-Q8_0.gguf
 ```
 
 ##### GLM
@@ -210,10 +275,26 @@ hf download --local-dir . unsloth/gemma-3-27b-it-GGUF mmproj-F16.gguf
 ```bash
 # dolphin-mistral-24b-venice
 mkdir dolphin-mistral-24b-venice && cd dolphin-mistral-24b-venice
-cd dolphin-mistral-24b-venice
 hf download --local-dir . bartowski/cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-GGUF cognitivecomputations_Dolphin-Mistral-24B-Venice-Edition-Q8_0.gguf
 ```
 
+##### LiquidAI
+
+```bash
+# lfm2-24b
+mkdir lfm2-24b && cd lfm2-24b
+hf download --local-dir . LiquidAI/LFM2-24B-A2B-GGUF LFM2-24B-A2B-Q8_0.gguf
+```
+
+##### Level 1 Techs
+
+```bash
+# kappa-20b
+# https://huggingface.co/eousphoros/kappa-20b-131k-GGUF-Q8_0/tree/main
+mkdir kappa-20b && cd kappa-20b
+hf download --local-dir . eousphoros/kappa-20b-131k-GGUF-Q8_0
+```
+
 #### Image models
 
 ##### Z-Image
@@ -244,8 +325,8 @@ hf download --local-dir . unsloth/Qwen3-8B-GGUF Qwen3-8B-Q8_0.gguf
 ##### Qwen Embedding
 
 ```bash
-mkdir /home/ai/models/embedding/qwen3-vl-embed && cd /home/ai/models/embedding/qwen3-vl-embed
-hf download --local-dir . dam2452/Qwen3-VL-Embedding-8B-GGUF Qwen3-VL-Embedding-8B-Q8_0.gguf
+mkdir qwen3-embed-4b && cd qwen3-embed-4b
+hf download --local-dir . Qwen/Qwen3-Embedding-4B-GGUF Qwen3-Embedding-4B-Q8_0.gguf
 ```
 
 ##### Nomic Embedding
@@ -279,16 +360,44 @@ podman run \
 --device=/dev/kfd \
 --device=/dev/dri \
 -v /home/ai/models/text:/models:z \
--p 8000:8000 \
+-p 8010:8000 \
 localhost/llama-cpp-vulkan:latest \
 --host 0.0.0.0 \
 --port 8000 \
--c 32768 \
+-c 16000 \
 --perf \
 --n-gpu-layers all \
 --jinja \
 --models-max 1 \
---models-dir /models
+--models-dir /models \
+--chat-template-kwargs '{"enable_thinking": false}' \
+-m /models/qwen3.5-35b-a3b
+```
+
+Embedding models
+
+```bash
+podman run \
+--rm \
+--name llama-server-demo \
+--device=/dev/kfd \
+--device=/dev/dri \
+-v /home/ai/models/text:/models:z \
+-p 8000:8000 \
+localhost/llama-cpp-vulkan:latest \
+--host 0.0.0.0 \
+--port 8001 \
+-c 512 \
+--perf \
+--n-gpu-layers all \
+--models-max 1 \
+--models-dir /models \
+--embedding
+```
+
+```bash
+# Test with curl
+curl -X POST "https://llama-embed.reeselink.com/embedding" --data '{"model": "qwen3-embed-4b", "content":"Star Wars is better than Star Trek"}'
 ```
 
 ## stable-diffusion.cpp
@@ -354,6 +463,37 @@ localhost/stable-diffusion-cpp:latest \
 -r /output/output.png \
 -o /output/edit.png \
 -p "Replace the dragon with an old car"
+
+# Video generation with wan2.2
+podman run --rm \
+-v /home/ai/models:/models:z \
+-v /home/ai/output:/output:z \
+--device /dev/kfd \
+--device /dev/dri \
+localhost/stable-diffusion-cpp:latest \
+-M vid_gen \
+--diffusion-model /models/video/wan2.2/Wan2.2-T2V-A14B-LowNoise-Q5_K_M.gguf \
+--high-noise-diffusion-model /models/video/wan2.2/Wan2.2-T2V-A14B-HighNoise-Q5_K_M.gguf \
+--vae /models/video/wan2.2/wan_2.1_vae.safetensors \
+--t5xxl /models/video/wan2.2/umt5-xxl-encoder-Q5_K_M.gguf \
+--cfg-scale 3.5 \
+--sampling-method euler \
+--steps 10 \
+--high-noise-cfg-scale 3.5 \
+--high-noise-sampling-method euler \
+--high-noise-steps 8 \
+--vae-conv-direct \
+--diffusion-conv-direct \
+--vae-tiling \
+-v \
+-n "Colorful tones, overexposed, static, blurred details, subtitles, style, artwork, painting, picture, still, overall graying, worst quality, low quality, JPEG compression residue, ugly, mutilated, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, deformed limbs, finger fusion, still pictures, messy backgrounds, three legs, many people in the background, walking backwards" \
+-W 512 \
+-H 512 \
+--diffusion-fa \
+--video-frames 24 \
+--flow-shift 3.0 \
+-o /output/video_output \
+-p "A normal business meeting. People discuss business for 2 seconds. Suddenly, a horde of furries carrying assault rifles bursts into the room and causes a panic. Hatsune Miku leads the charge screaming in rage."
 ```
 
 ## open-webui
@@ -382,6 +522,17 @@ Use the following connections:
 | stable-diffusion.cpp      | <http://host.containers.internal:1234/v1> |
 | stable-diffusion.cpp edit | <http://host.containers.internal:1235/v1> |
 
+## lite-llm
+
+<https://docs.litellm.ai/docs/proxy/configs>
+
+```bash
+podman run \
+--rm \
+--name litellm \
+-p 4000:4000
+```
+
 ## Install Services with Quadlets
 
 ### Internal and External Pods
@@ -397,18 +548,18 @@ systemctl --user daemon-reload
 systemctl --user start ai-internal-pod.service ai-external-pod.service
 ```
 
-### Llama CPP Server
+### Llama CPP Server (Port 8000)
 
 Installs the llama.cpp server to run our text models.
 
 ```bash
-scp -r active/software_ai_stack/quadlets_llama_server/* deskwork-ai:.config/containers/systemd/
+scp -r active/software_ai_stack/quadlets_llama_think/* deskwork-ai:.config/containers/systemd/
 ssh deskwork-ai
 systemctl --user daemon-reload
 systemctl --user restart ai-internal-pod.service
 ```
 
-### Llama CPP Embedding Server
+### Llama CPP Embedding Server (Port 8001)
 
 Installs the llama.cpp server to run our embedding models
 
@@ -419,7 +570,18 @@ systemctl --user daemon-reload
 systemctl --user restart ai-internal-pod.service
 ```
 
-### Stable Diffusion CPP
+### Llama CPP Instruct Server (Port 8002)
+
+Installs the llama.cpp server to run a constant instruct (no thinking) model for quick replies
+
+```bash
+scp -r active/software_ai_stack/quadlets_llama_instruct/* deskwork-ai:.config/containers/systemd/
+ssh deskwork-ai
+systemctl --user daemon-reload
+systemctl --user restart ai-internal-pod.service
+```
+
+### Stable Diffusion CPP (Port 1234 and 1235)
 
 Installs the stable-diffusion.cpp server to run our image models.
 
@@ -430,7 +592,7 @@ systemctl --user daemon-reload
 systemctl --user restart ai-internal-pod.service
 ```
 
-### Open Webui
+### Open Webui (Port 8080)
 
 Installs the open webui frontend.
 
@@ -482,27 +644,22 @@ podman run -it --rm \
 ghcr.io/ggml-org/llama.cpp:full-vulkan
 
 # Benchmark command
-./llama-bench -m /models/benchmark/gpt-oss-20b-Q8_0.gguf
+./llama-bench -m /models/gpt-oss-20b/gpt-oss-20b-Q8_0.gguf -p 4096 -n 1024
 ```
 
 Framework Desktop
 
-| model            |      size |  params | backend |  ngl |  test |            t/s |
-| ---------------- | --------: | ------: | ------- | ---: | ----: | -------------: |
-| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | Vulkan  |   99 | pp512 | 1128.50 ± 7.60 |
-| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | Vulkan  |   99 | tg128 |   77.94 ± 0.08 |
-
-| model            |      size |  params | backend |  ngl |  test |           t/s |
-| ---------------- | --------: | ------: | ------- | ---: | ----: | ------------: |
-| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | ROCm    |   99 | pp512 | 526.05 ± 7.04 |
-| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | ROCm    |   99 | tg128 |  70.98 ± 0.01 |
+| model            |      size |  params | backend |  ngl |   test |           t/s |
+| ---------------- | --------: | ------: | ------- | ---: | -----: | ------------: |
+| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | Vulkan  |   99 | pp4096 | 992.74 ± 6.07 |
+| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | Vulkan  |   99 | tg1024 |  75.82 ± 0.07 |
 
 AMD R9700
 
-| model            |      size |  params | backend |  ngl |  test |              t/s |
-| ---------------- | --------: | ------: | ------- | ---: | ----: | ---------------: |
-| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | Vulkan  |   99 | pp512 | 3756.79 ± 203.97 |
-| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | Vulkan  |   99 | tg128 |    174.24 ± 0.32 |
+| model            |      size |  params | backend |  ngl |   test |            t/s |
+| ---------------- | --------: | ------: | ------- | ---: | -----: | -------------: |
+| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | Vulkan  |   99 | pp4096 | 3190.85 ± 8.24 |
+| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | Vulkan  |   99 | tg1024 |  168.73 ± 0.15 |
 
 NVIDIA GeForce RTX 4080 SUPER
 
@@ -514,10 +671,10 @@ NVIDIA GeForce RTX 4080 SUPER
 
 NVIDIA GeForce RTX 3090
 
-| model            |      size |  params | backend |  ngl |  test |             t/s |
-| ---------------- | --------: | ------: | ------- | ---: | ----: | --------------: |
-| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | CUDA    |   99 | pp512 | 4297.72 ± 35.60 |
-| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | CUDA    |   99 | tg128 |   197.73 ± 0.62 |
+| model            |      size |  params | backend     |  ngl |   test |             t/s |
+| ---------------- | --------: | ------: | ----------- | ---: | -----: | --------------: |
+| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | CUDA,Vulkan |   99 | pp4096 | 3034.03 ± 80.36 |
+| gpt-oss 20B Q8_0 | 11.27 GiB | 20.91 B | CUDA,Vulkan |   99 | tg1024 |   181.05 ± 9.01 |
 
 Apple M4 max
 
@@ -525,3 +682,89 @@ Apple M4 max
 | :---------------------------- | -----: | -------------: |
 | unsloth/gpt-oss-20b-Q8_0-GGUF | pp2048 | 1579.12 ± 7.12 |
 | unsloth/gpt-oss-20b-Q8_0-GGUF |   tg32 |  113.00 ± 2.81 |
+
+## Testing with Curl
+
+### OpenAI API
+
+```bash
+export TOKEN=$(cat active/software_ai_stack/secrets/aipi-token)
+
+# List Models
+curl https://aipi.reeseapps.com/v1/models \
+-H "Authorization: Bearer $TOKEN" | jq
+
+# Text
+curl https://aipi.reeseapps.com/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $TOKEN" \
+-d '{
+  "model": "llama-instruct/instruct",
+  "messages": [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello, how are you?"}
+  ],
+  "temperature": 0.7,
+  "max_tokens": 500
+}' | jq
+
+# Completion
+curl https://aipi.reeseapps.com/v1/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $TOKEN" \
+-d '{
+  "model": "llama-instruct/instruct",
+  "prompt": "Write a short poem about the ocean.",
+  "temperature": 0.7,
+  "max_tokens": 500,
+  "top_p": 1,
+  "frequency_penalty": 0,
+  "presence_penalty": 0
+}' | jq
+
+# Image Gen
+curl https://aipi.reeseapps.com/v1/images/generations \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $TOKEN" \
+-d '{
+  "model": "sdd-gen/sd-cpp-local",
+  "prompt": "A futuristic city with flying cars at sunset, digital art",
+  "n": 1,
+  "size": "1024x1024"
+}' | jq
+
+# Image Edit
+curl http://aipi.reeseapps.com/v1/images/edits \
+-H "Authorization: Bearer $TOKEN" \
+-d '{
+  "model": "sdd-edit/sd-cpp-local",
+  "image": "@path/to/your/image.jpg",
+  "prompt": "Add a sunset background",
+  "n": 1,
+  "size": "1024x1024"
+}'
+
+# Embed
+curl \
+"https://aipi.reeseapps.com/v1/embeddings" \
+-H "Authorization: Bearer $TOKEN" \
+-H "Content-Type: application/json" \
+-d '{
+  "model": "llama-embed/embed",
+  "input":"This is the reason you ended up here:",
+  "encoding_format": "float"
+}'
+```
+
+## Misc
+
+### Qwen3.5 Settings
+
+> We recommend using the following set of sampling parameters for generation
+
+- Non-thinking mode for text tasks: temperature=1.0, top_p=1.00, top_k=20, min_p=0.0, presence_penalty=2.0, repetition_penalty=1.0
+- Non-thinking mode for VL tasks: temperature=0.7, top_p=0.80, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0
+- Thinking mode for text tasks: temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0
+- Thinking mode for VL or precise coding (e.g. WebDev) tasks : temperature=0.6, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=0.0, repetition_penalty=1.0
+
+> Please note that the support for sampling parameters varies according to inference frameworks.
diff --git a/active/software_ai_stack/install_ai_image_stack.yaml b/active/software_ai_stack/install_ai_image_stack.yaml
new file mode 100644
index 0000000..460f5c3
--- /dev/null
+++ b/active/software_ai_stack/install_ai_image_stack.yaml
@@ -0,0 +1,23 @@
+- name: Create Deskwork AI Stack
+  hosts: toybox-ai
+  tasks:
+    - name: Create /home/ai/.config/containers/systemd
+      ansible.builtin.file:
+        path: /home/ai/.config/containers/systemd
+        state: directory
+        mode: "0755"
+    - name: Copy Quadlets
+      template:
+        src: "{{ item }}"
+        dest: "/home/ai/.config/containers/systemd/{{ item }}"
+      loop:
+        - ai-internal.network
+        - ai-internal.pod
+        - stable-diffusion-gen-server.container
+        - stable-diffusion-edit-server.container
+    - name: Reload and start the ai-internal-pod service
+      ansible.builtin.systemd_service:
+        state: restarted
+        name: ai-internal-pod.service
+        daemon_reload: true
+        scope: user
diff --git a/active/software_ai_stack/install_ai_text_stack.yaml b/active/software_ai_stack/install_ai_text_stack.yaml
new file mode 100644
index 0000000..bc1c006
--- /dev/null
+++ b/active/software_ai_stack/install_ai_text_stack.yaml
@@ -0,0 +1,24 @@
+- name: Create Deskwork AI Stack
+  hosts: deskwork-ai
+  tasks:
+    - name: Create /home/ai/.config/containers/systemd
+      ansible.builtin.file:
+        path: /home/ai/.config/containers/systemd
+        state: directory
+        mode: "0755"
+    - name: Copy Quadlets
+      template:
+        src: "{{ item }}"
+        dest: "/home/ai/.config/containers/systemd/{{ item }}"
+      loop:
+        - ai-internal.network
+        - ai-internal.pod
+        - llama-embed.container
+        - llama-instruct.container
+        - llama-think.container
+    - name: Reload and start the ai-internal-pod service
+      ansible.builtin.systemd_service:
+        state: restarted
+        name: ai-internal-pod.service
+        daemon_reload: true
+        scope: user
diff --git a/active/software_ai_stack/quadlets_guest/llama-embed.container b/active/software_ai_stack/llama-embed.container
similarity index 75%
rename from active/software_ai_stack/quadlets_guest/llama-embed.container
rename to active/software_ai_stack/llama-embed.container
index a800ef7..c3a1a25 100644
--- a/active/software_ai_stack/quadlets_guest/llama-embed.container
+++ b/active/software_ai_stack/llama-embed.container
@@ -1,5 +1,5 @@
 [Unit]
-Description=A Llama CPP Server running an Embedding Model
+Description=A Llama CPP Server For Embedding Models
 
 [Container]
 # Shared AI internal pod
@@ -17,9 +17,14 @@ AddDevice=/dev/dri
 
 # Server command
 Exec=--port 8001 \
+    -c 0 \
+    --perf \
     --n-gpu-layers all \
-    --embeddings \
-    -m /models/nomic-embed-text-v2/nomic-embed-text-v2-moe-q8_0.gguf
+    --models-max 1 \
+    --models-dir /models \
+    --embedding \
+    -m /models/qwen3-embed-4b/Qwen3-Embedding-4B-Q8_0.gguf \
+    --alias embed
 
 # Health Check
 HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8001/props || exit 1
diff --git a/active/software_ai_stack/llama-instruct.container b/active/software_ai_stack/llama-instruct.container
new file mode 100644
index 0000000..2bc3116
--- /dev/null
+++ b/active/software_ai_stack/llama-instruct.container
@@ -0,0 +1,51 @@
+[Unit]
+Description=A Llama CPP Server Running GPT OSS 120b
+
+[Container]
+# Shared AI internal pod
+Pod=ai-internal.pod
+
+# Image is built locally via podman build
+Image=localhost/llama-cpp-vulkan:latest
+
+# Downloaded models volume
+Volume=/home/ai/models/text:/models:z
+
+# GPU Device
+AddDevice=/dev/kfd
+AddDevice=/dev/dri
+
+# Server command
+Exec=--port 8002 \
+    -c 16000 \
+    --perf \
+    -v \
+    --top-k 20 \
+    --top-p 0.8 \
+    --min-p 0 \
+    --presence-penalty 1.5 \
+    --repeat-penalty 1 \
+    --temp 0.7 \
+    --n-gpu-layers all \
+    --jinja \
+    --chat-template-kwargs '{"enable_thinking": false}' \
+    -m /models/qwen3.5-35b-a3b/Qwen3.5-35B-A3B-Q8_0.gguf \
+    --mmproj /models/qwen3.5-35b-a3b/mmproj-F16.gguf \
+    --alias instruct
+
+# Health Check
+HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8000/health || exit 1
+HealthInterval=10s
+HealthRetries=3
+HealthStartPeriod=10s
+HealthTimeout=30s
+HealthOnFailure=kill
+
+[Service]
+Restart=always
+# Extend Timeout to allow time to pull the image
+TimeoutStartSec=900
+
+[Install]
+# Start by default on boot
+WantedBy=multi-user.target default.target
diff --git a/active/software_ai_stack/quadlets_llama_server/llama-server.container b/active/software_ai_stack/llama-think.container
similarity index 89%
rename from active/software_ai_stack/quadlets_llama_server/llama-server.container
rename to active/software_ai_stack/llama-think.container
index 354552f..b1ce4f2 100644
--- a/active/software_ai_stack/quadlets_llama_server/llama-server.container
+++ b/active/software_ai_stack/llama-think.container
@@ -17,7 +17,7 @@ AddDevice=/dev/dri
 
 # Server command
 Exec=--port 8000 \
-    -c 16384 \
+    -c 64000 \
     --perf \
     --n-gpu-layers all \
     --jinja \
@@ -25,7 +25,7 @@ Exec=--port 8000 \
     --models-dir /models
 
 # Health Check
-HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8000/props || exit 1
+HealthCmd=CMD-SHELL curl --fail http://127.0.0.1:8000/health || exit 1
 HealthInterval=10s
 HealthRetries=3
 HealthStartPeriod=10s
diff --git a/active/software_ai_stack/quadlets_openwebui/open-webui.container b/active/software_ai_stack/open-webui.container
similarity index 96%
rename from active/software_ai_stack/quadlets_openwebui/open-webui.container
rename to active/software_ai_stack/open-webui.container
index 8babd56..3153fc8 100644
--- a/active/software_ai_stack/quadlets_openwebui/open-webui.container
+++ b/active/software_ai_stack/open-webui.container
@@ -3,7 +3,7 @@ Description=An Open Webui Frontend for Local AI Services
 
 [Container]
 # Shared AI external pod
-Pod=ai-external.pod
+PublishPort=8080:8080
 
 # Open Webui base image
 Image=ghcr.io/open-webui/open-webui:main
diff --git a/active/software_ai_stack/openai-example.py b/active/software_ai_stack/openai-example.py
new file mode 100644
index 0000000..dad4156
--- /dev/null
+++ b/active/software_ai_stack/openai-example.py
@@ -0,0 +1,133 @@
+import base64
+import os
+from datetime import datetime
+from io import BytesIO
+
+import requests
+from PIL import Image
+
+# Configuration
+BASE_URL = "https://llama-cpp.reeselink.com"
+API_KEY = os.getenv("LLAMA_CPP_API_KEY", "")  # Set if required
+
+
+def call_api(endpoint, method="GET", data=None):
+    """Generic API call helper"""
+    url = f"{BASE_URL}/v1/{endpoint}"
+    headers = {"Content-Type": "application/json"}
+    if API_KEY:
+        headers["Authorization"] = f"Bearer {API_KEY}"
+
+    response = requests.request(method, url, headers=headers, json=data)
+    return response
+
+
+# 1. List Models
+models_response = call_api("models")
+models = models_response.json().get("data", [])
+print(f"Available models: {[m['id'] for m in models]}")
+
+# 2. Use First Model
+model_id = models[1]["id"]
+
+# 3. Chat Completion
+chat_data = {
+    "model": model_id,
+    "messages": [
+        {"role": "system", "content": "You are helpful."},
+        {"role": "user", "content": "Tell me about Everquest!"},
+    ],
+    "temperature": 0.95,
+    "max_tokens": 100,
+}
+response = call_api("chat/completions", "POST", chat_data)
+print(response.json()["choices"][0]["message"]["content"])
+
+
+def describe_image(image_path, api_key=None):
+    """
+    Send an image to the LLM for description
+    """
+    base_url = "https://llama-cpp.reeselink.com"
+
+    # Read and encode image to base64
+    with open(image_path, "rb") as f:
+        encoded_image = base64.b64encode(f.read()).decode("utf-8")
+
+    # Prepare headers
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    # Create payload
+    payload = {
+        "model": "qwen3-vl-30b-a3b-instruct",  # 👁️ VISION MODEL
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this image in detail"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
+                    },
+                ],
+            }
+        ],
+        "max_tokens": 1000,
+        "temperature": 0.7,
+    }
+
+    # Send request
+    response = requests.post(
+        f"{base_url}/v1/chat/completions", headers=headers, json=payload
+    )
+
+    if response.status_code == 200:
+        return response.json()["choices"][0]["message"]["content"]
+    else:
+        print(f"Error: {response.status_code}")
+        print(response.text)
+        return None
+
+
+# description = describe_image("generated-image.png", api_key="your_key")
+# print(description)
+
+
+def generate_image(prompt, **kwargs):
+    """
+    Generate image using Stable Diffusion / OpenAI compatible API
+    """
+    base_url = "http://toybox.reeselink.com:1234/v1"
+
+    payload = {"model": "default", "prompt": prompt, "n": 1, "size": "1024x1024"}
+
+    response = requests.post(
+        f"http://toybox.reeselink.com:1234/v1/images/generations",
+        json=payload,
+        timeout=120,
+    )
+
+    if response.status_code == 200:
+        result = response.json()
+        # Save image
+        image_data = base64.b64decode(result["data"][0]["b64_json"])
+        img = Image.open(BytesIO(image_data))
+        filename = f"generated_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
+        img.save(filename)
+        print(f"✅ Saved: {filename}")
+        return result
+    else:
+        print(f"❌ Error: {response.status_code}")
+        print(response.text)
+        return None
+
+
+# Usage:
+result = generate_image(
+    prompt="A beautiful sunset over mountains, photorealistic",
+    negative_prompt="blurry, low quality",
+    steps=8,
+    guidance=7.5,
+)
diff --git a/active/software_ai_stack/quadlets_guest/open-webui-guest.container b/active/software_ai_stack/quadlets_guest/open-webui-guest.container
deleted file mode 100644
index 762f9ee..0000000
--- a/active/software_ai_stack/quadlets_guest/open-webui-guest.container
+++ /dev/null
@@ -1,32 +0,0 @@
-[Unit]
-Description=An Open Webui Frontend for Local AI Services for Guests
-
-[Container]
-# Shared AI external pod
-Pod=ai-external.pod
-
-# Open Webui base image
-Image=ghcr.io/open-webui/open-webui:main
-
-# Nothing too complicated here. Open Webui will basically configure itself.
-Volume=open-webui-data-guest:/app/backend/data
-
-# WEBUI_SECRET_KEY is required to prevent logout on Restart
-EnvironmentFile=/home/ai/.env/open-webui-env-guest
-
-# ai-external is the primary network
-Network=ai-external.network
-Network=ai-internal.network
-
-# open-webui
-PublishPort=8081:8081/tcp
-
-[Service]
-Restart=on-failure
-RestartSec=5
-# Extend Timeout to allow time to pull the image
-TimeoutStartSec=900
-
-[Install]
-# Start by default on boot
-WantedBy=multi-user.target default.target
\ No newline at end of file
diff --git a/active/software_ai_stack/quadlets_pods/ai-external.network b/active/software_ai_stack/quadlets_pods/ai-external.network
deleted file mode 100644
index ce02935..0000000
--- a/active/software_ai_stack/quadlets_pods/ai-external.network
+++ /dev/null
@@ -1,2 +0,0 @@
-[Network]
-IPv6=true
\ No newline at end of file
diff --git a/active/software_ai_stack/quadlets_pods/ai-external.pod b/active/software_ai_stack/quadlets_pods/ai-external.pod
deleted file mode 100644
index 2dd4bbc..0000000
--- a/active/software_ai_stack/quadlets_pods/ai-external.pod
+++ /dev/null
@@ -1,6 +0,0 @@
-[Pod]
-# ai-external is the primary network
-Network=ai-external.network
-Network=ai-internal.network
-# open-webui
-PublishPort=8080:8080/tcp
\ No newline at end of file
diff --git a/active/software_ai_stack/quadlets_stable_diffusion/stable-diffusion-edit-server.container b/active/software_ai_stack/stable-diffusion-edit-server.container
similarity index 100%
rename from active/software_ai_stack/quadlets_stable_diffusion/stable-diffusion-edit-server.container
rename to active/software_ai_stack/stable-diffusion-edit-server.container
diff --git a/active/software_ai_stack/quadlets_stable_diffusion/stable-diffusion-gen-server.container b/active/software_ai_stack/stable-diffusion-gen-server.container
similarity index 100%
rename from active/software_ai_stack/quadlets_stable_diffusion/stable-diffusion-gen-server.container
rename to active/software_ai_stack/stable-diffusion-gen-server.container