Longtime lurker here, thought i should post my speeeeds...
I have a RTX 4070S 12 GB Vram (+10% OC), AMD 9800x3D with 4x16 Gb DDR5 6000Mhz CL30.
EDIT: I offload my display to my igpu btw to save some vram on the rtx dgpu. Otherwise drop 10% or so on performance.
EDIT2: Using this with cuda 13.1
Please dont ask me how good they can do stuff, it's all working with no tool calls issues in VS Code with Cline and KiloCode and can use subagents too. I have not looked in to pi-coding yet.
These models for doing WebDev are very good imho, i use Qwen3.6-35B-A3B-GGUF Q6_K_XL the most :)
TL;DR:
- Unsloth: Qwen3.6-35B-A3B-GGUF Q6_K_XL -> tgs 40 pps 2100
- Unsloth: Qwen3.6-27B-IQ3_XXS -> tgs 16 pps 1000
- Unsloth: Gemma 4 26B-A4B-it-UD-Q8 -> tgs 26 pps 2150
- Unsloth: Gemma-4-31B-it-IQ3_XXS -> tgs 13-16 pps 650
Using the following (latest llama atm) llama cpp models.ini config:
; --- Hardware ---
n-gpu-layers = 999
threads = 8
threads-batch = 16
; --- Batching ---
batch-size = 4096
ubatch-size = 4096
; --- Context ---
ctx-size = 65536
; --- KV Cache ---
cache-ram = 2048
; --- Server ---
parallel = 1
kv-unified = true
flash-attn = true
no-mmproj-offload = true
;no-mmap = true
; --- Sampling defaults ---
temp = 1.0
top-k = 40
top-p = 0.95
min-p = 0.01
repeat-penalty = 1.05
seed = 3407
; ==============================================
; Unsloth Qwen3.6-35B-A3B-GGUF Q6_K_XL tgs 40 pps 2100
; ==============================================
[Qwen3.6-35B-A3B-Q6_K_XL-Unsloth]
model = E:\Apps\Ai Models\unsloth\Qwen3.6-35B-A3B-GGUF\Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf
mmproj = E:\Apps\Ai Models\unsloth\Qwen3.6-35B-A3B-GGUF\mmproj-F16.gguf
ctx-size = 131072
n-cpu-moe = 35
;n-cpu-moe = 38
cache-type-k = q8_0
cache-type-v = q8_0
no-mmap = true
reasoning = on
jinja = true
chat-template-kwargs = {"preserve_thinking": true}
reasoning-budget = 8096
reasoning-budget-message = Okay, enough thinking no more waiting. Let's just jump to it.
temperature = 0.6
top-p = 0.95
top-k = 20
min-p = 0.0
presence-penalty = 0.0
repeat-penalty = 1.0
swa-full = true
cache-reuse = 512
; ==============================================
; Gemma 4 26B-A4B-it-UD-Q8 tgs 26 pps 2150
; ==============================================
[Gemma-4-26B-A4B-Q8_0]
model = E:\Apps\Ai Models\unsloth\gemma-4-26B-A4B-it-GGUF\gemma-4-26B-A4B-it-Q8_0.gguf
mmproj = E:\Apps\Ai Models\unsloth\gemma-4-26B-A4B-it-GGUF\mmproj-F16.gguf
ctx-size = 102400
n-cpu-moe = 27
cache-type-k = q8_0
cache-type-v = q8_0
reasoning = on
jinja = true
no-mmap = true
reasoning-budget = 8192
reasoning-budget-message = Okay, enough thinking no more waiting. Let's just jump in to it.
temp = 1.0
top-k = 64
top-p = 0.95
min-p = 0.00
repeat-penalty = 1
seed = 3407
fit = on
fit-target = 256
fit-ctx = 32768
; ==============================================
; unsloth gemma-4-31B-it-IQ3_XXS tgs 13-16 pps 650
; ==============================================
[Gemma-4-31B-IQ3_XXS-Unsloth]
model = E:\Apps\Ai Models\unsloth\gemma-4-31B-it-GGUF\gemma-4-31B-it-UD-IQ3_XXS.gguf
ctx-size = 51200
ubatch-size = 256
batch-size = 4096
cache-type-k = q4_0
cache-type-v = q4_0
cache-reuse = 512
; --- GPU offload (hardcoded = fit won't touch it) ---
n-gpu-layers = 58
no-mmap = true
; --- fit only guards ctx-size from being reduced; NGL is already pinned ---
fit = on
fit-target = 256
fit-ctx = 32768
; --- Reasoning / Thinking ---
reasoning = on
jinja = true
;chat-template-kwargs = {"preserve_thinking": true}
reasoning-budget = 8192
reasoning-budget-message = Okay, enough thinking no more waiting. Let's just jump in to it.
; --- Sampling ---
temperature = 0.6
top-p = 0.95
top-k = 20
min-p = 0.0
presence-penalty = 0.0
repeat-penalty = 1.0
; --- Speculative decoding (ngram-mod) ---
spec-type = ngram-mod
spec-ngram-mod-n-match = 24
spec-draft-n-min = 5
spec-draft-n-max = 64
no-kv-offload = true
; ==============================================
; Qwen3.6-27B-IQ3_XXS-Unsloth tgs 16 pps 1000
; ==============================================
[Qwen3.6-27B-IQ3_XXS-Unsloth]
model = E:\Apps\Ai Models\unsloth\Qwen3.6-27B-GGUF\Qwen3.6-27B-UD-IQ3_XXS.gguf
ubatch-size = 256
batch-size = 4096
cache-type-k = q4_0
cache-type-v = q4_0
; --- GPU offload (hardcoded = fit won't touch it) ---
;n-gpu-layers = 63
no-mmap = true
; --- fit only guards ctx-size from being reduced; NGL is already pinned ---
fit = on
fit-target = 256
fit-ctx = 32768
; --- Reasoning / Thinking ---
reasoning = on
;grammar-file = E:\Apps\llama-cpp\grammars\think_qwen3_6.gbnf
jinja = true
chat-template-kwargs = {"preserve_thinking": true}
reasoning-budget = 8192
reasoning-budget-message = Okay, enough thinking no more waiting. Let's just jump in to it.
; --- Sampling ---
temperature = 0.6
top-p = 0.95
top-k = 20
min-p = 0.0
presence-penalty = 0.0
repeat-penalty = 1.0
; --- Speculative decoding (ngram-mod) ---
spec-type = ngram-mod
spec-ngram-mod-n-match = 24
spec-draft-n-min = 5
spec-draft-n-max = 32
no-kv-offload = true
[link] [comments]