12GB-Club: 4070S qwen3.6 27b + 35b a3b, and Gemma 4 26b a4b + 31b speeds

Longtime lurker here, thought i should post my speeeeds...

I have a RTX 4070S 12 GB Vram (+10% OC), AMD 9800x3D with 4x16 Gb DDR5 6000Mhz CL30.

EDIT: I offload my display to my igpu btw to save some vram on the rtx dgpu. Otherwise drop 10% or so on performance.

EDIT2: Using this with cuda 13.1

Please dont ask me how good they can do stuff, it's all working with no tool calls issues in VS Code with Cline and KiloCode and can use subagents too. I have not looked in to pi-coding yet.

These models for doing WebDev are very good imho, i use Qwen3.6-35B-A3B-GGUF Q6_K_XL the most :)

TL;DR:

Unsloth: Qwen3.6-35B-A3B-GGUF Q6_K_XL -> tgs 40 pps 2100
Unsloth: Qwen3.6-27B-IQ3_XXS -> tgs 16 pps 1000
Unsloth: Gemma 4 26B-A4B-it-UD-Q8 -> tgs 26 pps 2150
Unsloth: Gemma-4-31B-it-IQ3_XXS -> tgs 13-16 pps 650

Using the following (latest llama atm) llama cpp models.ini config:

; --- Hardware ---

n-gpu-layers = 999

threads = 8

threads-batch = 16

; --- Batching ---

batch-size = 4096

ubatch-size = 4096

; --- Context ---

ctx-size = 65536

; --- KV Cache ---

cache-ram = 2048

; --- Server ---

parallel = 1

kv-unified = true

flash-attn = true

no-mmproj-offload = true

;no-mmap = true

; --- Sampling defaults ---

temp = 1.0

top-k = 40

top-p = 0.95

min-p = 0.01

repeat-penalty = 1.05

seed = 3407

; ==============================================

; Unsloth Qwen3.6-35B-A3B-GGUF Q6_K_XL tgs 40 pps 2100

; ==============================================

[Qwen3.6-35B-A3B-Q6_K_XL-Unsloth]

model = E:\Apps\Ai Models\unsloth\Qwen3.6-35B-A3B-GGUF\Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf

mmproj = E:\Apps\Ai Models\unsloth\Qwen3.6-35B-A3B-GGUF\mmproj-F16.gguf

ctx-size = 131072

n-cpu-moe = 35

;n-cpu-moe = 38

cache-type-k = q8_0

cache-type-v = q8_0

no-mmap = true

reasoning = on

jinja = true

chat-template-kwargs = {"preserve_thinking": true}

reasoning-budget = 8096

reasoning-budget-message = Okay, enough thinking no more waiting. Let's just jump to it.

temperature = 0.6

top-p = 0.95

top-k = 20

min-p = 0.0

presence-penalty = 0.0

repeat-penalty = 1.0

swa-full = true

cache-reuse = 512

; ==============================================

; Gemma 4 26B-A4B-it-UD-Q8 tgs 26 pps 2150

; ==============================================

[Gemma-4-26B-A4B-Q8_0]

model = E:\Apps\Ai Models\unsloth\gemma-4-26B-A4B-it-GGUF\gemma-4-26B-A4B-it-Q8_0.gguf

mmproj = E:\Apps\Ai Models\unsloth\gemma-4-26B-A4B-it-GGUF\mmproj-F16.gguf

ctx-size = 102400

n-cpu-moe = 27

cache-type-k = q8_0

cache-type-v = q8_0

reasoning = on

jinja = true

no-mmap = true

reasoning-budget = 8192

reasoning-budget-message = Okay, enough thinking no more waiting. Let's just jump in to it.

temp = 1.0

top-k = 64

top-p = 0.95

min-p = 0.00

repeat-penalty = 1

seed = 3407

fit = on

fit-target = 256

fit-ctx = 32768

; ==============================================

; unsloth gemma-4-31B-it-IQ3_XXS tgs 13-16 pps 650

; ==============================================

[Gemma-4-31B-IQ3_XXS-Unsloth]

model = E:\Apps\Ai Models\unsloth\gemma-4-31B-it-GGUF\gemma-4-31B-it-UD-IQ3_XXS.gguf

ctx-size = 51200

ubatch-size = 256

batch-size = 4096

cache-type-k = q4_0

cache-type-v = q4_0

cache-reuse = 512

; --- GPU offload (hardcoded = fit won't touch it) ---

n-gpu-layers = 58

no-mmap = true

; --- fit only guards ctx-size from being reduced; NGL is already pinned ---

fit = on

fit-target = 256

fit-ctx = 32768

; --- Reasoning / Thinking ---

reasoning = on

jinja = true

;chat-template-kwargs = {"preserve_thinking": true}

reasoning-budget = 8192

reasoning-budget-message = Okay, enough thinking no more waiting. Let's just jump in to it.

; --- Sampling ---

temperature = 0.6

top-p = 0.95

top-k = 20

min-p = 0.0

presence-penalty = 0.0

repeat-penalty = 1.0

; --- Speculative decoding (ngram-mod) ---

spec-type = ngram-mod

spec-ngram-mod-n-match = 24

spec-draft-n-min = 5

spec-draft-n-max = 64

no-kv-offload = true

; ==============================================

; Qwen3.6-27B-IQ3_XXS-Unsloth tgs 16 pps 1000

; ==============================================

[Qwen3.6-27B-IQ3_XXS-Unsloth]

model = E:\Apps\Ai Models\unsloth\Qwen3.6-27B-GGUF\Qwen3.6-27B-UD-IQ3_XXS.gguf

ubatch-size = 256

batch-size = 4096

cache-type-k = q4_0

cache-type-v = q4_0

; --- GPU offload (hardcoded = fit won't touch it) ---

;n-gpu-layers = 63

no-mmap = true

; --- fit only guards ctx-size from being reduced; NGL is already pinned ---

fit = on

fit-target = 256

fit-ctx = 32768

; --- Reasoning / Thinking ---

reasoning = on

;grammar-file = E:\Apps\llama-cpp\grammars\think_qwen3_6.gbnf

jinja = true

chat-template-kwargs = {"preserve_thinking": true}

reasoning-budget = 8192

reasoning-budget-message = Okay, enough thinking no more waiting. Let's just jump in to it.

; --- Sampling ---

temperature = 0.6

top-p = 0.95

top-k = 20

min-p = 0.0

presence-penalty = 0.0

repeat-penalty = 1.0

; --- Speculative decoding (ngram-mod) ---

spec-type = ngram-mod

spec-ngram-mod-n-match = 24

spec-draft-n-min = 5

spec-draft-n-max = 32

no-kv-offload = true

submitted by /u/mr_Owner
[link] [comments]

Leave a Comment