for anyone who cares... 😄
prompt = spen a 1000 tokens
unsloth MTP models
strix halo
llama.cpp:server-rocm-mtp \
--spec-type draft-mtp \
--spec-draft-n-max 3
Qwen3.5-122B-Q5-MTP-General
n_decoded = 100 tg = 29.77 t/s
n_decoded = 179 tg = 27.95 t/s
n_decoded = 254 tg = 26.80 t/s
n_decoded = 4056 tg = 20.23 t/s
n_decoded = 4120 tg = 20.23 t/s
n_decoded = 4181 tg = 20.22 t/s
prompt eval time = 408.99 ms / 19 tokens
eval time = 207516.64 ms / 4200 tokens
tg = 20.24 t/s
Qwen3.5-122B-Q6-MTP-General
n_decoded = 102 tg = 25.10 t/s
n_decoded = 174 tg = 24.25 t/s
n_decoded = 225 tg = 22.04 t/s
n_decoded = 3193 tg = 17.27 t/s
n_decoded = 3244 tg = 17.26 t/s
n_decoded = 3281 tg = 17.18 t/s
prompt eval time = 488.39 ms / 19 tokens
eval time = 191156.72 ms / 3283 tokens
tg = 17.17 t/s
[link] [comments]