llm: glm 4.7 flash (#15738)

* glm 4.7

* test

* temperature, server enable_thinking

* --no-think

* remove think stuff
This commit is contained in:
b1tg
2026-04-16 22:42:04 +08:00
committed by GitHub
parent d147e2a549
commit 4e88d875ba
2 changed files with 26 additions and 10 deletions

View File

@@ -12,6 +12,7 @@ class TestLLMServer(unittest.TestCase):
cls.mock_tok.decode = Mock(return_value="Hello")
cls.mock_tok.stream_decoder = Mock(return_value=lambda tid=None: "Hello" if tid is not None else "")
cls.mock_tok.end_turn = Mock(return_value=[998])
cls.mock_tok.prefix = Mock(return_value=[1])
cls.mock_tok.preset = "llama3"
cls.mock_model = Mock()
@@ -27,6 +28,7 @@ class TestLLMServer(unittest.TestCase):
llm_module.tok = cls.mock_tok
llm_module.bos_id = cls.bos_id
llm_module.eos_id = cls.eos_id
llm_module.eot_id = None
from tinygrad.apps.llm import Handler
from tinygrad.viz.serve import TCPServerWithReuse

View File

@@ -9,7 +9,8 @@ from tinygrad.viz.serve import TCPServerWithReuse, HTTPRequestHandler
class SimpleTokenizer:
def __init__(self, normal_tokens:dict[str, int], special_tokens:dict[str, int], preset:str="llama3"):
preset = {"qwen35":"qwen2","qwen35moe":"qwen2"}.get(preset, preset)
if preset not in ("llama3","llama-v3","llama-bpe","qwen2","olmo","kimi-k2","tekken"): raise ValueError(f"Invalid tokenizer preset '{preset}'")
if preset not in ("llama3","llama-v3","llama-bpe","qwen2","olmo","kimi-k2","tekken","glm4"):
raise ValueError(f"Invalid tokenizer preset '{preset}'")
# https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/encoder.py#L9
bs = [*range(33, 127), *range(161, 173), *range(174, 256)] # bytes that map to themselves
self._byte_decoder = {chr(b): b for b in bs} | {chr(256+i): b for i,b in enumerate(b for b in range(256) if b not in bs)}
@@ -63,6 +64,7 @@ class SimpleTokenizer:
if self.preset == 'olmo': return self.encode("<|" + role + "|>\n") # OLMoE Instruct format
if self.preset == 'kimi-k2': return self.encode("<|im_" + role + "|>" + role + "<|im_middle|>")
if self.preset == 'qwen2': return self.encode("<|im_start|>" + role + "\n")
if self.preset == 'glm4': return self.encode("<|" + role + "|>")
if self.preset == 'tekken':
if role == 'user': return self.encode("[INST]")
if role == 'assistant': return []
@@ -72,8 +74,11 @@ class SimpleTokenizer:
if self.preset == 'olmo': return self.encode("\n")
if self.preset == 'kimi-k2': return [eos_id]
if self.preset == 'qwen2': return [eos_id] + self.encode("\n")
if self.preset == 'glm4': return []
if self.preset == 'tekken': return self.encode("[/INST]")
return [eos_id]
def prefix(self, bos_id:int|None) -> list[int]:
return ([] if bos_id is None else [bos_id]) + (self.encode("<sop>") if self.preset == 'glm4' else [])
@functools.cache
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> Tensor:
@@ -129,6 +134,7 @@ class TransformerConfig:
num_experts: int = 0
num_experts_per_tok: int = 0
norm_topk_prob: bool = False
q_lora_rank: int = 0
kv_lora_rank: int = 0
shared_expert_dim: int = 0
full_attention_interval: int = 0
@@ -259,7 +265,12 @@ class MLATransformerBlock(FFNBlock):
def __init__(self, config:TransformerConfig):
super().__init__(config)
qk_nope_head_dim = config.head_dim - config.rope_dim
self.attn_q = nn.Linear(config.dim, config.n_heads * config.head_dim, bias=False)
if config.q_lora_rank > 0:
self.attn_q_a = nn.Linear(config.dim, config.q_lora_rank, bias=False)
self.attn_q_a_norm = nn.RMSNorm(config.q_lora_rank, config.norm_eps)
self.attn_q_b = nn.Linear(config.q_lora_rank, config.n_heads * config.head_dim, bias=False)
else:
self.attn_q = nn.Linear(config.dim, config.n_heads * config.head_dim, bias=False)
self.attn_kv_a_mqa = nn.Linear(config.dim, config.kv_lora_rank + config.rope_dim, bias=False)
self.attn_kv_a_norm = nn.RMSNorm(config.kv_lora_rank, config.norm_eps)
self.attn_k_b = {"weight": Tensor.zeros(config.n_heads, config.kv_lora_rank, qk_nope_head_dim)}
@@ -269,7 +280,8 @@ class MLATransformerBlock(FFNBlock):
def _attention(self, x:Tensor, start_pos:int|UOp) -> Tensor:
B, T, _ = x.shape
q_nope_head_dim = self.config.head_dim - self.config.rope_dim
q = self.attn_q(x).reshape(B, T, self.config.n_heads, self.config.head_dim).transpose(1, 2)
q_proj = self.attn_q_b(self.attn_q_a_norm(self.attn_q_a(x))) if self.config.q_lora_rank > 0 else self.attn_q(x)
q = q_proj.reshape(B, T, self.config.n_heads, self.config.head_dim).transpose(1, 2)
q_nope, q_rope = q[..., :q_nope_head_dim], q[..., q_nope_head_dim:]
q = (q_nope @ self.attn_k_b["weight"].transpose(-1, -2)).cat(apply_rope(q_rope, self.freqs_cis[start_pos:start_pos+T]), dim=-1)
@@ -407,7 +419,7 @@ class Transformer:
# Permute RoPE weights from interleaved to half-split layout.
for name in state_dict:
if 'attn_q.weight' in name and (arch == 'llama' or kv_lora_rank):
if ('attn_q.weight' in name or 'attn_q_b.weight' in name) and (arch == 'llama' or kv_lora_rank):
w = state_dict[name].reshape(n_heads, state_dict[name].shape[0]//n_heads, -1)
prefix = head_dim-rope_dim
state_dict[name] = w[:, :prefix].cat(w[:, prefix:].rearrange("n (h two) d -> n (two h) d", two=2), dim=1).reshape(-1, w.shape[-1])
@@ -429,7 +441,7 @@ class Transformer:
qk_norm=int(state_dict['blk.0.attn_q_norm.weight'].shape[0]) if 'blk.0.attn_q_norm.weight' in state_dict else 0,
num_experts=kv.get(f'{arch}.expert_count', 0), num_experts_per_tok=kv.get(f'{arch}.expert_used_count', 0),
norm_topk_prob=kv.get(f'{arch}.expert_weights_norm', arch in ('qwen3moe', 'qwen35moe')),
kv_lora_rank=kv_lora_rank,
kv_lora_rank=kv_lora_rank, q_lora_rank=kv.get(f'{arch}.attention.q_lora_rank', 0),
leading_dense_blocks=kv.get(f'{arch}.leading_dense_block_count', 0),
shared_expert_dim=kv.get(
f'{arch}.expert_shared_feed_forward_length',
@@ -489,6 +501,7 @@ models = {
"qwen3.5:35b-a3b": "https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/Qwen3.5-35B-A3B-Q4_K_M.gguf",
"olmoe": "https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF/resolve/main/olmoe-1b-7b-0924-instruct-q4_k_m.gguf",
"moonlight": "https://huggingface.co/gabriellarson/Moonlight-16B-A3B-Instruct-GGUF/resolve/main/Moonlight-16B-A3B-Instruct-Q4_K_M.gguf",
"glm-4.7-flash": "https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/resolve/main/GLM-4.7-Flash-Q4_K_M.gguf",
}
# *** simple OpenAI API compatible server with web interface on http://localhost:8000/ ***
@@ -549,7 +562,7 @@ class Handler(HTTPRequestHandler):
dec = tok.stream_decoder()
for next_id in model.generate(ids, temperature=temperature):
if len(out) == 0: stderr_log(f"prefill:{(len(ids)-cache_start_pos)/((pt:=time.perf_counter())-st):4.0f} tok/s {colored('--', 'BLACK')} ")
if next_id == eos_id: break
if next_id in (eos_id, eot_id): break
out.append(next_id)
yield {"choices": [{"index":0, "delta":{"content":dec(next_id)}, "finish_reason":None}], **tmpl}
if max_tokens is not None and len(out) >= max_tokens:
@@ -569,7 +582,7 @@ class Handler(HTTPRequestHandler):
if DEBUG >= 1: print(json.dumps(body, indent=2))
if self.path == "/v1/chat/completions":
# extract tokens, last assistant message is treated as prefill
ids: list[int] = [bos_id] if bos_id is not None else []
ids: list[int] = tok.prefix(bos_id)
for i, msg in enumerate(body["messages"]):
ids += tok.role(msg["role"])
content = msg["content"]
@@ -621,6 +634,7 @@ if __name__ == "__main__":
tok = SimpleTokenizer.from_gguf_kv(kv)
bos_id: int|None = kv.get('tokenizer.ggml.bos_token_id') if kv.get('tokenizer.ggml.add_bos_token', True) else None
eos_id: int = kv['tokenizer.ggml.eos_token_id']
eot_id: int|None = kv.get('tokenizer.ggml.eot_token_id')
# warmup the JIT
if args.warmup or args.serve:
@@ -642,7 +656,7 @@ if __name__ == "__main__":
exit(0)
# interactive chat
ids: list[int] = [bos_id] if bos_id is not None else []
ids: list[int] = tok.prefix(bos_id)
while 1:
try:
ids += tok.role("user") + tok.encode(input('>>> ')) + tok.end_turn(eos_id) + tok.role("assistant")
@@ -650,6 +664,6 @@ if __name__ == "__main__":
break
dec = tok.stream_decoder()
for next_id in model.generate(ids):
sys.stdout.write(dec(next_id) if next_id != eos_id else dec() + "\n\n")
sys.stdout.write(dec(next_id) if next_id not in (eos_id, eot_id) else dec() + "\n\n")
sys.stdout.flush()
if next_id == eos_id: break
if next_id in (eos_id, eot_id): break