don't pass model in convert_from_huggingface and convert_from_gguf (#10094)

it only needs n_layers
2026-01-09 15:08:02 -05:00 · 2025-04-28 20:11:19 -04:00
parent a2d0684fc1
commit 3eba3d6ee9
6 changed files with 22 additions and 22 deletions
--- a/examples/coder.py
+++ b/examples/coder.py
@@ -34,8 +34,8 @@ if __name__ == "__main__":
    part2 = nn.state.torch_load(fetch("https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/resolve/main/pytorch_model-00002-of-00002.bin?download=true"))

  with Timing("weights -> model: "):
-    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part1, model, 32, 8)), strict=False)
-    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part2, model, 32, 8)), strict=False)
+    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part1, 32, 32, 8)), strict=False)
+    nn.state.load_state_dict(model, fix_bf16(convert_from_huggingface(part2, 32, 32, 8)), strict=False)

  if not os.path.isfile("/tmp/tokenizer.model"): create_fixed_tokenizer("/tmp/tokenizer.model")
  spp = SentencePieceProcessor(model_file="/tmp/tokenizer.model")
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -211,7 +211,7 @@ class LLaMa:
    else:
      weights = load(str(model_path))
    if "model.embed_tokens.weight" in weights:
-      weights = convert_from_huggingface(weights, model, params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))
+      weights = convert_from_huggingface(weights, params["args"]["n_layers"], params["args"]["n_heads"], params["args"].get("n_kv_heads", params["args"]["n_heads"]))

    weights = fix_bf16(weights)

--- a/examples/llama3.py
+++ b/examples/llama3.py
@@ -173,9 +173,9 @@ def build_transformer(model_path: Path, model_size="8B", quantize=None, scale_dt
  else:
    weights = load(str(model_path))
  if "model.embed_tokens.weight" in weights:
-    weights = convert_from_huggingface(weights, model, MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
+    weights = convert_from_huggingface(weights, MODEL_PARAMS[model_size]["args"]["n_layers"], MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
  elif "token_embd.weight" in weights:
-    weights = convert_from_gguf(weights, model)
+    weights = convert_from_gguf(weights, MODEL_PARAMS[model_size]["args"]["n_layers"])
  weights = fix_bf16(weights)

  with Context(BEAM=0):
--- a/examples/olmoe.py
+++ b/examples/olmoe.py
@@ -55,7 +55,7 @@ if __name__ == "__main__":
    del model_state_dict['freqs_cis']

  with Timing("load weights to GPU: "):
-    nhf_state = convert_from_huggingface(fetch_weights(), model, 16, 16)
+    nhf_state = convert_from_huggingface(fetch_weights(), 16, 16, 16)
    # NOTE: i'm not sure this actually needs float32, it may just change the type of things downstream from it. but doesn't match torch w/o this
    for needs_float32 in ['tok_embeddings.weight']: nhf_state[needs_float32] = nhf_state[needs_float32].float()
  print(f"ram used: {GlobalCounters.mem_used/1e9:.2f} GB")
--- a/examples/qwq.py
+++ b/examples/qwq.py
@@ -44,7 +44,7 @@ def load_model(model_path:Path, model_params:Dict[str, Union[int, float]]) -> Tr
  model.layers = updated_layers

  # load weights
-  weights = fix_bf16(convert_from_huggingface(load(str(model_path / "model.safetensors.index.json")), model, model_params["n_heads"], model_params["n_kv_heads"], permute_layers=False))
+  weights = fix_bf16(convert_from_huggingface(load(str(model_path / "model.safetensors.index.json")), model_params["n_layers"], model_params["n_heads"], model_params["n_kv_heads"], permute_layers=False))

  # replace weights in model
  load_state_dict(model, weights, strict=False, consume=True)