diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index ffb12a92ad..64f992c946 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -85,6 +85,10 @@ jobs: run: | python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt + - name: Run quantized LLaMA3 + run: | + python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt + python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt #- name: Run LLaMA 7B on 4 (virtual) GPUs # run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt - name: Run GPT2 @@ -118,6 +122,8 @@ jobs: llama_beam.txt llama_int8.txt llama_nf4.txt + llama3_int8.txt + llama3_nf4.txt llama_four_gpu.txt gpt2_unjitted.txt gpt2_jitted.txt diff --git a/examples/llama3.py b/examples/llama3.py index 9c53438b31..6969818faa 100644 --- a/examples/llama3.py +++ b/examples/llama3.py @@ -126,7 +126,8 @@ def NF4Linear(block_size): return x.linear(unscaled.reshape(self.out_features, self.in_features).T) @staticmethod - def quantize(state_dict: dict[str, Tensor], device, scale_dtype=dtypes.float16) -> dict[str, Tensor]: + def quantize(state_dict: dict[str, Tensor], device, scale_dtype=dtypes.float16, quantize_embeds=False) -> dict[str, Tensor]: + assert not quantize_embeds # TODO: support this? new_state_dict = {} for k, v in state_dict.items(): if "feed_forward" in k or "attention.w" in k: