fix llama example quantize (#4699)

* fix llama example quantize import quantize layers from new example llama3 add to mac benchmark * fix that * save the files
2026-01-22 13:28:06 -05:00 · 2024-05-23 15:35:26 -04:00
parent 532c9e08e3
commit 38bc38cdff
2 changed files with 35 additions and 81 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -47,6 +47,10 @@ jobs:
        JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt
    - name: Run LLaMA with BEAM
      run: JIT=1 BEAM=2 CACHELEVEL=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt
+    - name: Run quantized LLaMA
+      run: |
+        JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt
+        JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt
    - name: Run LLaMA 7B on 4 (virtual) GPUs
      run: JIT=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_four_gpu.txt
    - name: Run GPT2
@@ -76,6 +80,8 @@ jobs:
          llama_unjitted.txt
          llama_jitted.txt
          llama_beam.txt
+          llama_int8.txt
+          llama_nf4.txt
          llama_four_gpu.txt
          gpt2_unjitted.txt
          gpt2_jitted.txt