Files
tinygrad/test/external/external_test_speed_llama.py
Steven Anderson 93a36c3659 Arm (#1421)
* testing new memops

* better debugging

* testing padded conv

* branching with load

* refactoring a bit

* first try

* fixing bugs

* fixing some

* eq

* eq2

* do not use x's

* working

* fixing imm

* getting things working

* refactor

* pow not working

* working except one

* refactor: one store mem

* refactor: global load

* refactor: imm

* refactor: cleaning

* fixing big offsets

* refactor with ci

* try ci

* typo

* another typo

* ubuntu default

* forgot git

* do i need git?

* missing packages

* adding python-dev

* with cache?

* buildx action

* buildx name issue?

* maybe now?

* python3

* newline warning

* maybe now

* i actually need this

* ci should work now

* improved caching

* fixing cache

* maybe now it will cache

* this

* testing cache

* trying again

* load

* missing platform

* caching gha

* testing cache

* full testing

* typo

* now?

* why

* adding checkout back

* bad formatting

* fixing convention issues

* supporting python

* adding CI flag

* testing all

* better comments

* adding debugging

* takes 12x longer

* does it output progress now?

* ignore models for speed

* fixing merge

* excluding conv_transpose2d

* only 2 test cuz is to slow

* another approach

* let's see

* faster duh

* my bad

* T_T

* typo

* sup

* with output?

* comment test

* comment test

* comment test

* :?

* no comment

* with cache

* back to normal

* testing that ci works

* back to passing

* trying again

* does it create another entry

* does it create another entry?

* build local

* hey

* Revert "excluding conv_transpose2d"

This reverts commit cc7348de03.

* does it cache if done before?

* does it cache?

* done

* adding test ops

* bad formatting

* no need for this

* working static mem

* sum 1d

* add ndim

* better reg import

* fix stack

* back to np

* working except for softmax

* 5 failing

* no pogress

* remove keystone

* remove keystone

* testops passing

* cleanups

* more cleanup

* typo

* ci

* ci2

* cond import

* ci3

* ci4

* ci4

* ci5

* ci5

* ci6

* aligment

* test all

* correct test

* err read_unmapped

* passing test

* ignore for speed

* ignore for speed

* ci7

* cleanup

* remove docker

* fixing merge

* fixing bugs

* add skipload for const ops

* comments

* First merge to master: Renderer

* fix emulation

* passing all tests arm64

* cleaning

* fix handcoded binary

* cleaning

* fix errs

* fix runtime arg binary

* clean git diff

* fix and clean

* fixing metal test

* cleaning

* fix metal test

* ci ~8 min

* fix pylint and clang

* cache the files in ops_clang

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
2023-08-14 19:29:30 -07:00

58 lines
2.2 KiB
Python

# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
import unittest, time
import numpy as np
from examples.llama import Transformer, MODEL_PARAMS
from test.test_net_speed import start_profile, stop_profile
from tinygrad.tensor import Tensor
from tinygrad.lazy import Device
from tinygrad.state import get_state_dict
from tinygrad.ops import Compiled
from tinygrad.helpers import dtypes, prod
from tinygrad.runtime.lib import RawBuffer
class FakeProgram:
def __init__(self, name:str, prg:str, binary:bool): pass
def __call__(self, global_size, local_size, *bufs, wait=False): pass
class RawFakeBuffer(RawBuffer):
@classmethod
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
class TestLLaMASpeed(unittest.TestCase):
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
def test_llama_compile(self):
backup_program = Device[Device.DEFAULT].runtime
backup_buffer = Device[Device.DEFAULT].buffer
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].buffer = RawFakeBuffer
print("testing llama python run time")
model = Transformer(**MODEL_PARAMS[1]["7B"]["args"])
print("built model")
# assign fake tensors to the values
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
print("assigned empty tensors, doing warmup")
def run_llama(st, empty_method_cache=True):
if empty_method_cache: Device[Device.DEFAULT].method_cache.clear()
tms = [time.perf_counter()]
for i in range(10):
model(Tensor([[2]]), i).realize()
tms.append(time.perf_counter())
timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
run_llama("codegen")
run_llama("methodcache", False)
pr = start_profile()
run_llama("profile")
stop_profile(pr, sort='time', frac=0.1)
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].buffer = backup_buffer
if __name__ == '__main__':
unittest.main()