Files
ROCm/python/test/unit/runtime/test_launch.py
Justin Lebar df08301e76 Reformat Python code with yapf. (#2589)
I've add an option to yapf to do what we want for long lines, see
https://github.com/google/yapf/pull/1177.  We can now have a real Python
formatter, yay!

To make this PR, I ran my modified yapf over the repository, then looked
over the full diff.  Where yapf was mangling the param list of long
function decls/calls (mostly kernels), I manually added `#` to put
linebreaks where we want.  I fixed up other formatting too -- mostly
adding or removing a trailing comma from lists.

Overall, trailing `#` was sufficient to get formatting similar to our
current code.  I didn't have to disable yapf anywhere.

---------

Co-authored-by: Phil Tillet <phil@openai.com>
2023-11-02 20:44:17 -07:00

107 lines
3.3 KiB
Python

import gc
# import importlib
# import os
# import sys
# import tempfile
# import textwrap
# import time
import tracemalloc
import torch
import triton
import triton.language as tl
# from typing import Tuple
def test_memory_leak() -> None:
@triton.jit
def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
xnumel = 10
xoffset = tl.program_id(0) * XBLOCK
xindex = xoffset + tl.arange(0, XBLOCK)[:]
xmask = xindex < xnumel
x0 = xindex
tmp0 = tl.load(in_ptr0 + (x0), xmask)
tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)
tracemalloc.start()
try:
inp = torch.randn(10, device='cuda')
out = torch.randn(10, device='cuda')
kernel[(10, )](inp, out, 10, XBLOCK=16)
gc.collect()
begin, _ = tracemalloc.get_traced_memory()
for _ in range(100):
kernel[(10, )](inp, out, 10, XBLOCK=16)
gc.collect()
end, _ = tracemalloc.get_traced_memory()
assert end - begin < 30000
finally:
tracemalloc.stop()
# LATENCY_THRESHOLD_US = 46
# def test_kernel_launch_latency() -> None:
# def define_kernel(kernel_name: str, num_tensor_args: int) -> str:
# arg_str = ",".join([f"arg{i}: torch.Tensor" for i in range(num_tensor_args)])
# arg_str += ", n_elements: int, BLOCK_SIZE: tl.constexpr"
# func_str = f"""
# import torch
# import triton
# import triton.language as tl
# @triton.jit
# def {kernel_name}({arg_str}):
# pass
# """
# with tempfile.NamedTemporaryFile(mode="w+t", suffix=".py", delete=False) as temp_file:
# temp_file.write(textwrap.dedent(func_str))
# temp_file_path = temp_file.name
# return temp_file_path
# def import_kernel(file_path, kernel_name):
# directory, filename = os.path.split(file_path)
# module_name, _ = os.path.splitext(filename)
# sys.path.insert(0, directory)
# module = importlib.import_module(module_name)
# kernel = getattr(module, kernel_name)
# return kernel
# def empty(*kernel_args: Tuple[torch.Tensor]):
# first_arg = kernel_args[0]
# n_elements = first_arg.numel()
# grid = (triton.cdiv(n_elements, 1024),)
# device = torch.cuda.current_device()
# # Warmup
# empty_kernel[grid](*kernel_args, n_elements, BLOCK_SIZE=1024, device=device)
# torch.cuda.synchronize()
# # Measure launch overhead at steady state
# num_runs = 1000
# start_time = time.time()
# for i in range(num_runs):
# empty_kernel[grid](*kernel_args, n_elements, BLOCK_SIZE=1024, device=device)
# end_time = time.time()
# latency_us = (end_time - start_time) / num_runs * 1e6
# assert latency_us < LATENCY_THRESHOLD_US, "Kernel launch time has increased!"
# num_tensor_args = 40
# kernel_name = 'empty_kernel'
# file_path = define_kernel(kernel_name, num_tensor_args)
# empty_kernel = import_kernel(file_path, kernel_name)
# # Initialize random tensors for the empty_kernel
# torch.manual_seed(0)
# size = 1024
# kernel_args = (torch.rand(size, device='cuda') for i in range(num_tensor_args))
# # Run empty, which would run empty_kernel internally
# empty(*kernel_args)