mirror of
https://github.com/ROCm/ROCm.git
synced 2026-04-05 03:01:17 -04:00
Merge pull request #210 from ROCmSoftwarePlatform/tt.load_issue
fix pyt 2.0 issues
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -28,3 +28,6 @@ cmake-build-*
|
||||
# cache dumps
|
||||
triton_cache*
|
||||
log_*
|
||||
|
||||
#
|
||||
python/triton/third_party/cuda/bin/ptxas
|
||||
|
||||
@@ -122,7 +122,7 @@ LogicalResult tritonTranslateMain(int argc, char **argv,
|
||||
}
|
||||
|
||||
llvm::LLVMContext llvmContext;
|
||||
#if 1 // USE_ROCM doesnot work here
|
||||
#ifdef USE_ROCM
|
||||
auto llvmir = translateTritonGPUToLLVMIR(&llvmContext, *module,
|
||||
SMArch.getValue(), true /*isRocm*/);
|
||||
#else
|
||||
|
||||
@@ -337,11 +337,8 @@ translateTritonGPUToLLVMIR(llvm::LLVMContext *llvmContext,
|
||||
llvm::errs() << "Pass execution failed";
|
||||
return nullptr;
|
||||
}
|
||||
#ifdef USE_ROCM
|
||||
auto llvmIR = translateLLVMToLLVMIR(llvmContext, module, true);
|
||||
#else
|
||||
auto llvmIR = translateLLVMToLLVMIR(llvmContext, module, false);
|
||||
#endif
|
||||
|
||||
auto llvmIR = translateLLVMToLLVMIR(llvmContext, module, isROCM);
|
||||
if (!llvmIR) {
|
||||
llvm::errs() << "Translate to LLVM IR failed";
|
||||
return nullptr;
|
||||
|
||||
@@ -70,7 +70,7 @@ def optimize_ttgir(mod, num_stages, arch):
|
||||
pm.enable_debug()
|
||||
pm.add_tritongpu_coalesce_pass()
|
||||
pm.add_tritongpu_remove_layout_conversions_pass()
|
||||
if isinstance(arch, int):
|
||||
if _is_cuda(arch):
|
||||
pm.add_tritongpu_accelerate_matmul_pass(arch)
|
||||
pm.add_tritongpu_remove_layout_conversions_pass()
|
||||
pm.add_tritongpu_optimize_dot_operands_pass()
|
||||
@@ -325,6 +325,9 @@ instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equ
|
||||
def _is_cuda(arch):
|
||||
return isinstance(arch, int)
|
||||
|
||||
def is_hip():
|
||||
return torch.version.hip is not None
|
||||
|
||||
|
||||
def get_architecture_descriptor(capability):
|
||||
if capability is None:
|
||||
@@ -363,7 +366,11 @@ def add_cuda_stages(arch, extern_libs, stages):
|
||||
|
||||
|
||||
def compile(fn, **kwargs):
|
||||
arch = get_architecture_descriptor(kwargs.get("cc", None))
|
||||
if is_hip():
|
||||
capability = None
|
||||
else:
|
||||
capability = kwargs.get("cc", None)
|
||||
arch = get_architecture_descriptor(capability)
|
||||
is_cuda = _is_cuda(arch)
|
||||
context = _triton.ir.context()
|
||||
asm = dict()
|
||||
|
||||
BIN
python/triton/third_party/cuda/bin/ptxas
vendored
BIN
python/triton/third_party/cuda/bin/ptxas
vendored
Binary file not shown.
@@ -83,24 +83,31 @@ if __name__ == '__main__':
|
||||
print(module)
|
||||
sys.exit(0)
|
||||
|
||||
if not args.sm:
|
||||
raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation")
|
||||
# set arch depending on platform
|
||||
if args.gfx:
|
||||
arch = args.gfx
|
||||
elif args.sm:
|
||||
arch = args.sm
|
||||
else:
|
||||
raise argparse.ArgumentError(None, "Must specify --sm or --gfx for ttgir compilation")
|
||||
|
||||
# triton-ir -> triton-gpu-ir
|
||||
module = tc.ttir_to_ttgir(module, num_warps=args.num_warps)
|
||||
module = tc.optimize_ttgir(module, num_stages=3, arch=args.sm)
|
||||
module = tc.optimize_ttgir(module, num_stages=3, arch=arch)
|
||||
if args.target == 'triton-gpu-ir':
|
||||
print(module.str())
|
||||
sys.exit(0)
|
||||
|
||||
# triton-gpu-ir -> llvm-ir
|
||||
module = tc.ttgir_to_llir(module, extern_libs=None, arch=args.sm)
|
||||
module = tc.ttgir_to_llir(module, extern_libs=None, arch=arch)
|
||||
if args.target == 'llvm-ir':
|
||||
print(module)
|
||||
sys.exit(0)
|
||||
|
||||
# llvm-ir -> ptx
|
||||
if args.target == 'ptx':
|
||||
if not args.sm:
|
||||
raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation")
|
||||
if not args.ptx_version:
|
||||
raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation")
|
||||
module = tc.llir_to_ptx(module, arch=args.sm, ptx_version=args.ptx_version)
|
||||
|
||||
@@ -3,14 +3,13 @@ set -x
|
||||
cd python
|
||||
pip uninstall -y triton
|
||||
|
||||
sh scripts/amd/clean.sh
|
||||
bash scripts/amd/clean.sh
|
||||
|
||||
export MLIR_ENABLE_DUMP=1
|
||||
export LLVM_IR_ENABLE_DUMP=1
|
||||
export AMDGCN_ENABLE_DUMP=1
|
||||
|
||||
export TRITON_USE_ROCM=ON
|
||||
# export MI_GPU_ARCH=gfx90a # not needed
|
||||
|
||||
pip install -U matplotlib pandas filelock tabulate
|
||||
# pip install -U matplotlib pandas filelock tabulate
|
||||
pip install --verbose -e .
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
set -o xtrace
|
||||
|
||||
alias drun='sudo docker run -it --rm --network=host --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined'
|
||||
alias drun='sudo docker run -it --rm --network=host --user root --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined'
|
||||
|
||||
# DEVICES="--gpus all"
|
||||
DEVICES="--device=/dev/kfd --device=/dev/dri"
|
||||
@@ -12,11 +12,9 @@ VOLUMES="-v $HOME/dockerx:/dockerx -v /data:/data"
|
||||
# WORK_DIR='/root/$(basename $(pwd))'
|
||||
WORK_DIR="/dockerx/$(basename $(pwd))"
|
||||
|
||||
# IMAGE_NAME=nvcr.io/nvidia/pytorch:21.08-py3
|
||||
# IMAGE_NAME=rocm/pytorch:latest
|
||||
IMAGE_NAME=rocm/pytorch:rocm5.4_ubuntu20.04_py3.8_pytorch_1.12.1
|
||||
# IMAGE_NAME=rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.10.0
|
||||
# IMAGE_NAME=triton_rocm_20-52 # build this docker before running
|
||||
IMAGE_NAME=rocm/pytorch-nightly:latest
|
||||
# IMAGE_NAME=rocm/pytorch:latest
|
||||
# IMAGE_NAME=nvcr.io/nvidia/pytorch
|
||||
|
||||
CONTAINER_NAME=triton
|
||||
|
||||
|
||||
8
scripts/amd/pytorch.sh
Executable file
8
scripts/amd/pytorch.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
# pip install transformers
|
||||
# pip install --upgrade diffusers[torch]
|
||||
# cd ../stuff/stable_diff
|
||||
# python run.py
|
||||
|
||||
cd ../pytorch_rocm/
|
||||
# TORCHINDUCTOR_COMPILE_THREADS=1 pytest test/inductor/test_torchinductor.py -k "test_views4_cuda"
|
||||
pytest test/inductor/test_torchinductor.py -k "test_views4_cuda"
|
||||
@@ -12,7 +12,8 @@ chmod -R 777 $LOG_DIR
|
||||
|
||||
bash scripts/amd/clean.sh
|
||||
bash scripts/amd/build.sh
|
||||
bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log
|
||||
# bash scripts/amd/lit.sh 2>&1 |tee $LOG_DIR/lit.log
|
||||
# bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log
|
||||
# bash scripts/amd/pytorch.sh 2>&1 |tee $LOG_DIR/test.log
|
||||
bash scripts/amd/lit.sh 2>&1 |tee $LOG_DIR/lit.log
|
||||
# bash scripts/amd/test.sh backtrace 2>&1 |tee $LOG_DIR/backtrace.log
|
||||
# bash scripts/amd/cache_print.sh 2>&1 |tee $LOG_DIR/cache.log
|
||||
@@ -14,9 +14,10 @@ chmod -R 777 $LOG_DIR
|
||||
sh scripts/amd/clean.sh
|
||||
|
||||
UNIT_TEST="python/test/unit/language/test_core_amd.py"
|
||||
# UNIT_TEST="python/test/unit/language/test_core.py::test_empty_kernel[float32]"
|
||||
# UNIT_TEST="python/test/unit/runtime/test_cache.py::test_compile_in_subproc"
|
||||
# UNIT_TEST="python/test/unit/language/test_core_amd.py::test_shift_op[int8-int8-<<]"
|
||||
# UNIT_TEST="python/test/unit/language/test_core_amd.py::test_shift_op[int32-int32->>]"
|
||||
# UNIT_TEST="python/test/unit/language/test_core.py::test_empty_kernel[float32]"
|
||||
# UNIT_TEST="python/test/unit/language/test_core.py::test_bin_op"
|
||||
# UNIT_TEST="python/test/unit/language/test_core.py::test_bin_op[float32-float32-+]"
|
||||
# UNIT_TEST="python/test/unit/language/test_core.py::test_bin_op[int8-float16-%]"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// RUN: %PYTHON -m triton.tools.aot %s --target=llvm-ir --sm=80 | FileCheck %s
|
||||
// RUN: %PYTHON -m triton.tools.aot %s --target=llvm-ir --gfx=90a | FileCheck %s
|
||||
|
||||
// == LLVM IR check begin ==
|
||||
// CHECK-LABEL: ; ModuleID = 'LLVMDialectModule'
|
||||
|
||||
Reference in New Issue
Block a user