Merge pull request #210 from ROCmSoftwarePlatform/tt.load_issue

fix pyt 2.0 issues
2026-04-05 03:01:17 -04:00 · 2023-05-16 14:18:24 -04:00
parent 7acc1cb707 94857d1ff0
commit dbf6a638dd
12 changed files with 46 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,6 @@ cmake-build-*
 # cache dumps
 triton_cache*
 log_*
+
+#
+python/triton/third_party/cuda/bin/ptxas
--- a/bin/triton-translate.cpp
+++ b/bin/triton-translate.cpp
@@ -122,7 +122,7 @@ LogicalResult tritonTranslateMain(int argc, char **argv,
  }

  llvm::LLVMContext llvmContext;
-#if 1 // USE_ROCM doesnot work here
+#ifdef USE_ROCM
  auto llvmir = translateTritonGPUToLLVMIR(&llvmContext, *module,
                                           SMArch.getValue(), true /*isRocm*/);
 #else
--- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp
+++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
@@ -337,11 +337,8 @@ translateTritonGPUToLLVMIR(llvm::LLVMContext *llvmContext,
    llvm::errs() << "Pass execution failed";
    return nullptr;
  }
-#ifdef USE_ROCM
-  auto llvmIR = translateLLVMToLLVMIR(llvmContext, module, true);
-#else
-  auto llvmIR = translateLLVMToLLVMIR(llvmContext, module, false);
-#endif
+
+  auto llvmIR = translateLLVMToLLVMIR(llvmContext, module, isROCM);
  if (!llvmIR) {
    llvm::errs() << "Translate to LLVM IR failed";
    return nullptr;
--- a/python/triton/compiler/compiler.py
+++ b/python/triton/compiler/compiler.py
@@ -70,7 +70,7 @@ def optimize_ttgir(mod, num_stages, arch):
    pm.enable_debug()
    pm.add_tritongpu_coalesce_pass()
    pm.add_tritongpu_remove_layout_conversions_pass()
-    if isinstance(arch, int):
+    if _is_cuda(arch):
        pm.add_tritongpu_accelerate_matmul_pass(arch)
    pm.add_tritongpu_remove_layout_conversions_pass()
    pm.add_tritongpu_optimize_dot_operands_pass()
@@ -325,6 +325,9 @@ instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equ
 def _is_cuda(arch):
    return isinstance(arch, int)

+def is_hip():
+    return torch.version.hip is not None
+

 def get_architecture_descriptor(capability):
    if capability is None:
@@ -363,7 +366,11 @@ def add_cuda_stages(arch, extern_libs, stages):


 def compile(fn, **kwargs):
-    arch = get_architecture_descriptor(kwargs.get("cc", None))
+    if is_hip():
+        capability = None
+    else:
+        capability = kwargs.get("cc", None)
+    arch = get_architecture_descriptor(capability)
    is_cuda = _is_cuda(arch)
    context = _triton.ir.context()
    asm = dict()
--- a/python/triton/third_party/cuda/bin/ptxas
+++ b/python/triton/third_party/cuda/bin/ptxas
--- a/python/triton/tools/aot.py
+++ b/python/triton/tools/aot.py
@@ -83,24 +83,31 @@ if __name__ == '__main__':
        print(module)
        sys.exit(0)

-    if not args.sm:
-        raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation")
+    # set arch depending on platform
+    if args.gfx:
+        arch = args.gfx
+    elif args.sm:
+        arch = args.sm
+    else:
+        raise argparse.ArgumentError(None, "Must specify --sm or --gfx for ttgir compilation")

    # triton-ir -> triton-gpu-ir
    module = tc.ttir_to_ttgir(module, num_warps=args.num_warps)
-    module = tc.optimize_ttgir(module, num_stages=3, arch=args.sm)
+    module = tc.optimize_ttgir(module, num_stages=3, arch=arch)
    if args.target == 'triton-gpu-ir':
        print(module.str())
        sys.exit(0)

    # triton-gpu-ir -> llvm-ir
-    module = tc.ttgir_to_llir(module, extern_libs=None, arch=args.sm)
+    module = tc.ttgir_to_llir(module, extern_libs=None, arch=arch)
    if args.target == 'llvm-ir':
        print(module)
        sys.exit(0)

    # llvm-ir -> ptx
    if args.target == 'ptx':
+        if not args.sm:
+            raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation")
        if not args.ptx_version:
            raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation")
        module = tc.llir_to_ptx(module, arch=args.sm, ptx_version=args.ptx_version)
--- a/scripts/amd/build.sh
+++ b/scripts/amd/build.sh
@@ -3,14 +3,13 @@ set -x
 cd python
 pip uninstall -y triton

-sh scripts/amd/clean.sh
+bash scripts/amd/clean.sh

 export MLIR_ENABLE_DUMP=1
 export LLVM_IR_ENABLE_DUMP=1
 export AMDGCN_ENABLE_DUMP=1

 export TRITON_USE_ROCM=ON
-# export MI_GPU_ARCH=gfx90a # not needed

-pip install -U matplotlib pandas filelock tabulate
+# pip install -U matplotlib pandas filelock tabulate
 pip install --verbose -e .
--- a/scripts/amd/docker_run.sh
+++ b/scripts/amd/docker_run.sh
@@ -1,6 +1,6 @@
 set -o xtrace

-alias drun='sudo docker run -it --rm --network=host --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined'
+alias drun='sudo docker run -it --rm --network=host --user root --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined'

 # DEVICES="--gpus all"
 DEVICES="--device=/dev/kfd --device=/dev/dri"
@@ -12,11 +12,9 @@ VOLUMES="-v $HOME/dockerx:/dockerx -v /data:/data"
 # WORK_DIR='/root/$(basename $(pwd))'
 WORK_DIR="/dockerx/$(basename $(pwd))"

-# IMAGE_NAME=nvcr.io/nvidia/pytorch:21.08-py3
-# IMAGE_NAME=rocm/pytorch:latest 
-IMAGE_NAME=rocm/pytorch:rocm5.4_ubuntu20.04_py3.8_pytorch_1.12.1
-# IMAGE_NAME=rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.10.0
-# IMAGE_NAME=triton_rocm_20-52 # build this docker before running
+IMAGE_NAME=rocm/pytorch-nightly:latest
+# IMAGE_NAME=rocm/pytorch:latest
+# IMAGE_NAME=nvcr.io/nvidia/pytorch

 CONTAINER_NAME=triton

--- a/scripts/amd/pytorch.sh
+++ b/scripts/amd/pytorch.sh
@@ -0,0 +1,8 @@
+# pip install transformers
+# pip install --upgrade diffusers[torch]
+# cd ../stuff/stable_diff
+# python run.py
+
+cd ../pytorch_rocm/
+# TORCHINDUCTOR_COMPILE_THREADS=1 pytest test/inductor/test_torchinductor.py -k "test_views4_cuda"
+pytest test/inductor/test_torchinductor.py -k "test_views4_cuda"
--- a/scripts/amd/run.sh
+++ b/scripts/amd/run.sh
@@ -12,7 +12,8 @@ chmod -R 777 $LOG_DIR

 bash scripts/amd/clean.sh
 bash scripts/amd/build.sh
-bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log
-# bash scripts/amd/lit.sh 2>&1 |tee $LOG_DIR/lit.log
+# bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log
+# bash scripts/amd/pytorch.sh 2>&1 |tee $LOG_DIR/test.log
+bash scripts/amd/lit.sh 2>&1 |tee $LOG_DIR/lit.log
 # bash scripts/amd/test.sh backtrace 2>&1 |tee $LOG_DIR/backtrace.log
 # bash scripts/amd/cache_print.sh  2>&1 |tee $LOG_DIR/cache.log
--- a/scripts/amd/test.sh
+++ b/scripts/amd/test.sh
@@ -14,9 +14,10 @@ chmod -R 777 $LOG_DIR
 sh scripts/amd/clean.sh

 UNIT_TEST="python/test/unit/language/test_core_amd.py"
+# UNIT_TEST="python/test/unit/language/test_core.py::test_empty_kernel[float32]"
+# UNIT_TEST="python/test/unit/runtime/test_cache.py::test_compile_in_subproc"
 # UNIT_TEST="python/test/unit/language/test_core_amd.py::test_shift_op[int8-int8-<<]"
 # UNIT_TEST="python/test/unit/language/test_core_amd.py::test_shift_op[int32-int32->>]"
-# UNIT_TEST="python/test/unit/language/test_core.py::test_empty_kernel[float32]"
 # UNIT_TEST="python/test/unit/language/test_core.py::test_bin_op"
 # UNIT_TEST="python/test/unit/language/test_core.py::test_bin_op[float32-float32-+]"
 # UNIT_TEST="python/test/unit/language/test_core.py::test_bin_op[int8-float16-%]"
--- a/test/Target/tritongpu_to_llvmir.mlir
+++ b/test/Target/tritongpu_to_llvmir.mlir
@@ -1,4 +1,4 @@
-// RUN: %PYTHON -m triton.tools.aot %s --target=llvm-ir --sm=80 | FileCheck %s
+// RUN: %PYTHON -m triton.tools.aot %s --target=llvm-ir --gfx=90a | FileCheck %s

 // == LLVM IR check begin ==
 // CHECK-LABEL: ; ModuleID = 'LLVMDialectModule'