Merge commit 'cb3d79a185e40c9d8a579bea07747a8a8d157d52' into ifu-231117

Conflicts:
	lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
	lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
	lib/Dialect/TritonGPU/IR/Dialect.cpp
	python/setup.py
	python/test/unit/language/assert_helper.py
	python/test/unit/operators/test_flash_attention.py
	python/test/unit/runtime/test_subproc.py
	python/triton/compiler/compiler.py
	python/triton/language/semantic.py
	python/triton/runtime/autotuner.py
	python/triton/runtime/jit.py
	python/tutorials/03-matrix-multiplication.py
	python/tutorials/05-layer-norm.py
	python/tutorials/06-fused-attention.py
	python/tutorials/11-grouped-gemm.py
	test/Conversion/tritongpu_to_llvm.mlir
This commit is contained in:
Jason Furmanek
2023-11-17 20:42:12 +00:00
179 changed files with 10116 additions and 6835 deletions

View File

@@ -55,6 +55,7 @@ class Package(NamedTuple):
lib_flag: str
syspath_var_name: str
# pybind11
@@ -63,6 +64,7 @@ def get_pybind11_package_info():
url = "https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz"
return Package("pybind11", name, url, "PYBIND11_INCLUDE_DIR", "", "PYBIND11_SYSPATH")
# llvm
@@ -74,6 +76,8 @@ def get_llvm_package_info():
arch = 'arm64'
if system == "Darwin":
arch = platform.machine()
if arch == "x86_64":
arch = "x64"
system_suffix = f"macos-{arch}"
elif system == "Linux":
# TODO: arm64
@@ -84,7 +88,7 @@ def get_llvm_package_info():
return Package("llvm", "LLVM-C.lib", "", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH")
# use_assert_enabled_llvm = check_env_flag("TRITON_USE_ASSERT_ENABLED_LLVM", "False")
# release_suffix = "assert" if use_assert_enabled_llvm else "release"
rev = "b1115f8c"
rev = "49af6502"
name = f"llvm-{rev}-{system_suffix}"
url = f"https://tritonlang.blob.core.windows.net/llvm-builds/{name}.tar.gz"
return Package("llvm", name, url, "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH")
@@ -119,10 +123,13 @@ def get_thirdparty_packages(triton_cache_path):
thirdparty_cmake_args.append(f"-D{p.lib_flag}={package_dir}/lib")
return thirdparty_cmake_args
# ---- package data ---
def download_and_copy(src_path, version, url_func):
def download_and_copy(src_path, variable, version, url_func):
if variable in os.environ:
return
base_dir = os.path.dirname(__file__)
arch = platform.machine()
if arch == "x86_64":
@@ -148,7 +155,7 @@ def download_and_copy(src_path, version, url_func):
src_path = os.path.join(temp_dir, src_path)
os.makedirs(os.path.split(dst_path)[0], exist_ok=True)
shutil.copy(src_path, dst_path)
return dst_suffix
# ---- cmake extension ----
@@ -167,18 +174,21 @@ def get_cmake_dir():
class CMakeClean(clean):
def initialize_options(self):
clean.initialize_options(self)
self.build_temp = get_cmake_dir()
class CMakeBuildPy(build_py):
def run(self) -> None:
self.run_command('build_ext')
return super().run()
class CMakeExtension(Extension):
def __init__(self, name, path, sourcedir=""):
Extension.__init__(self, name, sources=[])
self.sourcedir = os.path.abspath(sourcedir)
@@ -201,7 +211,8 @@ class CMakeBuild(build_ext):
try:
out = subprocess.check_output(["cmake", "--version"])
except OSError:
raise RuntimeError("CMake must be installed to build the following extensions: " + ", ".join(e.name for e in self.extensions))
raise RuntimeError("CMake must be installed to build the following extensions: " +
", ".join(e.name for e in self.extensions))
match = re.search(r"version\s*(?P<major>\d+)\.(?P<minor>\d+)([\d.]+)?", out.decode())
cmake_major, cmake_minor = int(match.group("major")), int(match.group("minor"))
@@ -228,8 +239,10 @@ class CMakeBuild(build_ext):
# python directories
python_include_dir = sysconfig.get_path("platinclude")
cmake_args = [
"-G", "Ninja", # Ninja is much faster than make
"-DCMAKE_MAKE_PROGRAM=" + ninja_dir, # Pass explicit path to ninja otherwise cmake may cache a temporary path
"-G",
"Ninja", # Ninja is much faster than make
"-DCMAKE_MAKE_PROGRAM=" +
ninja_dir, # Pass explicit path to ninja otherwise cmake may cache a temporary path
"-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
"-DLLVM_ENABLE_WERROR=ON",
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
@@ -263,12 +276,28 @@ class CMakeBuild(build_ext):
build_args += ['-j' + max_jobs]
if check_env_flag("TRITON_BUILD_WITH_CLANG_LLD"):
cmake_args += ["-DCMAKE_C_COMPILER=clang",
"-DCMAKE_CXX_COMPILER=clang++",
"-DCMAKE_LINKER=lld",
"-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
"-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
"-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld"]
cmake_args += [
"-DCMAKE_C_COMPILER=clang",
"-DCMAKE_CXX_COMPILER=clang++",
"-DCMAKE_LINKER=lld",
"-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
"-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
"-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld",
]
# Note that asan doesn't work with binaries that use the GPU, so this is
# only useful for tools like triton-opt that don't run code on the GPU.
#
# I tried and gave up getting msan to work. It seems that libstdc++'s
# std::string does not play nicely with clang's msan (I didn't try
# gcc's). I was unable to configure clang to ignore the error, and I
# also wasn't able to get libc++ to work, but that doesn't mean it's
# impossible. :)
if check_env_flag("TRITON_BUILD_WITH_ASAN"):
cmake_args += [
"-DCMAKE_C_FLAGS=-fsanitize=address",
"-DCMAKE_CXX_FLAGS=-fsanitize=address",
]
if check_env_flag("TRITON_BUILD_WITH_CCACHE"):
cmake_args += [
@@ -282,9 +311,27 @@ class CMakeBuild(build_ext):
subprocess.check_call(["cmake", "--build", ".", "--target", "mlir-doc"], cwd=cmake_dir)
download_and_copy(src_path='bin/ptxas', version='12.1.105', url_func=lambda arch, version: f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-nvcc-{version}-0.tar.bz2")
download_and_copy(src_path='bin/cuobjdump', version='12.1.111', url_func=lambda arch, version: f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-cuobjdump-{version}-0.tar.bz2")
download_and_copy(src_path='bin/nvdisasm', version='12.1.105', url_func=lambda arch, version: f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-nvdisasm-{version}-0.tar.bz2")
download_and_copy(
src_path="bin/ptxas",
variable="TRITON_PTXAS_PATH",
version="12.1.105",
url_func=lambda arch, version:
f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-nvcc-{version}-0.tar.bz2",
)
download_and_copy(
src_path="bin/cuobjdump",
variable="TRITON_CUOBJDUMP_PATH",
version="12.1.111",
url_func=lambda arch, version:
f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-cuobjdump-{version}-0.tar.bz2",
)
download_and_copy(
src_path="bin/nvdisasm",
variable="TRITON_NVDISASM_PATH",
version="12.1.105",
url_func=lambda arch, version:
f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-nvdisasm-{version}-0.tar.bz2",
)
setup(
name="triton",
@@ -307,10 +354,14 @@ setup(
"triton/third_party",
"triton/tools",
],
<<<<<<< HEAD
long_description_content_type="text/markdown",
install_requires=[
"filelock"
],
=======
install_requires=["filelock"],
>>>>>>> cb3d79a185e40c9d8a579bea07747a8a8d157d52
include_package_data=True,
ext_modules=[CMakeExtension("triton", "triton/_C/")],
cmdclass={"build_ext": CMakeBuild, "build_py": CMakeBuildPy, "clean": CMakeClean},