diff --git a/docs/.sphinx/_toc.yml.in b/docs/.sphinx/_toc.yml.in index 0be8809c8..b311775e7 100644 --- a/docs/.sphinx/_toc.yml.in +++ b/docs/.sphinx/_toc.yml.in @@ -182,6 +182,14 @@ subtrees: - caption: How to Guides entries: + - title: Tuning Guides + file: how_to/tuning_guides/index.md + subtrees: + - entries: + - title: MI200 + file: how_to/tuning_guides/mi200.md + - title: MI100 + file: how_to/tuning_guides/mi100.md - file: how_to/deep_learning_rocm subtrees: - entries: diff --git a/docs/data/how_to/tuning_guides/image.001.png b/docs/data/how_to/tuning_guides/image.001.png new file mode 100644 index 000000000..a83fe2ee2 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.001.png differ diff --git a/docs/data/how_to/tuning_guides/image.002.png b/docs/data/how_to/tuning_guides/image.002.png new file mode 100644 index 000000000..9338104d8 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.002.png differ diff --git a/docs/data/how_to/tuning_guides/image.003.png b/docs/data/how_to/tuning_guides/image.003.png new file mode 100644 index 000000000..b05b09706 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.003.png differ diff --git a/docs/data/how_to/tuning_guides/image.004.png b/docs/data/how_to/tuning_guides/image.004.png new file mode 100644 index 000000000..9fdc2a5fa Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.004.png differ diff --git a/docs/data/how_to/tuning_guides/image.005.png b/docs/data/how_to/tuning_guides/image.005.png new file mode 100644 index 000000000..10b3d7ba1 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.005.png differ diff --git a/docs/data/how_to/tuning_guides/image.006.png b/docs/data/how_to/tuning_guides/image.006.png new file mode 100644 index 000000000..185229d25 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.006.png differ diff --git a/docs/data/how_to/tuning_guides/image.007.png b/docs/data/how_to/tuning_guides/image.007.png new file mode 100644 index 000000000..6e5b280f2 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.007.png differ diff --git a/docs/data/how_to/tuning_guides/image.008.png b/docs/data/how_to/tuning_guides/image.008.png new file mode 100644 index 000000000..69718df12 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.008.png differ diff --git a/docs/data/how_to/tuning_guides/image.009.png b/docs/data/how_to/tuning_guides/image.009.png new file mode 100644 index 000000000..445405a26 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.009.png differ diff --git a/docs/data/how_to/tuning_guides/image.010.png b/docs/data/how_to/tuning_guides/image.010.png new file mode 100644 index 000000000..6f7b855a4 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.010.png differ diff --git a/docs/data/how_to/tuning_guides/image.011.png b/docs/data/how_to/tuning_guides/image.011.png new file mode 100644 index 000000000..c2d4df59f Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.011.png differ diff --git a/docs/data/how_to/tuning_guides/image.012.png b/docs/data/how_to/tuning_guides/image.012.png new file mode 100644 index 000000000..4d89256ba Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.012.png differ diff --git a/docs/data/how_to/tuning_guides/image.013.png b/docs/data/how_to/tuning_guides/image.013.png new file mode 100644 index 000000000..4a15a7787 Binary files /dev/null and b/docs/data/how_to/tuning_guides/image.013.png differ diff --git a/docs/deploy/linux/install.md b/docs/deploy/linux/install.md index 0eaa91691..abcb41414 100644 --- a/docs/deploy/linux/install.md +++ b/docs/deploy/linux/install.md @@ -899,6 +899,8 @@ but are generally useful. Verification of the install is advised. developing these libraries or want to use self-built versions of them.) ``` +(verifying-kernel-mode-driver-installation)= + ### Verifying Kernel-mode Driver Installation Check the installation of the kernel-mode driver by typing the command given diff --git a/docs/how_to/tuning_guides/index.md b/docs/how_to/tuning_guides/index.md new file mode 100644 index 000000000..1baa3e07e --- /dev/null +++ b/docs/how_to/tuning_guides/index.md @@ -0,0 +1,65 @@ +# Tuning Guides + +High Performance Computing (HPC) workloads have unique requirements. The default +hardware and BIOS configurations for OEM platforms may not provide optimal +performance for HPC workloads. To enable optimal HPC settings on a per-platform +and per-workload level, this guide calls out: + +- BIOS settings that can impact performance +- Hardware configuration best practices +- Supported versions of operating systems +- Workload-specific recommendations for optimal BIOS and operating system + settings + +There is also a discussion on the AMD Instinct™ software development +environment, including information on how to install and run the DGEMM, STREAM, +HPCG, and HPL benchmarks. This guidance provides a good starting point but is +not exhaustively tested across all compilers. + +Prerequisites to understanding this document and to performing tuning of HPC +applications include: + +- Experience in configuring servers +- Administrative access to the server's Management Interface (BMC) +- Administrative access to the operating system +- Familiarity with the OEM server's BMC (strongly recommended) +- Familiarity with the OS specific tools for configuration, monitoring, and + troubleshooting (strongly recommended) + +This document provides guidance on tuning systems with various AMD Instinct™ +accelerators for HPC workloads. This document is not an all-inclusive guide, and +some items referred to may have similar, but different, names in various OEM +systems (for example, OEM-specific BIOS settings). This document also provides +suggestions on items that should be the initial focus of additional, +application-specific tuning. + +This document is based on the AMD EYPC™ 7003-series processor family (former +codename "Milan"). + +While this guide is a good starting point, developers are encouraged to perform +their own performance testing for additional tuning. + +:::::{grid} 1 1 2 2 +:gutter: 1 + +:::{grid-item-card} AMD Instinct™ MI200 +This chapter goes through how to configure your AMD Instinct™ MI200 accelerated +compute nodes to get the best performance out of them. + +- [Instruction Set Architecture](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) +- [Whitepaper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf) +- [Guide](./gpu_arch/mi250.md) + +::: + +:::{grid-item-card} AMD Instinct™ MI100 +This chapter briefly reviews hardware aspects of the AMD Instinct™ MI100 +accelerators and the CDNA™ 1 architecture that is the foundation of these GPUs. + +- [Instruction Set Architecture](https://www.amd.com/system/files/TechDocs/instinct-mi100-cdna1-shader-instruction-set-architecture%C2%A0.pdf) +- [Whitepaper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf) +- [Guide](./gpu_arch/mi100.md) + +::: + +::::: diff --git a/docs/how_to/tuning_guides/mi100.md b/docs/how_to/tuning_guides/mi100.md new file mode 100644 index 000000000..44819e7b7 --- /dev/null +++ b/docs/how_to/tuning_guides/mi100.md @@ -0,0 +1,504 @@ +# MI100 High Performance Computing and Tuning Guide + +## System Settings + +This chapter reviews system settings that are required to configure the system +for AMD Instinct™ MI100 accelerators and that can improve performance of the +GPUs. It is advised to configure the system for best possible host configuration +according to the "High Performance Computing (HPC) Tuning Guide for AMD EPYC™ +7002 Series Processors" or "High Performance Computing (HPC) Tuning Guide for +AMD EPYC™ 7003 Series Processors" depending on the processor generation of the +system. + +In addition to the BIOS settings listed below the following settings +({ref}`bios_settings`) will also have to be enacted via the command line (see +{ref}`os_settings`): + +- Core C states +- AMD-PCI-UTIL (on AMD EPYC™ 7002 series processors) +- IOMMU (if needed) + +(bios_settings)= + +### System BIOS Settings + +For maximum MI100 GPU performance on systems with AMD EPYC™ 7002 series +processors (codename "Rome") and AMI System BIOS, the following configuration of +System BIOS settings has been validated. These settings must be used for the +qualification process and should be set as default values for the system BIOS. +Analogous settings for other non-AMI System BIOS providers could be set +similarly. For systems with Intel processors, some settings may not apply or be +available as listed in {numref}`mi100-bios`. + +```{list-table} Recommended settings for the system BIOS in a GIGABYTE platform. +:header-rows: 1 +:name: mi100-bios + +* + - BIOS Setting Location + - Parameter + - Value + - Comments +* + - Advanced / PCI Subsystem Settings + - Above 4G Decoding + - Enabled + - GPU Large BAR Support +* + - AMD CBS / CPU Common Options + - Global C-state Control + - Auto + - Global Core C-States +* + - AMD CBS / CPU Common Options + - CCD/Core/Thread Enablement + - Accept + - Global Core C-States +* + - AMD CBS / CPU Common Options / Performance + - SMT Control + - Disable + - Global Core C-States +* + - AMD CBS / DF Common Options / Memory Addressing + - NUMA nodes per socket + - NPS 1,2,4 + - NUMA Nodes (NPS) +* + - AMD CBS / DF Common Options / Memory Addressing + - Memory interleaving + - Auto + - Numa Nodes (NPS) +* + - AMD CBS / DF Common Options / Link + - 4-link xGMI max speed + - 18 Gbps + - Set AMD CPU xGMI speed to highest rate supported +* + - AMD CBS / DF Common Options / Link + - 3-link xGMI max speed + - 18 Gbps + - Set AMD CPU xGMI speed to highest rate supported +* + - AMD CBS / NBIO Common Options + - IOMMU + - Disable + - +* + - AMD CBS / NBIO Common Options + - PCIe Ten Bit Tag Support + - Enable + - +* + - AMD CBS / NBIO Common Options + - Preferred IO + - Manual + - +* + - AMD CBS / NBIO Common Options + - Preferred IO Bus + - "Use lspci to find pci device id" + - +* + - AMD CBS / NBIO Common Options + - Enhanced Preferred IO Mode + - Enable + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Determinism Control + - Manual + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Determinism Slider + - Power + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - cTDP Control + - Manual + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - cTDP + - 240 + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Package Power Limit Control + - Manual + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Package Power Limit + - 240 + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - xGMI Link Width Control + - Manual + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - xGMI Force Link Width + - 2 + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - xGMI Force Link Width Control + - Force + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - APBDIS + - 1 + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - DF C-states + - Auto + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Fixed SOC P-state + - P0 + - +* + - AMD CBS / UMC Common Options / DDR4 Common Options + - Enforce POR + - Accept + - +* + - AMD CBS / UMC Common Options / DDR4 Common Options / Enforce POR + - Overclock + - Enabled + - +* + - AMD CBS / UMC Common Options / DDR4 Common Options / Enforce POR + - Memory Clock Speed + - 1600 MHz + - Set to max Memory Speed, if using 3200 MHz DIMMs +* + - AMD CBS / UMC Common Options / DDR4 Common Options / DRAM Controller + Configuration / DRAM Power Options + - Power Down Enable + - Disabled + - RAM Power Down +* + - AMD CBS / Security + - TSME + - Disabled + - Memory Encryption +``` + +#### NBIO Link Clock Frequency + +The NBIOs (4x per AMD EPYC™ processor) are the serializers/deserializers (also +known as "SerDes") that convert and prepare the I/O signals for the processor's +128 external I/O interface lanes (32 per NBIO). + +LCLK (short for link clock frequency) controls the link speed of the internal +bus that connects the NBIO silicon with the data fabric. All data between the +processor and its PCIe lanes flow to the data fabric based on these LCLK +frequency settings. The link clock frequency of the NBIO components need to be +forced to the maximum frequency for optimal PCIe performance. + +For AMD EPYC™ 7002 series processors, this setting cannot be modified via +configuration options in the server BIOS alone. Instead, the AMD-IOPM-UTIL (see +Section 3.2.3) must be run at every server boot to disable Dynamic Power +Management for all PCIe Root Complexes and NBIOs within the system and to lock +the logic into the highest performance operational mode. + +For AMD EPYC™ 7003 series processors, configuring all NBIOs to be in "Enhanced +Preferred I/O" mode is sufficient to enable highest link clock frequency for the +NBIO components. + +#### Memory Configuration + +For the memory addressing modes (see {numref}`mi100-bios`), especially the +number of NUMA nodes per socket/processor (NPS), the recommended setting is +to follow the guidance of the "High Performance Computing (HPC) Tuning Guide +for AMD EPYC™ 7002 Series Processors" and "High Performance Computing (HPC) +Tuning Guide for AMD EPYC™ 7003 Series Processors" to provide the optimal +configuration for host side computation. + +If the system is set to one NUMA domain per socket/processor (NPS1), +bidirectional copy bandwidth between host memory and GPU memory may be +slightly higher (up to about 16% more) than with four NUMA domains per socket +processor (NPS4). For memory bandwidth sensitive applications using MPI, NPS4 +is recommended. For applications that are not optimized for NUMA locality, +NPS1 is the recommended setting. + +(os_settings)= + +### Operating System Settings + +#### CPU Core State - "C States" + +There are several Core-States, or C-states that an AMD EPYC CPU can idle within: + +- C0: active. This is the active state while running an application. +- C1: idle +- C2: idle and power gated. This is a deeper sleep state and will have a + greater latency when moving back to the C0 state, compared to when the + CPU is coming out of C1. + +Disabling C2 is important for running with a high performance, low-latency +network. To disable power-gating on all cores run the following on Linux +systems: + +```shell +cpupower idle-set -d 2 +``` + +Note that the `cpupower` tool must be installed, as it is not part of the base +packages of most Linux® distributions. The package needed varies with the +respective Linux distribution. + +::::{tab-set} +:::{tab-item} Ubuntu +:sync: ubuntu + +```shell +sudo apt install linux-tools-common +``` + +::: + +:::{tab-item} Red Hat Enterprise Linux +:sync: RHEL + +```shell +sudo yum install cpupowerutils +``` + +::: + +:::{tab-item} SUSE Linux Enterprise Server 15 +:sync: SLES + +```shell +sudo zypper install cpupower +``` + +::: +:::: + +#### AMD-IOPM-UTIL + +This section applies to AMD EPYC™ 7002 processors to optimize advanced +Dynamic Power Management (DPM) in the I/O logic (see NBIO description above) +for performance. Certain I/O workloads may benefit from disabling this power +management. This utility disables DPM for all PCI-e root complexes in the +system and locks the logic into the highest performance operational mode. + +Disabling I/O DPM will reduce the latency and/or improve the throughput of +low-bandwidth messages for PCI-e InfiniBand NICs and GPUs. Other workloads +with low-bandwidth bursty PCI-e I/O characteristics may benefit as well if +multiple such PCI-e devices are installed in the system. + +The actions of the utility do not persist across reboots. There is no need to +change any existing firmware settings when using this utility. The "Preferred +I/O" and "Enhanced Preferred I/O" settings should remain unchanged at enabled. + +```{tip} +The recommended method to use the utility is either to create a system +start-up script, for example, a one-shot `systemd` service unit, or run the +utility when starting up a job scheduler on the system. The installer +packages (see +[Power Management Utility](https://developer.amd.com/iopm-utility/)) will +create and enable a `systemd` service unit for you. This service unit is +configured to run in one-shot mode. This means that even when the service +unit runs as expected, the status of the service unit will show inactive. +This is the expected behavior when the utility runs normally. If the service +unit shows failed, the utility did not run as expected. The output in either +case can be shown with the `systemctl status` command. + +Stopping the service unit has no effect since the utility does not leave +anything running. To undo the effects of the utility, disable the service +unit with the `systemctl disable` command and reboot the system. + +The utility does not have any command-line options, and it must be run with +super-user permissions. +``` + +#### Systems with 256 CPU Threads - IOMMU Configuration + +For systems that have 256 logical CPU cores or more (e.g., 64-core AMD EPYC™ +7763 in a dual-socket configuration and SMT enabled), setting the Input-Output +Memory Management Unit (IOMMU) configuration to "disabled" can limit the number +of available logical cores to 255. The reason is that the Linux® kernel disables +X2APIC in this case and falls back to Advanced Programmable Interrupt Controller +(APIC), which can only enumerate a maximum of 255 (logical) cores. + +If SMT is enabled by setting "CCD/Core/Thread Enablement > SMT Control" to +"enable", the following steps can be applied to the system to enable all +(logical) cores of the system: + +- In the server BIOS, set IOMMU to "Enabled". +- When configuring the Grub boot loader, add the following arguments for the + Linux kernel: `amd_iommu=on iommu=pt` +- Update Grub to use the modified configuration: + + ```shell + sudo grub2-mkconfig -o /boot/grub2/grub.cfg + ``` + +- Reboot the system. +- Verify IOMMU passthrough mode by inspecting the kernel log via `dmesg`: + + ```none + [...] + [ 0.000000] Kernel command line: [...] amd_iommu=on iommu=pt + [...] + ``` + +Once the system is properly configured, the AMD ROCm platform can be +installed. + +## System Management + +For a complete guide on how to install/manage/uninstall ROCm on Linux, refer to +[Deploy ROCm on Linux](../../deploy/linux/index.md). For verifying that the +installation was successful, refer to +{ref}`verifying-kernel-mode-driver-installation` and +[Validation Tools](../../reference/validation_tools.md). Should verification +fail, consult the [System Debugging Guide](../system_debugging.md). + +(mi100-hw-verification)= + +### Hardware Verification with ROCm + +The AMD ROCm™ platform ships with tools to query the system structure. To query +the GPU hardware, the `rocm-smi` command is available. It can show available +GPUs in the system with their device ID and their respective firmware (or VBIOS) +versions: + +:::{figure-md} mi100-smi-showhw + +rocm-smi --showhw output on an 8*MI100 system. + +`rocm-smi --showhw` output on an 8*MI100 system. +::: + +Another important query is to show the system structure, the localization of the +GPUs in the system, and the fabric connections between the system components: + +:::{figure-md} mi100-smi-showtopo + +rocm-smi --showtopo output on an 8*MI100 system. + +`rocm-smi --showtopo` output on an 8*MI100 system. +::: + +The previous command shows the system structure in four blocks: + +- The first block of the output shows the distance between the GPUs similar to + what the `numactl` command outputs for the NUMA domains of a system. The + weight is a qualitative measure for the "distance" data must travel to reach + one GPU from another one. While the values do not carry a special (physical) + meaning, the higher the value the more hops are needed to reach the + destination from the source GPU. +- The second block has a matrix for the number of hops required to send data + from one GPU to another. For the GPUs in the local hive, this number is one, + while for the others it is three (one hop to leave the hive, one hop across + the processors, and one hop within the destination hive). +- The third block outputs the link types between the GPUs. This can either be + "XGMI" for AMD Infinity Fabric™ links or "PCIE" for PCIe Gen4 links. +- The fourth block reveals the localization of a GPU with respect to the NUMA + organization of the shared memory of the AMD EPYC™ processors. + +To query the compute capabilities of the GPU devices, the `rocminfo` command is +available with the AMD ROCm™ platform. It lists specific details about the GPU +devices, including but not limited to the number of compute units, width of the +SIMD pipelines, memory information, and instruction set architecture: + +:::{figure-md} mi100-rocminfo + +rocminfo output fragment on an 8*MI100 system. + +`rocminfo` output fragment on an 8*MI100 system. +::: + +For a complete list of architecture (LLVM target) names, refer to +[GPU OS Support](../../release/gpu_os_support.md). + +### Testing Inter-device Bandwidth + +{numref}`mi100-hw-verification` showed the `rocm-smi --showtopo` command to show +how the system structure and how the GPUs are located and connected in this +structure. For more details, the `rocm-bandwidth-test` can run benchmarks to +show the effective link bandwidth between the components of the system. + +The ROCm Bandwidth Test program can be installed with the following +package-manager commands: + +::::{tab-set} +:::{tab-item} Ubuntu +:sync: ubuntu + +```shell +sudo apt install rocm-bandwidth-test +``` + +::: + +:::{tab-item} Red Hat Enterprise Linux +:sync: RHEL + +```shell +sudo yum install rocm-bandwidth-test +``` + +::: + +:::{tab-item} SUSE Linux Enterprise Server 15 +:sync: SLES + +```shell +sudo zypper install rocm-bandwidth-test +``` + +::: +:::: + +Alternatively, the source code can be downloaded and built from +[source](https://github.com/RadeonOpenCompute/rocm_bandwidth_test). + +The output will list the available compute devices (CPUs and GPUs): + +:::{figure-md} mi100-bandwidth-test-1 + +rocm-bandwidth-test output fragment on an 8*MI100 system listing devices. + +`rocm-bandwidth-test` output fragment on an 8*MI100 system listing devices. +::: + +The output will also show a matrix that contains a "1" if a device can +communicate to another device (CPU and GPU) of the system and it will show the +NUMA distance (similar to `rocm-smi`): + +:::{figure-md} mi100-bandwidth-test-2 + +rocm-bandwidth-test output fragment on an 8*MI100 system showing inter-device access matrix. + +`rocm-bandwidth-test` output fragment on an 8*MI100 system showing inter-device access matrix. +::: + +:::{figure-md} mi100-bandwidth-test-3 + +rocm-bandwidth-test output fragment on an 8*MI100 system showing inter-device NUMA distance. + +`rocm-bandwidth-test` output fragment on an 8*MI100 system showing inter-device NUMA distance. +::: + +The output also contains the measured bandwidth for unidirectional and +bidirectional transfers between the devices (CPU and GPU): + +:::{figure-md} mi100-bandwidth-test-4 + +rocm-bandwidth-test output fragment on an 8*MI100 system showing uni- and bidirectional bandwidths. + +`rocm-bandwidth-test` output fragment on an 8*MI100 system showing uni- and bidirectional bandwidths. +::: diff --git a/docs/how_to/tuning_guides/mi200.md b/docs/how_to/tuning_guides/mi200.md new file mode 100644 index 000000000..5ef8e1e77 --- /dev/null +++ b/docs/how_to/tuning_guides/mi200.md @@ -0,0 +1,483 @@ +# MI200 High Performance Computing and Tuning Guide + +## System Settings + +This chapter reviews system settings that are required to configure the system +for AMD Instinct MI250 accelerators and improve the performance of the GPUs. It +is advised to configure the system for the best possible host configuration +according to the "High Performance Computing (HPC) Tuning Guide for AMD EPYC +7003 Series Processors." + +Configure the system BIOS settings as explained in {ref}`bios_settings` and +enact the below given settings via the command line as explained in +{ref}`os_settings`: + +- Core C states +- IOMMU (if needed) + +(bios_settings)= + +### System BIOS Settings + +For maximum MI250 GPU performance on systems with AMD EPYC™ 7003-series +processors (codename "Milan") and AMI System BIOS, the following configuration +of system BIOS settings has been validated. These settings must be used for the +qualification process and should be set as default values for the system BIOS. +Analogous settings for other non-AMI System BIOS providers could be set +similarly. For systems with Intel processors, some settings may not apply or be +available as listed in {numref}`mi200-bios`. + +Table 2: Recommended settings for the system BIOS in a GIGABYTE platform. + +```{list-table} Recommended settings for the system BIOS in a GIGABYTE platform. +:header-rows: 1 +:name: mi200-bios + +* + - BIOS Setting Location + - Parameter + - Value + - Comments +* + - Advanced / PCI Subsystem Settings + - Above 4G Decoding + - Enabled + - GPU Large BAR Support +* + - Advanced / PCI Subsystem Settings + - SR-IOV Support + - Disabled + - Disable Single Root IO Virtualization +* + - AMD CBS / CPU Common Options + - Global C-state Control + - Auto + - Global Core C-States +* + - AMD CBS / CPU Common Options + - CCD/Core/Thread Enablement + - Accept + - Global Core C-States +* + - AMD CBS / CPU Common Options / Performance + - SMT Control + - Disable + - Global Core C-States +* + - AMD CBS / DF Common Options / Memory Addressing + - NUMA nodes per socket + - NPS 1,2,4 + - NUMA Nodes (NPS) +* + - AMD CBS / DF Common Options / Memory Addressing + - Memory interleaving + - Auto + - Numa Nodes (NPS) +* + - AMD CBS / DF Common Options / Link + - 4-link xGMI max speed + - 18 Gbps + - Set AMD CPU xGMI speed to highest rate supported +* + - AMD CBS / NBIO Common Options + - IOMMU + - Disable + - +* + - AMD CBS / NBIO Common Options + - PCIe Ten Bit Tag Support + - Auto + - +* + - AMD CBS / NBIO Common Options + - Preferred IO + - Bus + - +* + - AMD CBS / NBIO Common Options + - Preferred IO Bus + - "Use lspci to find pci device id" + - +* + - AMD CBS / NBIO Common Options + - Enhanced Preferred IO Mode + - Enable + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Determinism Control + - Manual + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Determinism Slider + - Power + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - cTDP Control + - Manual + - Set cTDP to the maximum supported by the installed CPU +* + - AMD CBS / NBIO Common Options / SMU Common Options + - cTDP + - 280 + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Package Power Limit Control + - Manual + - Set Package Power Limit to the maximum supported by the installed CPU +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Package Power Limit + - 280 + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - xGMI Link Width Control + - Manual + - Set AMD CPU xGMI width to 16 bits +* + - AMD CBS / NBIO Common Options / SMU Common Options + - xGMI Force Link Width + - 2 + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - xGMI Force Link Width Control + - Force + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - APBDIS + - 1 + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - DF C-states + - Enabled + - +* + - AMD CBS / NBIO Common Options / SMU Common Options + - Fixed SOC P-state + - P0 + - +* + - AMD CBS / UMC Common Options / DDR4 Common Options + - Enforce POR + - Accept + - +* + - AMD CBS / UMC Common Options / DDR4 Common Options / Enforce POR + - Overclock + - Enabled + - +* + - AMD CBS / UMC Common Options / DDR4 Common Options / Enforce POR + - Memory Clock Speed + - 1600 MHz + - Set to max Memory Speed, if using 3200 MHz DIMMs +* + - AMD CBS / UMC Common Options / DDR4 Common Options / DRAM Controller + Configuration / DRAM Power Options + - Power Down Enable + - Disabled + - RAM Power Down +* + - AMD CBS / Security + - TSME + - Disabled + - Memory Encryption +``` + +#### NBIO Link Clock Frequency + +The NBIOs (4x per AMD EPYC™ processor) are the serializers/deserializers (also +known as "SerDes") that convert and prepare the I/O signals for the processor's +128 external I/O interface lanes (32 per NBIO). + +LCLK (short for link clock frequency) controls the link speed of the internal +bus that connects the NBIO silicon with the data fabric. All data between the +processor and its PCIe lanes flow to the data fabric based on these LCLK +frequency settings. The link clock frequency of the NBIO components need to be +forced to the maximum frequency for optimal PCIe performance. + +For AMD EPYC™ 7003 series processors, configuring all NBIOs to be in "Enhanced +Preferred I/O" mode is sufficient to enable highest link clock frequency for the +NBIO components. + +#### Memory Configuration + +For setting the memory addressing modes (see {numref}`mi200-bios`), especially +the number of NUMA nodes per socket/processor (NPS), follow the guidance of the +"High Performance Computing (HPC) Tuning Guide for AMD EPYC 7003 Series +Processors" to provide the optimal configuration for host side computation. For +most HPC workloads, NPS=4 is the recommended value. + +(os_settings)= + +### Operating System Settings + +#### CPU Core State - "C States" + +There are several Core-States, or C-states that an AMD EPYC CPU can idle within: + +- C0: active. This is the active state while running an application. +- C1: idle +- C2: idle and power gated. This is a deeper sleep state and will have a + greater latency when moving back to the C0 state, compared to when the + CPU is coming out of C1. + +Disabling C2 is important for running with a high performance, low-latency +network. To disable power-gating on all cores run the following on Linux +systems: + +```shell +cpupower idle-set -d 2 +``` + +Note that the `cpupower` tool must be installed, as it is not part of the base +packages of most Linux® distributions. The package needed varies with the +respective Linux distribution. + +::::{tab-set} +:::{tab-item} Ubuntu +:sync: ubuntu + +```shell +sudo apt install linux-tools-common +``` + +::: + +:::{tab-item} Red Hat Enterprise Linux +:sync: RHEL + +```shell +sudo yum install cpupowerutils +``` + +::: + +:::{tab-item} SUSE Linux Enterprise Server 15 +:sync: SLES + +```shell +sudo zypper install cpupower +``` + +::: +:::: + +#### AMD-IOPM-UTIL + +This section applies to AMD EPYC™ 7002 processors to optimize advanced +Dynamic Power Management (DPM) in the I/O logic (see NBIO description above) +for performance. Certain I/O workloads may benefit from disabling this power +management. This utility disables DPM for all PCI-e root complexes in the +system and locks the logic into the highest performance operational mode. + +Disabling I/O DPM will reduce the latency and/or improve the throughput of +low-bandwidth messages for PCI-e InfiniBand NICs and GPUs. Other workloads +with low-bandwidth bursty PCI-e I/O characteristics may benefit as well if +multiple such PCI-e devices are installed in the system. + +The actions of the utility do not persist across reboots. There is no need to +change any existing firmware settings when using this utility. The "Preferred +I/O" and "Enhanced Preferred I/O" settings should remain unchanged at enabled. + +```{tip} +The recommended method to use the utility is either to create a system +start-up script, for example, a one-shot `systemd` service unit, or run the +utility when starting up a job scheduler on the system. The installer +packages (see +[Power Management Utility](https://developer.amd.com/iopm-utility/)) will +create and enable a `systemd` service unit for you. This service unit is +configured to run in one-shot mode. This means that even when the service +unit runs as expected, the status of the service unit will show inactive. +This is the expected behavior when the utility runs normally. If the service +unit shows failed, the utility did not run as expected. The output in either +case can be shown with the `systemctl status` command. + +Stopping the service unit has no effect since the utility does not leave +anything running. To undo the effects of the utility, disable the service +unit with the `systemctl disable` command and reboot the system. + +The utility does not have any command-line options, and it must be run with +super-user permissions. +``` + +#### Systems with 256 CPU Threads - IOMMU Configuration + +For systems that have 256 logical CPU cores or more (e.g., 64-core AMD EPYC™ +7763 in a dual-socket configuration and SMT enabled), setting the Input-Output +Memory Management Unit (IOMMU) configuration to "disabled" can limit the number +of available logical cores to 255. The reason is that the Linux® kernel disables +X2APIC in this case and falls back to Advanced Programmable Interrupt Controller +(APIC), which can only enumerate a maximum of 255 (logical) cores. + +If SMT is enabled by setting "CCD/Core/Thread Enablement > SMT Control" to +"enable", the following steps can be applied to the system to enable all +(logical) cores of the system: + +- In the server BIOS, set IOMMU to "Enabled". +- When configuring the Grub boot loader, add the following arguments for the + Linux kernel: `amd_iommu=on iommu=pt` +- Update Grub to use the modified configuration: + + ```shell + sudo grub2-mkconfig -o /boot/grub2/grub.cfg + ``` + +- Reboot the system. +- Verify IOMMU passthrough mode by inspecting the kernel log via `dmesg`: + + ```none + [...] + [ 0.000000] Kernel command line: [...] amd_iommu=on iommu=pt + [...] + ``` + +Once the system is properly configured, the AMD ROCm platform can be +installed. + +## System Management + +For a complete guide on how to install/manage/uninstall ROCm on Linux, refer to +[Deploy ROCm on Linux](../../deploy/linux/index.md). For verifying that the +installation was successful, refer to +{ref}`verifying-kernel-mode-driver-installation` and +[Validation Tools](../../reference/validation_tools.md). Should verification +fail, consult the [System Debugging Guide](../system_debugging.md). + +(mi200-hw-verification)= + +### Hardware Verification with ROCm + +The AMD ROCm™ platform ships with tools to query the system structure. To query +the GPU hardware, the `rocm-smi` command is available. It can show available +GPUs in the system with their device ID and their respective firmware (or VBIOS) +versions: + +:::{figure-md} mi200-smi-showhw + +rocm-smi --showhw output on an 8*MI200 system. + +`rocm-smi --showhw` output on an 8*MI200 system. +::: + +To see the system structure, the localization of the GPUs in the system, and the +fabric connections between the system components, use: + +:::{figure-md} mi200-smi-showtopo + +rocm-smi --showtopo output on an 8*MI200 system. + +`rocm-smi --showtopo` output on an 8*MI200 system. +::: + +- The first block of the output shows the distance between the GPUs similar to + what the `numactl` command outputs for the NUMA domains of a system. The + weight is a qualitative measure for the "distance" data must travel to reach + one GPU from another one. While the values do not carry a special (physical) + meaning, the higher the value the more hops are needed to reach the + destination from the source GPU. +- The second block has a matrix named "Hops between two GPUs", where 1 means the + two GPUs are directly connected with XGMI, 2 means both GPUs are linked to the + same CPU socket and GPU communications will go through the CPU, and 3 means + both GPUs are linked to different CPU sockets so communications will go + through both CPU sockets. This number is one for all GPUs in this case since + they are all connected to each other through the Infinity Fabric links. +- The third block outputs the link types between the GPUs. This can either be + "XGMI" for AMD Infinity Fabric links or "PCIE" for PCIe Gen4 links. +- The fourth block reveals the localization of a GPU with respect to the NUMA + organization of the shared memory of the AMD EPYC processors. + +To query the compute capabilities of the GPU devices, use `rocminfo` command. It +lists specific details about the GPU devices, including but not limited to the +number of compute units, width of the SIMD pipelines, memory information, and +instruction set architecture: + +:::{figure-md} mi200-rocminfo + +rocminfo output fragment on an 8*MI200 system. + +`rocminfo` output fragment on an 8*MI200 system. +::: + +For a complete list of architecture (LLVM target) names, refer to +[GPU OS Support](../../release/gpu_os_support.md). + +### Testing Inter-device Bandwidth + +{numref}`mi100-hw-verification` showed the `rocm-smi --showtopo` command to show +how the system structure and how the GPUs are located and connected in this +structure. For more details, the `rocm-bandwidth-test` can run benchmarks to +show the effective link bandwidth between the components of the system. + +The ROCm Bandwidth Test program can be installed with the following +package-manager commands: + +::::{tab-set} +:::{tab-item} Ubuntu +:sync: ubuntu + +```shell +sudo apt install rocm-bandwidth-test +``` + +::: + +:::{tab-item} Red Hat Enterprise Linux +:sync: RHEL + +```shell +sudo yum install rocm-bandwidth-test +``` + +::: + +:::{tab-item} SUSE Linux Enterprise Server 15 +:sync: SLES + +```shell +sudo zypper install rocm-bandwidth-test +``` + +::: +:::: + +Alternatively, the source code can be downloaded and built from +[source](https://github.com/RadeonOpenCompute/rocm_bandwidth_test). + +The output will list the available compute devices (CPUs and GPUs), including +their device ID and PCIe ID: + +:::{figure-md} mi200-bandwidth-test-1 + +rocm-bandwidth-test output fragment on an 8*MI200 system listing devices. + +`rocm-bandwidth-test` output fragment on an 8*MI200 system listing devices. +::: + +The output will also show a matrix that contains a "1" if a device can +communicate to another device (CPU and GPU) of the system and it will show the +NUMA distance (similar to `rocm-smi`): + +:::{figure-md} mi200-bandwidth-test-2 + +rocm-bandwidth-test output fragment on an 8*MI200 system showing inter-device access matrix and NUMA distances. + +`rocm-bandwidth-test` output fragment on an 8*MI200 system showing inter-device access matrix and NUMA distances. +::: + +The output also contains the measured bandwidth for unidirectional and +bidirectional transfers between the devices (CPU and GPU): + +:::{figure-md} mi200-bandwidth-test-3 + +rocm-bandwidth-test output fragment on an 8*MI200 system showing uni- and bidirectional bandwidths. + +`rocm-bandwidth-test` output fragment on an 8*MI200 system showing uni- and bidirectional bandwidths. +:::