From 48db1eea8d9de0788d5ccdca9273b8c269f033a7 Mon Sep 17 00:00:00 2001 From: Nara Date: Mon, 24 Apr 2023 15:09:09 +0200 Subject: [PATCH] Spell checking (#2070) * ci: cleanup linters and add spelling checker * docs: fix spelling and styling issues --- .github/workflows/linters.yml | 22 ---- .github/workflows/linting.yml | 51 ++++++++ .github/workflows/markdownlint.yml | 17 --- docs/about.md | 23 ++-- docs/deploy/advanced/spack.md | 2 +- docs/deploy/docker.md | 6 +- docs/deploy/spack.md | 2 +- .../inception_casestudy.md | 110 +++++++++--------- docs/how_to/magma_install/magma_install.md | 4 +- docs/how_to/manage_install/install_linux.md | 38 +++--- .../how_to/pytorch_install/pytorch_install.md | 28 ++--- docs/how_to/quick_start_linux.md | 4 +- docs/how_to/quick_start_windows.md | 3 - .../tensorflow_install/tensorflow_install.md | 6 +- docs/kernel_userspace.md | 2 +- docs/reference/all.md | 2 +- docs/reference/compilers.md | 2 +- .../framework_compatibility.md | 2 +- docs/reference/gpu_arch.md | 10 +- docs/reference/gpu_arch/mi100.md | 8 +- .../reference/gpu_libraries/linear_algebra.md | 4 +- docs/reference/gpu_libraries/rand.md | 2 +- .../kernel_userspace_comp.md | 2 +- docs/reference/management_tools.md | 4 +- docs/reference/openmp/openmp.md | 66 +++++------ docs/reference/rocmcc/rocmcc.md | 91 ++++++++------- docs/release/docker_support_matrix.md | 22 ++-- docs/release/licensing.md | 1 + docs/rocm_stack.md | 6 +- docs/understand/file_reorg.md | 10 +- docs/understand/installing_linux.md | 20 ++-- .../package_manager_integration.md | 42 +++---- 32 files changed, 312 insertions(+), 300 deletions(-) delete mode 100644 .github/workflows/linters.yml create mode 100644 .github/workflows/linting.yml delete mode 100644 .github/workflows/markdownlint.yml diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml deleted file mode 100644 index 1fd77a3e5..000000000 --- a/.github/workflows/linters.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Linting - -on: - push: - pull_request: - branches: [develop, main] - -concurrency: - group: ${{ github.ref }}-${{ github.workflow }} - cancel-in-progress: true - -jobs: - build: - name: Restructured text - runs-on: ubuntu-latest - steps: - - name: Checkout Code - uses: actions/checkout@v3 - - name: Install rst-lint - run: pip install restructuredtext-lint - - name: Lint ResT files - run: rst-lint ${{ join(github.workspace, '/docs') }} diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 000000000..e1a579d4f --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,51 @@ +name: Linting + +on: + push: + pull_request: + branches: [develop, main] + +concurrency: + group: ${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + lint-rest: + name: "RestructuredText" + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Install rst-lint + run: pip install restructuredtext-lint + - name: Lint ResT files + run: rst-lint ${{ join(github.workspace, '/docs') }} + + lint-md: + name: "Markdown" + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Use markdownlint + uses: actionshub/markdownlint@v3.1.3 + with: + filesToIgnoreRegex: "CHANGELOG.md|tools\\/autotag\\/templates\\/." + + spelling: + name: "Spelling" + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Fetch config + shell: sh + run: | + curl --silent --show-error --fail --location https://raw.github.com/RadeonOpenCompute/rocm-docs-core/develop/.spellcheck.yaml -O + curl --silent --show-error --fail --location https://raw.github.com/RadeonOpenCompute/rocm-docs-core/develop/.wordlist.txt >> .wordlist.txt + - name: Run spellcheck + uses: rojopolis/spellcheck-github-actions@0.30.0 + - name: On fail + if: failure() + run: | + echo "Please check for spelling mistakes or add them to '.wordlist.txt' in either the root of this project or in rocm-docs-core." diff --git a/.github/workflows/markdownlint.yml b/.github/workflows/markdownlint.yml deleted file mode 100644 index 0d5f57dd1..000000000 --- a/.github/workflows/markdownlint.yml +++ /dev/null @@ -1,17 +0,0 @@ - -name: Markdownlint Action -on: - push: - pull_request: - branches: [develop, main] - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v3 - - name: Use markdownlint - uses: actionshub/markdownlint@v3.1.3 - with: - filesToIgnoreRegex: "CHANGELOG.md|tools\\/autotag\\/templates\\/." \ No newline at end of file diff --git a/docs/about.md b/docs/about.md index 5235089d7..b16c8ec1e 100644 --- a/docs/about.md +++ b/docs/about.md @@ -7,8 +7,8 @@ yourself with our documentation toolchain. ## ReadTheDocs -[ReadTheDocs](https://docs.readthedocs.io/en/stable/) is our frontend for the -our documentation. By frontend, this is the tool that serves our HTML based +[ReadTheDocs](https://docs.readthedocs.io/en/stable/) is our front end for the +our documentation. By front end, this is the tool that serves our HTML based documentation to our end users. ## Doxygen @@ -21,18 +21,17 @@ upstream project is using a different tool). [Sphinx](https://www.sphinx-doc.org/en/master/) is a documentation generator originally used for python. It is now widely used in the Open Source community. -Originally, sphinx supported rst based documentation. Markdown support is now +Originally, sphinx supported RST based documentation. Markdown support is now available. ROCm documentation plans to default to markdown for new projects. -Existing projects using rst are under no obligation to convert to markdown. New +Existing projects using RST are under no obligation to convert to markdown. New projects that believe markdown is not suitable should contact the documentation -team prior to selecting rst. +team prior to selecting RST. ### MyST [Markedly Structured Text (MyST)](https://myst-tools.org/docs/spec) is an extended -flavor of Markdown ([https://commonmark.org/](CommonMark)) influenced by ReStructured -Text (RST) and Sphinx. -It is intergrated via [`myst-parser`](https://myst-parser.readthedocs.io/en/latest/). +flavor of Markdown ([CommonMark](https://commonmark.org/)) influenced by reStructuredText (RST) and Sphinx. +It is integrated via [`myst-parser`](https://myst-parser.readthedocs.io/en/latest/). A cheat sheet that showcases how to use the MyST syntax is available over at [the Jupyter reference](https://jupyterbook.org/en/stable/reference/cheatsheet.html). @@ -56,18 +55,18 @@ Other features may be used in the future. ROCm uses the [sphinx-external-toc](https://sphinx-external-toc.readthedocs.io/en/latest/intro.html) -for our navigation. This tool allows a yml file based left navigation menu. This +for our navigation. This tool allows a YAML file based left navigation menu. This tool was selected due to its flexibility that allows scripts to operate on the -yml file. Please transition to this file for the project's navigation. You can +YAML file. Please transition to this file for the project's navigation. You can see the `_toc.yml.in` file in this repository in the docs/sphinx folder for an example. ### Breathe -Sphinx uses [Breathe](https://www.breathe-doc.org/) to integrate doxygen +Sphinx uses [Breathe](https://www.breathe-doc.org/) to integrate Doxygen content. -## rocm-docs-core pip package +## `rocm-docs-core` pip package [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) is an AMD maintained project that applies customization for our documentation. This diff --git a/docs/deploy/advanced/spack.md b/docs/deploy/advanced/spack.md index 3f7eb96d0..483ec4f9b 100644 --- a/docs/deploy/advanced/spack.md +++ b/docs/deploy/advanced/spack.md @@ -1 +1 @@ -# spack +# Spack diff --git a/docs/deploy/docker.md b/docs/deploy/docker.md index 68a46f1b3..5c7279f41 100644 --- a/docs/deploy/docker.md +++ b/docs/deploy/docker.md @@ -1,6 +1,6 @@ # Docker -## Prequisites +## Prerequisites Docker containers share the kernel with the host operating system, therefore the ROCm kernel-mode driver must be installed on the host. Please refer to the @@ -56,9 +56,9 @@ docker run --device /dev/kfd --device /dev/dri/renderD128 --device /dev/dri/rend ### Base images hosts images useful for users -wishing to build their own containers levaraging ROCm. The built images are +wishing to build their own containers leveraging ROCm. The built images are available from [Docker Hub](https://hub.docker.com/u/rocm). In particular -`rocm/rocm-terminal` is a small image with the prequisites to build HIP +`rocm/rocm-terminal` is a small image with the prerequisites to build HIP applications, but does not include any libraries. ### Applications diff --git a/docs/deploy/spack.md b/docs/deploy/spack.md index 3f7eb96d0..483ec4f9b 100644 --- a/docs/deploy/spack.md +++ b/docs/deploy/spack.md @@ -1 +1 @@ -# spack +# Spack diff --git a/docs/examples/inception_casestudy/inception_casestudy.md b/docs/examples/inception_casestudy/inception_casestudy.md index 3cb119bf7..4c1800775 100644 --- a/docs/examples/inception_casestudy/inception_casestudy.md +++ b/docs/examples/inception_casestudy/inception_casestudy.md @@ -46,11 +46,11 @@ The following sections contain case studies for the Inception v3 model. Convolution Neural Networks are forms of artificial neural networks commonly used for image processing. One of the core layers of such a network is the convolutional layer, which convolves the input with a weight tensor and passes the result to the next layer. Inception v3 [1] is an architectural development over the ImageNet competition-winning entry, AlexNet, using more profound and broader networks while attempting to meet computational and memory budgets. -The implementation uses PyTorch as a framework. This case study utilizes torchvision [2], a repository of popular datasets and model architectures, for obtaining the model. Torchvision also provides pretrained weights as a starting point to develop new models or fine-tune the model for a new task. +The implementation uses PyTorch as a framework. This case study utilizes `torchvision` [2], a repository of popular datasets and model architectures, for obtaining the model. `torchvision` also provides pre-trained weights as a starting point to develop new models or fine-tune the model for a new task. -#### Evaluating a Pretrained Model +#### Evaluating a Pre-Trained Model -The Inception v3 model introduces a simple image classification task with the pretrained model. This does not involve training but utilizes an already pretrained model from torchvision. +The Inception v3 model introduces a simple image classification task with the pre-trained model. This does not involve training but utilizes an already pre-trained model from `torchvision`. This example is adapted from the PyTorch research hub page on Inception v3 [3]. @@ -85,7 +85,7 @@ Follow these steps: except: urllib.request.urlretrieve(url, filename) ``` -5. Import torchvision and PIL Image support libraries. +5. Import `torchvision` and `PIL.Image` support libraries. ```py from PIL import Image @@ -124,7 +124,7 @@ Follow these steps: print(probabilities) ``` -9. To understand the probabilities, download and examine the Imagenet labels. +9. To understand the probabilities, download and examine the ImageNet labels. ```py wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt @@ -153,13 +153,13 @@ Follow these steps: docker run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G rocm/pytorch:latest ``` -2. Download an imagenet database. For this example, the tiny-imagenet-200 [4], a smaller ImageNet variant with 200 image classes and a training dataset with 100,000 images, was downsized to 64x64 color images. +2. Download an ImageNet database. For this example, the `tiny-imagenet-200` [4], a smaller ImageNet variant with 200 image classes and a training dataset with 100,000 images, was downsized to 64x64 color images. ```py wget http://cs231n.stanford.edu/tiny-imagenet-200.zip ``` -3. Process the database to set the validation directory to the format expected by PyTorch DataLoader. +3. Process the database to set the validation directory to the format expected by PyTorch's `DataLoader`. 4. Run the following script: @@ -196,7 +196,7 @@ Follow these steps: 5. Open a Python shell. -6. Import dependencies, including torch, OS, and torchvision. +6. Import dependencies, including `torch`, `os`, and `torchvision`. ```py import torch @@ -209,14 +209,14 @@ Follow these steps: 7. Set parameters to guide the training process. :::{note} - The device is set to "cuda". In PyTorch, "cuda" is a generic keyword to denote a GPU. + The device is set to `"cuda"`. In PyTorch, `"cuda"` is a generic keyword to denote a GPU. ::: ```py device = "cuda" ``` -8. Set the data_path to the location of the training and validation data. In this case, the tiny-imagenet-200 is present as a subdirectory to the current directory. +8. Set the data_path to the location of the training and validation data. In this case, the `tiny-imagenet-200` is present as a subdirectory to the current directory. ```py data_path = "tiny-imagenet-200" @@ -241,7 +241,7 @@ Follow these steps: val_resize_size = 342 ``` - The pretrained Inception v3 model is chosen to be downloaded from torchvision. + The pre-trained Inception v3 model is chosen to be downloaded from `torchvision`. ```py model_name = "inception_v3" @@ -254,13 +254,13 @@ Follow these steps: batch_size = 32 ``` - This refers to the number of CPU threads the data loader uses to perform efficient multiprocess data loading. + This refers to the number of CPU threads the data loader uses to perform efficient multi-process data loading. ```py num_workers = 16 ``` - The PyTorch optim package provides methods to adjust the learning rate as the training progresses. This example uses the StepLR scheduler, which decays the learning rate by lr_gamma at every lr_step_size number of epochs. + The `torch.optim` package provides methods to adjust the learning rate as the training progresses. This example uses the `StepLR` scheduler, which decays the learning rate by `lr_gamma` at every `lr_step_size` number of epochs. ```py learning_rate = 0.1 @@ -334,7 +334,7 @@ Follow these steps: ``` :::{note} - Use torchvision to obtain the Inception v3 model. Use the pretrained model weights to speed up training. + Use `torchvision` to obtain the Inception v3 model. Use the pre-trained model weights to speed up training. ::: ```py @@ -343,7 +343,7 @@ Follow these steps: model = torchvision.models.__dict__[model_name](pretrained=pretrained) ``` -11. Adapt Inception v3 for the current dataset. Tiny-imagenet-200 contains only 200 classes, whereas Inception v3 is designed for 1,000-class output. The last layer of Inception v3 is replaced to match the output features required. +11. Adapt Inception v3 for the current dataset. `tiny-imagenet-200` contains only 200 classes, whereas Inception v3 is designed for 1,000-class output. The last layer of Inception v3 is replaced to match the output features required. ```py model.fc = torch.nn.Linear(model.fc.in_features, len(dataset.classes)) @@ -477,7 +477,7 @@ The CIFAR-10 (Canadian Institute for Advanced Research) dataset is a subset of t Follow these steps: -1. Import dependencies, including torch, OS, and torchvision. +1. Import dependencies, including `torch`, `os`, and `torchvision`. ```py import torch @@ -487,7 +487,7 @@ Follow these steps: import numpy as np ``` -2. The output of torchvision datasets is PILImage images of range [0, 1]. Transform them to Tensors of normalized range [-1, 1]. +2. The output of `torchvision` datasets is `PILImage` images of range [0, 1]. Transform them to Tensors of normalized range [-1, 1]. ```py transform = transforms.Compose( @@ -501,7 +501,7 @@ Follow these steps: batch_size = 4 ``` -3. Download the dataset train and test datasets as follows. Specify the batch size, shuffle the dataset once, and specify the number of workers to the number of CPU threads used by the data loader to perform efficient multiprocess data loading. +3. Download the dataset train and test datasets as follows. Specify the batch size, shuffle the dataset once, and specify the number of workers to the number of CPU threads used by the data loader to perform efficient multi-process data loading. ```py train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) @@ -523,7 +523,7 @@ Follow these steps: print("defined classes") ``` -6. Unnormalize the images and then iterate over them. +6. Denormalize the images and then iterate over them. ```py global image_number @@ -544,7 +544,7 @@ Follow these steps: print("image created and saved ") ``` -7. Import the torch.nn for constructing neural networks and torch.nn.functional to use the convolution functions. +7. Import the `torch.nn` for constructing neural networks and `torch.nn.functional` to use the convolution functions. ```py import torch.nn as nn @@ -682,7 +682,7 @@ Access the source code from the following repository: To understand the code step by step, follow these steps: -1. Import libraries like TensorFlow, Numpy, and Matplotlib to train the neural network and calculate and plot graphs. +1. Import libraries like TensorFlow, NumPy, and Matplotlib to train the neural network and calculate and plot graphs. ```py import tensorflow as tf @@ -775,7 +775,7 @@ To understand the code step by step, follow these steps: --- ``` - The basic building block of a neural network is the layer. Layers extract representations from the data fed into them. Deep Learning consists of chaining together simple layers. Most layers, such as tf.keras.layers.Dense, have parameters that are learned during training. + The basic building block of a neural network is the layer. Layers extract representations from the data fed into them. Deep Learning consists of chaining together simple layers. Most layers, such as `tf.keras.layers.Dense`, have parameters that are learned during training. ```py model = tf.keras.Sequential([ @@ -785,9 +785,9 @@ To understand the code step by step, follow these steps: ]) ``` - - The first layer in this network tf.keras.layers.Flatten transforms the format of the images from a two-dimensional array (of 28 x 28 pixels) to a one-dimensional array (of 28 * 28 = 784 pixels). Think of this layer as unstacking rows of pixels in the image and lining them up. This layer has no parameters to learn; it only reformats the data. + - The first layer in this network `tf.keras.layers.Flatten` transforms the format of the images from a two-dimensional array (of 28 x 28 pixels) to a one-dimensional array (of 28 * 28 = 784 pixels). Think of this layer as unstacking rows of pixels in the image and lining them up. This layer has no parameters to learn; it only reformats the data. - - After the pixels are flattened, the network consists of a sequence of two tf.keras.layers.Dense layers. These are densely connected or fully connected neural layers. The first Dense layer has 128 nodes (or neurons). The second (and last) layer returns a logits array with a length of 10. Each node contains a score that indicates the current image belongs to one of the 10 classes. + - After the pixels are flattened, the network consists of a sequence of two `tf.keras.layers.Dense` layers. These are densely connected or fully connected neural layers. The first Dense layer has 128 nodes (or neurons). The second (and last) layer returns a logits array with a length of 10. Each node contains a score that indicates the current image belongs to one of the 10 classes. 12. You must add the Loss function, Metrics, and Optimizer at the time of model compilation. @@ -883,7 +883,7 @@ To understand the code step by step, follow these steps: thisplot[true_label].set_color('blue') ``` - 9. With the model trained, you can use it to make predictions about some images. Review the 0th image predictions and the prediction array. Correct prediction labels are blue, and incorrect prediction labels are red. The number gives the percentage (out of 100) for the predicted label. + 9. With the model trained, you can use it to make predictions about some images. Review the 0-th image predictions and the prediction array. Correct prediction labels are blue, and incorrect prediction labels are red. The number gives the percentage (out of 100) for the predicted label. ```py i = 0 @@ -925,7 +925,7 @@ To understand the code step by step, follow these steps: print(img.shape) ``` - 11. tf.keras models are optimized to make predictions on a batch, or collection, of examples at once. Accordingly, even though you are using a single image, you must add it to a list. + 11. `tf.keras` models are optimized to make predictions on a batch, or collection, of examples at once. Accordingly, even though you are using a single image, you must add it to a list. ```py # Add the image to a batch where it's the only member. @@ -952,7 +952,7 @@ To understand the code step by step, follow these steps: --- ``` - 13. tf.keras.Model.predict returns a list of lists—one for each image in the batch of data. Grab the predictions for our (only) image in the batch. + 13. `tf.keras.Model.predict` returns a list of lists—one for each image in the batch of data. Grab the predictions for our (only) image in the batch. ```py np.argmax(predictions_single[0]) @@ -960,7 +960,7 @@ To understand the code step by step, follow these steps: ### Case Study: TensorFlow with Text Classification -This procedure demonstrates text classification starting from plain text files stored on disk. You will train a binary classifier to perform sentiment analysis on an IMDB dataset. At the end of the notebook, there is an exercise for you to try in which you will train a multiclass classifier to predict the tag for a programming question on Stack Overflow. +This procedure demonstrates text classification starting from plain text files stored on disk. You will train a binary classifier to perform sentiment analysis on an IMDB dataset. At the end of the notebook, there is an exercise for you to try in which you will train a multi-class classifier to predict the tag for a programming question on Stack Overflow. Follow these steps: @@ -1042,7 +1042,7 @@ Follow these steps: raw_train_ds=tf.keras.utils.text_dataset_from_directory('aclImdb/train',batch_size=batch_size, validation_split=0.2,subset='training', seed=seed) ``` -8. As you will see in a moment, you can train a model by passing a dataset directly to model.fit. If you are new to tf.data, you can also iterate over the dataset and print a few examples as follows: +8. As you will see in a moment, you can train a model by passing a dataset directly to `model.fit`. If you are new to `tf.data`, you can also iterate over the dataset and print a few examples as follows: ```py for text_batch, label_batch in raw_train_ds.take(1): @@ -1072,7 +1072,7 @@ Follow these steps: To prepare the data for training, follow these steps: -1. Standardize, tokenize, and vectorize the data using the helpful tf.keras.layers.TextVectorization layer. +1. Standardize, tokenize, and vectorize the data using the helpful `tf.keras.layers.TextVectorization` layer. ```py def custom_standardization(input_data): @@ -1081,7 +1081,7 @@ To prepare the data for training, follow these steps: return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation),'') ``` -2. Create a TextVectorization layer. Use this layer to standardize, tokenize, and vectorize our data. Set the output_mode to int to create unique integer indices for each token. Note that we are using the default split function and the custom standardization function you defined above. You will also define some constants for the model, like an explicit maximum sequence_length, which will cause the layer to pad or truncate sequences to exactly sequence_length values. +2. Create a `TextVectorization` layer. Use this layer to standardize, tokenize, and vectorize our data. Set the output_mode to int to create unique integer indices for each token. Note that we are using the default split function and the custom standardization function you defined above. You will also define some constants for the model, like an explicit maximum sequence_length, which will cause the layer to pad or truncate sequences to exactly sequence_length values. ```py max_features = 10000 @@ -1129,7 +1129,7 @@ To prepare the data for training, follow these steps: print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary()))) ``` -6. You are nearly ready to train your model. As a final preprocessing step, apply the TextVectorization layer we created earlier to train, validate, and test the dataset. +6. You are nearly ready to train your model. As a final preprocessing step, apply the `TextVectorization` layer we created earlier to train, validate, and test the dataset. ```py train_ds = raw_train_ds.map(vectorize_text) @@ -1137,9 +1137,9 @@ To prepare the data for training, follow these steps: test_ds = raw_test_ds.map(vectorize_text) ``` - The cache() function keeps data in memory after it is loaded off disk. This ensures the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files. + The `cache()` function keeps data in memory after it is loaded off disk. This ensures the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files. - The prefetch() function overlaps data preprocessing and model execution while training. + The `prefetch()` function overlaps data preprocessing and model execution while training. ```py AUTOTUNE = tf.data.AUTOTUNE @@ -1308,7 +1308,7 @@ MIGraphX provides easy-to-use APIs in C++ and Python to import machine models in After optimization passes, all these operators get mapped to different kernels on GPUs or CPUs. -After importing a model into MIGraphX, the model is represented as migraphx::program. migraphx::program is made up of migraphx::module. The program can consist of several modules, but it always has one main_module. Modules are made up of migraphx::instruction_ref. Instructions contain the migraphx::op and arguments to the operator.   +After importing a model into MIGraphX, the model is represented as `migraphx::program`. `migraphx::program` is made up of `migraphx::module`. The program can consist of several modules, but it always has one main_module. Modules are made up of `migraphx::instruction_ref`. Instructions contain the `migraphx::op` and arguments to the operator.   ### MIGraphX Installation @@ -1322,19 +1322,19 @@ To install MIGraphX on Debian-based systems like Ubuntu, use the following comma sudo apt update && sudo apt install -y migraphx ``` -The header files and libs are installed under /opt/rocm-\, where \ is the ROCm version. +The header files and libraries are installed under `/opt/rocm-\`, where \ is the ROCm version. #### Option 2: Building from Source There are two ways to build the MIGraphX sources. -- [Use the ROCm build tool](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-the-rocm-build-tool-rbuild) - This approach uses [rbuild](https://github.com/RadeonOpenCompute/rbuild) to install the prerequisites and build the libs with just one command. +- [Use the ROCm build tool](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-the-rocm-build-tool-rbuild) - This approach uses [rbuild](https://github.com/RadeonOpenCompute/rbuild) to install the prerequisites and build the libraries with just one command. or - [Use CMake](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#use-cmake-to-build-migraphx) - This approach uses a script to install the prerequisites, then uses CMake to build the source. -For detailed steps on building from source and installing dependencies, refer to the following README file: +For detailed steps on building from source and installing dependencies, refer to the following `README` file: [https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#building-from-source](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX#building-from-source) @@ -1342,13 +1342,13 @@ For detailed steps on building from source and installing dependencies, refer to To use Docker, follow these steps: -1. The easiest way to set up the development environment is to use Docker. To build Docker from scratch, first clone the MIGraphX repo by running: +1. The easiest way to set up the development environment is to use Docker. To build Docker from scratch, first clone the MIGraphX repository by running: ```bash git clone --recursive https://github.com/ROCmSoftwarePlatform/AMDMIGraphX ``` -2. The repo contains a Dockerfile from which you can build a Docker image as: +2. The repository contains a Dockerfile from which you can build a Docker image as: ```bash docker build -t migraphx . @@ -1373,19 +1373,19 @@ inception = models.inception_v3(pretrained=True) torch.onnx.export(inception,torch.randn(1,3,299,299), "inceptioni1.onnx") ``` -This will create inceptioni1.onnx, which can be imported in MIGraphX using C++ or Python API. +This will create `inceptioni1.onnx`, which can be imported in MIGraphX using C++ or Python API. ### MIGraphX Python API Follow these steps: -1. To import the migraphx module in Python script, set PYTHONPATH to migraphx libs installation. If binaries are installed using steps mentioned in [Option 1: Installing Binaries](#option-1-installing-binaries), perform the following action: +1. To import the MIGraphX module in Python script, set `PYTHONPATH` to the MIGraphX libraries installation. If binaries are installed using steps mentioned in [Option 1: Installing Binaries](#option-1-installing-binaries), perform the following action: ```py export PYTHONPATH=$PYTHONPATH:/opt/rocm/ ``` -2. The following script shows the usage of Python API to import the ONNX model, compile it, and run inference on it. Set LD_LIBRARY_PATH to /opt/rocm/ if required. +2. The following script shows the usage of Python API to import the ONNX model, compile it, and run inference on it. Set `LD_LIBRARY_PATH` to `/opt/rocm/` if required. ```py # import migraphx and numpy @@ -1407,13 +1407,13 @@ Follow these steps: print(np.argmax(result_np)) ``` - Find additional examples of Python API in the /examples directory of the MIGraphX repo. + Find additional examples of Python API in the /examples directory of the MIGraphX repository. ### MIGraphX C++ API Follow these steps: -1. The following is a minimalist example that shows the usage of MIGraphX C++ API to load ONNX file, compile it for the GPU, and run inference on it. To use MIGraphX C++ API, you only need to load the migraphx.hpp file. This example runs inference on the Inception v3 model. +1. The following is a minimalist example that shows the usage of MIGraphX C++ API to load ONNX file, compile it for the GPU, and run inference on it. To use MIGraphX C++ API, you only need to load the `migraphx.hpp` file. This example runs inference on the Inception v3 model. ```c++ #include @@ -1461,7 +1461,7 @@ Follow these steps: } ``` -2. To compile this program, you can use CMake and you only need to link the migraphx::c library to use MIGraphX's C++ API. The following is the CMakeLists.txt file that can build the earlier example: +2. To compile this program, you can use CMake and you only need to link the `migraphx::c` library to use MIGraphX's C++ API. The following is the `CMakeLists.txt` file that can build the earlier example: ```py cmake_minimum_required(VERSION 3.5) @@ -1479,7 +1479,7 @@ Follow these steps: target_link_libraries(${EXAMPLE} migraphx::c) ``` -3. To build the executable file, run the following from the directory containing the inception_inference.cpp file: +3. To build the executable file, run the following from the directory containing the `inception_inference.cpp` file: ```py mkdir build @@ -1490,7 +1490,7 @@ Follow these steps: ``` :::{note} - Set LD_LIBRARY_PATH to /opt/rocm/lib if required during the build. Additional examples can be found in the MIGraphX repo under the examples/ directory. + Set `LD_LIBRARY_PATH` to `/opt/rocm/lib` if required during the build. Additional examples can be found in the MIGraphX repository under the `/examples/` directory. ::: ### Tuning MIGraphX @@ -1577,38 +1577,38 @@ Inference time: 0.004ms #### YModel -The best inference performance through MIGraphX is conditioned upon having tuned kernel configs stored in a /home local User Database (DB). If a user were to move their model to a different server or allow a different user to use it, they would have to run through the MIOpen tuning process again to populate the next User DB with the best kernel configs and corresponding solvers. +The best inference performance through MIGraphX is conditioned upon having tuned kernel configurations stored in a /home local User Database (DB). If a user were to move their model to a different server or allow a different user to use it, they would have to run through the MIOpen tuning process again to populate the next User DB with the best kernel configurations and corresponding solvers. Tuning is time consuming, and if the users have not performed tuning, they would see discrepancies between expected or claimed inference performance and actual inference performance. This has led to repetitive and time-consuming tuning tasks for each user. -MIGraphX introduces a feature, known as YModel, that stores the kernel config parameters found during tuning into a .mxr file. This ensures the same level of expected performance, even when a model is copied to a different user/system. +MIGraphX introduces a feature, known as YModel, that stores the kernel config parameters found during tuning into a `.mxr` file. This ensures the same level of expected performance, even when a model is copied to a different user/system. The YModel feature is available starting from ROCm 5.4.1 and UIF 1.1. ##### YModel Example -Through the \`migraphx-driver\` functionality, you can generate .mxr files with tuning information stored inside it by passing additional --binary --output model.mxr to \`migraphx-driver\` along with the rest of the necessary flags. +Through the `migraphx-driver` functionality, you can generate `.mxr` files with tuning information stored inside it by passing additional `--binary --output model.mxr` to `migraphx-driver` along with the rest of the necessary flags. -For example, to generate .mxr file from the onnx model, use the following: +For example, to generate `.mxr` file from the ONNX model, use the following: ```bash ./path/to/migraphx-driver compile --onnx resnet50.onnx --enable-offload-copy --binary --output resnet50.mxr ``` -To run generated .mxr files through \`migraphx-driver\`, use the following: +To run generated `.mxr` files through `migraphx-driver`, use the following: ```bash ./path/to/migraphx-driver run --migraphx resnet50.mxr --enable-offload-copy ``` -Alternatively, you can use MIGraphX’s C++ or Python API to generate .mxr file. Refer to {numref}`image018` for an example. +Alternatively, you can use MIGraphX's C++ or Python API to generate `.mxr` file. Refer to {numref}`image018` for an example. ```{figure} ../../data/understand/deep_learning/image.018.png :name: image018 --- align: center --- -Generating a .mxr File +Generating a `.mxr` File ``` ## Troubleshooting diff --git a/docs/how_to/magma_install/magma_install.md b/docs/how_to/magma_install/magma_install.md index cb33492ea..b088b431d 100644 --- a/docs/how_to/magma_install/magma_install.md +++ b/docs/how_to/magma_install/magma_install.md @@ -2,7 +2,7 @@ ## MAGMA for ROCm -Matrix Algebra on GPU and Multicore Architectures, abbreviated as MAGMA, is a +Matrix Algebra on GPU and Multi-core Architectures, abbreviated as MAGMA, is a collection of next-generation dense linear algebra libraries that is designed for heterogeneous architectures, such as multiple GPUs and multi- or many-core CPUs. @@ -29,7 +29,7 @@ To build MAGMA from the source, follow these steps: export PYTORCH_ROCM_ARCH= ``` - `` is the architecture reported by the rocminfo command. + `` is the architecture reported by the `rocminfo` command. 2. Use the following: diff --git a/docs/how_to/manage_install/install_linux.md b/docs/how_to/manage_install/install_linux.md index 52aebb77d..b4082dc57 100644 --- a/docs/how_to/manage_install/install_linux.md +++ b/docs/how_to/manage_install/install_linux.md @@ -204,15 +204,15 @@ automation). Example: `amdgpu-install -y --usecase=rocm` When the installation is initiated in Docker, the installer tries to install the use case along with the kernel-mode driver. However, you cannot install the kernel-mode driver in a Docker container. To skip the installation of the -kernel-mode driver, proceed with the --no-dkms option, as shown below: +kernel-mode driver, proceed with the `--no-dkms` option, as shown below: ```shell sudo amdgpu-install --usecase=rocm --no-dkms ``` -### Using the Installer Script for Multiversion ROCm Installation +### Using the Installer Script for Multi-version ROCm Installation -The multiversion ROCm installation requires you to download and install the +The multi-version ROCm installation requires you to download and install the latest ROCm release installer from the list of ROCm releases you want to install simultaneously on your system. @@ -225,7 +225,7 @@ Installer Script](#download-and-install-the-installer-script) section. ```{attention} If the existing ROCm release contains non-versioned ROCm packages, uninstall -those packages before proceeding with the multiversion installation to avoid +those packages before proceeding with the multi-version installation to avoid conflicts. ``` @@ -308,7 +308,7 @@ sudo zypper ref ::::: :::::: -#### Use the Installer to Install Multiversion ROCm Meta-packages +#### Use the Installer to Install Multi-version ROCm Meta-packages Use the installer script as given below: @@ -329,7 +329,7 @@ sudo amdgpu-install --usecase=rocm --rocmrelease=4.5.0 --no-dkms sudo amdgpu-install --usecase=rocm --rocmrelease=5.4.3 --no-dkms ``` -Following are examples of ROCm multiversion installation. The kernel-mode +Following are examples of ROCm multi-version installation. The kernel-mode driver, associated with the ROCm release v5.4.3, will be installed as its latest release in the list. @@ -378,7 +378,7 @@ distribution, follow the steps below: 6. **Add ROCm stack repository** – Ensure the ROCm stack repository is added. -7. **Install single-version or multiversion ROCm meta-packages** – Install the +7. **Install single-version or multi-version ROCm meta-packages** – Install the desired meta-packages. 8. **Verify installation for the applicable distributions** – Verify if the @@ -398,7 +398,7 @@ packages with subsequent releases. When a new ROCm release is available, the new repository, specific to that release, is added. You can select a specific release to install, update the previously installed single version to the later available release, or add the latest version of ROCm along with the currently -installed version by using the multiversion ROCm packages. +installed version by using the multi-version ROCm packages. ```{note} Users installing multiple versions of the ROCm stack must use the @@ -476,10 +476,10 @@ follow these steps: ``` ```{note} - The gpg key may change; ensure it is updated when installing a new release. If + The GPG key may change; ensure it is updated when installing a new release. If the key signature verification fails while updating, re-add the key from the - ROCm to the apt repository as mentioned above. The current rocm.gpg.key is not - available in a standard key ring distribution but has the following sha1sum + ROCm to the apt repository as mentioned above. The current `rocm.gpg.key` is not + available in a standard key ring distribution but has the following SHA1 sum hash: `73f5d8100de6048aa38a8b84cd9a87f05177d208 rocm.gpg.key` ``` @@ -568,7 +568,7 @@ follow these steps: ```{important} If the existing ROCm release contains non-versioned ROCm packages, you must - uninstall those packages before proceeding to the multiversion installation + uninstall those packages before proceeding to the multi-version installation to avoid conflicts. ``` @@ -730,7 +730,7 @@ follow these steps: ```{important} If the existing ROCm release contains non-versioned ROCm packages, you must - uninstall those packages before proceeding to the multiversion installation + uninstall those packages before proceeding to the multi-version installation to avoid conflicts. ``` @@ -750,7 +750,7 @@ ROCm installation requires you to install `linux-headers` and kernel's version. **Example:** If the system is running the Linux kernel version -`5.3.18-57_11.0.18`, you must install the same versions of linux-headers and +`5.3.18-57_11.0.18`, you must install the same versions of Linux headers and development packages. Refer to {ref}`check-kernel-info` on to how to check the system's kernel version. @@ -759,7 +759,7 @@ these steps: 1. Ensure that the correct version of the latest `kernel-default-devel` and `kernel-default` packages are installed. The following command lists the - installed kernel-default-devel and kernel-default package: + installed `kernel-default-devel` and `kernel-default` package: ```shell sudo zypper info kernel-default-devel or kernel-default @@ -841,7 +841,7 @@ these steps: ```{important} If the existing ROCm release contains non-versioned ROCm packages, you must - uninstall those packages before proceeding to the multiversion installation + uninstall those packages before proceeding to the multi-version installation to avoid conflicts. ``` @@ -894,9 +894,9 @@ but are generally useful. Verification of the install is advised. for details. (Entries in the `PATH` minus `bin` and `sbin` are added to library search - paths, therefore this convenience will affect builds and result in ROCm libs - almost always being found. This may be an issue when you're developing these - libraries or want to use self-built versions of them.) + paths, therefore this convenience will affect builds and result in ROCm + libraries almost always being found. This may be an issue when you're + developing these libraries or want to use self-built versions of them.) ``` ### Verifying Kernel-mode Driver Installation diff --git a/docs/how_to/pytorch_install/pytorch_install.md b/docs/how_to/pytorch_install/pytorch_install.md index c5894669a..c3124725b 100644 --- a/docs/how_to/pytorch_install/pytorch_install.md +++ b/docs/how_to/pytorch_install/pytorch_install.md @@ -8,7 +8,7 @@ automatic differentiation. Other advanced features include: - Support for distributed training - Native ONNX support -- C++ frontend +- C++ front-end - The ability to deploy at scale using TorchServe - A production-ready deployment mechanism through TorchScript @@ -19,7 +19,7 @@ To install ROCm on bare metal, refer to the section The recommended option to get a PyTorch environment is through Docker. However, installing the PyTorch wheels package on bare metal is also supported. -#### Option 1 (Recommended): Use Docker Image with PyTorch Pre-installed +#### Option 1 (Recommended): Use Docker Image with PyTorch Pre-Installed Using Docker gives you portability and access to a prebuilt Docker container that has been rigorously tested within AMD. This might also save on the @@ -56,7 +56,7 @@ Follow these steps: PyTorch supports the ROCm platform by providing tested wheels packages. To access this feature, refer to [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) -and choose the "ROCm" compute platform. {numref}`Installation-Matrix-from-Pytorch` is a matrix from pytroch.org that illustrates the installation compatibility between ROCm and the PyTorch build. +and choose the "ROCm" compute platform. {numref}`Installation-Matrix-from-Pytorch` is a matrix from that illustrates the installation compatibility between ROCm and the PyTorch build. ```{figure} ../../data/how_to/magma_install/image.006.png :name: Installation-Matrix-from-Pytorch @@ -79,7 +79,7 @@ To install PyTorch using the wheels package, follow these installation steps: installation directions in the section [Installation](https://docs.amd.com/bundle/ROCm-Deep-Learning-Guide-v5.4-/page/Prerequisites.html#d2999e60). ROCm 5.2 is installed in this example, as supported by the installation - matrix from pytorch.org. + matrix from . or @@ -103,7 +103,7 @@ To install PyTorch using the wheels package, follow these installation steps: pip3 install wheel setuptools ``` -4. Install torch, torchvision, and torchaudio as specified by the installation +4. Install torch, `torchvision`, and `torchaudio` as specified by the installation matrix. :::{note} @@ -158,7 +158,7 @@ Follow these steps: 4. Build PyTorch for ROCm. :::{note} - By default in the rocm/pytorch:latest-base, PyTorch builds for these + By default in the `rocm/pytorch:latest-base`, PyTorch builds for these architectures simultaneously: - gfx900 - gfx906 @@ -253,7 +253,7 @@ Follow these steps: 5. Build PyTorch for ROCm. :::{note} - By default in the rocm/pytorch:latest-base, PyTorch builds for these + By default in the `rocm/pytorch:latest-base`, PyTorch builds for these architectures simultaneously: - gfx900 - gfx906 @@ -274,7 +274,7 @@ Follow these steps: export PYTORCH_ROCM_ARCH= ``` - `` is the architecture reported by the rocminfo command. + `` is the architecture reported by the `rocminfo` command. 8. Build PyTorch using: @@ -314,7 +314,7 @@ Follow these steps: ``` 2. Test if the GPU is accessible from PyTorch. In the PyTorch framework, - torch.cuda is a generic mechanism to access the GPU; it will access an AMD + `torch.cuda` is a generic mechanism to access the GPU; it will access an AMD GPU only if available. ```bash @@ -338,8 +338,8 @@ Follow these steps: errors when running the unit tests. ::: - This will first install some dependencies, such as a supported torchvision - version for PyTorch. Torchvision is used in some PyTorch tests for loading + This will first install some dependencies, such as a supported `torchvision` + version for PyTorch. `torchvision` is used in some PyTorch tests for loading models. Next, this will run all the unit tests. :::{note} @@ -356,7 +356,7 @@ Follow these steps: PYTORCH_TEST_WITH_ROCM=1 python3 test/test_nn.py --verbose ``` - test_nn.py can be replaced with any other test set. + `test_nn.py` can be replaced with any other test set. ### Run a Basic PyTorch Example @@ -381,7 +381,7 @@ Follow these steps: cd examples/mnist ``` -3. Follow the instructions in the README file in this folder. In this case: +3. Follow the instructions in the `README` file in this folder. In this case: ```bash pip3 install -r requirements.txt @@ -394,7 +394,7 @@ Follow these steps: cd examples/imagenet ``` -5. Follow the instructions in the README file in this folder. In this case: +5. Follow the instructions in the `README` file in this folder. In this case: ```bash pip3 install -r requirements.txt diff --git a/docs/how_to/quick_start_linux.md b/docs/how_to/quick_start_linux.md index 760b7d9f5..4fcbb81e7 100644 --- a/docs/how_to/quick_start_linux.md +++ b/docs/how_to/quick_start_linux.md @@ -4,7 +4,7 @@ The driver package uses [`DKMS`](https://en.wikipedia.org/wiki/Dynamic_Kernel_Module_Support) to build -the amdgpu module (driver) for the installed kernels. This requires the linux +the `amdgpu-dkms` module (driver) for the installed kernels. This requires the Linux kernel headers and modules to be installed for each. Usually these are automatically installed with the kernel, but if you have multiple kernel versions or you have downloaded the kernel images and not the kernel @@ -251,7 +251,7 @@ sudo zypper ref ## Install Drivers -Install the amdgpu kernel module, aka driver, on your system. +Install the `amdgpu-dkms` kernel module, aka driver, on your system. ::::{tab-set} diff --git a/docs/how_to/quick_start_windows.md b/docs/how_to/quick_start_windows.md index c7d80b553..7277ed3fd 100644 --- a/docs/how_to/quick_start_windows.md +++ b/docs/how_to/quick_start_windows.md @@ -7,7 +7,6 @@ The steps to install the HIP SDK for Windows are described in this document. The HIP SDK is supported on Windows 10 and 11. The HIP SDK may be installed on a system without AMD GPUs to use the build toolchains. To run HIP applications, a compatible GPU is required. Please see the supported GPU guide for more details. -TODO: provide link to supported GPU guide. ## SDK Installation @@ -24,8 +23,6 @@ Installation options are listed in {numref}`installation-components`. | **BitCode Profiler** | **Full** | | | ::: -TODO: describe each installation option. - ## HIP SDK Installer The AMD HIP SDK Installer manages the installation and uninstallation process of diff --git a/docs/how_to/tensorflow_install/tensorflow_install.md b/docs/how_to/tensorflow_install/tensorflow_install.md index 52660a7bb..3131f1379 100644 --- a/docs/how_to/tensorflow_install/tensorflow_install.md +++ b/docs/how_to/tensorflow_install/tensorflow_install.md @@ -114,14 +114,14 @@ To install TensorFlow using the wheels package, follow these steps: sudo apt install rocm-libs rccl ``` -6. Update protobuf to 3.19 or lower. +6. Update `protobuf` to 3.19 or lower. ```bash /usr/bin/python3.7 -m pip install protobuf=3.19.0 sudo pip3 install tensorflow ``` -7. Set the environment variable PYTHONPATH. +7. Set the environment variable `PYTHONPATH`. ```bash export PYTHONPATH="./.local/lib/python[version]/site-packages:$PYTHONPATH" #Use same python version as in step 2 @@ -140,7 +140,7 @@ To install TensorFlow using the wheels package, follow these steps: ``` :::{note} - For details on tensorflow-rocm wheels and ROCm version compatibility, see: + For details on `tensorflow-rocm` wheels and ROCm version compatibility, see: [https://github.com/ROCmSoftwarePlatform/tensorflow-upstream/blob/develop-upstream/rocm_docs/tensorflow-rocm-release.md](https://github.com/ROCmSoftwarePlatform/tensorflow-upstream/blob/develop-upstream/rocm_docs/tensorflow-rocm-release.md) ::: diff --git a/docs/kernel_userspace.md b/docs/kernel_userspace.md index 03b6e5e08..41d37e5ad 100644 --- a/docs/kernel_userspace.md +++ b/docs/kernel_userspace.md @@ -1 +1 @@ -# Kernel and Userspace Compatibility +# Kernel and User Space Compatibility diff --git a/docs/reference/all.md b/docs/reference/all.md index fc940b6d4..5fd77c129 100644 --- a/docs/reference/all.md +++ b/docs/reference/all.md @@ -32,7 +32,7 @@ ROCm template libraries for C++ primitives and algorithms are as follows: ::: :::{grid-item-card} [Communication Libraries](gpu_libraries/communication) -Inter and intra node communication is supported by the following projects: +Inter and intra-node communication is supported by the following projects: - [RCCL](https://rocmdocs.amd.com/projects/rccl/en/latest/) diff --git a/docs/reference/compilers.md b/docs/reference/compilers.md index 41d413c19..210442a5f 100644 --- a/docs/reference/compilers.md +++ b/docs/reference/compilers.md @@ -18,7 +18,7 @@ This is ROCgdb, the ROCm source-level debugger for Linux, based on GDB, the GNU ::: :::{grid-item-card} [ROCProfiler](https://rocmdocs.amd.com/projects/rocprofiler/en/latest/) -ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. +ROC profiler library. Profiling with performance counters and derived metrics. Library supports GFX8/GFX9. Hardware specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes hardware performance counters with complex performance metrics. - [Documentation](https://rocmdocs.amd.com/projects/rocprofiler/en/latest/) diff --git a/docs/reference/framework_compatibility/framework_compatibility.md b/docs/reference/framework_compatibility/framework_compatibility.md index 1798bd1b8..9bced739f 100644 --- a/docs/reference/framework_compatibility/framework_compatibility.md +++ b/docs/reference/framework_compatibility/framework_compatibility.md @@ -2,7 +2,7 @@ Pull content from . -Only the frameworks content. Link to kernel/userspace guide. +Only the frameworks content. Link to kernel/user space guide. Also pull content from diff --git a/docs/reference/gpu_arch.md b/docs/reference/gpu_arch.md index 4c94a6172..9fc6845fa 100644 --- a/docs/reference/gpu_arch.md +++ b/docs/reference/gpu_arch.md @@ -11,12 +11,12 @@ - [AMD RDNA Instruction Set Architecture](https://www.amd.com/system/files/TechDocs/rdna-shader-instruction-set-architecture.pdf) - [AMD GCN3 Instruction Set Architecture](https://www.amd.com/system/files/TechDocs/gcn3-instruction-set-architecture.pdf) -## Whitepapers +## White Papers -- [AMD CDNA™ 2 Architecture Whitepaper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf) -- [AMD CDNA Architecture Whitepaper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf) -- [AMD Vega Architecture Whitepaper](https://en.wikichip.org/w/images/a/a1/vega-whitepaper.pdf) -- [AMD RDNA Architecture Whitepaper](https://www.amd.com/system/files/documents/rdna-whitepaper.pdf) +- [AMD CDNA™ 2 Architecture White Paper](https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf) +- [AMD CDNA Architecture White Paper](https://www.amd.com/system/files/documents/amd-cdna-whitepaper.pdf) +- [AMD Vega Architecture White Paper](https://en.wikichip.org/w/images/a/a1/vega-whitepaper.pdf) +- [AMD RDNA Architecture White Paper](https://www.amd.com/system/files/documents/rdna-whitepaper.pdf) ## Architecture Guides diff --git a/docs/reference/gpu_arch/mi100.md b/docs/reference/gpu_arch/mi100.md index 9e4da6b0a..10dfd2eb5 100644 --- a/docs/reference/gpu_arch/mi100.md +++ b/docs/reference/gpu_arch/mi100.md @@ -39,7 +39,7 @@ Fabric™ bridge for the AMD Instinct™ accelerators. The micro-architecture of the AMD Instinct accelerators is based on the AMD CDNA architecture, which targets compute applications such as high-performance computing (HPC) and AI & machine learning (ML) that run on everything from -individual servers to the world’s largest exascale supercomputers. The overall +individual servers to the world's largest exascale supercomputers. The overall system architecture is designed for extreme scalability and compute performance. :::{figure-md} mi100-block @@ -56,7 +56,7 @@ high-speed links (23 GT/sec, also at the bottom) to the other GPUs of the local hive as shown in {numref}`mi100-arch`. On the left and right of the floor plan, the High Bandwidth Memory (HBM) -attaches via the GPU’s memory controller. The MI100 generation of the AMD +attaches via the GPU's memory controller. The MI100 generation of the AMD Instinct accelerator offers four stacks of HBM generation 2 (HBM2) for a total of 32GB with a 4,096bit-wide memory interface. The peak memory bandwidth of the attached HBM2 is 1.228 TB/sec at a memory clock frequency of 1.2 GHz. @@ -66,7 +66,7 @@ Units (CU). There are a total 120 compute units that are physically organized into eight Shader Engines (SE) with fifteen compute units per shader engine. Each compute unit is further sub-divided into four SIMD units that process SIMD instructions of 16 data elements per instruction. This enables the CU to process -64 data elements (a so-called ‘wavefront’) at a peak clock frequency of 1.5 GHz. +64 data elements (a so-called 'wavefront') at a peak clock frequency of 1.5 GHz. Therefore, the theoretical maximum FP64 peak performance is 11.5 TFLOPS (`4 [SIMD units] x 16 [elements per instruction] x 120 [CU] x 1.5 [GHz]`). @@ -91,7 +91,7 @@ thus may affect execution performance. A wavefront can occupy any number of VGPRs from 0 to 256, directly affecting occupancy; that is, the number of concurrently active wavefronts in the CU. For -instance, with 119 VPGRs used, only two wavefronts can be active in the CU at +instance, with 119 VGPRs used, only two wavefronts can be active in the CU at the same time. With the instruction latency of four cycles per SIMD instruction, the occupancy should be as high as possible such that the compute unit can improve execution efficiency by scheduling instructions from multiple diff --git a/docs/reference/gpu_libraries/linear_algebra.md b/docs/reference/gpu_libraries/linear_algebra.md index 462e592b6..fe1321d4f 100644 --- a/docs/reference/gpu_libraries/linear_algebra.md +++ b/docs/reference/gpu_libraries/linear_algebra.md @@ -15,7 +15,7 @@ rocBLAS is an AMD GPU optimized library for BLAS. ::: :::{grid-item-card} [hipBLAS](https://rocmdocs.amd.com/projects/hipBLAS/en/develop/) -hipBLAS is a compatiblity layer for GPU accelerated BLAS optimized for AMD GPUs +hipBLAS is a compatibility layer for GPU accelerated BLAS optimized for AMD GPUs via rocBLAS and rocSOLVER. hipBLAS allows for a common interface for other GPU BLAS libraries. @@ -25,7 +25,7 @@ BLAS libraries. ::: :::{grid-item-card} [hipBLASLt](https://rocmdocs.amd.com/projects/hipBLASLt/en/develop/) -hipBLASLt is a library that provides general matrix-matrix operations with a flexible API and extends funtionalities beyond traditional BLAS library. hipBLASLt is exposed APIs in HIP programming language with an underlying optimized generator as a backend kernel provider. +hipBLASLt is a library that provides general matrix-matrix operations with a flexible API and extends functionalities beyond traditional BLAS library. hipBLASLt is exposed APIs in HIP programming language with an underlying optimized generator as a back-end kernel provider. - [Documentation](https://rocmdocs.amd.com/projects/hipBLASLt/en/develop/) - [Changelog](https://github.com/ROCmSoftwarePlatform/hipBLASLt/blob/develop/CHANGELOG.md) diff --git a/docs/reference/gpu_libraries/rand.md b/docs/reference/gpu_libraries/rand.md index 904bea6e0..52c5f935b 100644 --- a/docs/reference/gpu_libraries/rand.md +++ b/docs/reference/gpu_libraries/rand.md @@ -13,7 +13,7 @@ rocRAND is an AMD GPU optimized library for pseudo-random number generators (PRN ::: :::{grid-item-card} [hipRAND](https://rocmdocs.amd.com/projects/hipRAND/en/rtd/) -hipRAND is a compatiblity layer for GPU accelerated FFT optimized for AMD GPUs +hipRAND is a compatibility layer for GPU accelerated FFT optimized for AMD GPUs using rocFFT. hipFFT allows for a common interface for other non AMD GPU FFT libraries. diff --git a/docs/reference/kernel_userspace_compatibility/kernel_userspace_comp.md b/docs/reference/kernel_userspace_compatibility/kernel_userspace_comp.md index 70b97dce9..f706b7a34 100644 --- a/docs/reference/kernel_userspace_compatibility/kernel_userspace_comp.md +++ b/docs/reference/kernel_userspace_compatibility/kernel_userspace_comp.md @@ -1 +1 @@ -# Kernel Userspace Compatibility Reference +# Kernel User Space Compatibility Reference diff --git a/docs/reference/management_tools.md b/docs/reference/management_tools.md index 40fd14bc9..38e9872bf 100644 --- a/docs/reference/management_tools.md +++ b/docs/reference/management_tools.md @@ -17,7 +17,7 @@ provided by the AMD E-SMI inband library and the ROCm SMI GPU library to the Pro ::: :::{grid-item-card} [ROCm SMI](https://rocmdocs.amd.com/projects/rocmsmi/en/latest/) -This tool acts as a command line interface for manipulating and monitoring the amdgpu kernel, and is intended to replace and deprecate the existing rocm_smi.py CLI tool. It uses Ctypes to call the rocm_smi_lib API. +This tool acts as a command line interface for manipulating and monitoring the AMD GPU kernel, and is intended to replace and deprecate the existing `rocm_smi.py` CLI tool. It uses `ctypes` to call the `rocm_smi_lib` API. - [Documentation](https://rocmdocs.amd.com/projects/rocmsmi/en/latest/) - [Examples](https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools) @@ -25,7 +25,7 @@ This tool acts as a command line interface for manipulating and monitoring the a ::: :::{grid-item-card} [ROCm Datacenter Tool](https://rocmdocs.amd.com/projects/rdc/en/latest/) -The ROCm™ Data Center Tool simplifies the administration and addresses key infrastructure challenges in AMD GPUs in cluster and datacenter environments. +The ROCm™ Data Center Tool simplifies the administration and addresses key infrastructure challenges in AMD GPUs in cluster and data center environments. - [Documentation](https://rocmdocs.amd.com/projects/rdc/en/latest/) - [Examples](https://github.com/RadeonOpenCompute/rdc/tree/master/example) diff --git a/docs/reference/openmp/openmp.md b/docs/reference/openmp/openmp.md index e5bc7f4ad..e632aab59 100644 --- a/docs/reference/openmp/openmp.md +++ b/docs/reference/openmp/openmp.md @@ -6,9 +6,9 @@ The ROCm™ installation includes an LLVM-based implementation that fully suppor ### Installation -The OpenMP toolchain is automatically installed as part of the standard ROCm installation and is available under /opt/rocm-{version}/llvm. The sub-directories are: +The OpenMP toolchain is automatically installed as part of the standard ROCm installation and is available under `/opt/rocm-{version}/llvm`. The sub-directories are: -bin: Compilers (flang and clang) and other binaries. +bin: Compilers (`flang` and `clang`) and other binaries. - examples: The usage section below shows how to compile and run these programs. @@ -36,13 +36,13 @@ The above invocation of Make compiles and runs the program. Note the options tha -fopenmp --offload-arch= ``` -Obtain the value of gpu-arch by running the following command: +Obtain the value of `gpu-arch` by running the following command: ```bash % /opt/rocm-{version}/bin/rocminfo | grep gfx ``` -[//]: # (dated link below, needs upading) +[//]: # (dated link below, needs updating) See the complete list of compiler command-line references [here](https://github.com/RadeonOpenCompute/llvm-project/blob/amd-stg-open/clang/docs/CommandGuide/clang.rst). @@ -56,7 +56,7 @@ The following steps describe a typical workflow for using rocprof with OpenMP co % rocprof ``` - This produces a results.csv file in the user’s current directory that shows basic stats such as kernel names, grid size, number of registers used, etc. The user can choose to specify the preferred output file name using the o option. + This produces a `results.csv` file in the user’s current directory that shows basic stats such as kernel names, grid size, number of registers used, etc. The user can choose to specify the preferred output file name using the o option. 2. Add options for a detailed result: @@ -64,9 +64,9 @@ The following steps describe a typical workflow for using rocprof with OpenMP co --stats: % rocprof --stats ``` - The stats option produces timestamps for the kernels. Look into the output CSV file for the field, DurationNs, which is useful in getting an understanding of the critical kernels in the code. + The stats option produces timestamps for the kernels. Look into the output CSV file for the field, `Durations`, which is useful in getting an understanding of the critical kernels in the code. - Apart from --stats, the option --timestamp on produces a timestamp for the kernels. + Apart from `--stats`, the option `--timestamp` on produces a timestamp for the kernels. 3. After learning about the required kernels, the user can take a detailed look at each one of them. rocprof has support for hardware counters: a set of basic and a set of derived ones. See the complete list of counters using options --list-basic and --list-derived. rocprof accepts either a text or an XML file as an input. @@ -74,7 +74,7 @@ For more details on rocprof, refer to the ROCm Profiling Tools document on [http ### Using Tracing Options -**Prerequisite:** When using the --sys-trace option, compile the OpenMP program with: +**Prerequisite:** When using the `--sys-trace` option, compile the OpenMP program with: ```bash -Wl,–rpath,/opt/rocm-{version}/lib -lamdhip64 @@ -82,9 +82,9 @@ For more details on rocprof, refer to the ROCm Profiling Tools document on [http The following tracing options are widely used to generate useful information: -- **--hsa-trace**: This option is used to get a JSON output file with the HSA API execution traces and a flat profile in a CSV file. +- **`--hsa-trace`**: This option is used to get a JSON output file with the HSA API execution traces and a flat profile in a CSV file. -- **--sys-trace**: This allows programmers to trace both HIP and HSA calls. Since this option results in loading ``libamdhip64.so``, follow the prerequisite as mentioned above. +- **`--sys-trace`**: This allows programmers to trace both HIP and HSA calls. Since this option results in loading ``libamdhip64.so``, follow the prerequisite as mentioned above. A CSV and a JSON file are produced by the above trace options. The CSV file presents the data in a tabular format, and the JSON file can be visualized using Google Chrome at chrome://tracing/ or [Perfetto](https://perfetto.dev/). Navigate to Chrome or Perfetto and load the JSON file to see the timeline of the HSA calls. @@ -94,14 +94,14 @@ For more details on tracing, refer to the ROCm Profiling Tools document on [http :::{table} :widths: auto -| Environment Variable | Description | -| ----------- | ----------- | -| OMP_NUM_TEAMS | The implementation chooses the number of teams for kernel launch. The user can change this number for performance tuning using this environment variable, subject to implementation limits. | -| OMPX_DISABLE_MAPS | Under USM mode, the implementation automatically checks for correctness of the map clauses without performing any copying. The user can disable this check by setting this environment variable to 1. | -| LIBOMPTARGET_KERNEL_TRACE | This environment variable is used to print useful statistics for device operations. Setting it to 1 and running the program emits the name of every kernel launched, the number of teams and threads used, and the corresponding register usage. Setting it to 2 additionally emits timing information for kernel launches and data transfer operations between the host and the device. | -| LIBOMPTARGET_INFO | This environment variable is used to print informational messages from the device runtime as the program executes. Users can request fine-grain information by setting it to the value of 1 or higher and can set the value of -1 for complete information. | -| LIBOMPTARGET_DEBUG | If a debug version of the device library is present, setting this environment variable to 1 and using that library emits further detailed debugging information about data transfer operations and kernel launch. | -| GPU_MAX_HW_QUEUES | This environment variable is used to set the number of HSA queues in the OpenMP runtime. | +| Environment Variable | Description | +| --------------------------- | ----------- | +| `OMP_NUM_TEAMS` | The implementation chooses the number of teams for kernel launch. The user can change this number for performance tuning using this environment variable, subject to implementation limits. | +| `OMPX_DISABLE_MAPS` | Under USM mode, the implementation automatically checks for correctness of the map clauses without performing any copying. The user can disable this check by setting this environment variable to 1. | +| `LIBOMPTARGET_KERNEL_TRACE` | This environment variable is used to print useful statistics for device operations. Setting it to 1 and running the program emits the name of every kernel launched, the number of teams and threads used, and the corresponding register usage. Setting it to 2 additionally emits timing information for kernel launches and data transfer operations between the host and the device. | +| `LIBOMPTARGET_INFO` | This environment variable is used to print informational messages from the device runtime as the program executes. Users can request fine-grain information by setting it to the value of 1 or higher and can set the value of -1 for complete information. | +| `LIBOMPTARGET_DEBUG` | If a debug version of the device library is present, setting this environment variable to 1 and using that library emits further detailed debugging information about data transfer operations and kernel launch. | +| `GPU_MAX_HW_QUEUES` | This environment variable is used to set the number of HSA queues in the OpenMP runtime. | ::: ## OpenMP: Features @@ -110,9 +110,9 @@ The OpenMP programming model is greatly enhanced with the following new features ### Asynchronous Behavior in OpenMP Target Regions -- Multithreaded offloading on the same device +- Multi-threaded offloading on the same device -The libomptarget plugin for GPU offloading allows creation of separate configurable HSA queues per chiplet, which enables two or more threads to concurrently offload to the same device. +The `libomptarget` plugin for GPU offloading allows creation of separate configurable HSA queues per chiplet, which enables two or more threads to concurrently offload to the same device. - Parallel memory copy invocations @@ -146,7 +146,7 @@ xnack- with -–offload-arch=gfx908:xnack- #### Unified Shared Memory Pragma -This OpenMP pragma is available on MI200 through xnack+ support. +This OpenMP pragma is available on MI200 through `xnack+` support. ```bash omp requires unified_shared_memory @@ -192,20 +192,20 @@ The difference between the memory pages pointed to by these two variables is tha The OpenMP runtime in ROCm implements a subset of the OMPT device APIs, as described in the OpenMP specification document. These APIs allow first-party tools to examine the profile and kernel traces that execute on a device. A tool can register callbacks for data transfer and kernel dispatch entry points or use APIs to start and stop tracing for device-related activities such as data transfer and kernel dispatch timings and associated metadata. If device tracing is enabled, trace records for device activities are collected during program execution and returned to the tool using the APIs described in the specification. -The following example demonstrates how a tool uses the supported OMPT target APIs. The README in /opt/rocm/llvm/examples/tools/ompt outlines the steps to be followed, and the provided example can be run as shown below: +The following example demonstrates how a tool uses the supported OMPT target APIs. The `README` in `/opt/rocm/llvm/examples/tools/ompt` outlines the steps to be followed, and the provided example can be run as shown below: ```bash % cd $ROCM_PATH/share/openmp-extras/examples/tools/ompt/veccopy-ompt-target-tracing % make run ``` -The file veccopy-ompt-target-tracing.c simulates how a tool initiates device activity tracing. The file callbacks.h shows the callbacks registered and implemented by the tool. +The file `veccopy-ompt-target-tracing.c` simulates how a tool initiates device activity tracing. The file `callbacks.h` shows the callbacks registered and implemented by the tool. ### Floating Point Atomic Operations -The MI200-series GPUs support the generation of hardware floating-point atomics using the OpenMP atomic pragma. The support includes single- and double-precision floating-point atomic operations. The programmer must ensure that the memory subjected to the atomic operation is in coarse-grain memory by mapping it explicitly with the help of map clauses when not implicitly mapped by the compiler as per the [OpenMP specifications](https://www.openmp.org/specifications/). This makes these hardware floating-point atomic instructions “fast,” as they are faster than using a default compare-and-swap loop scheme, but at the same time “unsafe,” as they are not supported on fine-grain memory. The operation in unified_shared_memory mode also requires programmers to map the memory explicitly when not implicitly mapped by the compiler. +The MI200-series GPUs support the generation of hardware floating-point atomics using the OpenMP atomic pragma. The support includes single- and double-precision floating-point atomic operations. The programmer must ensure that the memory subjected to the atomic operation is in coarse-grain memory by mapping it explicitly with the help of map clauses when not implicitly mapped by the compiler as per the [OpenMP specifications](https://www.openmp.org/specifications/). This makes these hardware floating-point atomic instructions “fast,” as they are faster than using a default compare-and-swap loop scheme, but at the same time “unsafe,” as they are not supported on fine-grain memory. The operation in `unified_shared_memory` mode also requires programmers to map the memory explicitly when not implicitly mapped by the compiler. -To request fast floating-point atomic instructions at the file level, use compiler flag -munsafe-fp-atomics or a hint clause on a specific pragma: +To request fast floating-point atomic instructions at the file level, use compiler flag `-munsafe-fp-atomics` or a hint clause on a specific pragma: ```bash double a = 0.0; @@ -213,9 +213,9 @@ double a = 0.0; a = a + 1.0; ``` -NOTE AMD_unsafe_fp_atomics is an alias for AMD_fast_fp_atomics, and AMD_safe_fp_atomics is implemented with a compare-and-swap loop. +NOTE `AMD_unsafe_fp_atomics` is an alias for `AMD_fast_fp_atomics`, and `AMD_safe_fp_atomics` is implemented with a compare-and-swap loop. -To disable the generation of fast floating-point atomic instructions at the file level, build using the option -msafe-fp-atomics or use a hint clause on a specific pragma: +To disable the generation of fast floating-point atomic instructions at the file level, build using the option `-msafe-fp-atomics` or use a hint clause on a specific pragma: ```bash double a = 0.0; @@ -225,7 +225,7 @@ a = a + 1.0; The hint clause value always has a precedence over the compiler flag, which allows programmers to create atomic constructs with a different behavior than the rest of the file. -See the example below, where the user builds the program using -msafe-fp-atomics to select a file-wide “safe atomic” compilation. However, the fast atomics hint clause over variable “a” takes precedence and operates on “a” using a fast/unsafe floating-point atomic, while the variable “b” in the absence of a hint clause is operated upon using safe floating-point atomics as per the compiler flag. +See the example below, where the user builds the program using `-msafe-fp-atomics` to select a file-wide “safe atomic” compilation. However, the fast atomics hint clause over variable “a” takes precedence and operates on “a” using a fast/unsafe floating-point atomic, while the variable “b” in the absence of a hint clause is operated upon using safe floating-point atomics as per the compiler flag. ```bash double a = 0.0;. @@ -239,7 +239,7 @@ b = b + 1.0; ### Address Sanitizer (ASan) Tool -Address Sanitizer is a memory error detector tool utilized by applications to detect various errors ranging from spatial issues such as out-of-bound access to temporal issues such as use-after-free. The AOMP compiler supports ASan for AMDGPUs with applications written in both HIP and OpenMP. +Address Sanitizer is a memory error detector tool utilized by applications to detect various errors ranging from spatial issues such as out-of-bound access to temporal issues such as use-after-free. The AOMP compiler supports ASan for AMD GPUs with applications written in both HIP and OpenMP. **Features Supported on Host Platform (Target x86_64):** @@ -259,7 +259,7 @@ Address Sanitizer is a memory error detector tool utilized by applications to de - Initialization order bugs -**Features Supported on AMDGPU Platform (amdgcn-amd-amdhsa):** +**Features Supported on AMDGPU Platform (`amdgcn-amd-amdhsa`):** - Heap buffer overflow @@ -318,11 +318,11 @@ The No-loop kernel generation feature optimizes the compiler performance by gene To enable the generation of the specialized kernel, follow these guidelines: -- Do not specify teams, threads, and schedule-related environment variables. The num_teams or a thread_limit clause in an OpenMP target construct acts as an override and prevents the generation of the specialized kernel. As the user is unable to specify the number of teams and threads used within target regions in the absence of the above-mentioned environment variables, the runtime will select the best values for the launch configuration based on runtime knowledge of the program. +- Do not specify teams, threads, and schedule-related environment variables. The `num_teams` or a `thread_limit` clause in an OpenMP target construct acts as an override and prevents the generation of the specialized kernel. As the user is unable to specify the number of teams and threads used within target regions in the absence of the above-mentioned environment variables, the runtime will select the best values for the launch configuration based on runtime knowledge of the program. -- Assert the absence of the above-mentioned environment variables by adding the command-line option fopenmp-target-ignore-env-vars. This option also allows programmers to enable the No-loop functionality at lower optimization levels. +- Assert the absence of the above-mentioned environment variables by adding the command-line option `-fopenmp-target-ignore-env-vars`. This option also allows programmers to enable the No-loop functionality at lower optimization levels. -- Also, the No-loop functionality is automatically enabled when -O3 or -Ofast is used for compilation. To disable this feature, use -fno-openmp-target-ignore-env-vars. +- Also, the No-loop functionality is automatically enabled when `-O3` or `-Ofast` is used for compilation. To disable this feature, use `-fno-openmp-target-ignore-env-vars`. Note The compiler might not generate the No-loop kernel in certain scenarios where the performance improvement is not substantial. diff --git a/docs/reference/rocmcc/rocmcc.md b/docs/reference/rocmcc/rocmcc.md index 2fb1091c4..1f5db3c9a 100644 --- a/docs/reference/rocmcc/rocmcc.md +++ b/docs/reference/rocmcc/rocmcc.md @@ -11,10 +11,10 @@ The differences are listed in [the table below](rocm-llvm-vs-alt). :::{table} Differences between `rocm-llvm` and `rocm-llvm-alt` :name: rocm-llvm-vs-alt -| **rocm-llvm** | **rocm-llvm-alt** | -|:---------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------:| -| Installed by default when ROCm™ itself is installed | An optional package | -| Provides an open-source compiler | Provides an additional closed-source compiler for users interested in additional CPU optimizations not available in rocm-llvm | +| **`rocm-llvm`** | **`rocm-llvm-alt`** | +|:---------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------:| +| Installed by default when ROCm™ itself is installed | An optional package | +| Provides an open-source compiler | Provides an additional closed-source compiler for users interested in additional CPU optimizations not available in `rocm-llvm` | ::: For more details, see: @@ -30,25 +30,25 @@ ROCm currently provides two compiler interfaces for compiling HIP programs: - `/opt/rocm/bin/amdclang++` Both leverage the same LLVM compiler technology with the AMD GCN GPU support; -however, they offer a slightly different user experience. The hipcc command-line +however, they offer a slightly different user experience. The `hipcc` command-line interface aims to provide a more familiar user interface to users who are experienced in CUDA but relatively new to the ROCm/HIP development environment. -On the other hand, amdclang++ provides a user interface identical to the clang++ +On the other hand, `amdclang++` provides a user interface identical to the clang++ compiler. It is more suitable for experienced developers who want to directly interact with the clang compiler and gain full control of their application’s build process. -The major differences between hipcc and amdclang++ are listed below: +The major differences between `hipcc` and `amdclang++` are listed below: -::::{table} Differences between hipcc and amdclang++ +::::{table} Differences between `hipcc` and `amdclang++` :name: hipcc-vs-amdclang -| * | **hipcc** | **amdclang++** | -|:----------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|:--------------:| -| Compiling HIP source files | Treats all source files as HIP language source files | Enables the HIP language support for files with the “.hip” extension or through the -x hip compiler option | -| Detecting GPU architecture | Auto-detects the GPUs available on the system and generates code for those devices when no GPU architecture is specified | Has AMD GCN gfx803 as the default GPU architecture. The --offload-arch compiler option may be used to target other GPU architectures | -| Finding a HIP installation | Finds the HIP installation based on its own location and its knowledge about the ROCm directory structure | First looks for HIP under the same parent directory as its own LLVM directory and then falls back on /opt/rocm. Users can use the --rocm-path option to instruct the compiler to use HIP from the specified ROCm installation. | -| Linking to the HIP runtime library | Is configured to automatically link to the HIP runtime from the detected HIP installation | Requires the --hip-link flag to be specified to link to the HIP runtime. Alternatively, users can use the -l`` -lamdhip64 option to link to a HIP runtime library. | -| Device function inlining | Inlines all GPU device functions, which provide greater performance and compatibility for codes that contain file scoped or device function scoped `__shared__` variables. However, it may increase compile time. | Relies on inlining heuristics to control inlining. Users experiencing performance or compilation issues with code using file scoped or device function scoped `__shared__` variables could try -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false to work around the issue. There are plans to address these issues with future compiler improvements. | +| * | **`hipcc`** | **`amdclang++`** | +|:----------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|:----------------:| +| Compiling HIP source files | Treats all source files as HIP language source files | Enables the HIP language support for files with the `.hip` extension or through the `-x hip` compiler option | +| Detecting GPU architecture | Auto-detects the GPUs available on the system and generates code for those devices when no GPU architecture is specified | Has AMD GCN gfx803 as the default GPU architecture. The `--offload-arch` compiler option may be used to target other GPU architectures | +| Finding a HIP installation | Finds the HIP installation based on its own location and its knowledge about the ROCm directory structure | First looks for HIP under the same parent directory as its own LLVM directory and then falls back on `/opt/rocm`. Users can use the `--rocm-path` option to instruct the compiler to use HIP from the specified ROCm installation. | +| Linking to the HIP runtime library | Is configured to automatically link to the HIP runtime from the detected HIP installation | Requires the `--hip-link` flag to be specified to link to the HIP runtime. Alternatively, users can use the `-l -lamdhip64` option to link to a HIP runtime library. | +| Device function inlining | Inlines all GPU device functions, which provide greater performance and compatibility for codes that contain file scoped or device function scoped `__shared__` variables. However, it may increase compile time. | Relies on inlining heuristics to control inlining. Users experiencing performance or compilation issues with code using file scoped or device function scoped `__shared__` variables could try `-mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false` to work around the issue. There are plans to address these issues with future compiler improvements. | | Source code location | | | :::: @@ -115,8 +115,8 @@ This section outlines commonly used compiler flags for `hipcc` and `amdclang++`. The CPU compiler optimizations described in this chapter originate from the AMD Optimizing C/C++ Compiler (AOCC) compiler. They are available in ROCmCC if the -optional rocm-llvm-alt package is installed. The user’s interaction with the -compiler does not change once rocm-llvm-alt is installed. The user should use +optional `rocm-llvm-alt` package is installed. The user’s interaction with the +compiler does not change once `rocm-llvm-alt` is installed. The user should use the same compiler entry point, provided AMD provides high-performance compiler optimizations for Zen-based processors in AOCC. @@ -149,13 +149,13 @@ feasible, this optimization transforms the code to enable these improvements. This transformation is likely to improve cache utilization and memory bandwidth. It is expected to improve the scalability of programs executed on multiple cores. -This is effective only under `flto`, as the whole program analysis is required +This is effective only under `-flto`, as the whole program analysis is required to perform this optimization. Users can choose different levels of aggressiveness with which this optimization can be applied to the application, with 1 being the least aggressive and 7 being the most aggressive level. :::{table} -fstruct-layout Values and Their Effects -| -fstruct-layout value | Structure peeling | Pointer size after selective compression of self-referential pointers in structures, wherever safe | Type of structure fields eligible for compression | Whether compression performed under safety check | +| `-fstruct-layout` value | Structure peeling | Pointer size after selective compression of self-referential pointers in structures, wherever safe | Type of structure fields eligible for compression | Whether compression performed under safety check | | ----------- | ----------- | ----------- | ----------- | ----------- | | 1 | Enabled | NA | NA | NA | | 2 | Enabled | 32-bit | NA | NA | @@ -191,14 +191,14 @@ optimization, which is invoked as `-flto -fitodcallsbyclone`. #### `-fremap-arrays` Transforms the data layout of a single dimensional array to provide better cache -locality. This optimization is effective only under `flto`, as the whole program +locality. This optimization is effective only under `-flto`, as the whole program needs to be analyzed to perform this optimization, which can be invoked as `-flto -fremap-arrays`. #### `-finline-aggressive` Enables improved inlining capability through better heuristics. This -optimization is more effective when used with `flto`, as the whole program +optimization is more effective when used with `-flto`, as the whole program analysis is required to perform this optimization, which can be invoked as `-flto -finline-aggressive`. @@ -282,7 +282,7 @@ or factor of 16. This vectorization width of 16 may be overwritten by ##### `-enable-redundant-movs` -Removes any redundant mov operations including redundant loads from memory and +Removes any redundant `mov` operations including redundant loads from memory and stores to memory. This can be invoked using `-Wl,-plugin-opt=-enable-redundant-movs`. @@ -322,13 +322,13 @@ functions at call sites. | 4 | 10 | ::: -This is more effective with flto as the whole program needs to be analyzed to +This is more effective with `-flto` as the whole program needs to be analyzed to perform this optimization, which can be invoked as `-flto -inline-recursion=[1,2,3,4]`. ##### `-reduce-array-computations=[1,2,3]` -Performs array dataflow analysis and optimizes the unused array computations. +Performs array data flow analysis and optimizes the unused array computations. :::{table} -reduce-array-computations Values and Their Effects | -reduce-array-computations value | Array elements eligible for elimination of computations | @@ -338,7 +338,7 @@ Performs array dataflow analysis and optimizes the unused array computations. | 3 | Both unused and zero valued | ::: -This optimization is effective with flto as the whole program needs to be +This optimization is effective with `-flto` as the whole program needs to be analyzed to perform this optimization, which can be invoked as `-flto -reduce-array-computations=[1,2,3]`. @@ -352,7 +352,7 @@ vector operations. This option is set to **true** by default. Experimental flag for enabling vectorization on certain loops with complex control flow, which the normal vectorizer cannot handle. -This optimization is effective with flto as the whole program needs to be +This optimization is effective with `-flto` as the whole program needs to be analyzed to perform this optimization, which can be invoked as `-flto -region-vectorize`. @@ -423,12 +423,12 @@ This option is set to false by default. ##### `-Hz,1,0x1 [Fortran]` Helps to preserve array index information for array access expressions which get -linearized in the compiler frontend. The preserved information is used by the +linearized in the compiler front end. The preserved information is used by the compiler optimization phase in performing optimizations such as loop transformations. It is recommended that any user who is using optimizations such as loop transformations and other optimizations requiring de-linearized index expressions should use the Hz option. This option has no impact on any -other aspects of the Flang frontend. +other aspects of the Flang front end. ### Inline ASM Statements @@ -467,7 +467,7 @@ compiler. An LLVM library and tool that is used to query the execution capability of the current system as well as to query requirements of a binary file. It is used by OpenMP device runtime to ensure compatibility of an image with the current -system while loading it. It is compatible with TargetID support and multi-image +system while loading it. It is compatible with target ID support and multi-image fat binary support. **Usage:** @@ -478,7 +478,7 @@ offload-arch [Options] [Optional lookup-value] When used without an option, offload-arch prints the value of the first offload arch found in the underlying system. This can be used by various clang -frontends. For example, to compile for OpenMP offloading on your current system, +front ends. For example, to compile for OpenMP offloading on your current system, invoke clang with the following command: ```bash @@ -507,11 +507,11 @@ The options are listed below: ::: :::{option} -m - Prints device code name (often found in pci.ids file). + Prints device code name (often found in `pci.ids` file). ::: :::{option} -n - Prints numeric pci-id. + Prints numeric `pci-id`. ::: :::{option} -t @@ -530,12 +530,12 @@ The options are listed below: Prints offload capabilities of the underlying system. This option is used by the language runtime to select an image when multiple images are available. A capability must exist for each requirement of the selected image. ::: -There are symbolic link aliases amdgpu-offload-arch and nvidia-arch for -offload-arch. These aliases return 1 if no amdgcn GPU or cuda GPU is found. +There are symbolic link aliases `amdgpu-offload-arch` and `nvidia-arch` for +`offload-arch`. These aliases return 1 if no AMD GCN GPU or CUDA GPU is found. These aliases are useful in determining whether architecture-specific tests should be run or to conditionally load architecture-specific software. -#### Command-Line Simplification Using offload-arch Flag +#### Command-Line Simplification Using `offload-arch` Flag Legacy mechanism of specifying offloading target for OpenMP involves using three flags, `-fopenmp-targets`, `-Xopenmp-target`, and `-march`. The first two flags @@ -562,14 +562,14 @@ clang -fopenmp -target x86_64-linux-gnu \ ``` To ensure backward compatibility, both styles are supported. This option is -compatible with TargetID support and multi-image fat binaries. +compatible with target ID support and multi-image fat binaries. -#### TargetID Support for OpenMP +#### Target ID Support for OpenMP The ROCmCC compiler supports specification of target features along with the GPU name while specifying a target offload device in the command line, using `-march` or `--offload-arch` options. The compiled image in such cases is -specialized for a given configuration of device and target features (TargetID). +specialized for a given configuration of device and target features (target ID). **Example:** @@ -598,8 +598,8 @@ clang -fopenmp -target x86_64-linux-gnu \ -march=gfx908:sramecc+:xnack- helloworld.c -o helloworld ``` -The TargetID specified on the command line is passed to the clang driver using -target-feature flag, to the LLVM optimizer and backend using `-mattr` flag, and +The target ID specified on the command line is passed to the clang driver using +`target-feature` flag, to the LLVM optimizer and back end using `-mattr` flag, and to linker using `-plugin-opt=-mattr` flag. This feature is compatible with offload-arch command-line option and multi-image binaries for multiple architectures. @@ -609,14 +609,14 @@ architectures. The ROCmCC compiler is enhanced to generate binaries that can contain heterogenous images. This heterogeneity could be in terms of: -- Images of different architectures, like amdgcn and nvptx +- Images of different architectures, like AMD GCN and NVPTX - Images of same architectures but for different GPUs, like gfx906 and gfx908 - Images of same architecture and same GPU but for different target features, - like gfx908:xnack+ and gfx908:xnack- + like `gfx908:xnack+` and `gfx908:xnack-` An appropriate image is selected by the OpenMP device runtime for execution depending on the capability of the current system. This feature is compatible -with TargetID support and offload-arch command-line options and uses +with target ID support and offload-arch command-line options and uses offload-arch tool to determine capability of the current system. **Example:** @@ -660,7 +660,7 @@ capability of the current system. #### Unified Shared Memory (USM) The following OpenMP pragma is available on MI200, and it must be executed with -xnack+ support. +`xnack+` support. ```cpp omp requires unified_shared_memory @@ -674,6 +674,8 @@ refer to the OpenMP Support Guide at [https://docs.amd.com](https://docs.amd.com The following table lists the other Clang options and their support status. + + :::{table} Clang Options :name: clang-options :widths: auto @@ -1440,3 +1442,4 @@ The following table lists the other Clang options and their support status. |-x \|Supported|Assumes subsequent input files to have the given type \| |-z \|Supported|Passes -z \ to the linker| ::: + diff --git a/docs/release/docker_support_matrix.md b/docs/release/docker_support_matrix.md index 3c85419e0..ba67ee31c 100644 --- a/docs/release/docker_support_matrix.md +++ b/docs/release/docker_support_matrix.md @@ -4,9 +4,9 @@ The software support matrices for ROCm container releases is listed. ## ROCm 5.6 -### Pytorch +### PyTorch -#### Ubuntu+ rocm5.6_internal_testing+169530b +#### `Ubuntu+ rocm5.6_internal_testing +169530b` * [ROCm5.6](http://repo.radeon.com/rocm/apt/latest/) * [Python 3.8](https://www.python.org/downloads/release/python-380/) @@ -19,7 +19,7 @@ The software support matrices for ROCm container releases is listed. * [OMPI 4.0.3](https://github.com/open-mpi/ompi/tree/v4.0.3) * [OFED 5.4.3](http://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz) -#### Centos7+ rocm5.6_internal_testing + 169530b +#### `CentOS7+ rocm5.6_internal_testing +169530b` * [ROCm5.6](http://repo.radeon.com/rocm/apt/latest/) * [Python 3.8](https://www.python.org/downloads/release/python-380/) @@ -29,7 +29,7 @@ The software support matrices for ROCm container releases is listed. * [Tensorboard 2.12.0](https://github.com/tensorflow/tensorboard/tree/2.12.0) * [MAGMA](https://bitbucket.org/icl/magma/src/master/) -#### 1.13 + bfeb431 +#### `1.13 +bfeb431` * [ROCm5.6](http://repo.radeon.com/rocm/apt/latest/) * [Python 3.8](https://www.python.org/downloads/release/python-380/) @@ -42,7 +42,7 @@ The software support matrices for ROCm container releases is listed. * [OMPI 4.0.3](https://github.com/open-mpi/ompi/tree/v4.0.3) * [OFED 5.4.3](http://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz) -#### 1.12+05d5d04 +#### `1.12 +05d5d04` * [ROCm5.6](http://repo.radeon.com/rocm/apt/latest/) * [Python 3.8](https://www.python.org/downloads/release/python-380/) @@ -57,31 +57,31 @@ The software support matrices for ROCm container releases is listed. ### TensorFlow -#### tensorflow_develop-upstream-QA-rocm56 +c88a9f4 +#### `tensorflow_develop-upstream-QA-rocm56 +c88a9f4` * [ROCm5.6](http://repo.radeon.com/rocm/apt/latest/) * [Python 3.9](https://www.python.org/downloads/release/python-390/) -* Tensorflow-rocm 2.13.0 +* `tensorflow-rocm` 2.13.0 * [OFED 5.3](http://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz) * [OMPI 4.0.7](https://github.com/open-mpi/ompi/tree/v4.0.7) * [Horovod 0.27.0](https://github.com/horovod/horovod/tree/v0.27.0) * [Tensorboard 2.12.0](https://github.com/tensorflow/tensorboard/tree/2.12.0) -#### r2.11-rocm-enhanced + 5be4141 +#### `r2.11-rocm-enhanced +5be4141` * [ROCm5.6](http://repo.radeon.com/rocm/apt/latest/) * [Python 3.9](https://www.python.org/downloads/release/python-390/) -* Tensorflow-rocm 2.11.0 +* `tensorflow-rocm` 2.11.0 * [OFED 5.3](http://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz) * [OMPI 4.0.7](https://github.com/open-mpi/ompi/tree/v4.0.7) * [Horovod 0.27.0](https://github.com/horovod/horovod/tree/v0.27.0) * [Tensorboard 2.11.2](https://github.com/tensorflow/tensorboard/tree/2.11.2) -#### r2.10-rocm-enhanced +72789a3 +#### `r2.10-rocm-enhanced +72789a3` * [ROCm5.6](http://repo.radeon.com/rocm/apt/latest/) * [Python 3.9](https://www.python.org/downloads/release/python-390/) -* Tensorflow-rocm 2.10.1 +* `tensorflow-rocm` 2.10.1 * [OFED 5.3](http://content.mellanox.com/ofed/MLNX_OFED-5.3-1.0.5.0/MLNX_OFED_LINUX-5.3-1.0.5.0-ubuntu20.04-x86_64.tgz) * [OMPI 4.0.7](https://github.com/open-mpi/ompi/tree/v4.0.7) * [Horovod 0.27.0](https://github.com/horovod/horovod/tree/v0.27.0) diff --git a/docs/release/licensing.md b/docs/release/licensing.md index 2ff866378..eefa7a8c3 100644 --- a/docs/release/licensing.md +++ b/docs/release/licensing.md @@ -7,6 +7,7 @@ additional licenses. Please review individual repositories for more information. The table shows ROCm components, the name of license and link to the license terms. The table is ordered to follow ROCm's manifest file. + | Component | License | |:------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------:| | [ROCK-Kernel-Driver](https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/) | [GPL 2.0 WITH Linux-syscall-note](https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/master/COPYING) | diff --git a/docs/rocm_stack.md b/docs/rocm_stack.md index 1930717aa..24fcae7a9 100644 --- a/docs/rocm_stack.md +++ b/docs/rocm_stack.md @@ -9,7 +9,7 @@ components described in this page. Kernel mo ## Compiler -### hipcc +### `hipcc` ### AMD Clang @@ -17,9 +17,9 @@ components described in this page. Kernel mo ### Math Libraries -The Math libraries are grouped into libraries starting with a roc-prefix and +The Math libraries are grouped into libraries starting with a `roc`-prefix and hip-prefix. Libraries starting with a hip-prefix provide a support for AMD GPUs -and NVIDIA GPUs. Libraries beginning the roc-prefix support AMD GPUs only. +and NVIDIA GPUs. Libraries beginning the `roc`-prefix support AMD GPUs only. ## #Compute Primitives diff --git a/docs/understand/file_reorg.md b/docs/understand/file_reorg.md index 06840bfad..a4f482375 100644 --- a/docs/understand/file_reorg.md +++ b/docs/understand/file_reorg.md @@ -84,7 +84,7 @@ wrapper header files in its old location for backward compatibility. Wrapper header files are placed in the old location ( `/opt/rocm-xxx//include`) with a warning message to include files -from the new location (/opt/rocm-xxx/include) as shown in the example below. +from the new location (`/opt/rocm-xxx/include`) as shown in the example below. ```cpp #pragma message "This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with hip." @@ -103,7 +103,7 @@ follows ### Executable files -Executable files are available in the /opt/rocm-xxx/bin folder. For backward +Executable files are available in the `/opt/rocm-xxx/bin` folder. For backward compatibility, the old library location (`/opt/rocm-xxx//bin`) has a soft link to the library at the new location. Soft links will be removed in a future release, tentatively ROCm v6.0. @@ -148,13 +148,13 @@ correct header file and use correct search paths. 1. `#include` needs to be changed to `#include ` - For eg: `#include ` needs to change + For example: `#include ` needs to change to `#include ` -2. Any variable in cmake or makefiles pointing to component folder needs to +2. Any variable in CMake or Makefiles pointing to component folder needs to changed. - For eg: `VAR1=/opt/rocm/hip` needs to be changed to `VAR1=/opt/rocm` + For example: `VAR1=/opt/rocm/hip` needs to be changed to `VAR1=/opt/rocm` `VAR2=/opt/rocm/hsa` needs to be changed to `VAR2=/opt/rocm` 3. Any reference to `/opt/rocm//bin` or `/opt/rocm//lib` diff --git a/docs/understand/installing_linux.md b/docs/understand/installing_linux.md index 08c220909..a8c82969e 100644 --- a/docs/understand/installing_linux.md +++ b/docs/understand/installing_linux.md @@ -9,15 +9,15 @@ installation/uninstallation of ROCm on the various Linux distributions. ```{note} The rest of this document refers to _Radeon™ Software for Linux_ as the AMDGPU -stack and _amdgpu-dkms_ driver as the kernel-mode driver. +stack and `amdgpu-dkms` driver as the kernel-mode driver. ``` The guide provides instructions for the following: - Kernel-mode driver installation -- ROCm single-version and multiversion installation +- ROCm single-version and multi-version installation - ROCm and kernel-mode driver version upgrade -- ROCm single-version and multiversion uninstallation +- ROCm single-version and multi-version uninstallation - Kernel-mode driver uninstallation ## Installation Methods @@ -47,12 +47,12 @@ terms of this agreement, do not install, copy or use the AQL Profiler and/or the AOCC CPU Optimizations. ``` -Acces the EULA agreement at: +Access the EULA agreement at: For the rest of the ROCm packages, you can find the licensing information at the following location: `/opt/rocm/share/doc//` -For example, you can fetch the licensing information of the _amd_comgr_ +For example, you can fetch the licensing information of the `_amd_comgr_` component (Code Object Manager) from the `amd_comgr` folder. A file named `LICENSE.txt` contains the license details at: `/opt/rocm-5.4.3/share/doc/amd_comgr/LICENSE.txt` @@ -74,11 +74,11 @@ The `amdgpu-install` script streamlines the installation process by: the required packages - Installing multiple ROCm releases simultaneously on a system - Automating updating local repository information through enhanced - functionality of the amdgpu-install script + functionality of the `amdgpu-install` script - Performing post-install checks to verify whether the installation was completed successfully - Upgrading the installed ROCm release -- Uninstalling the installed single-version or multiversion ROCm releases +- Uninstalling the installed single-version or multi-version ROCm releases ```{tip} The installer script is provided for convenience. It doesn't do anything the @@ -110,7 +110,7 @@ For more information, refer to the How to Install ROCm section in this guide. ## Installation types -This section discusses the single-version and multiversion installation of the +This section discusses the single-version and multi-version installation of the ROCm software stack. ### Single-version Installation @@ -122,7 +122,7 @@ The single-version ROCm installation refers to the following: ### Multi-version Installation -The multiversion installation refers to the following: +The multi-version installation refers to the following: - Installation of multiple instances of the ROCm stack on a system. Extending the package name and its dependencies with the release version adds the @@ -134,7 +134,7 @@ Multiversion install is not available for the AMDGPU stack. ``` The following image demonstrates the difference between single-version and -multiversion ROCm installation types: +multi-version ROCm installation types: ```{figure-md} install-types diff --git a/docs/understand/installing_linux/package_manager_integration.md b/docs/understand/installing_linux/package_manager_integration.md index 668724f0f..5e8e71e61 100644 --- a/docs/understand/installing_linux/package_manager_integration.md +++ b/docs/understand/installing_linux/package_manager_integration.md @@ -17,7 +17,7 @@ support a specific use case. All meta-packages exist in both versioned and non-versioned forms. - Non-versioned packages – For a single-version installation of the ROCm stack -- Versioned packages – For multiversion installations of the ROCm stack +- Versioned packages – For multi-version installations of the ROCm stack ```{figure-md} package-naming @@ -26,7 +26,7 @@ All meta-packages exist in both versioned and non-versioned forms. ROCm Release Package Naming ``` -{numref}`package-naming` demonstrates the single and multiversion ROCm packages' naming +{numref}`package-naming` demonstrates the single and multi-version ROCm packages' naming structure, including examples for various Linux distributions. See terms below: _Module_ - It is the part of the package that represents the name of the ROCm @@ -40,16 +40,16 @@ should increase with a newer release. _Release version_ - It shows the ROCm release version when the package was released. -**Example:** 50400 points to the ROCm 5.4.0 release. +**Example:** `50400` points to the ROCm 5.4.0 release. -_Build id_ - It represents the jenkins build number for that release. +_Build id_ - It represents the Jenkins build number for that release. _Arch_ - It shows the architecture for which the package was created. _Distro_ - It describes the distribution for which the package was created. It is valid only for rpm packages. -**Example:** el8 represents RHEL 8.x packages. +**Example:** `el8` represents RHEL 8.x packages. ## Components of ROCm Programming Models @@ -59,9 +59,9 @@ of required packages and libraries. **Example:** -- rocm-hip-runtime is used to deploy on supported machines to execute HIP +- `rocm-hip-runtime` is used to deploy on supported machines to execute HIP applications. -- rocm-hip-sdk contains runtime components to deploy and execute HIP +- `rocm-hip-sdk` contains runtime components to deploy and execute HIP applications. ```{figure-md} meta-packages @@ -72,7 +72,7 @@ ROCm Meta Packages ``` ```{note} -_rocm-llvm_ is not a meta-package but a single package that installs the ROCm +`rocm-llvm` is not a meta-package but a single package that installs the ROCm clang compiler files. ``` @@ -80,18 +80,18 @@ clang compiler files. :name: meta-package-desc | **Meta-packages** | **Description** | |:--------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------:| -| **rocm-language-runtime** | installs the ROCm runtime | -| **rocm-hip-runtime** | installs packages necessary to run an application written in HIP for the AMD platform | -| **rocm-opencl-runtime** | installs packages required to run OpenCL-based applications on the AMD platform | -| **rocm-hip-runtime-devel** | contains packages to develop an application on HIP or port it from CUDA | -| **rocm-opencl-sdk** | installs packages required to develop applications in OpenCL for the AMD platform | -| **rocm-hip-libraries** | installs HIP libraries optimized for AMD platforms | -| **rocm-hip-sdk** | installs packages necessary to develop/port applications using HIP and libraries for AMD platforms | -| **rocm-developer-tools** | installs packages required to debug and profile HIP-based applications | -| **rocm-ml-sdk** | installs packages necessary to develop and run Machine Learning applications with Machine Learning primitives optimized for AMD platforms | -| **rocm-ml-libraries** | installs packages for key Machine Learning libraries, specifically MIOpen | -| **rocm-openmp-sdk** | installs packages necessary to develop OpenMP-based applications for AMD platforms | -| **rocm-openmp-runtime** | installs packages necessary to run OpenMP-based applications for AMD platforms | +| `rocm-language-runtime` | installs the ROCm runtime | +| `rocm-hip-runtime` | installs packages necessary to run an application written in HIP for the AMD platform | +| `rocm-opencl-runtime` | installs packages required to run OpenCL-based applications on the AMD platform | +| `rocm-hip-runtime-devel` | contains packages to develop an application on HIP or port it from CUDA | +| `rocm-opencl-sdk` | installs packages required to develop applications in OpenCL for the AMD platform | +| `rocm-hip-libraries` | installs HIP libraries optimized for AMD platforms | +| `rocm-hip-sdk` | installs packages necessary to develop/port applications using HIP and libraries for AMD platforms | +| `rocm-developer-tools` | installs packages required to debug and profile HIP-based applications | +| `rocm-ml-sdk` | installs packages necessary to develop and run Machine Learning applications with Machine Learning primitives optimized for AMD platforms | +| `rocm-ml-libraries` | installs packages for key Machine Learning libraries, specifically MIOpen | +| `rocm-openmp-sdk` | installs packages necessary to develop OpenMP-based applications for AMD platforms | +| `rocm-openmp-runtime` | installs packages necessary to run OpenMP-based applications for AMD platforms | ``` ## Packages in ROCm Programming Models @@ -108,7 +108,7 @@ Associated Packages ``` - Meta-packages can include another meta-package. -- rocm-core package is common across all the meta-packages. +- `rocm-core` package is common across all the meta-packages. - Meta-packages and associated packages are represented in the same color. ```{note}