Update training Docker docs for Primus 25.10 (#5737)

(cherry picked from commit fb644412d5)
This commit is contained in:
peterjunpark
2025-12-04 09:08:00 -05:00
committed by peterjunpark
parent 48ca38b0dc
commit 8647ebcf76
18 changed files with 4158 additions and 560 deletions

View File

@@ -1,21 +1,17 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/megatron-lm:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: aab4234
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/megatron-lm:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
docker:
pull_tag: rocm/primus:v25.10
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components:
ROCm: 7.1.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.10.0.dev20251112+rocm7.1
Python: "3.10"
Transformer Engine: 2.4.0.dev0+32e2d1d4
Flash Attention: 2.8.3
hipBLASLt: 1.2.0-09ab7153e2
Triton: 3.4.0
RCCL: 2.27.7
model_groups:
- group: Meta Llama
tag: llama

View File

@@ -0,0 +1,53 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/megatron-lm:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: aab4234
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/megatron-lm:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/megatron-lm/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.3 70B
mad_tag: pyt_megatron_lm_train_llama-3.3-70b
- model: Llama 3.1 8B
mad_tag: pyt_megatron_lm_train_llama-3.1-8b
- model: Llama 3.1 70B
mad_tag: pyt_megatron_lm_train_llama-3.1-70b
- model: Llama 2 7B
mad_tag: pyt_megatron_lm_train_llama-2-7b
- model: Llama 2 70B
mad_tag: pyt_megatron_lm_train_llama-2-70b
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek-V3 (proxy)
mad_tag: pyt_megatron_lm_train_deepseek-v3-proxy
- model: DeepSeek-V2-Lite
mad_tag: pyt_megatron_lm_train_deepseek-v2-lite-16b
- group: Mistral AI
tag: mistral
models:
- model: Mixtral 8x7B
mad_tag: pyt_megatron_lm_train_mixtral-8x7b
- model: Mixtral 8x22B (proxy)
mad_tag: pyt_megatron_lm_train_mixtral-8x22b-proxy
- group: Qwen
tag: qwen
models:
- model: Qwen 2.5 7B
mad_tag: pyt_megatron_lm_train_qwen2.5-7b
- model: Qwen 2.5 72B
mad_tag: pyt_megatron_lm_train_qwen2.5-72b

View File

@@ -0,0 +1,65 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/primus:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/primus:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.3 70B
mad_tag: primus_pyt_megatron_lm_train_llama-3.3-70b
config_name: llama3.3_70B-pretrain.yaml
- model: Llama 3.1 70B
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-70b
config_name: llama3.1_70B-pretrain.yaml
- model: Llama 3.1 8B
mad_tag: primus_pyt_megatron_lm_train_llama-3.1-8b
config_name: llama3.1_8B-pretrain.yaml
- model: Llama 2 7B
mad_tag: primus_pyt_megatron_lm_train_llama-2-7b
config_name: llama2_7B-pretrain.yaml
- model: Llama 2 70B
mad_tag: primus_pyt_megatron_lm_train_llama-2-70b
config_name: llama2_70B-pretrain.yaml
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek-V3 (proxy)
mad_tag: primus_pyt_megatron_lm_train_deepseek-v3-proxy
config_name: deepseek_v3-pretrain.yaml
- model: DeepSeek-V2-Lite
mad_tag: primus_pyt_megatron_lm_train_deepseek-v2-lite-16b
config_name: deepseek_v2_lite-pretrain.yaml
- group: Mistral AI
tag: mistral
models:
- model: Mixtral 8x7B
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x7b
config_name: mixtral_8x7B_v0.1-pretrain.yaml
- model: Mixtral 8x22B (proxy)
mad_tag: primus_pyt_megatron_lm_train_mixtral-8x22b-proxy
config_name: mixtral_8x22B_v0.1-pretrain.yaml
- group: Qwen
tag: qwen
models:
- model: Qwen 2.5 7B
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-7b
config_name: primus_qwen2.5_7B-pretrain.yaml
- model: Qwen 2.5 72B
mad_tag: primus_pyt_megatron_lm_train_qwen2.5-72b
config_name: qwen2.5_72B-pretrain.yaml

View File

@@ -0,0 +1,39 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/primus:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/primus:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.1 8B
mad_tag: primus_pyt_train_llama-3.1-8b
model_repo: meta-llama/Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
config_file:
bf16: "./llama3_8b_fsdp_bf16.toml"
fp8: "./llama3_8b_fsdp_fp8.toml"
- model: Llama 3.1 70B
mad_tag: primus_pyt_train_llama-3.1-70b
model_repo: meta-llama/Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
precision: BF16
config_file:
bf16: "./llama3_70b_fsdp_bf16.toml"
fp8: "./llama3_70b_fsdp_fp8.toml"

View File

@@ -0,0 +1,186 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/pytorch-training:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: aab4234
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/pytorch-training:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 4 Scout 17B-16E
mad_tag: pyt_train_llama-4-scout-17b-16e
model_repo: Llama-4-17B_16E
url: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.3 70B
mad_tag: pyt_train_llama-3.3-70b
model_repo: Llama-3.3-70B
url: https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 3.2 1B
mad_tag: pyt_train_llama-3.2-1b
model_repo: Llama-3.2-1B
url: https://huggingface.co/meta-llama/Llama-3.2-1B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.2 3B
mad_tag: pyt_train_llama-3.2-3b
model_repo: Llama-3.2-3B
url: https://huggingface.co/meta-llama/Llama-3.2-3B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3.2 Vision 11B
mad_tag: pyt_train_llama-3.2-vision-11b
model_repo: Llama-3.2-Vision-11B
url: https://huggingface.co/meta-llama/Llama-3.2-11B-Vision
precision: BF16
training_modes: [finetune_fw]
- model: Llama 3.2 Vision 90B
mad_tag: pyt_train_llama-3.2-vision-90b
model_repo: Llama-3.2-Vision-90B
url: https://huggingface.co/meta-llama/Llama-3.2-90B-Vision
precision: BF16
training_modes: [finetune_fw]
- model: Llama 3.1 8B
mad_tag: pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
training_modes: [pretrain, finetune_fw, finetune_lora, HF_pretrain]
- model: Llama 3.1 70B
mad_tag: pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
precision: BF16
training_modes: [pretrain, finetune_fw, finetune_lora]
- model: Llama 3.1 405B
mad_tag: pyt_train_llama-3.1-405b
model_repo: Llama-3.1-405B
url: https://huggingface.co/meta-llama/Llama-3.1-405B
precision: BF16
training_modes: [finetune_qlora]
- model: Llama 3 8B
mad_tag: pyt_train_llama-3-8b
model_repo: Llama-3-8B
url: https://huggingface.co/meta-llama/Meta-Llama-3-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 3 70B
mad_tag: pyt_train_llama-3-70b
model_repo: Llama-3-70B
url: https://huggingface.co/meta-llama/Meta-Llama-3-70B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 2 7B
mad_tag: pyt_train_llama-2-7b
model_repo: Llama-2-7B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_fw, finetune_lora, finetune_qlora]
- model: Llama 2 13B
mad_tag: pyt_train_llama-2-13b
model_repo: Llama-2-13B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Llama 2 70B
mad_tag: pyt_train_llama-2-70b
model_repo: Llama-2-70B
url: https://github.com/meta-llama/llama-models/tree/main/models/llama2
precision: BF16
training_modes: [finetune_lora, finetune_qlora]
- group: OpenAI
tag: openai
models:
- model: GPT OSS 20B
mad_tag: pyt_train_gpt_oss_20b
model_repo: GPT-OSS-20B
url: https://huggingface.co/openai/gpt-oss-20b
precision: BF16
training_modes: [HF_finetune_lora]
- model: GPT OSS 120B
mad_tag: pyt_train_gpt_oss_120b
model_repo: GPT-OSS-120B
url: https://huggingface.co/openai/gpt-oss-120b
precision: BF16
training_modes: [HF_finetune_lora]
- group: Qwen
tag: qwen
models:
- model: Qwen 3 8B
mad_tag: pyt_train_qwen3-8b
model_repo: Qwen3-8B
url: https://huggingface.co/Qwen/Qwen3-8B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Qwen 3 32B
mad_tag: pyt_train_qwen3-32b
model_repo: Qwen3-32
url: https://huggingface.co/Qwen/Qwen3-32B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2.5 32B
mad_tag: pyt_train_qwen2.5-32b
model_repo: Qwen2.5-32B
url: https://huggingface.co/Qwen/Qwen2.5-32B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2.5 72B
mad_tag: pyt_train_qwen2.5-72b
model_repo: Qwen2.5-72B
url: https://huggingface.co/Qwen/Qwen2.5-72B
precision: BF16
training_modes: [finetune_lora]
- model: Qwen 2 1.5B
mad_tag: pyt_train_qwen2-1.5b
model_repo: Qwen2-1.5B
url: https://huggingface.co/Qwen/Qwen2-1.5B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- model: Qwen 2 7B
mad_tag: pyt_train_qwen2-7b
model_repo: Qwen2-7B
url: https://huggingface.co/Qwen/Qwen2-7B
precision: BF16
training_modes: [finetune_fw, finetune_lora]
- group: Stable Diffusion
tag: sd
models:
- model: Stable Diffusion XL
mad_tag: pyt_huggingface_stable_diffusion_xl_2k_lora_finetuning
model_repo: SDXL
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
precision: BF16
training_modes: [posttrain-p]
- group: Flux
tag: flux
models:
- model: FLUX.1-dev
mad_tag: pyt_train_flux
model_repo: Flux
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [posttrain-p]
- group: NCF
tag: ncf
models:
- model: NCF
mad_tag: pyt_ncf_training
model_repo:
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
precision: FP32

View File

@@ -1,22 +1,15 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/primus:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/primus:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
docker:
pull_tag: rocm/primus:v25.10
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components:
ROCm: 7.1.0
PyTorch: 2.10.0.dev20251112+rocm7.1
Python: "3.10"
Transformer Engine: 2.4.0.dev0+32e2d1d4
Flash Attention: 2.8.3
hipBLASLt: 1.2.0-09ab7153e2
Triton: 3.4.0
RCCL: 2.27.7
model_groups:
- group: Meta Llama
tag: llama

View File

@@ -1,39 +1,32 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/primus:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/primus:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
docker:
pull_tag: rocm/primus:v25.10
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components:
ROCm: 7.1.0
PyTorch: 2.10.0.dev20251112+rocm7.1
Python: "3.10"
Transformer Engine: 2.4.0.dev0+32e2d1d4
Flash Attention: 2.8.3
hipBLASLt: 1.2.0-09ab7153e2
model_groups:
- group: Meta Llama
tag: llama
models:
- model: Llama 3.1 8B
mad_tag: primus_pyt_train_llama-3.1-8b
model_repo: meta-llama/Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
config_file:
bf16: "./llama3_8b_fsdp_bf16.toml"
fp8: "./llama3_8b_fsdp_fp8.toml"
- model: Llama 3.1 70B
mad_tag: primus_pyt_train_llama-3.1-70b
model_repo: meta-llama/Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
precision: BF16
config_file:
bf16: "./llama3_70b_fsdp_bf16.toml"
fp8: "./llama3_70b_fsdp_fp8.toml"
- model: Llama 3.1 8B
mad_tag: primus_pyt_train_llama-3.1-8b
model_repo: Llama-3.1-8B
url: https://huggingface.co/meta-llama/Llama-3.1-8B
precision: BF16
- model: Llama 3.1 70B
mad_tag: primus_pyt_train_llama-3.1-70b
model_repo: Llama-3.1-70B
url: https://huggingface.co/meta-llama/Llama-3.1-70B
precision: BF16
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek V2 16B
mad_tag: primus_pyt_train_deepseek-v2
model_repo: DeepSeek-V2
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
precision: BF16

View File

@@ -1,21 +1,15 @@
dockers:
MI355X and MI350X:
pull_tag: rocm/pytorch-training:v25.9_gfx950
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components: &docker_components
ROCm: 7.0.0
Primus: aab4234
PyTorch: 2.9.0.dev20250821+rocm7.0.0.lw.git125803b7
Python: "3.10"
Transformer Engine: 2.2.0.dev0+54dd2bdc
Flash Attention: 2.8.3
hipBLASLt: 911283acd1
Triton: 3.4.0+rocm7.0.0.git56765e8c
RCCL: 2.26.6
MI325X and MI300X:
pull_tag: rocm/pytorch-training:v25.9_gfx942
docker_hub_url: https://hub.docker.com/layers/rocm/pytorch-training/v25.9_gfx942/images/sha256-df6ab8f45b4b9ceb100fb24e19b2019a364e351ee3b324dbe54466a1d67f8357
components: *docker_components
docker:
pull_tag: rocm/primus:v25.10
docker_hub_url: https://hub.docker.com/layers/rocm/primus/v25.9_gfx950/images/sha256-1a198be32f49efd66d0ff82066b44bd99b3e6b04c8e0e9b36b2c481e13bff7b6
components:
ROCm: 7.1.0
Primus: 0.3.0
Primus Turbo: 0.1.1
PyTorch: 2.10.0.dev20251112+rocm7.1
Python: "3.10"
Transformer Engine: 2.4.0.dev0+32e2d1d4
Flash Attention: 2.8.3
hipBLASLt: 1.2.0-09ab7153e2
model_groups:
- group: Meta Llama
tag: llama
@@ -119,6 +113,15 @@ model_groups:
url: https://huggingface.co/openai/gpt-oss-120b
precision: BF16
training_modes: [HF_finetune_lora]
- group: DeepSeek
tag: deepseek
models:
- model: DeepSeek V2 16B
mad_tag: primus_pyt_train_deepseek-v2
model_repo: DeepSeek-V2
url: https://huggingface.co/deepseek-ai/DeepSeek-V2
precision: BF16
training_modes: [pretrain]
- group: Qwen
tag: qwen
models:
@@ -166,7 +169,7 @@ model_groups:
model_repo: SDXL
url: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
precision: BF16
training_modes: [posttrain-p]
training_modes: [posttrain]
- group: Flux
tag: flux
models:
@@ -175,12 +178,20 @@ model_groups:
model_repo: Flux
url: https://huggingface.co/black-forest-labs/FLUX.1-dev
precision: BF16
training_modes: [posttrain-p]
training_modes: [posttrain]
- group: NCF
tag: ncf
models:
- model: NCF
mad_tag: pyt_ncf_training
model_repo:
url: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF
url: https://github.com/ROCm/FluxBenchmark
precision: FP32
- group: DLRM
tag: dlrm
models:
- model: DLRM v2
mad_tag: pyt_train_dlrm
model_repo: DLRM
url: https://github.com/AMD-AGI/DLRMBenchmark
training_modes: [pretrain]