From 5fc5bb5237421dbbe0658001e4685cef931bb692 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Wed, 30 Jul 2025 22:15:18 +0300 Subject: [PATCH] ci: clear processes (#11434) * unified hcq_smi for managment * fix * fix * no reset for amd --- .github/workflows/benchmark.yml | 12 ++-- extra/hcq/hcq_smi.py | 122 ++++++++++++++++++++++++++++++++ extra/nvpci/gpu_reset.sh | 2 - extra/nvpci/nv_smi.py | 65 ----------------- 4 files changed, 129 insertions(+), 72 deletions(-) create mode 100755 extra/hcq/hcq_smi.py delete mode 100755 extra/nvpci/gpu_reset.sh delete mode 100755 extra/nvpci/nv_smi.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 99befc1269..8285aa0270 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -649,10 +649,10 @@ jobs: steps: - name: Checkout Code uses: actions/checkout@v4 - - name: Remove amdgpu - run: sudo rmmod amdgpu || true - - name: Cleanup running AM processes - run: python extra/amdpci/am_smi.py --pids --kill + - name: Remove amd modules + run: ./extra/hcq/hcq_smi.py amd rmmod + - name: Kill stale pids + run: ./extra/hcq/hcq_smi.py amd kill_pids - name: Symlink models and datasets run: | mkdir -p weights @@ -716,7 +716,9 @@ jobs: - name: Checkout Code uses: actions/checkout@v4 - name: Remove nv modules - run: ./extra/nvpci/nv_smi.py rmmod + run: ./extra/hcq/hcq_smi.py nv rmmod + - name: Kill stale pids + run: ./extra/hcq/hcq_smi.py nv kill_pids - name: Symlink models and datasets run: | mkdir -p weights diff --git a/extra/hcq/hcq_smi.py b/extra/hcq/hcq_smi.py new file mode 100755 index 0000000000..e3e8ce95dc --- /dev/null +++ b/extra/hcq/hcq_smi.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +from tinygrad.runtime.support.system import System +import argparse, glob, os, re, time, subprocess, sys + +def scan_devs_based_on_lock(prefix:str, args) -> list[str]: + target_dev = args.pci_bus if 'pci_bus' in args.__dir__() else "" + + devs = [] + for dev in glob.glob(f'/tmp/{prefix}_*.lock'): + dev_id = dev[8:-5] + if os.path.exists(f"/sys/bus/pci/devices/{dev_id}") and dev_id.startswith(target_dev): devs.append(dev_id) + return devs + +def _do_reset_device(pci_bus): System.pci_reset(pci_bus) +def _is_module_loaded(name: str) -> bool: return os.path.isdir(f"/sys/module/{name}") + +def cmd_remove_module(args): + modules = ["nvidia_drm", "nvidia_modeset", "nvidia_uvm", "nvidia"] if args.backend == "nv" else ["amdgpu"] + to_unload = [m for m in modules if _is_module_loaded(m)] + if not to_unload: print("Kernel modules are not loaded") + else: + print("Removing kernel modules:", ", ".join(to_unload)) + try: subprocess.run(["sudo", "modprobe", "-r", *to_unload], check=True) + except subprocess.CalledProcessError as e: + print("Failed to unload all modules — they may be in use.", file=sys.stderr) + sys.exit(e.returncode) + +def cmd_insert_module(args): + cmd_remove_module(args) + cmd_reset_devices(args) + + module = "nvidia" if args.backend == "nv" else "amdgpu" + if _is_module_loaded(module): + print(f"{module} kernel module already loaded") + return + + print(f"Inserting kernel module: {module}") + if args.backend == "nv": + subprocess.run(["nvidia-smi"], check=True) + elif args.backend == "amd": + subprocess.run(["sudo", "modprobe", "amdgpu"], check=True) + +def cmd_reset_devices(args): + devs = scan_devs_based_on_lock({"amd":"am", "nv":"nv"}[args.backend], args) + + for dev in devs: + print(f"Resetting device {dev}") + if args.backend != "amd": _do_reset_device(dev) + time.sleep(0.2) + +def cmd_show_pids(args): + devs = scan_devs_based_on_lock(prefix:={"amd":"am", "nv":"nv"}[args.backend], args) + + for dev in devs: + try: + pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1] + print(f"{dev}: {pid}") + except subprocess.CalledProcessError: print(f"{dev}: No processes found using this device") + +def cmd_kill_pids(args): + devs = scan_devs_based_on_lock(prefix:={"amd":"am", "nv":"nv"}[args.backend], args) + + for dev in devs: + try: + pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1] + print(f"{dev}: {pid}") + except subprocess.CalledProcessError: print(f"{dev}: No processes found using this device") + +def cmd_kill_pids(args): + devs = scan_devs_based_on_lock(prefix:={"amd":"am", "nv":"nv"}[args.backend], args) + + for dev in devs: + for i in range(128): + if i > 0: time.sleep(0.2) + + try: + try: pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1] + except subprocess.CalledProcessError: break + + print(f"Killing process {pid} (which uses {dev})") + subprocess.run(['sudo', 'kill', '-9', pid], check=True) + except subprocess.CalledProcessError as e: + print(f"Failed to kill process for device {dev}: {e}", file=sys.stderr) + +def add_common_commands(parent_subparsers): + p_insmod = parent_subparsers.add_parser("insmod", help="Insert a kernel module") + p_insmod.set_defaults(func=cmd_insert_module) + + p_rmmod = parent_subparsers.add_parser("rmmod", help="Remove a kernel module") + p_rmmod.set_defaults(func=cmd_remove_module) + + p_reset = parent_subparsers.add_parser("reset", help="Reset a device") + p_reset.add_argument("--pci_bus", default="", help="PCI bus ID of the device to reset") + p_reset.set_defaults(func=cmd_reset_devices) + + p_reset = parent_subparsers.add_parser("pids", help="Show pids of processes using the device") + p_reset.add_argument("--pci_bus", default="", help="PCI bus ID of the device") + p_reset.set_defaults(func=cmd_show_pids) + + p_reset = parent_subparsers.add_parser("kill_pids", help="Kill pids of processes using the device") + p_reset.add_argument("--pci_bus", default="", help="PCI bus ID of the device") + p_reset.set_defaults(func=cmd_kill_pids) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + backend_subparsers = parser.add_subparsers(dest="backend", required=True, metavar="{nv,amd}", help="Hardware backend to target") + + nv_parser = backend_subparsers.add_parser("nv", help="NVIDIA GPUs") + nv_commands = nv_parser.add_subparsers(dest="command", required=True) + add_common_commands(nv_commands) + + amd_parser = backend_subparsers.add_parser("amd", help="AMD GPUs") + amd_commands = amd_parser.add_subparsers(dest="command", required=True) + add_common_commands(amd_commands) + + args = parser.parse_args() + if args.command is None: + parser.print_help(sys.stderr) + sys.exit(1) + + args.func(args) diff --git a/extra/nvpci/gpu_reset.sh b/extra/nvpci/gpu_reset.sh deleted file mode 100755 index a76103675d..0000000000 --- a/extra/nvpci/gpu_reset.sh +++ /dev/null @@ -1,2 +0,0 @@ -GPU="$1" -echo 1 | sudo tee /sys/bus/pci/devices/$GPU/reset 2>/dev/null diff --git a/extra/nvpci/nv_smi.py b/extra/nvpci/nv_smi.py deleted file mode 100755 index 8491d56d1e..0000000000 --- a/extra/nvpci/nv_smi.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 - -from tinygrad.runtime.support.system import System -import argparse, glob, os, re, time, subprocess, sys - -def scan_devs_based_on_lock(prefix:str) -> list[str]: - devs = [] - for dev in glob.glob(f'/tmp/{prefix}_*.lock'): - dev_id = dev[8:-5] - if os.path.exists(f"/sys/bus/pci/devices/{dev_id}"): devs.append(dev_id) - return devs - -def _do_reset_device(pci_bus): System.pci_reset(pci_bus) -def _is_module_loaded(name: str) -> bool: return os.path.isdir(f"/sys/module/{name}") - -def cmd_remove_module(args): - to_unload = [m for m in ["nvidia_drm", "nvidia_modeset", "nvidia_uvm", "nvidia"] if _is_module_loaded(m)] - if not to_unload: - print("NVIDIA kernel modules are not loaded") - else: - print("Removing NVIDIA kernel modules:", ", ".join(to_unload)) - try: subprocess.run(["sudo", "modprobe", "-r", *to_unload], check=True) - except subprocess.CalledProcessError as e: - print("Failed to unload all modules — they may be in use.", file=sys.stderr) - sys.exit(e.returncode) - -def cmd_insert_module(args): - cmd_remove_module(args) - cmd_reset_devices(args) - - if not os.path.exists("/sys/module/nvidia"): - print("Inserting nvidia kernel module") - subprocess.run(["nvidia-smi"], check=True) - else: print("Nvidia kernel module already loaded") - -def cmd_reset_devices(args): - devs = scan_devs_based_on_lock("nv") - dev_to_reset = args.pci_bus if 'pci_bus' in args.__dir__() else "" - - for dev in devs: - if dev.startswith(dev_to_reset): - print(f"Resetting device {dev}") - _do_reset_device(dev) - time.sleep(0.2) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - subparsers = parser.add_subparsers(required=True, dest="cmd") - - parser_insmod = subparsers.add_parser('insmod', help='Insert a nvidia kernel module') - parser_insmod.set_defaults(func=cmd_insert_module) - - parser_rmmod = subparsers.add_parser('rmmod', help='Remove a nvidia kernel module') - parser_rmmod.set_defaults(func=cmd_remove_module) - - parser_reset = subparsers.add_parser('reset', help='Reset a nvidia device') - parser_reset.add_argument('--pci_bus', type=str, default="", help='PCI bus ID of the device to reset') - parser_reset.set_defaults(func=cmd_reset_devices) - - args = parser.parse_args() - if args.cmd is None: - parser.print_help(sys.stderr) - sys.exit(1) - - args.func(args)