From ece29b6e689e40299322b553f8d99b9239111775 Mon Sep 17 00:00:00 2001 From: teoxoy <28601907+teoxoy@users.noreply.github.com> Date: Wed, 2 Apr 2025 15:11:19 +0200 Subject: [PATCH] [D3D12/VK] add OOM check on submit and poll that will lose the device if we are over 95% of our budget --- wgpu-core/src/device/global.rs | 2 ++ wgpu-core/src/device/queue.rs | 2 ++ wgpu-core/src/device/resource.rs | 13 +++++++++ wgpu-hal/src/dx12/device.rs | 29 ++++++++++++++++++++ wgpu-hal/src/dynamic/device.rs | 6 +++++ wgpu-hal/src/gles/device.rs | 4 +++ wgpu-hal/src/lib.rs | 2 ++ wgpu-hal/src/metal/device.rs | 6 +++++ wgpu-hal/src/noop/mod.rs | 4 +++ wgpu-hal/src/vulkan/adapter.rs | 7 +++++ wgpu-hal/src/vulkan/device.rs | 45 +++++++++++++++++++++++++++++++- 11 files changed, 119 insertions(+), 1 deletion(-) diff --git a/wgpu-core/src/device/global.rs b/wgpu-core/src/device/global.rs index e9db7d5697..f48574267b 100644 --- a/wgpu-core/src/device/global.rs +++ b/wgpu-core/src/device/global.rs @@ -1977,6 +1977,8 @@ impl Global { let fence = device.fence.read(); let maintain_result = device.maintain(fence, poll_type, snatch_guard); + device.lose_if_oom(); + // Some deferred destroys are scheduled in maintain so run this right after // to avoid holding on to them until the next device poll. device.deferred_resource_destruction(); diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index 21b8b5e62c..746b0ddc56 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -1361,6 +1361,8 @@ impl Queue { // the closures should execute with nothing locked! callbacks.fire(); + self.device.lose_if_oom(); + api_log!("Queue::submit returned submit index {submit_index}"); Ok(submit_index) diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs index b715fa828d..4f3c5ab797 100644 --- a/wgpu-core/src/device/resource.rs +++ b/wgpu-core/src/device/resource.rs @@ -361,6 +361,19 @@ impl Device { } } + /// Checks that we are operating within the memory budget reported by the native APIs. + /// + /// If we are not, the device gets invalidated. + /// + /// The budget might fluctuate over the lifetime of the application, so it should be checked + /// somewhat frequently. + pub fn lose_if_oom(&self) { + let _ = self + .raw() + .check_if_oom() + .map_err(|e| self.handle_hal_error(e)); + } + pub fn handle_hal_error(&self, error: hal::DeviceError) -> DeviceError { match error { hal::DeviceError::OutOfMemory diff --git a/wgpu-hal/src/dx12/device.rs b/wgpu-hal/src/dx12/device.rs index 1417fe1a85..edb0f064a6 100644 --- a/wgpu-hal/src/dx12/device.rs +++ b/wgpu-hal/src/dx12/device.rs @@ -2320,4 +2320,33 @@ impl crate::Device for super::Device { bytemuck::bytes_of(&Desc::wrap(temp)).to_vec() } + + fn check_if_oom(&self) -> Result<(), crate::DeviceError> { + let info = self + .shared + .adapter + .query_video_memory_info(Dxgi::DXGI_MEMORY_SEGMENT_GROUP_LOCAL)?; + + // Make sure we don't exceed 95% of the budget + if info.CurrentUsage >= info.Budget / 100 * 95 { + return Err(crate::DeviceError::OutOfMemory); + } + + if matches!( + self.shared.private_caps.memory_architecture, + super::MemoryArchitecture::NonUnified + ) { + let info = self + .shared + .adapter + .query_video_memory_info(Dxgi::DXGI_MEMORY_SEGMENT_GROUP_NON_LOCAL)?; + + // Make sure we don't exceed 95% of the budget + if info.CurrentUsage >= info.Budget / 100 * 95 { + return Err(crate::DeviceError::OutOfMemory); + } + } + + Ok(()) + } } diff --git a/wgpu-hal/src/dynamic/device.rs b/wgpu-hal/src/dynamic/device.rs index 76cd47c5ce..fd7c10f254 100644 --- a/wgpu-hal/src/dynamic/device.rs +++ b/wgpu-hal/src/dynamic/device.rs @@ -171,6 +171,8 @@ pub trait DynDevice: DynResource { fn get_internal_counters(&self) -> wgt::HalCounters; fn generate_allocator_report(&self) -> Option; + + fn check_if_oom(&self) -> Result<(), DeviceError>; } impl DynDevice for D { @@ -563,4 +565,8 @@ impl DynDevice for D { fn generate_allocator_report(&self) -> Option { D::generate_allocator_report(self) } + + fn check_if_oom(&self) -> Result<(), DeviceError> { + D::check_if_oom(self) + } } diff --git a/wgpu-hal/src/gles/device.rs b/wgpu-hal/src/gles/device.rs index cc23b981f2..c5539eae35 100644 --- a/wgpu-hal/src/gles/device.rs +++ b/wgpu-hal/src/gles/device.rs @@ -1622,6 +1622,10 @@ impl crate::Device for super::Device { fn get_internal_counters(&self) -> wgt::HalCounters { self.counters.as_ref().clone() } + + fn check_if_oom(&self) -> Result<(), crate::DeviceError> { + Ok(()) + } } #[cfg(send_sync)] diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs index 444d896802..0e9c972eee 100644 --- a/wgpu-hal/src/lib.rs +++ b/wgpu-hal/src/lib.rs @@ -1020,6 +1020,8 @@ pub trait Device: WasmNotSendSync { fn generate_allocator_report(&self) -> Option { None } + + fn check_if_oom(&self) -> Result<(), DeviceError>; } pub trait Queue: WasmNotSendSync { diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs index 8bc3595605..4516286d5d 100644 --- a/wgpu-hal/src/metal/device.rs +++ b/wgpu-hal/src/metal/device.rs @@ -1601,4 +1601,10 @@ impl crate::Device for super::Device { fn get_internal_counters(&self) -> wgt::HalCounters { self.counters.as_ref().clone() } + + fn check_if_oom(&self) -> Result<(), crate::DeviceError> { + // TODO: see https://github.com/gfx-rs/wgpu/issues/7460 + + Ok(()) + } } diff --git a/wgpu-hal/src/noop/mod.rs b/wgpu-hal/src/noop/mod.rs index 929ff329fc..f91fc49948 100644 --- a/wgpu-hal/src/noop/mod.rs +++ b/wgpu-hal/src/noop/mod.rs @@ -457,4 +457,8 @@ impl crate::Device for Context { fn get_internal_counters(&self) -> wgt::HalCounters { Default::default() } + + fn check_if_oom(&self) -> DeviceResult<()> { + Ok(()) + } } diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs index 38a5b80a57..d38f26de74 100644 --- a/wgpu-hal/src/vulkan/adapter.rs +++ b/wgpu-hal/src/vulkan/adapter.rs @@ -1062,6 +1062,13 @@ impl PhysicalDeviceProperties { extensions.push(ext::external_memory_dma_buf::NAME); } + // Optional `VK_EXT_memory_budget` + if self.supports_extension(ext::memory_budget::NAME) { + extensions.push(ext::memory_budget::NAME); + } else { + log::warn!("VK_EXT_memory_budget is not available.") + } + // Require `VK_KHR_draw_indirect_count` if the associated feature was requested // Even though Vulkan 1.2 has promoted the extension to core, we must require the extension to avoid // large amounts of spaghetti involved with using PhysicalDeviceVulkan12Features. diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs index 9cd0fa36c9..9e5ccc174f 100644 --- a/wgpu-hal/src/vulkan/device.rs +++ b/wgpu-hal/src/vulkan/device.rs @@ -10,7 +10,7 @@ use std::{ }; use arrayvec::ArrayVec; -use ash::{khr, vk}; +use ash::{ext, khr, vk}; use hashbrown::hash_map::Entry; use parking_lot::Mutex; @@ -2872,6 +2872,49 @@ impl crate::Device for super::Device { }; bytemuck::bytes_of(&temp).to_vec() } + + fn check_if_oom(&self) -> Result<(), crate::DeviceError> { + if !self + .shared + .enabled_extensions + .contains(&ext::memory_budget::NAME) + { + return Ok(()); + } + + let get_physical_device_properties = self + .shared + .instance + .get_physical_device_properties + .as_ref() + .unwrap(); + + let mut memory_budget_properties = vk::PhysicalDeviceMemoryBudgetPropertiesEXT::default(); + + let mut memory_properties = + vk::PhysicalDeviceMemoryProperties2::default().push_next(&mut memory_budget_properties); + + unsafe { + get_physical_device_properties.get_physical_device_memory_properties2( + self.shared.physical_device, + &mut memory_properties, + ); + } + + let memory_properties = memory_properties.memory_properties; + + for i in 0..memory_properties.memory_heap_count { + let heap_usage = memory_budget_properties.heap_usage[i as usize]; + let heap_budget = memory_budget_properties.heap_budget[i as usize]; + + // Make sure we don't exceed 95% of the budget + if heap_usage >= heap_budget / 100 * 95 { + return Err(crate::DeviceError::OutOfMemory); + } + } + + Ok(()) + } } impl super::DeviceShared {