Skip to content

Commit aeb2067

Browse files
committed
[core] Make poll(Wait) not hang after bad command submission.
Add `wgpu_core::device::Device::last_successful_submission_index`, which records the fence value that `Maintain::Wait` should actually wait for. See comments for details. Fixes #5969.
1 parent 2bc328c commit aeb2067

File tree

3 files changed

+87
-21
lines changed

3 files changed

+87
-21
lines changed

tests/tests/poll.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,37 @@ static WAIT_OUT_OF_ORDER: GpuTestConfiguration =
125125
.await
126126
.panic_on_timeout();
127127
});
128+
129+
/// Submit a command buffer to the wrong device. A wait poll shouldn't hang.
130+
///
131+
/// We can't catch panics on Wasm, since they get reported directly to the
132+
/// console.
133+
#[gpu_test]
134+
static WAIT_AFTER_BAD_SUBMISSION: GpuTestConfiguration = GpuTestConfiguration::new()
135+
.parameters(wgpu_test::TestParameters::default().skip(wgpu_test::FailureCase::webgl2()))
136+
.run_async(wait_after_bad_submission);
137+
138+
async fn wait_after_bad_submission(ctx: TestingContext) {
139+
let (device2, queue2) =
140+
wgpu_test::initialize_device(&ctx.adapter, ctx.device_features, ctx.device_limits.clone())
141+
.await;
142+
143+
let command_buffer1 = ctx
144+
.device
145+
.create_command_encoder(&CommandEncoderDescriptor::default())
146+
.finish();
147+
148+
// This should panic, since the command buffer belongs to the wrong
149+
// device, and queue submission errors seem to be fatal errors?
150+
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
151+
queue2.submit([command_buffer1]);
152+
}));
153+
assert!(result.is_err());
154+
155+
// This should not hang.
156+
//
157+
// Specifically, the failed submission should not cause a new fence value to
158+
// be allocated that will not be signalled until further work is
159+
// successfully submitted, causing a greater fence value to be signalled.
160+
device2.poll(wgpu::Maintain::Wait);
161+
}

wgpu-core/src/device/queue.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1069,7 +1069,7 @@ impl Global {
10691069
let fence = fence_guard.as_mut().unwrap();
10701070
let submit_index = device
10711071
.active_submission_index
1072-
.fetch_add(1, Ordering::Relaxed)
1072+
.fetch_add(1, Ordering::SeqCst)
10731073
+ 1;
10741074
let mut active_executions = Vec::new();
10751075

@@ -1392,6 +1392,11 @@ impl Global {
13921392
)
13931393
.map_err(DeviceError::from)?;
13941394
}
1395+
1396+
// Advance the successful submission index.
1397+
device
1398+
.last_successful_submission_index
1399+
.fetch_max(submit_index, Ordering::SeqCst);
13951400
}
13961401

13971402
profiling::scope!("cleanup");

wgpu-core/src/device/resource.rs

Lines changed: 47 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,27 @@ pub struct Device<A: HalApi> {
8888
label: String,
8989

9090
pub(crate) command_allocator: command::CommandAllocator<A>,
91+
92+
/// The index of the last command submission that was attempted.
93+
///
94+
/// Note that `fence` may never be signalled with this value, if the command
95+
/// submission failed. If you need to wait for everything running on a
96+
/// `Queue` to complete, wait for [`last_successful_submission_index`].
97+
///
98+
/// [`last_successful_submission_index`]: Device::last_successful_submission_index
9199
pub(crate) active_submission_index: hal::AtomicFenceValue,
100+
101+
/// The index of the last successful submission to this device's
102+
/// [`hal::Queue`].
103+
///
104+
/// Unlike [`active_submission_index`], which is incremented each time
105+
/// submission is attempted, this is updated only when submission succeeds,
106+
/// so waiting for this value won't hang waiting for work that was never
107+
/// submitted.
108+
///
109+
/// [`active_submission_index`]: Device::active_submission_index
110+
pub(crate) last_successful_submission_index: hal::AtomicFenceValue,
111+
92112
// NOTE: if both are needed, the `snatchable_lock` must be consistently acquired before the
93113
// `fence` lock to avoid deadlocks.
94114
pub(crate) fence: RwLock<Option<A::Fence>>,
@@ -257,6 +277,7 @@ impl<A: HalApi> Device<A> {
257277
label: desc.label.to_string(),
258278
command_allocator,
259279
active_submission_index: AtomicU64::new(0),
280+
last_successful_submission_index: AtomicU64::new(0),
260281
fence: RwLock::new(rank::DEVICE_FENCE, Some(fence)),
261282
snatchable_lock: unsafe { SnatchLock::new(rank::DEVICE_SNATCHABLE_LOCK) },
262283
valid: AtomicBool::new(true),
@@ -387,37 +408,41 @@ impl<A: HalApi> Device<A> {
387408
profiling::scope!("Device::maintain");
388409

389410
let fence = fence_guard.as_ref().unwrap();
390-
let last_done_index = if maintain.is_wait() {
391-
let index_to_wait_for = match maintain {
392-
wgt::Maintain::WaitForSubmissionIndex(submission_index) => {
393-
// We don't need to check to see if the queue id matches
394-
// as we already checked this from inside the poll call.
395-
submission_index.index
396-
}
397-
_ => self.active_submission_index.load(Ordering::Relaxed),
398-
};
399-
unsafe {
411+
412+
// Determine which submission index `maintain` represents.
413+
let submission_index = match maintain {
414+
wgt::Maintain::WaitForSubmissionIndex(submission_index) => {
415+
// We don't need to check to see if the queue id matches
416+
// as we already checked this from inside the poll call.
417+
submission_index.index
418+
}
419+
wgt::Maintain::Wait => self
420+
.last_successful_submission_index
421+
.load(Ordering::Acquire),
422+
wgt::Maintain::Poll => unsafe {
400423
self.raw
401424
.as_ref()
402425
.unwrap()
403-
.wait(fence, index_to_wait_for, CLEANUP_WAIT_MS)
426+
.get_fence_value(fence)
404427
.map_err(DeviceError::from)?
405-
};
406-
index_to_wait_for
407-
} else {
428+
},
429+
};
430+
431+
// If necessary, wait for that submission to complete.
432+
if maintain.is_wait() {
408433
unsafe {
409434
self.raw
410435
.as_ref()
411436
.unwrap()
412-
.get_fence_value(fence)
437+
.wait(fence, submission_index, CLEANUP_WAIT_MS)
413438
.map_err(DeviceError::from)?
414-
}
415-
};
416-
log::info!("Device::maintain: last done index {last_done_index}");
439+
};
440+
}
441+
log::info!("Device::maintain: waiting for submission index {submission_index}");
417442

418443
let mut life_tracker = self.lock_life();
419444
let submission_closures =
420-
life_tracker.triage_submissions(last_done_index, &self.command_allocator);
445+
life_tracker.triage_submissions(submission_index, &self.command_allocator);
421446

422447
life_tracker.triage_mapped();
423448

@@ -3586,7 +3611,9 @@ impl<A: HalApi> Device<A> {
35863611
/// Wait for idle and remove resources that we can, before we die.
35873612
pub(crate) fn prepare_to_die(&self) {
35883613
self.pending_writes.lock().as_mut().unwrap().deactivate();
3589-
let current_index = self.active_submission_index.load(Ordering::Relaxed);
3614+
let current_index = self
3615+
.last_successful_submission_index
3616+
.load(Ordering::Acquire);
35903617
if let Err(error) = unsafe {
35913618
let fence = self.fence.read();
35923619
let fence = fence.as_ref().unwrap();

0 commit comments

Comments
 (0)