Skip to content

Early frees on CPU Implementations #3193

@teoxoy

Description

@teoxoy

Found in #3174 (comment)
Related: #3031, #2285

Description
Getting DX12 errors

[ERROR wgpu_hal::auxil::dxgi::exception] ID3D12CommandAllocator::Reset: A command allocator 0x000001FD96C262F0:'Unnamed ID3D12CommandAllocator Object' is being reset before previous executions associated with the allocator have completed. [ EXECUTION ERROR #552: COMMAND_ALLOCATOR_SYNC]
[ERROR wgpu_hal::auxil::dxgi::exception] ID3D12Resource2::<final-release>: CORRUPTION: An ID3D12Resource object (0x000001FD96C25E80:'(wgpu internal) Staging') is referenced by GPU operations in-flight on Command Queue (0x000001FD96B64D10:'Unnamed ID3D12CommandQueue Object').  It is not safe to final-release objects that may have GPU operations pending.  This can result in application instability. [ EXECUTION ERROR #921: OBJECT_DELETED_WHILE_STILL_IN_USE]

or

[ERROR wgpu_hal::auxil::dxgi::exception] ID3D12CommandAllocator::Reset: The command allocator cannot be reset because a command list is currently being recorded with the allocator. [ EXECUTION ERROR #543: COMMAND_ALLOCATOR_CANNOT_RESET]

depending if queue.submit is called in the repro below.

This feels like a timing issue (also pointed out by @kvark in #2285 (comment)) since I could only reproduce this locally by increasing array_size to 2048. I also can't reproduce the issue on actual hardware (tried on an Nvidia dGPU and Intel iGPU).

Repro steps

use wgpu::{
    BindGroupDescriptor, BindGroupEntry, BindGroupLayoutDescriptor, BindGroupLayoutEntry,
    BindingType, BufferBindingType, BufferDescriptor, BufferUsages, CommandEncoderDescriptor,
    ComputePassDescriptor, ComputePipelineDescriptor, DownlevelFlags, Limits,
    PipelineLayoutDescriptor, ShaderModuleDescriptor, ShaderSource, ShaderStages,
};

use crate::common::{initialize_test, TestParameters, TestingContext};

#[test]
fn test() {
    initialize_test(
        TestParameters::default()
            .downlevel_flags(DownlevelFlags::COMPUTE_SHADERS)
            .limits(Limits::downlevel_defaults()),
        test_impl,
    );
}

const SRC: &'static str = r#"
let array_size = 2048u; // increase this if you can't reproduce

var<workgroup> w_mem: array<u32, array_size>;

@group(0) @binding(0)
var<storage, read_write> output: u32;

@compute @workgroup_size(1)
fn main() {
    w_mem = array<u32, array_size>();
    workgroupBarrier();

    var is_zero = true;
    for(var i = 0u; i < array_size; i++) {
        is_zero &= w_mem[i] == 0u;
    }
    output = u32(!is_zero);
}
"#;

fn test_impl(ctx: TestingContext) {
    let bgl = ctx
        .device
        .create_bind_group_layout(&BindGroupLayoutDescriptor {
            label: None,
            entries: &[BindGroupLayoutEntry {
                binding: 0,
                visibility: ShaderStages::COMPUTE,
                ty: BindingType::Buffer {
                    ty: BufferBindingType::Storage { read_only: false },
                    has_dynamic_offset: false,
                    min_binding_size: None,
                },
                count: None,
            }],
        });

    let output_buffer = ctx.device.create_buffer(&BufferDescriptor {
        label: None,
        size: 4,
        usage: BufferUsages::COPY_DST | BufferUsages::STORAGE,
        mapped_at_creation: false,
    });

    let bg = ctx.device.create_bind_group(&BindGroupDescriptor {
        label: None,
        layout: &bgl,
        entries: &[BindGroupEntry {
            binding: 0,
            resource: output_buffer.as_entire_binding(),
        }],
    });

    let pll = ctx
        .device
        .create_pipeline_layout(&PipelineLayoutDescriptor {
            label: None,
            bind_group_layouts: &[&bgl],
            push_constant_ranges: &[],
        });

    let sm = ctx.device.create_shader_module(ShaderModuleDescriptor {
        label: None,
        source: ShaderSource::Wgsl(SRC.into()),
    });

    let pipeline = ctx
        .device
        .create_compute_pipeline(&ComputePipelineDescriptor {
            label: None,
            layout: Some(&pll),
            module: &sm,
            entry_point: "main",
        });

    ctx.queue
        .write_buffer(&output_buffer, 0, bytemuck::cast_slice(&[1])); // if this is here, we get OBJECT_DELETED_WHILE_STILL_IN_USE

    let mut encoder = ctx
        .device
        .create_command_encoder(&CommandEncoderDescriptor::default());

    let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor::default());
    cpass.set_pipeline(&pipeline);
    cpass.set_bind_group(0, &bg, &[]);
    cpass.dispatch_workgroups(1, 1, 1);
    drop(cpass);

    ctx.queue.submit(Some(encoder.finish())); // if this is removed, we get COMMAND_ALLOCATOR_CANNOT_RESET
}

Expected vs observed behavior
No errors.

Platform
Windows 11, wgpu master (08b160c)

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

Status

Done

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions