🎉 Celebrating 25 Years of GameDev.net! 🎉

Not many can claim 25 years on the Internet! Join us in celebrating this milestone. Learn more about our history, and thank you for being a part of our community!

Vulkan barriers not working as expected.

Started by
2 comments, last by Raph 4 years, 3 months ago

Hi,

I have a stream compaction algorithm based on a few compute shaders using a prefix sum. First, a number of scan passes, then a number of add passes and then the compaction.

It seems that something goes wrong during the add passes, like the buffer barriers aren't working between them… Am I missing something? Do I need a different type of barrier?

Scan Shader:

#version 450
#extension GL_KHR_shader_subgroup_arithmetic : enable

layout(std430, set = 0, binding = 0) buffer Input
{
    uvec4 dataInput[];
};

layout(std430, set = 0, binding = 1) buffer Output
{
    uvec4 dataOutput[];
};

layout (local_size_x = 256) in;
const int sumSubGroupSize = 64;

shared uint sdata[sumSubGroupSize];

void main()
{
    uint numInstances = dataInput[0].z;

    uint sum = 0;
    if (gl_GlobalInvocationID.x < numInstances)
    {
        sum = dataInput[gl_GlobalInvocationID.x].x;
    }

    sum = subgroupInclusiveAdd(sum);

    if (gl_SubgroupInvocationID == gl_SubgroupSize - 1)
    {
        sdata[gl_SubgroupID] = sum;
    }

    memoryBarrierShared();
    barrier();

    if (gl_SubgroupID == 0)
    {
        uint warpSum = gl_SubgroupInvocationID < gl_NumSubgroups ? sdata[gl_SubgroupInvocationID] : 0;
        warpSum = subgroupInclusiveAdd(warpSum);
        sdata[gl_SubgroupInvocationID] = warpSum;
    }

    memoryBarrierShared();
    barrier();

    uint blockSum = 0;
    if (gl_SubgroupID > 0)
    {
        blockSum = sdata[gl_SubgroupID - 1];
    }

    sum += blockSum;

    if (gl_GlobalInvocationID.x < numInstances)
    {
        dataInput[gl_GlobalInvocationID.x].x = sum;
    }
    else
    {
        dataInput[gl_GlobalInvocationID.x].x = 0;
    }

    if (gl_LocalInvocationID.x == gl_WorkGroupSize.x - 1)
    {
        dataOutput[gl_WorkGroupID.x].x = sum;
    }

    if (gl_GlobalInvocationID.x == 0)
    {
        dataOutput[0].z = (numInstances + 255) / 256;
    }
}

Add Shader:

#version 450
#extension GL_ARB_separate_shader_objects : enable
#extension GL_GOOGLE_include_directive : enable

//layout (local_size_x_id = 1) in;
layout (local_size_x = 256) in;

layout(std430, set = 0, binding = 0) buffer Input
{
    uvec4 InputData[];
};
layout(std430, set = 0, binding = 1) buffer Output
{
    uvec4 OutputData[];
};

shared uint sum;

void main()
{
    uint numInstances = OutputData[0].z;
    if (gl_WorkGroupID.x > 0 &amp;&amp; gl_GlobalInvocationID.x < numInstances)
    {
        sum = 0;
        if (gl_LocalInvocationID.x == 0)
        {
            sum = InputData[gl_WorkGroupID.x - 1].x;
        }

        memoryBarrierShared();
        barrier();

        OutputData[gl_GlobalInvocationID.x].x += sum;
    }
}

C++ Side:

		{
			renderCmd->CmdBeginDebugMarker("Stream Compaction");
			{
				renderCmd->CmdBeginDebugMarker("Scan");
				{
					Renderer::BufferBarrier bbarriers[] = {
						{ mInstanceVisibilityBuffer, Renderer::ResourceState::UNORDERED_ACCESS  },
						{ mInstanceVisibilityPartialSumsBuffer[0], Renderer::ResourceState::UNORDERED_ACCESS  },
					};

					renderCmd->CmdResourceBarrier(2, bbarriers, 0, nullptr);

					renderCmd->CmdBindPipeline(mVisibilityScanPipeline);
					renderCmd->CmdBindDescriptorSet(0, mVisibilityScanDescriptorSetTexture);
					//renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityScanDescriptorSetUniforms);
					const uint32_t* threadGroupSizes = mVisibilityScanShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;

					uint32_t totalInstance = mInstancedMeshData.mTotalNumInstances;
					renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);

					for (uint32_t pass = 1; pass < mInstanceVisibilityPartialSumsBuffer.size(); pass++)
					{
						Renderer::BufferBarrier bbarriers[] = {
							{ mInstanceVisibilityPartialSumsBuffer[pass - 1], Renderer::ResourceState::UNORDERED_ACCESS  },
							{ mInstanceVisibilityPartialSumsBuffer[pass], Renderer::ResourceState::UNORDERED_ACCESS  },
						};

						renderCmd->CmdResourceBarrier(2, bbarriers, 0, nullptr);

						renderCmd->CmdBindPipeline(mVisibilityScanPipeline);
						renderCmd->CmdBindDescriptorSet(pass, mVisibilityScanDescriptorSetTexture);
						//renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityScanDescriptorSetUniforms);
						const uint32_t* threadGroupSizes = mVisibilityScanShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;

						totalInstance = (totalInstance + 255) / 256;
						renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
					}
				}
				renderCmd->CmdEndDebugMarker();
				renderCmd->CmdBeginDebugMarker("Add");
				{
					for (uint32_t pass = (uint32_t)mInstanceVisibilityPartialSumsBuffer.size() - 1; pass > 0 ; pass--)
					{
						Renderer::BufferBarrier bbarriers[] = {
							{ mInstanceVisibilityPartialSumsBuffer[pass - 1], Renderer::ResourceState::UNORDERED_ACCESS  },
							{ mInstanceVisibilityPartialSumsBuffer[pass], Renderer::ResourceState::UNORDERED_ACCESS  },
						};

						renderCmd->CmdResourceBarrier(2, bbarriers, 0, nullptr);

						renderCmd->CmdBindPipeline(mVisibilityAddPipeline);
						renderCmd->CmdBindDescriptorSet(pass, mVisibilityAddDescriptorSetTexture);
						//renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityAddDescriptorSetUniforms);
						const uint32_t* threadGroupSizes = mVisibilityAddShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;

						uint32_t divide = (256 << (8 << (pass - 2)));
						uint32_t totalInstance = (mInstancedMeshData.mTotalNumInstances + divide - 1) / divide;
						renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
					}

					Renderer::BufferBarrier bbarriers[] = {
						{ mInstanceVisibilityBuffer, Renderer::ResourceState::UNORDERED_ACCESS  },
						{ mInstanceVisibilityPartialSumsBuffer[0], Renderer::ResourceState::UNORDERED_ACCESS  },
					};

					renderCmd->CmdResourceBarrier(2, bbarriers, 0, nullptr);

					renderCmd->CmdBindPipeline(mVisibilityAddPipeline);
					renderCmd->CmdBindDescriptorSet(0, mVisibilityAddDescriptorSetTexture);
					//renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityAddDescriptorSetUniforms);
					const uint32_t* threadGroupSizes = mVisibilityAddShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;

					uint32_t totalInstance = mInstancedMeshData.mTotalNumInstances;
					renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
				}
				renderCmd->CmdEndDebugMarker();
				renderCmd->CmdBeginDebugMarker("Compact");
				{
					Renderer::BufferBarrier bbarriers[] = {
						{ mInstanceVisibilityBuffer, Renderer::ResourceState::UNORDERED_ACCESS  },
						{ mDrawInstancesIndirectCommandBuffer, Renderer::ResourceState::UNORDERED_ACCESS  },
						{ mInstanceDataBuffer, Renderer::ResourceState::UNORDERED_ACCESS  },
					};

					renderCmd->CmdResourceBarrier(3, bbarriers, 0, nullptr);

					renderCmd->CmdBindPipeline(mVisibilityCompactPipeline);
					renderCmd->CmdBindDescriptorSet(0, mVisibilityCompactDescriptorSetTexture);
					renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityCompactDescriptorSetUniforms);
					const uint32_t* threadGroupSizes = mVisibilityCompactShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;

					uint32_t totalInstance = mInstancedMeshData.mTotalNumInstances;
					renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
				}
				renderCmd->CmdEndDebugMarker();
			}
			renderCmd->CmdEndDebugMarker();
		}

The number of threads is stored and written out in the [0].z component of some of the buffers for the next pass to use.

Advertisement

Have you tried running with the the validation layer active to see if it reports any issues?

@fries Is this by any chance on Intel? There's a “fun feature” where gl_SubgroupSize reports 32 even when the number of invocations in the subgroup is less (commonly 16). You can work around this by using the workgroup size divided by the number of invocations.

I'm trying to figure out where's the best place to report this “fun feature.”

This topic is closed to new replies.

Advertisement