🎉 Celebrating 25 Years of GameDev.net! 🎉

Not many can claim 25 years on the Internet! Join us in celebrating this milestone. Learn more about our history, and thank you for being a part of our community!

[Vulkan] Incorrect results when measuring time using Timestamp

Started by
2 comments, last by mrjbom 2 years, 11 months ago

Hello.
I am trying to measure the render time using timestamp queries.

I have this code:

void MainWindow::updateCommandBuffer(int imageIndex)
{
	//Start recording
	if (vkBeginCommandBuffer(commandBuffers[imageIndex], &beginInfo) != VK_SUCCESS) {
		throw MakeErrorInfo("Failed to begin recording command buffer!");
	}

	vkCmdResetQueryPool(commandBuffers[imageIndex], queryPools[imageIndex], 0, 2);
	vkCmdWriteTimestamp(commandBuffers[imageIndex], VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, queryPools[imageIndex], 0);

	vkCmdBeginRenderPass(commandBuffers[imageIndex], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
	vkCmdBindPipeline(commandBuffers[imageIndex], VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline);

	vkCmdBindVertexBuffers(commandBuffers[imageIndex], 0, 1, vertexBuffers, offsets);
	vkCmdBindIndexBuffer(commandBuffers[imageIndex], indexBuffer, 0, VK_INDEX_TYPE_UINT16);
	vkCmdBindDescriptorSets(commandBuffers[imageIndex], VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSets[imageIndex], 0, nullptr);
	
	for (size_t i = 0; i < iterationsNumber; ++i) {
		vkCmdDrawIndexed(commandBuffers[imageIndex], static_cast<uint32_t>(indices.size()), 1, 0, 0, 0);
	}
	vkCmdEndRenderPass(commandBuffers[imageIndex]);

	vkCmdWriteTimestamp(commandBuffers[imageIndex], VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, queryPools[imageIndex], 1);
	if (vkEndCommandBuffer(commandBuffers[imageIndex]) != VK_SUCCESS) {
		throw MakeErrorInfo("Failed to record command buffer!");
	}
}

So, if iterationsNumber is equal to 1, then I get the following results:
Total frametime: 0.354 ms
Square command buffer rendertime: 0.016 ms

But if the iterationsNumber is 2000, then I get strange results:
Total frametime: 5.483 ms
Squares command buffers rendertime: 2.331 ms

Obviously, the imgui render time could not grow by itself, which means that the frame time is considered incorrect.

The only thing I do in my scene is render 2 command buffers, an imgui command buffer and a command buffer with squares.

image

Why is this happening and how to fix it?

I’ll leave my render function just in case:

Render code(It also shows how I take data from the query pool and count the time)

void MainWindow::drawFrame()
{
	//Waiting for the fence to be ready(Waiting for the command buffer to finish executing for the frame)
	vkWaitForFences(logicalDevice, 1, &inFlightFences[currentFrame], VK_TRUE, UINT64_MAX);

	//Acquire image from swapchain
	uint32_t imageIndex;
	//UINT64_MAX disables timeout, timeout is infinite
	VkResult result = vkAcquireNextImageKHR(logicalDevice, swapchain, UINT64_MAX, imageAvailableSemaphores[currentFrame], VK_NULL_HANDLE, &imageIndex);
	if (result == VK_ERROR_OUT_OF_DATE_KHR) {
		recreateSwapchain();
		return;
	}
	else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
		throw MakeErrorInfo("Failed to acquire swapchain image!");
	}

	// Check if a previous frame is using this image (i.e. there is its fence to wait on)
	if (imagesInFlight[imageIndex] != VK_NULL_HANDLE) {
		vkWaitForFences(logicalDevice, 1, &imagesInFlight[imageIndex], VK_TRUE, UINT64_MAX);
	}

	static std::vector<bool> isFirstTimeFrameRender(MAX_FRAMES_IN_FLIGHT, true);
	if (isFirstTimeFrameRender[currentFrame] == false) {
		uint64_t timestampsResults[2];
		vkGetQueryPoolResults(logicalDevice, queryPools[imageIndex], 0, 2, sizeof(timestampsResults), timestampsResults, 0, VK_QUERY_RESULT_64_BIT);
		//timestampsResults[0] is start point
		////timestampsResults[1] is end point
		VkPhysicalDeviceProperties deviceProperties;
		vkGetPhysicalDeviceProperties(physicalDevice, &deviceProperties);
		float nanosecondsInTimestamp = deviceProperties.limits.timestampPeriod;
		float timestampValueInMilliseconds = ((timestampsResults[1] - timestampsResults[0]) * nanosecondsInTimestamp) / 1000000;
		g_timestampValueInMilliseconds = timestampValueInMilliseconds;
	}
	else {
		isFirstTimeFrameRender[currentFrame] = false;
	}

	// Mark the image as now being in use by this frame
	imagesInFlight[imageIndex] = inFlightFences[currentFrame];

	glfwGetWindowPos(window, &windowXPos, &windowYPos);

	updateCommandBuffer(imageIndex);
	imguiUpdateCommandBuffer(imageIndex);

	updateUniformBuffer(imageIndex);

	//Submitting the command buffer
	//Each entry in the waitStages array corresponds to the semaphore with the same index in pWaitSemaphores.
	VkSemaphore waitSemaphores[] = { imageAvailableSemaphores[currentFrame] };
	std::vector<VkPipelineStageFlags> waitStages = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
	VkSubmitInfo submitInfo{};
	submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
	//The first three parameters specify which semaphores to wait on before execution begins
	//and in which stage(s) of the pipeline to wait.
	//We want to wait with writing colors to the image until it's available, so we're specifying the stage of the graphics pipeline that writes to the color attachment.
	submitInfo.waitSemaphoreCount = 1;
	submitInfo.pWaitSemaphores = waitSemaphores;
	submitInfo.pWaitDstStageMask = waitStages.data();
	std::vector<VkCommandBuffer> currentCommandBuffers = { commandBuffers[imageIndex], imguiCommandBuffers[imageIndex] };
	submitInfo.commandBufferCount = 2;
	submitInfo.pCommandBuffers = currentCommandBuffers.data();
	std::vector<VkSemaphore> signalSemaphores = { renderFinishedSemaphores[currentFrame], imguiRenderFinishedSemaphores[currentFrame] };
	submitInfo.signalSemaphoreCount = 2;
	submitInfo.pSignalSemaphores = signalSemaphores.data();

	//Restore the fence to the unsignaled state
	vkResetFences(logicalDevice, 1, &inFlightFences[currentFrame]);

	//Start executing the command buffer
	if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) != VK_SUCCESS) {
		throw MakeErrorInfo("Failed to submit draw command buffer!");
	}

	VkPresentInfoKHR presentInfo{};
	presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
	presentInfo.waitSemaphoreCount = 2;
	presentInfo.pWaitSemaphores = signalSemaphores.data();

	VkSwapchainKHR swapchains[] = { swapchain };
	presentInfo.swapchainCount = 1;
	presentInfo.pSwapchains = swapchains;
	presentInfo.pImageIndices = &imageIndex;
	presentInfo.pResults = nullptr; // Optional

	result = vkQueuePresentKHR(presentQueue, &presentInfo);
	if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || framebufferResized) {
		framebufferResized = false;
		recreateSwapchain();
		if (minimizedWindowNeedClose) {
			return;
		}
	}
	else if (result != VK_SUCCESS) {
		throw MakeErrorInfo("Failed to present swapchain image!");
	}

	currentFrame = (currentFrame + 1) % MAX_FRAMES_IN_FLIGHT;
}
Advertisement

I compared this with my own, but did not spot anything suspicious.

mrjbom said:
So, if iterationsNumber is equal to 1, then I get the following results:

I ignore the first 3 frames for profiling. Can't remember exactly why, but likely the first timings were off or noise.

After that, i also do averaging to get a better impression from the fluctuating timings of each frame.

mrjbom said:
But if the iterationsNumber is 2000, then I get strange results:

Is there some abrupt change, becoming gradually slower, or are your results quite consistent after a small number of initial frames?
Likely you want to log all frame timings to some text file and see how it makes sense.

Posting my profiler code for reference, but it's messy and confusing.

	struct Profiler 
	{
		enum
		{
			MAX_TIMERS = 1024,
			IGNORE_INITIAL_FRAMES = 3,
		};

		VkQueryPool queryPool;

		int numTimers;
		int tick;
		uint64_t usedFlags [(MAX_TIMERS*2+63) / 64];
		uint64_t timestamps [MAX_TIMERS*2];
		uint64_t startTime;
		uint64_t endTime;
		double averageDuration [MAX_TIMERS];
		double totalAverageDuration;

		void SetUsed (const int id)
		{
			usedFlags[id>>6] |= (uint64_t(1)<<uint64_t(id&63)); 
		}

		bool IsUsed (const int id)
		{
			return (usedFlags[id>>6] & (uint64_t(1)<<uint64_t(id&63))) != 0;
		}




		void Init (Gpu *gpu, uint32_t numTimers)
		{
			this->numTimers = min(MAX_TIMERS, numTimers);

			for (int i=0; i < (MAX_TIMERS*2+63) / 64; i++) usedFlags[i] = 0;
			for (int i=0; i < MAX_TIMERS*2; i++) timestamps[i] = 0;
			for (int i=0; i < MAX_TIMERS; i++) averageDuration[i] = 0;
			totalAverageDuration = 0;
			tick = 0;

			VkQueryPoolCreateInfo poolCI = {};
			poolCI.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
			poolCI.queryType = VK_QUERY_TYPE_TIMESTAMP;
			poolCI.queryCount = this->numTimers*2;

			VkResult err = vkCreateQueryPool (gpu->device, &poolCI, nullptr, &queryPool);
			assert (!err);
		}

		void Destroy (Gpu *gpu)
		{
			vkDestroyQueryPool (gpu->device, queryPool, nullptr);
		}

		void RecordReset (VkCommandBuffer commandBuffer)
		{
			VkCommandBufferBeginInfo commandBufferBeginInfo = {};
			commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
			commandBufferBeginInfo.flags = 0;
			vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
			vkCmdResetQueryPool (commandBuffer, queryPool, 0, MAX_TIMERS*2);
			vkEndCommandBuffer(commandBuffer);
		}

		void DownloadResults (Gpu *gpu)
		{
			bool oldUsed = false;
			uint32_t oldId = 0;
			for (uint32_t id = 0; id <= uint32_t(numTimers)*2; id++)
			{
				bool used = (id<MAX_TIMERS ? IsUsed (id) : false);
				if (!used && oldUsed)
				{
//SystemTools::Log ("Query: %i, %i\n", oldId, id-oldId);
					VkResult err = vkGetQueryPoolResults(gpu->device, queryPool, oldId, id-oldId,
						sizeof(timestamps), (void*)&timestamps[oldId], sizeof(uint64_t), VK_QUERY_RESULT_64_BIT);
				}
				if (used && !oldUsed)
				{
					oldId = id;
				}
				oldUsed = used;
			}
			
			tick++;
			double f = double(gpu->physicalDeviceProps.limits.timestampPeriod) / 1000000.0;

			uint64_t start = 0xFFFFFFFFFFFFFFFFUL;
			uint64_t end = 0;

			for (int i=0; i<numTimers; i++)
			{
				double dur = f * double(GetDuration(i));
				if (tick > IGNORE_INITIAL_FRAMES) averageDuration[i] += dur;
				
				//if (IsUsed(i*2)) start = min (start, GetStartTime (i));
				//if (IsUsed(i*2+1)) end = max (end, GetStopTime (i));
			}
			
			{
				if (IsUsed(TS_TOTAL0*2)) start = min (start, GetStartTime (TS_TOTAL0));
				if (IsUsed(TS_TOTAL0*2+1)) end = max (end, GetStopTime (TS_TOTAL0));
				if (IsUsed(TS_TOTAL1*2)) start = min (start, GetStartTime (TS_TOTAL1));
				if (IsUsed(TS_TOTAL1*2+1)) end = max (end, GetStopTime (TS_TOTAL1));
				if (IsUsed(TS_TOTAL2*2)) start = min (start, GetStartTime (TS_TOTAL2));
				if (IsUsed(TS_TOTAL2*2+1)) end = max (end, GetStopTime (TS_TOTAL2));
			}

			startTime = start;
			endTime = end;
			double dur = f * double(end - start);
			if (tick > IGNORE_INITIAL_FRAMES) totalAverageDuration += dur;


			ImGui::Text("dur %f, start %f, end %f, ", f * float(end-start), f * float(start), f * float(end));
			if (IsUsed(TS_TOTAL0*2)) ImGui::Text("start0 %f", f * double(GetStartTime (TS_TOTAL0) - start));
			if (IsUsed(TS_TOTAL0*2+1)) ImGui::Text("end0 %f", f * double(GetStopTime (TS_TOTAL0) - start));
			if (IsUsed(TS_TOTAL1*2)) ImGui::Text("start1 %f", f * double(GetStartTime (TS_TOTAL1) - start));
			if (IsUsed(TS_TOTAL1*2+1)) ImGui::Text("end1 %f", f * double(GetStopTime (TS_TOTAL1) - start));
			if (IsUsed(TS_TOTAL2*2)) ImGui::Text("start2 %f", f * double(GetStartTime (TS_TOTAL2) - start));
			if (IsUsed(TS_TOTAL2*2+1)) ImGui::Text("end2 %f", f * double(GetStopTime (TS_TOTAL2) - start));
			
		}


		void Start (int id, VkCommandBuffer cmd, VkPipelineStageFlagBits pipelineStage)
		{
			id = (id*2) & (MAX_TIMERS*2-1);
			SetUsed (id);
			//vkCmdResetQueryPool (cmd, queryPool, id, 2);
			vkCmdWriteTimestamp (cmd, pipelineStage, queryPool, id);
		}

		void Stop (int id, VkCommandBuffer cmd, VkPipelineStageFlagBits pipelineStage)
		{
			id = (id*2+1) & (MAX_TIMERS*2-1);
			SetUsed (id);
			vkCmdWriteTimestamp (cmd, pipelineStage, queryPool, id);
		}


		uint64_t GetDuration (int id) { return timestamps[id*2+1] - timestamps[id*2]; }
		uint64_t GetStartTime (int id) { return timestamps[id*2]; }
		uint64_t GetStopTime (int id) { return timestamps[id*2+1]; }
		float GetAverageDuration (int id) 
		{
			double d = averageDuration[id] / double(tick - IGNORE_INITIAL_FRAMES);
			return float(d); 
		}
		float GetAverageTotalDuration () 
		{
			double d = totalAverageDuration / double(tick - IGNORE_INITIAL_FRAMES);
			return float(d); 
		}

	};
	
	

// some example where i used it for a CS:

		if (taskFlags & (1<<tREBUILD_RESTORE))
		{
			vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayouts[tREBUILD_RESTORE], 0, 1, &descriptorSets[tREBUILD_RESTORE], 0, nullptr);
		
			vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines[taskToPipeline[tREBUILD_RESTORE]]);
	#ifdef PROFILE_TASKS
			profiler.Start (TS_BRESTORE, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
	#endif
			vkCmdDispatchIndirect(commandBuffer, buffers[bDISPATCH].deviceBuffer, sizeof(VkDispatchIndirectCommand) * OFFSET_DISPATCH_REBUILD_RESTORE);
	#ifdef PROFILE_TASKS
			profiler.Stop (TS_BRESTORE, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
	#endif
			int barrierBuffers[] = {bS_POS, bS_V4, bS_I4, bS_SI, bS_CONE};
			MemoryBarriers (commandBuffer, barrierBuffers, 5);
		}

I decided to use an index buffer for drawing, and since in order to draw one square using two triangles, 6 values are required in the index buffer, I tried to add more values to it in order to draw 2000 squares. Now i draw 2000 squares using an index buffer in which I have 12000 values using one vkCmdDrawIndexed() call and in this case I get logical results:

It turns out that those extra 3 ms were the cost of a cycle and multiple calls to the drawing function.
Thank you very much for the answer, I think your code will be useful to me or someone.

This topic is closed to new replies.

Advertisement