SpiecsEngine
 
Loading...
Searching...
No Matches
NsightAftermathGpuCrashTracker.cpp
Go to the documentation of this file.
1/**
2* @file NsightAftermathGpuCrashTracker.cpp
3* @brief The GpuCrashTracker Class Implementation.
4* @author NVIDIA
5* @see https://github.com/NVIDIA/nsight-aftermath-samples
6*/
7
8#include "Pchheader.h"
9#include <fstream>
10#include <iomanip>
11#include <string>
12#include <array>
13#include <winuser.h>
14#include <filesystem>
15
17
18namespace Spices {
19
21
23 : m_Initialized(false)
24 , m_Mutex()
27 , m_FrameCut(0)
28 {}
29
31 {
33
34 /**
35 * @brief If initialized, disable GPU crash dumps.
36 */
37 if (m_Initialized)
38 {
39 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_DisableGpuCrashDumps());
40 }
41 }
42
44 {
46
47 /**
48 * @brief Enable GPU crash dumps and set up the callbacks for crash dump notifications,
49 * shader debug information notifications, and providing additional crash
50 * dump description data.Only the crash dump callback is mandatory. The other two
51 * callbacks are optional and can be omitted, by passing nullptr, if the corresponding
52 * functionality is not used.
53 * The DeferDebugInfoCallbacks flag enables caching of shader debug information data
54 * in memory. If the flag is set, ShaderDebugInfoCallback will be called only
55 * in the event of a crash, right before GpuCrashDumpCallback. If the flag is not set,
56 * ShaderDebugInfoCallback will be called for every shader that is compiled.
57 */
58 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_EnableGpuCrashDumps(
59 GFSDK_Aftermath_Version_API ,
60 GFSDK_Aftermath_GpuCrashDumpWatchedApiFlags_Vulkan ,
61 GFSDK_Aftermath_GpuCrashDumpFeatureFlags_DeferDebugInfoCallbacks , /* @brief Let the Nsight Aftermath library cache shader debug information. */
62 GpuCrashDumpCallback , /* @brief Register callback for GPU crash dumps. */
63 ShaderDebugInfoCallback , /* @brief Register callback for shader debug information. */
64 CrashDumpDescriptionCallback , /* @brief Register callback for GPU crash dump description. */
65 ResolveMarkerCallback , /* @brief Register callback for resolving application-managed markers. */
66 this
67 )); /* @brief Set the GpuCrashTracker object as user data for the above callbacks. */
68
69 m_Initialized = true;
70 }
71
73 {
75
76 if (!m_GpuCrashTracker)
77 {
78 m_GpuCrashTracker = std::make_unique<GpuCrashTracker>();
79 m_GpuCrashTracker->Initialize();
80 }
81 }
82
84 {
86
87 // Device lost notification is asynchronous to the NVIDIA display
88 // driver's GPU crash handling. Give the Nsight Aftermath GPU crash dump
89 // thread some time to do its work before terminating the process.
90 constexpr auto tdrTerminationTimeout = std::chrono::seconds(3);
91 const auto tStart = std::chrono::steady_clock::now();
92 auto tElapsed = std::chrono::milliseconds::zero();
93
94 GFSDK_Aftermath_CrashDump_Status status = GFSDK_Aftermath_CrashDump_Status_Unknown;
95 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GetCrashDumpStatus(&status));
96
97 while (status != GFSDK_Aftermath_CrashDump_Status_CollectingDataFailed &&
98 status != GFSDK_Aftermath_CrashDump_Status_Finished &&
99 tElapsed < tdrTerminationTimeout)
100 {
101 // Sleep 50ms and poll the status again until timeout or Aftermath finished processing the crash dump.
102 std::this_thread::sleep_for(std::chrono::milliseconds(50));
103 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GetCrashDumpStatus(&status));
104
105 auto tEnd = std::chrono::steady_clock::now();
106 tElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(tEnd - tStart);
107 }
108
109 if (status == GFSDK_Aftermath_CrashDump_Status_Finished)
110 {
111 std::stringstream ss;
112 ss << "Aftermath finished processing the crash dump ";
113 SPICES_CORE_INFO(ss.str());
114 }
115 else
116 {
117 std::stringstream err_msg;
118 err_msg << "Unexpected crash dump status: " << status;
119 SPICES_CORE_CRITICAL(err_msg.str().c_str())
120 }
121
122 // Terminate on failure
123 exit(1);
124 }
125
126 void GpuCrashTracker::SetFrameCut(uint64_t frameCut)
127 {
129
130 m_FrameCut = frameCut % c_MarkerFrameHistory;
131
132 m_MarkerMap[m_FrameCut].clear();
133 }
134
135 void GpuCrashTracker::SetMarker(uint64_t& markerId, const std::string& info)
136 {
138
139 markerId = m_MarkerMap[m_FrameCut].size();
140 m_MarkerMap[m_FrameCut][markerId] = info;
141 }
142
143 void GpuCrashTracker::OnCrashDump(const void* pGpuCrashDump, const uint32_t gpuCrashDumpSize)
144 {
146
147 std::lock_guard<std::mutex> lock(m_Mutex);
148
149 /**
150 * @brief Write to file for later in-depth analysis with Nsight Graphics.
151 */
152 WriteGpuCrashDumpToFile(pGpuCrashDump, gpuCrashDumpSize);
153 }
154
155 void GpuCrashTracker::OnShaderDebugInfo(const void* pShaderDebugInfo, const uint32_t shaderDebugInfoSize)
156 {
158
159 std::lock_guard<std::mutex> lock(m_Mutex);
160
161 /**
162 * @brief Get shader debug information identifier.
163 */
164 GFSDK_Aftermath_ShaderDebugInfoIdentifier identifier = {};
165 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GetShaderDebugInfoIdentifier(
166 GFSDK_Aftermath_Version_API ,
167 pShaderDebugInfo ,
168 shaderDebugInfoSize ,
169 &identifier
170 ));
171
172 /**
173 * @brief Store information for decoding of GPU crash dumps with shader address mapping
174 * from within the application.
175 */
176 std::vector<uint8_t> data(
177 (uint8_t*)pShaderDebugInfo,
178 (uint8_t*)pShaderDebugInfo + shaderDebugInfoSize
179 );
180 m_ShaderDebugInfo[identifier].swap(data);
181
182 /**
183 * @brief Write to file for later in-depth analysis of crash dumps with Nsight Graphics.
184 */
185 WriteShaderDebugInformationToFile(identifier, pShaderDebugInfo, shaderDebugInfoSize);
186 }
187
188 void GpuCrashTracker::OnDescription(PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription addDescription)
189 {
191
192 /**
193 * @brief Add some basic description about the crash. This is called after the GPU crash happens, but before
194 * the actual GPU crash dump callback. The provided data is included in the crash dump and can be
195 * retrieved using GFSDK_Aftermath_GpuCrashDump_GetDescription().
196 */
197 addDescription(GFSDK_Aftermath_GpuCrashDumpDescriptionKey_ApplicationName , "SpicesEngineNsightAftermath");
198 addDescription(GFSDK_Aftermath_GpuCrashDumpDescriptionKey_ApplicationVersion , "v1.0");
199 addDescription(GFSDK_Aftermath_GpuCrashDumpDescriptionKey_UserDefined , "Aftermath GPU Crash Dump.");
200 addDescription(GFSDK_Aftermath_GpuCrashDumpDescriptionKey_UserDefined + 1 , "Engine State: Rendering.");
201 addDescription(GFSDK_Aftermath_GpuCrashDumpDescriptionKey_UserDefined + 2 , "More user-defined information...");
202 }
203
205 const void* pMarkerData ,
206 const uint32_t markerDataSize ,
207 void** ppResolvedMarkerData ,
208 uint32_t* pResolvedMarkerDataSize
209 )
210 {
212
213 /**
214 * @brief Important: the pointer passed back via ppResolvedMarkerData must remain valid after this function returns
215 * using references for all of the m_markerMap accesses ensures that the pointers refer to the persistent data
216 */
217 for (auto& map : m_MarkerMap)
218 {
219 const auto& foundMarker = map.find(reinterpret_cast<uint64_t>(pMarkerData));
220 if (foundMarker != map.end())
221 {
222 const std::string& foundMarkerData = foundMarker->second;
223 // std::string::data() will return a valid pointer until the string is next modified
224 // we don't modify the string after calling data() here, so the pointer should remain valid
225 *ppResolvedMarkerData = (void*)foundMarkerData.data();
226 *pResolvedMarkerDataSize = static_cast<uint32_t>(foundMarkerData.length());
227 return;
228 }
229 }
230 }
231
232 void GpuCrashTracker::WriteGpuCrashDumpToFile(const void* pGpuCrashDump, const uint32_t gpuCrashDumpSize)
233 {
235
236 /**
237 * @brief Create a GPU crash dump decoder object for the GPU crash dump.
238 */
239 GFSDK_Aftermath_GpuCrashDump_Decoder decoder = {};
240 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GpuCrashDump_CreateDecoder(
241 GFSDK_Aftermath_Version_API ,
242 pGpuCrashDump ,
243 gpuCrashDumpSize ,
244 &decoder
245 ));
246
247 /**
248 * @brief Use the decoder object to read basic information, like application
249 * name, PID, etc.from the GPU crash dump.
250 */
251 GFSDK_Aftermath_GpuCrashDump_BaseInfo baseInfo = {};
252 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GpuCrashDump_GetBaseInfo(decoder, &baseInfo));
253
254 /**
255 * @brief Use the decoder object to query the application name that was set
256 * in the GPU crash dump description.
257 */
258 uint32_t applicationNameLength = 0;
259 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GpuCrashDump_GetDescriptionSize(
260 decoder ,
261 GFSDK_Aftermath_GpuCrashDumpDescriptionKey_ApplicationName ,
262 &applicationNameLength
263 ));
264
265 std::vector<char> applicationName(applicationNameLength, '\0');
266
267 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GpuCrashDump_GetDescription(
268 decoder ,
269 GFSDK_Aftermath_GpuCrashDumpDescriptionKey_ApplicationName ,
270 static_cast<uint32_t>(applicationName.size()) ,
271 applicationName.data()
272 ));
273
274 /**
275 * @brief Create a unique file name for writing the crash dump data to a file.
276 * Note: due to an Nsight Aftermath bug (will be fixed in an upcoming
277 * driver release) we may see redundant crash dumps. As a workaround,
278 * attach a unique count to each generated file name.
279 */
280 static int count = 0;
281 const std::string baseFileName =
282 std::string(applicationName.data())
283 + "-"
284 + std::to_string(baseInfo.pid)
285 + "-"
286 + std::to_string(++count);
287
288 /**
289 * @brief Aftermath file folder.
290 */
291 time_t timep;
292 tm* p;
293
294 auto error = time(&timep);
295 p = localtime(&timep);
296
297 std::stringstream ss;
298 ss << SPICES_GPUCRASHREPORT_PATH <<
299 p->tm_year + 1900 <<
300 p->tm_mon + 1 <<
301 p->tm_mday << "_" <<
302 p->tm_hour <<
303 p->tm_min << "00" << "/";
304 std::filesystem::create_directories(ss.str());
305
306 /**
307 * @brief Write the crash dump data to a file using the .nv-gpudmp extension
308 * registered with Nsight Graphics.
309 */
310 const std::string crashDumpFileName = baseFileName + ".nv-gpudmp";
311 std::ofstream dumpFile(ss.str() + crashDumpFileName, std::ios::out | std::ios::binary);
312 if (dumpFile)
313 {
314 dumpFile.write(static_cast<const char*>(pGpuCrashDump), gpuCrashDumpSize);
315 dumpFile.close();
316 }
317
318 /*
319 * @brief Decode the crash dump to a JSON string.
320 * Step 1: Generate the JSON and get the size.
321 */
322 uint32_t jsonSize = 0;
323 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GpuCrashDump_GenerateJSON(
324 decoder ,
325 GFSDK_Aftermath_GpuCrashDumpDecoderFlags_ALL_INFO ,
326 GFSDK_Aftermath_GpuCrashDumpFormatterFlags_NONE ,
327 ShaderDebugInfoLookupCallback ,
328 ShaderLookupCallback ,
329 ShaderSourceDebugInfoLookupCallback ,
330 this ,
331 &jsonSize
332 ));
333
334 /**
335 * @brief Step 2: Allocate a buffer and fetch the generated JSON.
336 */
337 std::vector<char> json(jsonSize);
338 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GpuCrashDump_GetJSON(
339 decoder ,
340 static_cast<uint32_t>(json.size()) ,
341 json.data()
342 ));
343
344 /**
345 * @brief Write the crash dump data as JSON to a file.
346 */
347 const std::string jsonFileName = crashDumpFileName + ".json";
348 std::ofstream jsonFile(ss.str() + jsonFileName, std::ios::out | std::ios::binary);
349 if (jsonFile)
350 {
351 /**
352 * @brief Write the JSON to the file (excluding string termination).
353 */
354 jsonFile.write(json.data(), json.size() - 1);
355 jsonFile.close();
356 }
357
358 /**
359 * @brief Destroy the GPU crash dump decoder object.
360 */
361 AFTERMATH_CHECK_ERROR(GFSDK_Aftermath_GpuCrashDump_DestroyDecoder(decoder));
362 }
363
365 GFSDK_Aftermath_ShaderDebugInfoIdentifier identifier ,
366 const void* pShaderDebugInfo ,
367 const uint32_t shaderDebugInfoSize
368 ) const
369 {
371
372 /**
373 * @brief Aftermath file folder.
374 */
375 time_t timep;
376 tm* p;
377
378 auto error = time(&timep);
379 p = localtime(&timep);
380
381 std::stringstream ss;
382 ss << SPICES_GPUCRASHREPORT_PATH <<
383 p->tm_year + 1900 <<
384 p->tm_mon + 1 <<
385 p->tm_mday << "_" <<
386 p->tm_hour <<
387 p->tm_min << "00" << "/";
388 std::filesystem::create_directories(ss.str());
389
390 /**
391 * @brief Create a unique file name.
392 */
393 const std::string filePath = "shader-" + std::to_string(identifier) + ".nvdbg";
394 std::ofstream f(ss.str() + filePath, std::ios::out | std::ios::binary);
395 if (f)
396 {
397 f.write(static_cast<const char*>(pShaderDebugInfo), shaderDebugInfoSize);
398 f.close();
399 }
400 }
401
403 const GFSDK_Aftermath_ShaderDebugInfoIdentifier& identifier ,
404 PFN_GFSDK_Aftermath_SetData setShaderDebugInfo
405 )
406 const
407 {
409
410 /**
411 * @brief Search the list of shader debug information blobs received earlier.
412 */
413 const auto i_debugInfo = m_ShaderDebugInfo.find(identifier);
414 if (i_debugInfo == m_ShaderDebugInfo.end())
415 {
416 /**
417 * @brief Early exit, nothing found. No need to call setShaderDebugInfo.
418 */
419 return;
420 }
421
422 /**
423 * @brief Let the GPU crash dump decoder know about the shader debug information
424 * that was found.
425 */
426 setShaderDebugInfo(i_debugInfo->second.data(), static_cast<uint32_t>(i_debugInfo->second.size()));
427 }
428
430 const GFSDK_Aftermath_ShaderBinaryHash& shaderHash ,
431 PFN_GFSDK_Aftermath_SetData setShaderBinary
432 )
433 const
434 {
436
437 /**
438 * @brief Find shader binary data for the shader hash in the shader database.
439 */
440 std::vector<uint8_t> shaderBinary;
441 if (!m_ShaderDataBase.FindShaderBinary(shaderHash, shaderBinary))
442 {
443 /**
444 * @brief Early exit, nothing found. No need to call setShaderBinary.
445 */
446 return;
447 }
448
449 /**
450 * @brief Let the GPU crash dump decoder know about the shader data
451 * that was found.
452 */
453 setShaderBinary(shaderBinary.data(), static_cast<uint32_t>(shaderBinary.size()));
454 }
455
457 const GFSDK_Aftermath_ShaderDebugName& shaderDebugName ,
458 PFN_GFSDK_Aftermath_SetData setShaderBinary
459 )
460 const
461 {
463
464 /**
465 * @brief Find source debug info for the shader DebugName in the shader database.
466 */
467 std::vector<uint8_t> shaderBinary;
468 if (!m_ShaderDataBase.FindShaderBinaryWithDebugData(shaderDebugName, shaderBinary))
469 {
470 /**
471 * @brief Early exit, nothing found. No need to call setShaderBinary.
472 */
473 return;
474 }
475
476 /**
477 * @brief Let the GPU crash dump decoder know about the shader debug data that was
478 * found.
479 */
480 setShaderBinary(shaderBinary.data(), static_cast<uint32_t>(shaderBinary.size()));
481 }
482
484 const void* pGpuCrashDump ,
485 const uint32_t gpuCrashDumpSize ,
486 void* pUserData
487 )
488 {
490
491 GpuCrashTracker* pGpuCrashTracker = static_cast<GpuCrashTracker*>(pUserData);
492 pGpuCrashTracker->OnCrashDump(pGpuCrashDump, gpuCrashDumpSize);
493 }
494
496 const void* pShaderDebugInfo ,
497 const uint32_t shaderDebugInfoSize ,
498 void* pUserData
499 )
500 {
502
503 GpuCrashTracker* pGpuCrashTracker = static_cast<GpuCrashTracker*>(pUserData);
504 pGpuCrashTracker->OnShaderDebugInfo(pShaderDebugInfo, shaderDebugInfoSize);
505 }
506
507 void GpuCrashTracker::CrashDumpDescriptionCallback(
508 PFN_GFSDK_Aftermath_AddGpuCrashDumpDescription addDescription ,
509 void* pUserData
510 )
511 {
513
514 const GpuCrashTracker* pGpuCrashTracker = static_cast<GpuCrashTracker*>(pUserData);
515 pGpuCrashTracker->OnDescription(addDescription);
516 }
517
519 const void* pMarkerData ,
520 const uint32_t markerDataSize ,
521 void* pUserData ,
522 void** ppResolvedMarkerData ,
523 uint32_t* pResolvedMarkerDataSize
524 )
525 {
527
528 GpuCrashTracker* pGpuCrashTracker = static_cast<GpuCrashTracker*>(pUserData);
529 pGpuCrashTracker->OnResolveMarker(pMarkerData, markerDataSize, ppResolvedMarkerData, pResolvedMarkerDataSize);
530 }
531
532 void GpuCrashTracker::ShaderDebugInfoLookupCallback(
533 const GFSDK_Aftermath_ShaderDebugInfoIdentifier* pIdentifier ,
534 PFN_GFSDK_Aftermath_SetData setShaderDebugInfo ,
535 void* pUserData
536 )
537 {
539
540 const GpuCrashTracker* pGpuCrashTracker = static_cast<GpuCrashTracker*>(pUserData);
541 pGpuCrashTracker->OnShaderDebugInfoLookup(*pIdentifier, setShaderDebugInfo);
542 }
543
544 void GpuCrashTracker::ShaderLookupCallback(
545 const GFSDK_Aftermath_ShaderBinaryHash* pShaderHash ,
546 PFN_GFSDK_Aftermath_SetData setShaderBinary ,
547 void* pUserData
548 )
549 {
551
552 const GpuCrashTracker* pGpuCrashTracker = static_cast<GpuCrashTracker*>(pUserData);
553 pGpuCrashTracker->OnShaderLookup(*pShaderHash, setShaderBinary);
554 }
555
556 void GpuCrashTracker::ShaderSourceDebugInfoLookupCallback(
557 const GFSDK_Aftermath_ShaderDebugName* pShaderDebugName ,
558 PFN_GFSDK_Aftermath_SetData setShaderBinary ,
559 void* pUserData
560 )
561 {
563
564 const GpuCrashTracker* pGpuCrashTracker = static_cast<GpuCrashTracker*>(pUserData);
565 pGpuCrashTracker->OnShaderSourceDebugInfoLookup(*pShaderDebugName, setShaderBinary);
566 }
567
568}
#define AFTERMATH_CHECK_ERROR(FC)
Helper macro for checking Nsight Aftermath results and throwing exception in case of a failure.
#define SPICES_PROFILE_ZONE
static void Init()
Create single instance of this class.
static constexpr unsigned int c_MarkerFrameHistory
keep four frames worth of marker history.
void WriteShaderDebugInformationToFile(GFSDK_Aftermath_ShaderDebugInfoIdentifier identifier, const void *pShaderDebugInfo, const uint32_t shaderDebugInfoSize) const
Helper for writing shader debug information to a file.
void SetMarker(uint64_t &markerId, const std::string &info)
Set Marker.
void OnShaderLookup(const GFSDK_Aftermath_ShaderBinaryHash &shaderHash, PFN_GFSDK_Aftermath_SetData setShaderBinary) const
Handler for shader lookup callbacks. This is used by the JSON decoder for mapping shader instruction ...
void SetFrameCut(uint64_t frameCut)
Set FrameCut.
virtual ~GpuCrashTracker()
Destructor Function.
static void AftermathDeviceLostCheck()
Aftermath handle device lost function.
void OnResolveMarker(const void *pMarkerData, const uint32_t markerDataSize, void **ppResolvedMarkerData, uint32_t *pResolvedMarkerDataSize)
Handler for app-managed marker resolve callback.
std::mutex m_Mutex
For thread-safe access of GPU crash tracker state.
static std::unique_ptr< GpuCrashTracker > m_GpuCrashTracker
GpuCrashTracker single instance.
static void ShaderDebugInfoCallback(const void *pShaderDebugInfo, const uint32_t shaderDebugInfoSize, void *pUserData)
Shader debug information callback.
bool m_Initialized
Is the GPU crash dump tracker initialized?
void OnCrashDump(const void *pGpuCrashDump, const uint32_t gpuCrashDumpSize)
Handler for GPU crash dump callbacks from Nsight Aftermath.
void OnShaderDebugInfo(const void *pShaderDebugInfo, const uint32_t shaderDebugInfoSize)
Handler for shader debug information callbacks.
void Initialize()
Initialize the GPU crash dump tracker.
static void GpuCrashDumpCallback(const void *pGpuCrashDump, const uint32_t gpuCrashDumpSize, void *pUserData)
GPU crash dump callback.
void OnShaderSourceDebugInfoLookup(const GFSDK_Aftermath_ShaderDebugName &shaderDebugName, PFN_GFSDK_Aftermath_SetData setShaderBinary) const
Handler for shader source debug info lookup callbacks. This is used by the JSON decoder for mapping s...
void WriteGpuCrashDumpToFile(const void *pGpuCrashDump, const uint32_t gpuCrashDumpSize)
Helper for writing a GPU crash dump to a file.
static void ResolveMarkerCallback(const void *pMarkerData, const uint32_t markerDataSize, void *pUserData, void **ppResolvedMarkerData, uint32_t *pResolvedMarkerDataSize)
App-managed marker resolve callback.
void OnShaderDebugInfoLookup(const GFSDK_Aftermath_ShaderDebugInfoIdentifier &identifier, PFN_GFSDK_Aftermath_SetData setShaderDebugInfo) const
Handler for shader debug information lookup callbacks. This is used by the JSON decoder for mapping s...
Implements GPU crash dump tracking using the Nsight Aftermath API.