SpiecsEngine
 
Loading...
Searching...
No Matches
NsightPerfGPUProfilerContinuous.cpp
Go to the documentation of this file.
1/**
2* @file NsightPerfGPUProfilerContinuous.cpp
3* @brief The NsightPerfGPUProfilerContinuous Class Implementation.
4* @author Spices
5*/
6
7#include "Pchheader.h"
10#include "Render/FrameInfo.h"
11
12#include <NvPerfCounterConfiguration.h>
13#include <NvPerfCpuMarkerTrace.h>
14#include <NvPerfVulkan.h>
15
16namespace Spices {
17
19
20 /**
21 * @brief The following metrics are for demonstration purposes only.
22 * For a more comprehensive set of single-pass metrics, please refer to the 'HudConfigurations'.
23 */
24 const char* Metrics[] = {
25 "gpc__cycles_elapsed.avg.per_second",
26 "sys__cycles_elapsed.avg.per_second",
27 "lts__cycles_elapsed.avg.per_second",
28 };
29
31 : m_VulkanState(state)
32 , m_IsInSession(false)
34 {
36
37 /**
38 * @brief Create this in construct.
39 */
40 Create(state);
41
42 /**
43 * @brief End Session after initalized,
44 * for Session can not be owned by multiple instance.
45 */
46 Reset();
47 }
48
50 {
52
53 if (!m_NsightPerfGPUProfilerContinuous)
54 {
55 m_NsightPerfGPUProfilerContinuous = std::make_shared<NsightPerfGPUProfilerContinuous>(state);
56 }
57 }
58
60 {
62
63 //NSPERF_CHECK(nv::perf::InitializeNvPerf())
64 NSPERF_CHECK(nv::perf::VulkanIsNvidiaDevice(state.m_PhysicalDevice))
65 const size_t deviceIndex = nv::perf::VulkanGetNvperfDeviceIndex(state.m_Instance, state.m_PhysicalDevice, state.m_Device);
66
67 /**
68 * @brief Initialize the periodic sampler.
69 */
70 NSPERF_CHECK(sampler.Initialize(deviceIndex))
71 const nv::perf::DeviceIdentifiers deviceIdentifiers = sampler.GetDeviceIdentifiers();
72
73 /**
74 * @brief Create the metrics evaluator.
75 */
76 {
77 std::vector<uint8_t> metricsEvaluatorScratchBuffer;
78 NVPW_MetricsEvaluator* pMetricsEvaluator = nv::perf::sampler::DeviceCreateMetricsEvaluator(metricsEvaluatorScratchBuffer, deviceIdentifiers.pChipName);
79
80 /**
81 * @brief transfer ownership to m_metricsEvaluator.
82 */
83 metricsEvaluator = nv::perf::MetricsEvaluator(pMetricsEvaluator, std::move(metricsEvaluatorScratchBuffer));
84 }
85
86 /**
87 * @brief Create the config builder, this is used to create a counter configuration.
88 */
89 nv::perf::MetricsConfigBuilder configBuilder;
90 {
91 NVPA_RawMetricsConfig* pRawMetricsConfig = nv::perf::sampler::DeviceCreateRawMetricsConfig(deviceIdentifiers.pChipName);
92
93 /**
94 * @brief transfer pRawMetricsConfig's ownership to configBuilder.
95 */
96 NSPERF_CHECK(configBuilder.Initialize(metricsEvaluator, pRawMetricsConfig, deviceIdentifiers.pChipName))
97 }
98
99 /**
100 * @brief Add metrics into config builder.
101 */
102 for (size_t ii = 0; ii < sizeof(Metrics) / sizeof(Metrics[0]); ++ii)
103 {
104 const char* const pMetric = Metrics[ii];
105 NVPW_MetricEvalRequest request{};
106 NSPERF_CHECK(ToMetricEvalRequest(metricsEvaluator, pMetric, request))
107
108 /**
109 * @brief By setting "keepInstances" to false, the counter data will only store GPU-level values, reducing its size and improving the performance of metric evaluation.
110 * However, this option has the drawback of making max/min submetrics non-evaluable.
111 */
112 constexpr bool keepInstances = false;
113 NSPERF_CHECK(configBuilder.AddMetrics(&request, 1, keepInstances))
114 metricEvalRequests.emplace_back(std::move(request));
115 }
116
117 /**
118 * @brief Create the counter configuration out of the config builder.
119 */
120 nv::perf::CounterConfiguration counterConfiguration;
121 NSPERF_CHECK(CreateConfiguration(configBuilder, counterConfiguration))
122
123 /**
124 * @brief Periodic sampler supports only single-pass configurations, meaning that all scheduled metrics must be collectable in a single pass.
125 */
126 assert(counterConfiguration.numPasses == 1);
127
128 /**
129 * @brief Initialize the counter data
130 * Below setting determines the maximum size of a counter data image. However, because the counter data here is requested to work in the ring buffer mode,
131 * when the put pointer reaches the end, it will start from the beginning and overwrite previous data even if it hasn't been read yet.
132 * Therefore, the size specified here must be sufficient to cover the latency.
133 */
134 constexpr uint32_t MaxSamples = 1024;
135 constexpr bool Validate = true; // Setting this to true enables extra validation, which is useful for debugging. In production environments, it can be set to false for improved performance.
136 NSPERF_CHECK(counterData.Initialize(
137 MaxSamples ,
138 Validate ,
139 [&](
140 uint32_t maxSamples ,
141 NVPW_PeriodicSampler_CounterData_AppendMode appendMode ,
142 std::vector<uint8_t>& counterData
143 )
144 {
145 return nv::perf::sampler::GpuPeriodicSamplerCreateCounterData(
146 deviceIndex ,
147 counterConfiguration.counterDataPrefix.data() ,
148 counterConfiguration.counterDataPrefix.size() ,
149 maxSamples ,
150 appendMode ,
151 counterData
152 );
153 }))
154
155 /**
156 * @brief Update the metrics evaluator with the actual device's attributes stored in the counter data.
157 */
158 NSPERF_CHECK(MetricsEvaluatorSetDeviceAttributes(
159 metricsEvaluator ,
160 counterData.GetCounterData().data() ,
161 counterData.GetCounterData().size()
162 ))
163
164 /**
165 * @brief Output the header in CSV format.
166 */
167 {
168 std::cout << "StartTime, EndTime, Duration";
169 const auto countersEnumerator = EnumerateCounters(metricsEvaluator);
170 const auto ratiosEnumerator = EnumerateRatios(metricsEvaluator);
171 const auto throughputsEnumerator = EnumerateThroughputs(metricsEvaluator);
172 for (const NVPW_MetricEvalRequest& metricEvalRequest : metricEvalRequests)
173 {
174 std::cout << ", " << ToString(countersEnumerator, ratiosEnumerator, throughputsEnumerator, metricEvalRequest);
175 }
176 std::cout << "\n";
177 }
178
179 /**
180 * @brief Start a periodic sampler session.
181 */
182 constexpr size_t SamplingFrequency = 120; // 120 Hz
183 constexpr size_t samplingIntervalInNanoSeconds = 1000 * 1000 * 1000 / SamplingFrequency;
184 constexpr size_t MaxDecodeLatencyInNanoSeconds = 1000 * 1000 * 1000 * 10; // tolerate maximum DecodeCounters() latency up to 1 second
185 const nv::perf::sampler::GpuPeriodicSampler::GpuPulseSamplingInterval samplingInterval = sampler.GetGpuPulseSamplingInterval(samplingIntervalInNanoSeconds);
186 const size_t maxNumUndecodedSamples = MaxDecodeLatencyInNanoSeconds / samplingIntervalInNanoSeconds;
187 size_t recordBufferSize = 0;
188 NSPERF_CHECK(nv::perf::sampler::GpuPeriodicSamplerCalculateRecordBufferSize(deviceIndex, counterConfiguration.configImage, maxNumUndecodedSamples, recordBufferSize))
189
190 const size_t MaxNumUndecodedSamplingRanges = 1; // must be 1
191 NSPERF_CHECK(sampler.BeginSession(
192 recordBufferSize ,
193 MaxNumUndecodedSamplingRanges ,
194 { samplingInterval.triggerSource } ,
195 samplingInterval.samplingInterval
196 ))
197
198 /**
199 * @brief Apply the previously generated counter configuration to the periodic sampler.
200 */
201 constexpr size_t passIndex = 0; // This is a single-pass configuration, so the pass index is fixed at 0.
202 NSPERF_CHECK(sampler.SetConfig(counterConfiguration.configImage, passIndex))
203
204 /**
205 * @brief Start sampling.
206 * Ideally, sampling should only start right before executing the target workloads to prevent the record buffer from being occupied by records generated by GPU triggers before the target workloads.
207 * However, in this use case, it is acceptable because the trigger source is set to "NVPW_GPU_PERIODIC_SAMPLER_TRIGGER_SOURCE_GPU_ENGINE_TRIGGER", which doesn't automatically generate GPU triggers but
208 * relies on clients manually pushing triggers through the command list. Furthermore, since the metric configuration used is for low-speed sampling, no "overflow prevention records" will be emitted.
209 */
210 NSPERF_CHECK(sampler.StartSampling())
211
212 /**
213 * @brief Set InSession true.
214 */
215 m_IsInSession = true;
216 }
217
219 {
221
222 /**
223 * @brief Capture One frame.
224 */
226 {
227 Create(state);
229 }
230 }
231
233 {
235
236 if (!m_IsInSession) return;
237
238 nv::perf::sampler::GpuPeriodicSampler::GetRecordBufferStatusParams getRecordBufferStatusParams = {};
239 getRecordBufferStatusParams.queryOverflow = true;
240 getRecordBufferStatusParams.queryNumUnreadBytes = true;
241 const bool success = sampler.GetRecordBufferStatus(getRecordBufferStatusParams);
242 if (!success)
243 {
244 return;
245 }
246 if (getRecordBufferStatusParams.overflow)
247 {
248 SPICES_CORE_ERROR("Record buffer has overflowed. Please ensure that the value of `maxNumUndecodedSamples` is sufficiently large.")
249 }
250
251 if (getRecordBufferStatusParams.numUnreadBytes == 0)
252 {
253 return;
254 }
255
256 /**
257 * @brief Decode the record buffer and store the decoded counters into the counter data.
258 */
259 NVPW_GPU_PeriodicSampler_DecodeStopReason decodeStopReason = NVPW_GPU_PERIODIC_SAMPLER_DECODE_STOP_REASON_OTHER;
260 size_t numSamplesMerged = 0;
261 size_t numBytesConsumed = 0;
262 if (!sampler.DecodeCounters(
263 counterData.GetCounterData() ,
264 getRecordBufferStatusParams.numUnreadBytes ,
265 decodeStopReason ,
266 numSamplesMerged ,
267 numBytesConsumed
268 ))
269 {
270 SPICES_CORE_WARN("Failed to decode counters.")
271 }
272
273 if (numSamplesMerged)
274 {
275 SPICES_CORE_WARN("Samples appear to be merged, this can reduce the accuracy of the collected samples. Please check for any back-to-back triggers!")
276 }
277 if (decodeStopReason != NVPW_GPU_PERIODIC_SAMPLER_DECODE_STOP_REASON_ALL_GIVEN_BYTES_READ)
278 {
279 SPICES_CORE_WARN("DecodeCounters stopped unexpectedly.")
280 }
281 if (!sampler.AcknowledgeRecordBuffer(numBytesConsumed))
282 {
283 SPICES_CORE_WARN("Failed to acknowledge record buffer")
284 }
285 if (!counterData.UpdatePut())
286 {
287 SPICES_CORE_WARN("Failed to update counter data's put pointer.")
288 }
289
290 const uint32_t numUnreadRanges = counterData.GetNumUnreadRanges();
291 if (numUnreadRanges)
292 {
293 std::vector<double> metricValues(metricEvalRequests.size());
294 uint32_t numRangesConsumed = 0;
295 NSPERF_CHECK(counterData.ConsumeData(
296 [&](
297 const uint8_t* pCounterDataImage ,
298 size_t counterDataImageSize ,
299 uint32_t rangeIndex ,
300 bool& stop
301 )
302 {
303 nv::perf::sampler::SampleTimestamp timestamp{};
304 if (!CounterDataGetSampleTime(pCounterDataImage, rangeIndex, timestamp))
305 {
306 return false;
307 }
308
309 if (!nv::perf::EvaluateToGpuValues(
310 metricsEvaluator,
311 pCounterDataImage,
312 counterDataImageSize,
313 rangeIndex,
314 metricEvalRequests.size(),
315 metricEvalRequests.data(),
316 metricValues.data()))
317 {
318 return false;
319 }
320 {
321 std::cout << std::fixed << std::setprecision(0) << timestamp.start << ", " << timestamp.end << ", " << (timestamp.end - timestamp.start);
322 for (const double& metricValue : metricValues)
323 {
324 std::cout << ", " << metricValue;
325 }
326 std::cout << "\n";
327 }
328 if (++numRangesConsumed == numUnreadRanges)
329 {
330 /**
331 * @brief Inform counter data to stop iterating because all existing data has been consumed.
332 */
333 stop = true;
334 }
335 return true;
336 }))
337
338 std::cout << std::flush;
339 if (!counterData.UpdateGet(numRangesConsumed))
340 {
341 SPICES_CORE_WARN("Counter data failed to update get pointer.")
342 }
343 }
344
345 /**
346 * @brief Reset after out.
347 */
348 Reset();
349 }
350
352 {
354
356 }
357
359 {
361
362 if(!m_IsInSession) return;
363
364 NSPERF_CHECK(sampler.StopSampling())
365 NSPERF_CHECK(sampler.EndSession())
366 sampler.Reset();
367
368 m_IsInSession = false;
369 }
370}
#define NSPERF_CHECK(val)
#define SPICES_PROFILE_ZONE
static void CreateInstance(VulkanState &state)
Create this Single Instance.
NsightPerfGPUProfilerContinuous(VulkanState &state)
Constructor Function.
void Create(VulkanState &state)
Begin this Session.
bool m_EnableCaptureNextFrame
True if want capture next frame.
static std::shared_ptr< NsightPerfGPUProfilerContinuous > m_NsightPerfGPUProfilerContinuous
This Single Instance.
void BeginFrame(VulkanState &state)
Begin a frame.
Wrapper of Nvidia Nsight Performance Metrics.
const char * Metrics[]
The following metrics are for demonstration purposes only. For a more comprehensive set of single-pas...
This struct contains all Vulkan object in used global.
Definition VulkanUtils.h:74