NVAPI Reference Documentation 545
Loading...
Searching...
No Matches
nvHLSLExtns.h
1 /************************************************************************************************************************************\
2|* *|
3|* Copyright © 2012 NVIDIA Corporation. All rights reserved. *|
4|* *|
5|* NOTICE TO USER: *|
6|* *|
7|* This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws. *|
8|* *|
9|* This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA *|
10|* and are being provided solely under the terms and conditions of an NVIDIA software license agreement. *|
11|* Otherwise, you have no rights to use or access this software in any manner. *|
12|* *|
13|* If not covered by the applicable NVIDIA software license agreement: *|
14|* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE. *|
15|* IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. *|
16|* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, *|
17|* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. *|
18|* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, *|
19|* OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, *|
20|* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE. *|
21|* *|
22|* U.S. Government End Users. *|
23|* This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995), *|
24|* consisting of "commercial computer software" and "commercial computer software documentation" *|
25|* as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item. *|
26|* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), *|
27|* all U.S. Government End Users acquire the software with only those rights set forth herein. *|
28|* *|
29|* Any use of this software in individual and commercial software must include, *|
30|* in the user documentation and internal comments to the code, *|
31|* the above Disclaimer (as applicable) and U.S. Government End Users Notice. *|
32|* *|
33 \************************************************************************************************************************************/
34
36
37// this file is to be #included in the app HLSL shader code to make
38// use of nvidia shader extensions
39
40
41#include "nvHLSLExtnsInternal.h"
42
43//----------------------------------------------------------------------------//
44//------------------------- Warp Shuffle Functions ---------------------------//
45//----------------------------------------------------------------------------//
46
47// all functions have variants with width parameter which permits sub-division
48// of the warp into segments - for example to exchange data between 4 groups of
49// 8 lanes in a SIMD manner. If width is less than warpSize then each subsection
50// of the warp behaves as a separate entity with a starting logical lane ID of 0.
51// A thread may only exchange data with others in its own subsection. Width must
52// have a value which is a power of 2 so that the warp can be subdivided equally;
53// results are undefined if width is not a power of 2, or is a number greater
54// than warpSize.
55
56//
57// simple variant of SHFL instruction
58// returns val from the specified lane
59// optional width parameter must be a power of two and width <= 32
60//
61int NvShfl(int val, uint srcLane, int width = NV_WARP_SIZE)
62{
63 uint index = g_NvidiaExt.IncrementCounter();
64 g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
65 g_NvidiaExt[index].src0u.y = srcLane; // source lane
66 g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
67 g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL;
68
69 // result is returned as the return value of IncrementCounter on fake UAV slot
70 return g_NvidiaExt.IncrementCounter();
71}
72
73int2 NvShfl(int2 val, uint srcLane, int width = NV_WARP_SIZE)
74{
75 int x = NvShfl(val.x, srcLane, width);
76 int y = NvShfl(val.y, srcLane, width);
77 return int2(x, y);
78}
79
80int4 NvShfl(int4 val, uint srcLane, int width = NV_WARP_SIZE)
81{
82 int x = NvShfl(val.x, srcLane, width);
83 int y = NvShfl(val.y, srcLane, width);
84 int z = NvShfl(val.z, srcLane, width);
85 int w = NvShfl(val.w, srcLane, width);
86 return int4(x, y, z, w);
87}
88
89//
90// Copy from a lane with lower ID relative to caller
91//
92int NvShflUp(int val, uint delta, int width = NV_WARP_SIZE)
93{
94 uint index = g_NvidiaExt.IncrementCounter();
95 g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
96 g_NvidiaExt[index].src0u.y = delta; // relative lane offset
97 g_NvidiaExt[index].src0u.z = (NV_WARP_SIZE - width) << 8; // minIndex = maxIndex for shfl_up (src2[4:0] is expected to be 0)
98 g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_UP;
99 return g_NvidiaExt.IncrementCounter();
100}
101
102//
103// Copy from a lane with higher ID relative to caller
104//
105int NvShflDown(int val, uint delta, int width = NV_WARP_SIZE)
106{
107 uint index = g_NvidiaExt.IncrementCounter();
108 g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
109 g_NvidiaExt[index].src0u.y = delta; // relative lane offset
110 g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
111 g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_DOWN;
112 return g_NvidiaExt.IncrementCounter();
113}
114
115//
116// Copy from a lane based on bitwise XOR of own lane ID
117//
118int NvShflXor(int val, uint laneMask, int width = NV_WARP_SIZE)
119{
120 uint index = g_NvidiaExt.IncrementCounter();
121 g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
122 g_NvidiaExt[index].src0u.y = laneMask; // laneMask to be XOR'ed with current laneId to get the source lane id
123 g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
124 g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_XOR;
125 return g_NvidiaExt.IncrementCounter();
126}
127
128
129//----------------------------------------------------------------------------//
130//----------------------------- Warp Vote Functions---------------------------//
131//----------------------------------------------------------------------------//
132
133// returns 0xFFFFFFFF if the predicate is true for any thread in the warp, returns 0 otherwise
134uint NvAny(int predicate)
135{
136 uint index = g_NvidiaExt.IncrementCounter();
137 g_NvidiaExt[index].src0u.x = predicate;
138 g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_ANY;
139 return g_NvidiaExt.IncrementCounter();
140}
141
142// returns 0xFFFFFFFF if the predicate is true for ALL threads in the warp, returns 0 otherwise
143uint NvAll(int predicate)
144{
145 uint index = g_NvidiaExt.IncrementCounter();
146 g_NvidiaExt[index].src0u.x = predicate;
147 g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_ALL;
148 return g_NvidiaExt.IncrementCounter();
149}
150
151// returns a mask of all threads in the warp with bits set for threads that have predicate true
152uint NvBallot(int predicate)
153{
154 uint index = g_NvidiaExt.IncrementCounter();
155 g_NvidiaExt[index].src0u.x = predicate;
156 g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_BALLOT;
157 return g_NvidiaExt.IncrementCounter();
158}
159
160
161//----------------------------------------------------------------------------//
162//----------------------------- Utility Functions ----------------------------//
163//----------------------------------------------------------------------------//
164
165// returns the lane index of the current thread (thread index in warp)
166int NvGetLaneId()
167{
168 uint index = g_NvidiaExt.IncrementCounter();
169 g_NvidiaExt[index].opcode = NV_EXTN_OP_GET_LANE_ID;
170 return g_NvidiaExt.IncrementCounter();
171}
172
173// returns value of special register - specify subopcode from any of NV_SPECIALOP_* specified in nvShaderExtnEnums.h - other opcodes undefined behavior
174uint NvGetSpecial(uint subOpCode)
175{
176 return __NvGetSpecial(subOpCode);
177}
178
179//----------------------------------------------------------------------------//
180//----------------------------- FP16 Atmoic Functions-------------------------//
181//----------------------------------------------------------------------------//
182
183// The functions below performs atomic operations on two consecutive fp16
184// values in the given raw UAV.
185// The uint paramater 'fp16x2Val' is treated as two fp16 values byteAddress must be multiple of 4
186// The returned value are the two fp16 values packed into a single uint
187
188uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
189{
190 return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_ADD);
191}
192
193uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
194{
195 return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MIN);
196}
197
198uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
199{
200 return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MAX);
201}
202
203
204// versions of the above functions taking two fp32 values (internally converted to fp16 values)
205uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
206{
207 return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
208}
209
210uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
211{
212 return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
213}
214
215uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
216{
217 return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
218}
219
220
221//----------------------------------------------------------------------------//
222
223// The functions below perform atomic operation on a R16G16_FLOAT UAV at the given address
224// the uint paramater 'fp16x2Val' is treated as two fp16 values
225// the returned value are the two fp16 values (.x and .y components) packed into a single uint
226// Warning: Behaviour of these set of functions is undefined if the UAV is not
227// of R16G16_FLOAT format (might result in app crash or TDR)
228
229uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
230{
231 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
232}
233
234uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
235{
236 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
237}
238
239uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
240{
241 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
242}
243
244uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
245{
246 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
247}
248
249uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
250{
251 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
252}
253
254uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
255{
256 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
257}
258
259uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
260{
261 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
262}
263
264uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
265{
266 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
267}
268
269uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
270{
271 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
272}
273
274
275// versions taking two fp32 values (internally converted to fp16)
276uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
277{
278 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
279}
280
281uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
282{
283 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
284}
285
286uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
287{
288 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
289}
290
291uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
292{
293 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
294}
295
296uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
297{
298 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
299}
300
301uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
302{
303 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
304}
305
306uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
307{
308 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
309}
310
311uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
312{
313 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
314}
315
316uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
317{
318 return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
319}
320
321
322//----------------------------------------------------------------------------//
323
324// The functions below perform Atomic operation on a R16G16B16A16_FLOAT UAV at the given address
325// the uint2 paramater 'fp16x2Val' is treated as four fp16 values
326// i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz
327// The returned value are the four fp16 values (.xyzw components) packed into uint2
328// Warning: Behaviour of these set of functions is undefined if the UAV is not
329// of R16G16B16A16_FLOAT format (might result in app crash or TDR)
330
331uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
332{
333 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
334}
335
336uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
337{
338 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
339}
340
341uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
342{
343 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
344}
345
346uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
347{
348 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
349}
350
351uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
352{
353 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
354}
355
356uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
357{
358 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
359}
360
361uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
362{
363 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
364}
365
366uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
367{
368 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
369}
370
371uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
372{
373 return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
374}
375
376// versions taking four fp32 values (internally converted to fp16)
377uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
378{
379 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
380}
381
382uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
383{
384 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
385}
386
387uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
388{
389 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
390}
391
392uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
393{
394 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
395}
396
397uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
398{
399 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
400}
401
402uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
403{
404 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
405}
406
407uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
408{
409 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
410}
411
412uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
413{
414 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
415}
416
417uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
418{
419 return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
420}
421
422
423//----------------------------------------------------------------------------//
424//----------------------------- FP32 Atmoic Functions-------------------------//
425//----------------------------------------------------------------------------//
426
427// The functions below performs atomic add on the given UAV treating the value as float
428// byteAddress must be multiple of 4
429// The returned value is the value present in memory location before the atomic add
430
431float NvInterlockedAddFp32(RWByteAddressBuffer uav, uint byteAddress, float val)
432{
433 return __NvAtomicAddFP32(uav, byteAddress, val);
434}
435
436//----------------------------------------------------------------------------//
437
438// The functions below perform atomic add on a R32_FLOAT UAV at the given address
439// the returned value is the value before performing the atomic add
440// Warning: Behaviour of these set of functions is undefined if the UAV is not
441// of R32_FLOAT format (might result in app crash or TDR)
442
443float NvInterlockedAddFp32(RWTexture1D<float> uav, uint address, float val)
444{
445 return __NvAtomicAddFP32(uav, address, val);
446}
447
448float NvInterlockedAddFp32(RWTexture2D<float> uav, uint2 address, float val)
449{
450 return __NvAtomicAddFP32(uav, address, val);
451}
452
453float NvInterlockedAddFp32(RWTexture3D<float> uav, uint3 address, float val)
454{
455 return __NvAtomicAddFP32(uav, address, val);
456}
457
458
459//----------------------------------------------------------------------------//
460//--------------------------- UINT64 Atmoic Functions-------------------------//
461//----------------------------------------------------------------------------//
462
463// The functions below performs atomic operation on the given UAV treating the value as uint64
464// byteAddress must be multiple of 8
465// The returned value is the value present in memory location before the atomic operation
466// uint2 vector type is used to represent a single uint64 value with the x component containing the low 32 bits and y component the high 32 bits.
467
468uint2 NvInterlockedAddUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
469{
470 return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_ADD);
471}
472
473uint2 NvInterlockedMaxUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
474{
475 return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_MAX);
476}
477
478uint2 NvInterlockedMinUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
479{
480 return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_MIN);
481}
482
483uint2 NvInterlockedAndUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
484{
485 return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_AND);
486}
487
488uint2 NvInterlockedOrUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
489{
490 return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_OR);
491}
492
493uint2 NvInterlockedXorUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
494{
495 return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_XOR);
496}
497
498uint2 NvInterlockedCompareExchangeUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 compare_value, uint2 value)
499{
500 return __NvAtomicCompareExchangeUINT64(uav, byteAddress, compare_value, value);
501}
502
503uint2 NvInterlockedExchangeUint64(RWByteAddressBuffer uav, uint byteAddress, uint2 value)
504{
505 return __NvAtomicOpUINT64(uav, byteAddress, value, NV_EXTN_ATOM_SWAP);
506}
507
508//----------------------------------------------------------------------------//
509
510// The functions below perform atomic operation on a R32G32_UINT UAV at the given address treating the value as uint64
511// the returned value is the value before performing the atomic operation
512// uint2 vector type is used to represent a single uint64 value with the x component containing the low 32 bits and y component the high 32 bits.
513// Warning: Behaviour of these set of functions is undefined if the UAV is not of R32G32_UINT format (might result in app crash or TDR)
514
515uint2 NvInterlockedAddUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
516{
517 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_ADD);
518}
519
520uint2 NvInterlockedMaxUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
521{
522 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MAX);
523}
524
525uint2 NvInterlockedMinUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
526{
527 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MIN);
528}
529
530uint2 NvInterlockedAndUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
531{
532 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_AND);
533}
534
535uint2 NvInterlockedOrUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
536{
537 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_OR);
538}
539
540uint2 NvInterlockedXorUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
541{
542 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_XOR);
543}
544
545uint2 NvInterlockedCompareExchangeUint64(RWTexture1D<uint2> uav, uint address, uint2 compare_value, uint2 value)
546{
547 return __NvAtomicCompareExchangeUINT64(uav, address, compare_value, value);
548}
549
550uint2 NvInterlockedExchangeUint64(RWTexture1D<uint2> uav, uint address, uint2 value)
551{
552 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_SWAP);
553}
554
555uint2 NvInterlockedAddUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
556{
557 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_ADD);
558}
559
560uint2 NvInterlockedMaxUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
561{
562 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MAX);
563}
564
565uint2 NvInterlockedMinUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
566{
567 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MIN);
568}
569
570uint2 NvInterlockedAndUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
571{
572 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_AND);
573}
574
575uint2 NvInterlockedOrUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
576{
577 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_OR);
578}
579
580uint2 NvInterlockedXorUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
581{
582 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_XOR);
583}
584
585uint2 NvInterlockedCompareExchangeUint64(RWTexture2D<uint2> uav, uint2 address, uint2 compare_value, uint2 value)
586{
587 return __NvAtomicCompareExchangeUINT64(uav, address, compare_value, value);
588}
589
590uint2 NvInterlockedExchangeUint64(RWTexture2D<uint2> uav, uint2 address, uint2 value)
591{
592 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_SWAP);
593}
594
595uint2 NvInterlockedAddUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
596{
597 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_ADD);
598}
599
600uint2 NvInterlockedMaxUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
601{
602 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MAX);
603}
604
605uint2 NvInterlockedMinUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
606{
607 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_MIN);
608}
609
610uint2 NvInterlockedAndUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
611{
612 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_AND);
613}
614
615uint2 NvInterlockedOrUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
616{
617 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_OR);
618}
619
620uint2 NvInterlockedXorUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
621{
622 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_XOR);
623}
624
625uint2 NvInterlockedCompareExchangeUint64(RWTexture3D<uint2> uav, uint3 address, uint2 compare_value, uint2 value)
626{
627 return __NvAtomicCompareExchangeUINT64(uav, address, compare_value, value);
628}
629
630uint2 NvInterlockedExchangeUint64(RWTexture3D<uint2> uav, uint3 address, uint2 value)
631{
632 return __NvAtomicOpUINT64(uav, address, value, NV_EXTN_ATOM_SWAP);
633}
634
635//----------------------------------------------------------------------------//
636//--------------------------- VPRS functions ---------------------------------//
637//----------------------------------------------------------------------------//
638
639// Returns the shading rate and the number of per-pixel shading passes for current VPRS pixel
640uint3 NvGetShadingRate()
641{
642 uint3 shadingRate = (uint3)0;
643 uint index = g_NvidiaExt.IncrementCounter();
644 g_NvidiaExt[index].opcode = NV_EXTN_OP_GET_SHADING_RATE;
645 g_NvidiaExt[index].numOutputsForIncCounter = 3;
646 shadingRate.x = g_NvidiaExt.IncrementCounter();
647 shadingRate.y = g_NvidiaExt.IncrementCounter();
648 shadingRate.z = g_NvidiaExt.IncrementCounter();
649 return shadingRate;
650}
651
652float NvEvaluateAttributeAtSampleForVPRS(float attrib, uint sampleIndex, int2 pixelOffset)
653{
654 float value = (float)0;
655 uint ext = g_NvidiaExt.IncrementCounter();
656 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
657 g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
658 g_NvidiaExt[ext].src1u.x = sampleIndex;
659 g_NvidiaExt[ext].src2u.xy = pixelOffset;
660 g_NvidiaExt[ext].numOutputsForIncCounter = 1;
661 value.x = asfloat(g_NvidiaExt.IncrementCounter());
662 return value;
663}
664
665float2 NvEvaluateAttributeAtSampleForVPRS(float2 attrib, uint sampleIndex, int2 pixelOffset)
666{
667 float2 value = (float2)0;
668 uint ext = g_NvidiaExt.IncrementCounter();
669 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
670 g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
671 g_NvidiaExt[ext].src1u.x = sampleIndex;
672 g_NvidiaExt[ext].src2u.xy = pixelOffset;
673 g_NvidiaExt[ext].numOutputsForIncCounter = 2;
674 value.x = asfloat(g_NvidiaExt.IncrementCounter());
675 value.y = asfloat(g_NvidiaExt.IncrementCounter());
676 return value;
677}
678
679float3 NvEvaluateAttributeAtSampleForVPRS(float3 attrib, uint sampleIndex, int2 pixelOffset)
680{
681 float3 value = (float3)0;
682 uint ext = g_NvidiaExt.IncrementCounter();
683 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
684 g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
685 g_NvidiaExt[ext].src1u.x = sampleIndex;
686 g_NvidiaExt[ext].src2u.xy = pixelOffset;
687 g_NvidiaExt[ext].numOutputsForIncCounter = 3;
688 value.x = asfloat(g_NvidiaExt.IncrementCounter());
689 value.y = asfloat(g_NvidiaExt.IncrementCounter());
690 value.z = asfloat(g_NvidiaExt.IncrementCounter());
691 return value;
692}
693
694float4 NvEvaluateAttributeAtSampleForVPRS(float4 attrib, uint sampleIndex, int2 pixelOffset)
695{
696 float4 value = (float4)0;
697 uint ext = g_NvidiaExt.IncrementCounter();
698 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
699 g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
700 g_NvidiaExt[ext].src1u.x = sampleIndex;
701 g_NvidiaExt[ext].src2u.xy = pixelOffset;
702 g_NvidiaExt[ext].numOutputsForIncCounter = 4;
703 value.x = asfloat(g_NvidiaExt.IncrementCounter());
704 value.y = asfloat(g_NvidiaExt.IncrementCounter());
705 value.z = asfloat(g_NvidiaExt.IncrementCounter());
706 value.w = asfloat(g_NvidiaExt.IncrementCounter());
707 return value;
708}
709
710int NvEvaluateAttributeAtSampleForVPRS(int attrib, uint sampleIndex, int2 pixelOffset)
711{
712 int value = (int)0;
713 uint ext = g_NvidiaExt.IncrementCounter();
714 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
715 g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
716 g_NvidiaExt[ext].src1u.x = sampleIndex;
717 g_NvidiaExt[ext].src2u.xy = pixelOffset;
718 g_NvidiaExt[ext].numOutputsForIncCounter = 1;
719 value.x = asint(g_NvidiaExt.IncrementCounter());
720 return value;
721}
722
723int2 NvEvaluateAttributeAtSampleForVPRS(int2 attrib, uint sampleIndex, int2 pixelOffset)
724{
725 int2 value = (int2)0;
726 uint ext = g_NvidiaExt.IncrementCounter();
727 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
728 g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
729 g_NvidiaExt[ext].src1u.x = sampleIndex;
730 g_NvidiaExt[ext].src2u.xy = pixelOffset;
731 g_NvidiaExt[ext].numOutputsForIncCounter = 2;
732 value.x = asint(g_NvidiaExt.IncrementCounter());
733 value.y = asint(g_NvidiaExt.IncrementCounter());
734 return value;
735}
736
737int3 NvEvaluateAttributeAtSampleForVPRS(int3 attrib, uint sampleIndex, int2 pixelOffset)
738{
739 int3 value = (int3)0;
740 uint ext = g_NvidiaExt.IncrementCounter();
741 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
742 g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
743 g_NvidiaExt[ext].src1u.x = sampleIndex;
744 g_NvidiaExt[ext].src2u.xy = pixelOffset;
745 g_NvidiaExt[ext].numOutputsForIncCounter = 3;
746 value.x = asint(g_NvidiaExt.IncrementCounter());
747 value.y = asint(g_NvidiaExt.IncrementCounter());
748 value.z = asint(g_NvidiaExt.IncrementCounter());
749 return value;
750}
751
752int4 NvEvaluateAttributeAtSampleForVPRS(int4 attrib, uint sampleIndex, int2 pixelOffset)
753{
754 int4 value = (int4)0;
755 uint ext = g_NvidiaExt.IncrementCounter();
756 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
757 g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
758 g_NvidiaExt[ext].src1u.x = sampleIndex;
759 g_NvidiaExt[ext].src2u.xy = pixelOffset;
760 g_NvidiaExt[ext].numOutputsForIncCounter = 4;
761 value.x = asint(g_NvidiaExt.IncrementCounter());
762 value.y = asint(g_NvidiaExt.IncrementCounter());
763 value.z = asint(g_NvidiaExt.IncrementCounter());
764 value.w = asint(g_NvidiaExt.IncrementCounter());
765 return value;
766}
767
768uint NvEvaluateAttributeAtSampleForVPRS(uint attrib, uint sampleIndex, int2 pixelOffset)
769{
770 uint value = (uint)0;
771 uint ext = g_NvidiaExt.IncrementCounter();
772 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
773 g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
774 g_NvidiaExt[ext].src1u.x = sampleIndex;
775 g_NvidiaExt[ext].src2u.xy = pixelOffset;
776 g_NvidiaExt[ext].numOutputsForIncCounter = 1;
777 value.x = asuint(g_NvidiaExt.IncrementCounter());
778 return value;
779}
780
781uint2 NvEvaluateAttributeAtSampleForVPRS(uint2 attrib, uint sampleIndex, int2 pixelOffset)
782{
783 uint2 value = (uint2)0;
784 uint ext = g_NvidiaExt.IncrementCounter();
785 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
786 g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
787 g_NvidiaExt[ext].src1u.x = sampleIndex;
788 g_NvidiaExt[ext].src2u.xy = pixelOffset;
789 g_NvidiaExt[ext].numOutputsForIncCounter = 2;
790 value.x = asuint(g_NvidiaExt.IncrementCounter());
791 value.y = asuint(g_NvidiaExt.IncrementCounter());
792 return value;
793}
794
795uint3 NvEvaluateAttributeAtSampleForVPRS(uint3 attrib, uint sampleIndex, int2 pixelOffset)
796{
797 uint3 value = (uint3)0;
798 uint ext = g_NvidiaExt.IncrementCounter();
799 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
800 g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
801 g_NvidiaExt[ext].src1u.x = sampleIndex;
802 g_NvidiaExt[ext].src2u.xy = pixelOffset;
803 g_NvidiaExt[ext].numOutputsForIncCounter = 3;
804 value.x = asuint(g_NvidiaExt.IncrementCounter());
805 value.y = asuint(g_NvidiaExt.IncrementCounter());
806 value.z = asuint(g_NvidiaExt.IncrementCounter());
807 return value;
808}
809
810uint4 NvEvaluateAttributeAtSampleForVPRS(uint4 attrib, uint sampleIndex, int2 pixelOffset)
811{
812 uint4 value = (uint4)0;
813 uint ext = g_NvidiaExt.IncrementCounter();
814 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_AT_SAMPLE;
815 g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
816 g_NvidiaExt[ext].src1u.x = sampleIndex;
817 g_NvidiaExt[ext].src2u.xy = pixelOffset;
818 g_NvidiaExt[ext].numOutputsForIncCounter = 4;
819 value.x = asuint(g_NvidiaExt.IncrementCounter());
820 value.y = asuint(g_NvidiaExt.IncrementCounter());
821 value.z = asuint(g_NvidiaExt.IncrementCounter());
822 value.w = asuint(g_NvidiaExt.IncrementCounter());
823 return value;
824}
825
826
827float NvEvaluateAttributeSnappedForVPRS(float attrib, uint2 offset)
828{
829 float value = (float)0;
830 uint ext = g_NvidiaExt.IncrementCounter();
831 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
832 g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
833 g_NvidiaExt[ext].src1u.xy = offset;
834 g_NvidiaExt[ext].numOutputsForIncCounter = 1;
835 value.x = asfloat(g_NvidiaExt.IncrementCounter());
836 return value;
837}
838
839float2 NvEvaluateAttributeSnappedForVPRS(float2 attrib, uint2 offset)
840{
841 float2 value = (float2)0;
842 uint ext = g_NvidiaExt.IncrementCounter();
843 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
844 g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
845 g_NvidiaExt[ext].src1u.xy = offset;
846 g_NvidiaExt[ext].numOutputsForIncCounter = 2;
847 value.x = asfloat(g_NvidiaExt.IncrementCounter());
848 value.y = asfloat(g_NvidiaExt.IncrementCounter());
849 return value;
850}
851
852float3 NvEvaluateAttributeSnappedForVPRS(float3 attrib, uint2 offset)
853{
854 float3 value = (float3)0;
855 uint ext = g_NvidiaExt.IncrementCounter();
856 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
857 g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
858 g_NvidiaExt[ext].src1u.xy = offset;
859 g_NvidiaExt[ext].numOutputsForIncCounter = 3;
860 value.x = asfloat(g_NvidiaExt.IncrementCounter());
861 value.y = asfloat(g_NvidiaExt.IncrementCounter());
862 value.z = asfloat(g_NvidiaExt.IncrementCounter());
863 return value;
864}
865
866float4 NvEvaluateAttributeSnappedForVPRS(float4 attrib, uint2 offset)
867{
868 float4 value = (float4)0;
869 uint ext = g_NvidiaExt.IncrementCounter();
870 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
871 g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
872 g_NvidiaExt[ext].src1u.xy = offset;
873 g_NvidiaExt[ext].numOutputsForIncCounter = 4;
874 value.x = asfloat(g_NvidiaExt.IncrementCounter());
875 value.y = asfloat(g_NvidiaExt.IncrementCounter());
876 value.z = asfloat(g_NvidiaExt.IncrementCounter());
877 value.w = asfloat(g_NvidiaExt.IncrementCounter());
878 return value;
879}
880
881int NvEvaluateAttributeSnappedForVPRS(int attrib, uint2 offset)
882{
883 int value = (int)0;
884 uint ext = g_NvidiaExt.IncrementCounter();
885 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
886 g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
887 g_NvidiaExt[ext].src1u.xy = offset;
888 g_NvidiaExt[ext].numOutputsForIncCounter = 1;
889 value.x = asint(g_NvidiaExt.IncrementCounter());
890 return value;
891}
892
893int2 NvEvaluateAttributeSnappedForVPRS(int2 attrib, uint2 offset)
894{
895 int2 value = (int2)0;
896 uint ext = g_NvidiaExt.IncrementCounter();
897 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
898 g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
899 g_NvidiaExt[ext].src1u.xy = offset;
900 g_NvidiaExt[ext].numOutputsForIncCounter = 2;
901 value.x = asint(g_NvidiaExt.IncrementCounter());
902 value.y = asint(g_NvidiaExt.IncrementCounter());
903 return value;
904}
905
906int3 NvEvaluateAttributeSnappedForVPRS(int3 attrib, uint2 offset)
907{
908 int3 value = (int3)0;
909 uint ext = g_NvidiaExt.IncrementCounter();
910 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
911 g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
912 g_NvidiaExt[ext].src1u.xy = offset;
913 g_NvidiaExt[ext].numOutputsForIncCounter = 3;
914 value.x = asint(g_NvidiaExt.IncrementCounter());
915 value.y = asint(g_NvidiaExt.IncrementCounter());
916 value.z = asint(g_NvidiaExt.IncrementCounter());
917 return value;
918}
919
920int4 NvEvaluateAttributeSnappedForVPRS(int4 attrib, uint2 offset)
921{
922 int4 value = (int4)0;
923 uint ext = g_NvidiaExt.IncrementCounter();
924 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
925 g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
926 g_NvidiaExt[ext].src1u.xy = offset;
927 g_NvidiaExt[ext].numOutputsForIncCounter = 4;
928 value.x = asint(g_NvidiaExt.IncrementCounter());
929 value.y = asint(g_NvidiaExt.IncrementCounter());
930 value.z = asint(g_NvidiaExt.IncrementCounter());
931 value.w = asint(g_NvidiaExt.IncrementCounter());
932 return value;
933}
934
935uint NvEvaluateAttributeSnappedForVPRS(uint attrib, uint2 offset)
936{
937 uint value = (uint)0;
938 uint ext = g_NvidiaExt.IncrementCounter();
939 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
940 g_NvidiaExt[ext].src0u.x = asuint(attrib.x);
941 g_NvidiaExt[ext].src1u.xy = offset;
942 g_NvidiaExt[ext].numOutputsForIncCounter = 1;
943 value.x = asuint(g_NvidiaExt.IncrementCounter());
944 return value;
945}
946
947uint2 NvEvaluateAttributeSnappedForVPRS(uint2 attrib, uint2 offset)
948{
949 uint2 value = (uint2)0;
950 uint ext = g_NvidiaExt.IncrementCounter();
951 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
952 g_NvidiaExt[ext].src0u.xy = asuint(attrib.xy);
953 g_NvidiaExt[ext].src1u.xy = offset;
954 g_NvidiaExt[ext].numOutputsForIncCounter = 2;
955 value.x = asuint(g_NvidiaExt.IncrementCounter());
956 value.y = asuint(g_NvidiaExt.IncrementCounter());
957 return value;
958}
959
960uint3 NvEvaluateAttributeSnappedForVPRS(uint3 attrib, uint2 offset)
961{
962 uint3 value = (uint3)0;
963 uint ext = g_NvidiaExt.IncrementCounter();
964 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
965 g_NvidiaExt[ext].src0u.xyz = asuint(attrib.xyz);
966 g_NvidiaExt[ext].src1u.xy = offset;
967 g_NvidiaExt[ext].numOutputsForIncCounter = 3;
968 value.x = asuint(g_NvidiaExt.IncrementCounter());
969 value.y = asuint(g_NvidiaExt.IncrementCounter());
970 value.z = asuint(g_NvidiaExt.IncrementCounter());
971 return value;
972}
973
974uint4 NvEvaluateAttributeSnappedForVPRS(uint4 attrib, uint2 offset)
975{
976 uint4 value = (uint4)0;
977 uint ext = g_NvidiaExt.IncrementCounter();
978 g_NvidiaExt[ext].opcode = NV_EXTN_OP_VPRS_EVAL_ATTRIB_SNAPPED;
979 g_NvidiaExt[ext].src0u.xyzw = asuint(attrib.xyzw);
980 g_NvidiaExt[ext].src1u.xy = offset;
981 g_NvidiaExt[ext].numOutputsForIncCounter = 4;
982 value.x = asuint(g_NvidiaExt.IncrementCounter());
983 value.y = asuint(g_NvidiaExt.IncrementCounter());
984 value.z = asuint(g_NvidiaExt.IncrementCounter());
985 value.w = asuint(g_NvidiaExt.IncrementCounter());
986 return value;
987}
988
989// MATCH instruction variants
990uint NvWaveMatch(uint value)
991{
992 uint index = g_NvidiaExt.IncrementCounter();
993 g_NvidiaExt[index].src0u.x = value;
994 g_NvidiaExt[index].src1u.x = 1;
995 g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
996 // result is returned as the return value of IncrementCounter on fake UAV slot
997 return g_NvidiaExt.IncrementCounter();
998}
999
1000uint NvWaveMatch(uint2 value)
1001{
1002 uint index = g_NvidiaExt.IncrementCounter();
1003 g_NvidiaExt[index].src0u.xy = value.xy;
1004 g_NvidiaExt[index].src1u.x = 2;
1005 g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1006 // result is returned as the return value of IncrementCounter on fake UAV slot
1007 return g_NvidiaExt.IncrementCounter();
1008}
1009
1010uint NvWaveMatch(uint4 value)
1011{
1012 uint index = g_NvidiaExt.IncrementCounter();
1013 g_NvidiaExt[index].src0u = value;
1014 g_NvidiaExt[index].src1u.x = 4;
1015 g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1016 // result is returned as the return value of IncrementCounter on fake UAV slot
1017 return g_NvidiaExt.IncrementCounter();
1018}
1019
1020uint NvWaveMatch(float value)
1021{
1022 uint index = g_NvidiaExt.IncrementCounter();
1023 g_NvidiaExt[index].src0u.x = asuint(value);
1024 g_NvidiaExt[index].src1u.x = 1;
1025 g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1026 // result is returned as the return value of IncrementCounter on fake UAV slot
1027 return g_NvidiaExt.IncrementCounter();
1028}
1029
1030uint NvWaveMatch(float2 value)
1031{
1032 uint index = g_NvidiaExt.IncrementCounter();
1033 g_NvidiaExt[index].src0u.xy = asuint(value);
1034 g_NvidiaExt[index].src1u.x = 2;
1035 g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1036 // result is returned as the return value of IncrementCounter on fake UAV slot
1037 return g_NvidiaExt.IncrementCounter();
1038}
1039
1040uint NvWaveMatch(float4 value)
1041{
1042 uint index = g_NvidiaExt.IncrementCounter();
1043 g_NvidiaExt[index].src0u = asuint(value);
1044 g_NvidiaExt[index].src1u.x = 4;
1045 g_NvidiaExt[index].opcode = NV_EXTN_OP_MATCH_ANY;
1046 // result is returned as the return value of IncrementCounter on fake UAV slot
1047 return g_NvidiaExt.IncrementCounter();
1048}
1049
1050
1051//----------------------------------------------------------------------------//
1052//------------------------------ Footprint functions -------------------------//
1053//----------------------------------------------------------------------------//
1054// texSpace and smpSpace must be immediates, texIndex and smpIndex can be variable
1055// offset must be immediate
1056// the required components of location and offset fields can be filled depending on the dimension/type of the texture
1057// texType should be one of 2D or 3D as defined in nvShaderExtnEnums.h and and should be an immediate literal
1058// if the above restrictions are not met, the behaviour of this instruction is undefined
1059
1060uint4 NvFootprintFine(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, int3 offset = int3(0, 0, 0))
1061{
1062 return __NvFootprint(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, offset);
1063}
1064
1065uint4 NvFootprintCoarse(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, int3 offset = int3(0, 0, 0))
1066{
1067 return __NvFootprint(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, offset);
1068}
1069
1070
1071
1072uint4 NvFootprintFineBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float bias, int3 offset = int3(0, 0, 0))
1073{
1074 return __NvFootprintBias(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, bias, offset);
1075}
1076
1077uint4 NvFootprintCoarseBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float bias, int3 offset = int3(0, 0, 0))
1078{
1079 return __NvFootprintBias(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, bias, offset);
1080}
1081
1082
1083
1084uint4 NvFootprintFineLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float lodLevel, int3 offset = int3(0, 0, 0))
1085{
1086 return __NvFootprintLevel(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, lodLevel, offset);
1087}
1088
1089uint4 NvFootprintCoarseLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float lodLevel, int3 offset = int3(0, 0, 0))
1090{
1091 return __NvFootprintLevel(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, lodLevel, offset);
1092}
1093
1094
1095
1096uint4 NvFootprintFineGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float3 ddx, float3 ddy, int3 offset = int3(0, 0, 0))
1097{
1098 return __NvFootprintGrad(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, ddx, ddy, offset);
1099}
1100
1101uint4 NvFootprintCoarseGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float3 ddx, float3 ddy, int3 offset = int3(0, 0, 0))
1102{
1103 return __NvFootprintGrad(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, ddx, ddy, offset);
1104}
1105
1106uint NvFootprintExtractLOD(uint4 blob)
1107{
1108 return ((blob.w & 0xF000) >> 12);
1109}
1110
1111uint NvFootprintExtractReturnGran(uint4 blob)
1112{
1113 return ((blob.z & 0xF000000) >> 24);
1114}
1115
1116uint2 NvFootprintExtractAnchorTileLoc2D(uint4 blob)
1117{
1118 uint2 loc;
1119 loc.x = (blob.w & 0xFFF);
1120 loc.y = (blob.z & 0xFFF);
1121 return loc;
1122}
1123
1124uint3 NvFootprintExtractAnchorTileLoc3D(uint4 blob)
1125{
1126 uint3 loc;
1127 loc.x = (blob.w & 0xFFF);
1128 loc.y = ((blob.w & 0xFFF0000) >> 16);
1129 loc.z = (blob.z & 0x1FFF);
1130 return loc;
1131}
1132
1133uint2 NvFootprintExtractOffset2D(uint4 blob)
1134{
1135 uint2 loc;
1136 loc.x = ((blob.z & 0x070000) >> 16);
1137 loc.y = ((blob.z & 0x380000) >> 19);
1138 return loc;
1139}
1140
1141uint3 NvFootprintExtractOffset3D(uint4 blob)
1142{
1143 uint3 loc;
1144 loc.x = ((blob.z & 0x030000) >> 16);
1145 loc.y = ((blob.z & 0x0C0000) >> 18);
1146 loc.z = ((blob.z & 0x300000) >> 20);
1147 return loc;
1148}
1149
1150uint2 NvFootprintExtractBitmask(uint4 blob)
1151{
1152 return blob.xy;
1153}
1154
1155
1156// Variant of Footprint extensions which returns isSingleLod (out parameter)
1157// isSingleLod = true -> This footprint request touched the texels from only single LOD.
1158uint4 NvFootprintFine(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1159{
1160 uint4 res = __NvFootprint(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, offset);
1161 isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1162 return res;
1163}
1164
1165uint4 NvFootprintCoarse(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1166{
1167 uint4 res = __NvFootprint(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, offset);
1168 isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1169 return res;
1170}
1171
1172
1173
1174uint4 NvFootprintFineBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float bias, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1175{
1176 uint4 res = __NvFootprintBias(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, bias, offset);
1177 isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1178 return res;
1179}
1180
1181uint4 NvFootprintCoarseBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float bias, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1182{
1183 uint4 res = __NvFootprintBias(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, bias, offset);
1184 isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1185 return res;
1186}
1187
1188
1189
1190uint4 NvFootprintFineLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float lodLevel, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1191{
1192 uint4 res = __NvFootprintLevel(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, lodLevel, offset);
1193 isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1194 return res;
1195}
1196
1197uint4 NvFootprintCoarseLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float lodLevel, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1198{
1199 uint4 res = __NvFootprintLevel(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, lodLevel, offset);
1200 isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1201 return res;
1202}
1203
1204
1205
1206uint4 NvFootprintFineGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float3 ddx, float3 ddy, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1207{
1208 uint4 res = __NvFootprintGrad(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_FINE, gran, ddx, ddy, offset);
1209 isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1210 return res;
1211}
1212
1213uint4 NvFootprintCoarseGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint gran, float3 ddx, float3 ddy, out uint isSingleLod, int3 offset = int3(0, 0, 0))
1214{
1215 uint4 res = __NvFootprintGrad(texSpace, texIndex, smpSpace, smpIndex, texType, location, NV_EXTN_FOOTPRINT_MODE_COARSE, gran, ddx, ddy, offset);
1216 isSingleLod = __NvGetSpecial(NV_SPECIALOP_FOOTPRINT_SINGLELOD_PRED);
1217 return res;
1218}
1219
1220
1221uint NvActiveThreads()
1222{
1223 return NvBallot(1);
1224}
1225
1226
1227//----------------------------------------------------------------------------//
1228//------------------------------ WaveMultiPrefix functions -------------------//
1229//----------------------------------------------------------------------------//
1230
1231// Following are the WaveMultiPrefix functions for different operations (Add, Bitand, BitOr, BitXOr) for different datatypes (uint, uint2, uint4)
1232// This is a set of functions which implement multi-prefix operations among the set of active lanes in the current wave (WARP).
1233// A multi-prefix operation comprises a set of prefix operations, executed in parallel within subsets of lanes identified with the provided bitmasks.
1234// These bitmasks represent partitioning of the set of active lanes in the current wave into N groups (where N is the number of unique masks across all lanes in the wave).
1235// N prefix operations are then performed each within its corresponding group.
1236// The groups are assumed to be non-intersecting (that is, a given lane can be a member of one and only one group),
1237// and bitmasks in all lanes belonging to the same group are required to be the same.
1238// There are 2 type of functions - Exclusive and Inclusive prefix operations.
1239// e.g. For NvWaveMultiPrefixInclusiveAdd(val, mask) operation - For each of the groups (for which mask input is same) following is the expected output :
1240// i^th thread in a group has value = sum(values of threads 0 to i)
1241// For Exclusive version of same opeartion -
1242// i^th thread in a group has value = sum(values of threads 0 to i-1) and 0th thread in a the Group has value 0
1243
1244// Extensions for Add
1245uint NvWaveMultiPrefixInclusiveAdd(uint val, uint mask)
1246{
1247 uint temp;
1248 uint a = NvActiveThreads();
1249 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1250 uint nextLane = firstbithigh(remainingThreads);
1251 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1252 {
1253 temp = NvShfl(val, nextLane);
1254 uint laneValid;
1255 // As remainingThreads only has threads in group with smaller thread ids than its own thread-id nextLane can never be 31 for any thread in the group except the smallest one
1256 // For smallest thread in the group, remainingThreads is 0 --> nextLane is ~0 (i.e. considering last 5 bits its 31)
1257 // So passing maskClampValue=30 to __NvShflGeneric, it will return laneValid=false for the smallest thread in the group. So update val and nextLane based on laneValid.
1258 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1259 if (laneValid) // if nextLane's nextLane is valid
1260 {
1261 val = val + temp;
1262 nextLane = newLane;
1263 }
1264 }
1265 return val;
1266}
1267
1268uint NvWaveMultiPrefixExclusiveAdd(uint val, uint mask)
1269{
1270 uint temp;
1271 uint a = NvActiveThreads();
1272 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1273 uint lane = firstbithigh(remainingThreads);
1274 temp = NvShfl(val, lane);
1275 val = remainingThreads != 0 ? temp : 0;
1276 return NvWaveMultiPrefixInclusiveAdd(val, mask);
1277}
1278
1279uint2 NvWaveMultiPrefixInclusiveAdd(uint2 val, uint mask)
1280{
1281 uint2 temp;
1282 uint a = NvActiveThreads();
1283 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1284 uint nextLane = firstbithigh(remainingThreads);
1285 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1286 {
1287 temp = NvShfl(val, nextLane);
1288 uint laneValid;
1289 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1290 if (laneValid) // if nextLane's nextLane is valid
1291 {
1292 val = val + temp;
1293 nextLane = newLane;
1294 }
1295 }
1296 return val;
1297}
1298
1299uint2 NvWaveMultiPrefixExclusiveAdd(uint2 val, uint mask)
1300{
1301 uint2 temp;
1302 uint a = NvActiveThreads();
1303 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1304 uint lane = firstbithigh(remainingThreads);
1305 temp = NvShfl(val, lane);
1306 val = remainingThreads != 0 ? temp : uint2(0, 0);
1307 return NvWaveMultiPrefixInclusiveAdd(val, mask);
1308}
1309
1310uint4 NvWaveMultiPrefixInclusiveAdd(uint4 val, uint mask)
1311{
1312 uint4 temp;
1313 uint a = NvActiveThreads();
1314 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1315 uint nextLane = firstbithigh(remainingThreads);
1316 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1317 {
1318 temp = NvShfl(val, nextLane);
1319 uint laneValid;
1320 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1321 if (laneValid) // if nextLane's nextLane is valid
1322 {
1323 val = val + temp;
1324 nextLane = newLane;
1325 }
1326 }
1327 return val;
1328}
1329
1330uint4 NvWaveMultiPrefixExclusiveAdd(uint4 val, uint mask)
1331{
1332 uint4 temp;
1333 uint a = NvActiveThreads();
1334 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1335 uint lane = firstbithigh(remainingThreads);
1336 temp = NvShfl(val, lane);
1337 val = remainingThreads != 0 ? temp : uint4(0, 0, 0, 0);
1338 return NvWaveMultiPrefixInclusiveAdd(val, mask);
1339}
1340
1341// MultiPrefix extensions for Bitand
1342uint NvWaveMultiPrefixInclusiveAnd(uint val, uint mask)
1343{
1344 uint temp;
1345 uint a = NvActiveThreads();
1346 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1347 uint nextLane = firstbithigh(remainingThreads);
1348 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1349 {
1350 temp = NvShfl(val, nextLane);
1351 uint laneValid;
1352 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1353 if (laneValid) // if nextLane's nextLane is valid
1354 {
1355 val = val & temp;
1356 nextLane = newLane;
1357 }
1358 }
1359 return val;
1360}
1361
1362uint NvWaveMultiPrefixExclusiveAnd(uint val, uint mask)
1363{
1364 uint temp;
1365 uint a = NvActiveThreads();
1366 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1367 uint lane = firstbithigh(remainingThreads);
1368 temp = NvShfl(val, lane);
1369 val = remainingThreads != 0 ? temp : ~0;
1370 return NvWaveMultiPrefixInclusiveAnd(val, mask);
1371}
1372
1373uint2 NvWaveMultiPrefixInclusiveAnd(uint2 val, uint mask)
1374{
1375 uint2 temp;
1376 uint a = NvActiveThreads();
1377 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1378 uint nextLane = firstbithigh(remainingThreads);
1379 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1380 {
1381 temp = NvShfl(val, nextLane);
1382 uint laneValid;
1383 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1384 if (laneValid) // if nextLane's nextLane is valid
1385 {
1386 val = val & temp;
1387 nextLane = newLane;
1388 }
1389 }
1390 return val;
1391}
1392
1393uint2 NvWaveMultiPrefixExclusiveAnd(uint2 val, uint mask)
1394{
1395 uint2 temp;
1396 uint a = NvActiveThreads();
1397 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1398 uint lane = firstbithigh(remainingThreads);
1399 temp = NvShfl(val, lane);
1400 val = remainingThreads != 0 ? temp : uint2(~0, ~0);
1401 return NvWaveMultiPrefixInclusiveAnd(val, mask);
1402}
1403
1404
1405uint4 NvWaveMultiPrefixInclusiveAnd(uint4 val, uint mask)
1406{
1407 uint4 temp;
1408 uint a = NvActiveThreads();
1409 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1410 uint nextLane = firstbithigh(remainingThreads);
1411 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1412 {
1413 temp = NvShfl(val, nextLane);
1414 uint laneValid;
1415 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1416 if (laneValid) // if nextLane's nextLane is valid
1417 {
1418 val = val & temp;
1419 nextLane = newLane;
1420 }
1421 }
1422 return val;
1423}
1424
1425uint4 NvWaveMultiPrefixExclusiveAnd(uint4 val, uint mask)
1426{
1427 uint4 temp;
1428 uint a = NvActiveThreads();
1429 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1430 uint lane = firstbithigh(remainingThreads);
1431 temp = NvShfl(val, lane);
1432 val = remainingThreads != 0 ? temp : uint4(~0, ~0, ~0, ~0);
1433 return NvWaveMultiPrefixInclusiveAnd(val, mask);
1434}
1435
1436
1437// MultiPrefix extensions for BitOr
1438uint NvWaveMultiPrefixInclusiveOr(uint val, uint mask)
1439{
1440 uint temp;
1441 uint a = NvActiveThreads();
1442 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1443 uint nextLane = firstbithigh(remainingThreads);
1444 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1445 {
1446 temp = NvShfl(val, nextLane);
1447 uint laneValid;
1448 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1449 if (laneValid) // if nextLane's nextLane is valid
1450 {
1451 val = val | temp;
1452 nextLane = newLane;
1453 }
1454 }
1455 return val;
1456}
1457
1458uint NvWaveMultiPrefixExclusiveOr(uint val, uint mask)
1459{
1460 uint temp;
1461 uint a = NvActiveThreads();
1462 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1463 uint lane = firstbithigh(remainingThreads);
1464 temp = NvShfl(val, lane);
1465 val = remainingThreads != 0 ? temp : 0;
1466 return NvWaveMultiPrefixInclusiveOr(val, mask);
1467}
1468
1469uint2 NvWaveMultiPrefixInclusiveOr(uint2 val, uint mask)
1470{
1471 uint2 temp;
1472 uint a = NvActiveThreads();
1473 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1474 uint nextLane = firstbithigh(remainingThreads);
1475 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1476 {
1477 temp = NvShfl(val, nextLane);
1478 uint laneValid;
1479 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1480 if (laneValid) // if nextLane's nextLane is valid
1481 {
1482 val = val | temp;
1483 nextLane = newLane;
1484 }
1485 }
1486 return val;
1487}
1488
1489uint2 NvWaveMultiPrefixExclusiveOr(uint2 val, uint mask)
1490{
1491 uint2 temp;
1492 uint a = NvActiveThreads();
1493 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1494 uint lane = firstbithigh(remainingThreads);
1495 temp = NvShfl(val, lane);
1496 val = remainingThreads != 0 ? temp : uint2(0, 0);
1497 return NvWaveMultiPrefixInclusiveOr(val, mask);
1498}
1499
1500
1501uint4 NvWaveMultiPrefixInclusiveOr(uint4 val, uint mask)
1502{
1503 uint4 temp;
1504 uint a = NvActiveThreads();
1505 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1506 uint nextLane = firstbithigh(remainingThreads);
1507 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1508 {
1509 temp = NvShfl(val, nextLane);
1510 uint laneValid;
1511 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1512 if (laneValid) // if nextLane's nextLane is valid
1513 {
1514 val = val | temp;
1515 nextLane = newLane;
1516 }
1517 }
1518 return val;
1519}
1520
1521uint4 NvWaveMultiPrefixExclusiveOr(uint4 val, uint mask)
1522{
1523 uint4 temp;
1524 uint a = NvActiveThreads();
1525 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1526 uint lane = firstbithigh(remainingThreads);
1527 temp = NvShfl(val, lane);
1528 val = remainingThreads != 0 ? temp : uint4(0, 0, 0, 0);
1529 return NvWaveMultiPrefixInclusiveOr(val, mask);
1530}
1531
1532
1533// MultiPrefix extensions for BitXOr
1534uint NvWaveMultiPrefixInclusiveXOr(uint val, uint mask)
1535{
1536 uint temp;
1537 uint a = NvActiveThreads();
1538 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1539 uint nextLane = firstbithigh(remainingThreads);
1540 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1541 {
1542 temp = NvShfl(val, nextLane);
1543 uint laneValid;
1544 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1545 if (laneValid) // if nextLane's nextLane is valid
1546 {
1547 val = val ^ temp;
1548 nextLane = newLane;
1549 }
1550 }
1551 return val;
1552}
1553
1554uint NvWaveMultiPrefixExclusiveXOr(uint val, uint mask)
1555{
1556 uint temp;
1557 uint a = NvActiveThreads();
1558 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1559 uint lane = firstbithigh(remainingThreads);
1560 temp = NvShfl(val, lane);
1561 val = remainingThreads != 0 ? temp : 0;
1562 return NvWaveMultiPrefixInclusiveXOr(val, mask);
1563}
1564
1565uint2 NvWaveMultiPrefixInclusiveXOr(uint2 val, uint mask)
1566{
1567 uint2 temp;
1568 uint a = NvActiveThreads();
1569 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1570 uint nextLane = firstbithigh(remainingThreads);
1571 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1572 {
1573 temp = NvShfl(val, nextLane);
1574 uint laneValid;
1575 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1576 if (laneValid) // if nextLane's nextLane is valid
1577 {
1578 val = val ^ temp;
1579 nextLane = newLane;
1580 }
1581 }
1582 return val;
1583}
1584
1585uint2 NvWaveMultiPrefixExclusiveXOr(uint2 val, uint mask)
1586{
1587 uint2 temp;
1588 uint a = NvActiveThreads();
1589 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1590 uint lane = firstbithigh(remainingThreads);
1591 temp = NvShfl(val, lane);
1592 val = remainingThreads != 0 ? temp : uint2(0, 0);
1593 return NvWaveMultiPrefixInclusiveXOr(val, mask);
1594}
1595
1596
1597uint4 NvWaveMultiPrefixInclusiveXOr(uint4 val, uint mask)
1598{
1599 uint4 temp;
1600 uint a = NvActiveThreads();
1601 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1602 uint nextLane = firstbithigh(remainingThreads);
1603 for (uint i = 0; i < NV_WARP_SIZE_LOG2; i++)
1604 {
1605 temp = NvShfl(val, nextLane);
1606 uint laneValid;
1607 uint newLane = asuint(__NvShflGeneric(nextLane, nextLane, 30, laneValid));
1608 if (laneValid) // if nextLane's nextLane is valid
1609 {
1610 val = val ^ temp;
1611 nextLane = newLane;
1612 }
1613 }
1614 return val;
1615}
1616
1617uint4 NvWaveMultiPrefixExclusiveXOr(uint4 val, uint mask)
1618{
1619 uint4 temp;
1620 uint a = NvActiveThreads();
1621 uint remainingThreads = a & __NvGetSpecial(NV_SPECIALOP_THREADLTMASK) & mask;
1622 uint lane = firstbithigh(remainingThreads);
1623 temp = NvShfl(val, lane);
1624 val = remainingThreads != 0 ? temp : uint4(0, 0, 0, 0);
1625 return NvWaveMultiPrefixInclusiveXOr(val, mask);
1626}
1627
1628
1629//----------------------------------------------------------------------------//
1630//------------------------- DXR Micro-map Extension --------------------------//
1631//----------------------------------------------------------------------------//
1632
1633float3x3 NvRtTriangleObjectPositions()
1634{
1635 uint index = g_NvidiaExt.IncrementCounter();
1636 g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_TRIANGLE_OBJECT_POSITIONS;
1637
1638 float3x3 ret;
1639 ret[0][0] = asfloat(g_NvidiaExt.IncrementCounter());
1640 ret[0][1] = asfloat(g_NvidiaExt.IncrementCounter());
1641 ret[0][2] = asfloat(g_NvidiaExt.IncrementCounter());
1642 ret[1][0] = asfloat(g_NvidiaExt.IncrementCounter());
1643 ret[1][1] = asfloat(g_NvidiaExt.IncrementCounter());
1644 ret[1][2] = asfloat(g_NvidiaExt.IncrementCounter());
1645 ret[2][0] = asfloat(g_NvidiaExt.IncrementCounter());
1646 ret[2][1] = asfloat(g_NvidiaExt.IncrementCounter());
1647 ret[2][2] = asfloat(g_NvidiaExt.IncrementCounter());
1648 return ret;
1649}
1650
1651float3x3 NvRtMicroTriangleObjectPositions()
1652{
1653 uint index = g_NvidiaExt.IncrementCounter();
1654 g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_MICRO_TRIANGLE_OBJECT_POSITIONS;
1655
1656 float3x3 ret;
1657 ret[0][0] = asfloat(g_NvidiaExt.IncrementCounter());
1658 ret[0][1] = asfloat(g_NvidiaExt.IncrementCounter());
1659 ret[0][2] = asfloat(g_NvidiaExt.IncrementCounter());
1660 ret[1][0] = asfloat(g_NvidiaExt.IncrementCounter());
1661 ret[1][1] = asfloat(g_NvidiaExt.IncrementCounter());
1662 ret[1][2] = asfloat(g_NvidiaExt.IncrementCounter());
1663 ret[2][0] = asfloat(g_NvidiaExt.IncrementCounter());
1664 ret[2][1] = asfloat(g_NvidiaExt.IncrementCounter());
1665 ret[2][2] = asfloat(g_NvidiaExt.IncrementCounter());
1666 return ret;
1667}
1668
1669float3x2 NvRtMicroTriangleBarycentrics()
1670{
1671 uint index = g_NvidiaExt.IncrementCounter();
1672 g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_MICRO_TRIANGLE_BARYCENTRICS;
1673
1674 float3x2 ret;
1675 ret[0][0] = asfloat(g_NvidiaExt.IncrementCounter());
1676 ret[0][1] = asfloat(g_NvidiaExt.IncrementCounter());
1677 ret[1][0] = asfloat(g_NvidiaExt.IncrementCounter());
1678 ret[1][1] = asfloat(g_NvidiaExt.IncrementCounter());
1679 ret[2][0] = asfloat(g_NvidiaExt.IncrementCounter());
1680 ret[2][1] = asfloat(g_NvidiaExt.IncrementCounter());
1681 return ret;
1682}
1683
1684bool NvRtIsMicroTriangleHit()
1685{
1686 uint index = g_NvidiaExt.IncrementCounter();
1687 g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_IS_MICRO_TRIANGLE_HIT;
1688 uint ret = g_NvidiaExt.IncrementCounter();
1689 return ret != 0;
1690}
1691
1692bool NvRtIsBackFacing()
1693{
1694 uint index = g_NvidiaExt.IncrementCounter();
1695 g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_IS_BACK_FACING;
1696 uint ret = g_NvidiaExt.IncrementCounter();
1697 return ret != 0;
1698}
1699
1700#if __SHADER_TARGET_MAJOR > 6 || (__SHADER_TARGET_MAJOR == 6 && __SHADER_TARGET_MINOR >= 5)
1701
1702float3 NvRtMicroVertexObjectPosition(RaytracingAccelerationStructure AccelerationStructure, uint InstanceIndex, uint GeometryIndex, uint PrimitiveIndex, uint2 UV)
1703{
1704 uint index = g_NvidiaExt.IncrementCounter();
1705 g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_MICRO_VERTEX_OBJECT_POSITION;
1706 g_NvidiaExt[index].src0u.x = InstanceIndex;
1707 g_NvidiaExt[index].src0u.y = GeometryIndex;
1708 g_NvidiaExt[index].src0u.z = PrimitiveIndex;
1709 g_NvidiaExt[index].src0u.w = UV.x;
1710 g_NvidiaExt[index].src1u.x = UV.y;
1711 uint handle = g_NvidiaExt.IncrementCounter();
1712 float3 ret;
1713 ret.x = asfloat(g_NvidiaExt.IncrementCounter());
1714 ret.y = asfloat(g_NvidiaExt.IncrementCounter());
1715 ret.z = asfloat(g_NvidiaExt.IncrementCounter());
1716
1717 RayQuery<0> rq;
1718 rq.TraceRayInline(AccelerationStructure, 0, handle, (RayDesc)0);
1719
1720 return ret;
1721}
1722
1723float2 NvRtMicroVertexBarycentrics(RaytracingAccelerationStructure AccelerationStructure, uint InstanceIndex, uint GeometryIndex, uint PrimitiveIndex, uint2 UV)
1724{
1725 uint index = g_NvidiaExt.IncrementCounter();
1726 g_NvidiaExt[index].opcode = NV_EXTN_OP_RT_MICRO_VERTEX_BARYCENTRICS;
1727 g_NvidiaExt[index].src0u.x = InstanceIndex;
1728 g_NvidiaExt[index].src0u.y = GeometryIndex;
1729 g_NvidiaExt[index].src0u.z = PrimitiveIndex;
1730 g_NvidiaExt[index].src0u.w = UV.x;
1731 g_NvidiaExt[index].src1u.x = UV.y;
1732 uint handle = g_NvidiaExt.IncrementCounter();
1733 float2 ret;
1734 ret.x = asfloat(g_NvidiaExt.IncrementCounter());
1735 ret.y = asfloat(g_NvidiaExt.IncrementCounter());
1736
1737 RayQuery<0> rq;
1738 rq.TraceRayInline(AccelerationStructure, 0, handle, (RayDesc)0);
1739
1740 return ret;
1741}
1742
1743#endif
1744
1745//----------------------------------------------------------------------------//
1746//------------------------- DXR HitObject Extension --------------------------//
1747//----------------------------------------------------------------------------//
1748
1749// Support for templates in HLSL requires HLSL 2021+. When using dxc,
1750// use the -HV 2021 command line argument to enable these versions.
1751#if defined(__HLSL_VERSION) && (__HLSL_VERSION >= 2021) && !defined(NV_HITOBJECT_USE_MACRO_API)
1752
1753struct NvHitObject {
1754 uint _handle;
1755
1756 bool IsMiss()
1757 {
1758 uint index = g_NvidiaExt.IncrementCounter();
1759 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_MISS;
1760 g_NvidiaExt[index].src0u.x = _handle;
1761 uint ret = g_NvidiaExt.IncrementCounter();
1762 return ret != 0;
1763 }
1764
1765 bool IsHit()
1766 {
1767 uint index = g_NvidiaExt.IncrementCounter();
1768 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_HIT;
1769 g_NvidiaExt[index].src0u.x = _handle;
1770 uint ret = g_NvidiaExt.IncrementCounter();
1771 return ret != 0;
1772 }
1773
1774 bool IsNop()
1775 {
1776 uint index = g_NvidiaExt.IncrementCounter();
1777 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_NOP;
1778 g_NvidiaExt[index].src0u.x = _handle;
1779 uint ret = g_NvidiaExt.IncrementCounter();
1780 return ret != 0;
1781 }
1782
1783 uint GetInstanceID()
1784 {
1785 uint index = g_NvidiaExt.IncrementCounter();
1786 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_INSTANCE_ID;
1787 g_NvidiaExt[index].src0u.x = _handle;
1788 return g_NvidiaExt.IncrementCounter();
1789 }
1790
1791 uint GetInstanceIndex()
1792 {
1793 uint index = g_NvidiaExt.IncrementCounter();
1794 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_INSTANCE_INDEX;
1795 g_NvidiaExt[index].src0u.x = _handle;
1796 return g_NvidiaExt.IncrementCounter();
1797 }
1798
1799 uint GetPrimitiveIndex()
1800 {
1801 uint index = g_NvidiaExt.IncrementCounter();
1802 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_PRIMITIVE_INDEX;
1803 g_NvidiaExt[index].src0u.x = _handle;
1804 return g_NvidiaExt.IncrementCounter();
1805 }
1806
1807 uint GetGeometryIndex()
1808 {
1809 uint index = g_NvidiaExt.IncrementCounter();
1810 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_GEOMETRY_INDEX;
1811 g_NvidiaExt[index].src0u.x = _handle;
1812 return g_NvidiaExt.IncrementCounter();
1813 }
1814
1815 uint GetHitKind()
1816 {
1817 uint index = g_NvidiaExt.IncrementCounter();
1818 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_HIT_KIND;
1819 g_NvidiaExt[index].src0u.x = _handle;
1820 return g_NvidiaExt.IncrementCounter();
1821 }
1822
1823 RayDesc GetRayDesc()
1824 {
1825 uint index = g_NvidiaExt.IncrementCounter();
1826 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_RAY_DESC;
1827 g_NvidiaExt[index].src0u.x = _handle;
1828
1829 uint tmin = g_NvidiaExt.IncrementCounter();
1830 uint tmax = g_NvidiaExt.IncrementCounter();
1831 uint rayOrgX = g_NvidiaExt.IncrementCounter();
1832 uint rayOrgY = g_NvidiaExt.IncrementCounter();
1833 uint rayOrgZ = g_NvidiaExt.IncrementCounter();
1834 uint rayDirX = g_NvidiaExt.IncrementCounter();
1835 uint rayDirY = g_NvidiaExt.IncrementCounter();
1836 uint rayDirZ = g_NvidiaExt.IncrementCounter();
1837
1838 RayDesc ray;
1839 ray.TMin = asfloat(tmin);
1840 ray.TMax = asfloat(tmax);
1841 ray.Origin.x = asfloat(rayOrgX);
1842 ray.Origin.y = asfloat(rayOrgY);
1843 ray.Origin.z = asfloat(rayOrgZ);
1844 ray.Direction.x = asfloat(rayDirX);
1845 ray.Direction.y = asfloat(rayDirY);
1846 ray.Direction.z = asfloat(rayDirZ);
1847
1848 return ray;
1849 }
1850
1851 template <typename T>
1852 T GetAttributes()
1853 {
1854 uint index = g_NvidiaExt.IncrementCounter();
1855 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_ATTRIBUTES;
1856 g_NvidiaExt[index].src0u.x = _handle;
1857 uint callHandle = g_NvidiaExt.IncrementCounter();
1858
1859 T attrs;
1860 CallShader(callHandle, attrs);
1861 return attrs;
1862 }
1863
1864 uint GetShaderTableIndex()
1865 {
1866 uint index = g_NvidiaExt.IncrementCounter();
1867 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_SHADER_TABLE_INDEX;
1868 g_NvidiaExt[index].src0u.x = _handle;
1869 return g_NvidiaExt.IncrementCounter();
1870 }
1871
1872 uint LoadLocalRootTableConstant(uint RootConstantOffsetInBytes)
1873 {
1874 uint index = g_NvidiaExt.IncrementCounter();
1875 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_LOAD_LOCAL_ROOT_TABLE_CONSTANT;
1876 g_NvidiaExt[index].src0u.x = _handle;
1877 g_NvidiaExt[index].src0u.y = RootConstantOffsetInBytes;
1878 return g_NvidiaExt.IncrementCounter();
1879 }
1880};
1881
1882template<typename T>
1883NvHitObject NvTraceRayHitObject(
1884 RaytracingAccelerationStructure AccelerationStructure,
1885 uint RayFlags,
1886 uint InstanceInclusionMask,
1887 uint RayContributionToHitGroupIndex,
1888 uint MultiplierForGeometryContributionToHitGroupIndex,
1889 uint MissShaderIndex,
1890 RayDesc Ray,
1891 inout T Payload)
1892{
1893 uint index = g_NvidiaExt.IncrementCounter();
1894 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_TRACE_RAY;
1895 g_NvidiaExt[index].numOutputsForIncCounter = 2;
1896 g_NvidiaExt[index].src0u.x = MissShaderIndex;
1897 uint hitHandle = g_NvidiaExt.IncrementCounter();
1898 uint traceHandle = g_NvidiaExt.IncrementCounter();
1899
1900 TraceRay(AccelerationStructure, RayFlags, InstanceInclusionMask, RayContributionToHitGroupIndex, MultiplierForGeometryContributionToHitGroupIndex, traceHandle, Ray, Payload);
1901
1902 NvHitObject hitObj;
1903 hitObj._handle = hitHandle;
1904 return hitObj;
1905}
1906
1907template <typename T>
1908NvHitObject NvMakeHit(
1909 RaytracingAccelerationStructure AccelerationStructure,
1910 uint InstanceIndex,
1911 uint GeometryIndex,
1912 uint PrimitiveIndex,
1913 uint HitKind,
1914 uint RayContributionToHitGroupIndex,
1915 uint MultiplierForGeometryContributionToHitGroupIndex,
1916 RayDesc Ray,
1917 T Attributes)
1918{
1919 uint index = g_NvidiaExt.IncrementCounter();
1920 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_HIT;
1921 g_NvidiaExt[index].numOutputsForIncCounter = 2;
1922 g_NvidiaExt[index].src0u.x = InstanceIndex;
1923 g_NvidiaExt[index].src0u.y = GeometryIndex;
1924 g_NvidiaExt[index].src0u.z = PrimitiveIndex;
1925 g_NvidiaExt[index].src0u.w = HitKind;
1926 g_NvidiaExt[index].src1u.x = RayContributionToHitGroupIndex;
1927 g_NvidiaExt[index].src1u.y = MultiplierForGeometryContributionToHitGroupIndex;
1928 uint hitHandle = g_NvidiaExt.IncrementCounter();
1929 uint traceHandle = g_NvidiaExt.IncrementCounter();
1930
1931 struct AttrWrapper { T Attrs; };
1932 AttrWrapper wrapper;
1933 wrapper.Attrs = Attributes;
1934 CallShader(traceHandle, wrapper);
1935
1936 struct DummyPayload { int a; };
1937 DummyPayload payload;
1938 TraceRay(AccelerationStructure, 0, 0, 0, 0, traceHandle, Ray, payload);
1939
1940 NvHitObject hitObj;
1941 hitObj._handle = hitHandle;
1942 return hitObj;
1943}
1944
1945template <typename T>
1946NvHitObject NvMakeHitWithRecordIndex(
1947 uint HitGroupRecordIndex,
1948 RaytracingAccelerationStructure AccelerationStructure,
1949 uint InstanceIndex,
1950 uint GeometryIndex,
1951 uint PrimitiveIndex,
1952 uint HitKind,
1953 RayDesc Ray,
1954 T Attributes)
1955{
1956 uint index = g_NvidiaExt.IncrementCounter();
1957 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_HIT_WITH_RECORD_INDEX;
1958 g_NvidiaExt[index].numOutputsForIncCounter = 2;
1959 g_NvidiaExt[index].src0u.x = InstanceIndex;
1960 g_NvidiaExt[index].src0u.y = GeometryIndex;
1961 g_NvidiaExt[index].src0u.z = PrimitiveIndex;
1962 g_NvidiaExt[index].src0u.w = HitKind;
1963 g_NvidiaExt[index].src1u.x = HitGroupRecordIndex;
1964 uint hitHandle = g_NvidiaExt.IncrementCounter();
1965 uint traceHandle = g_NvidiaExt.IncrementCounter();
1966
1967 struct AttrWrapper { T Attrs; };
1968 AttrWrapper wrapper;
1969 wrapper.Attrs = Attributes;
1970 CallShader(traceHandle, wrapper);
1971
1972 struct DummyPayload { int a; };
1973 DummyPayload payload;
1974 TraceRay(AccelerationStructure, 0, 0, 0, 0, traceHandle, Ray, payload);
1975
1976 NvHitObject hitObj;
1977 hitObj._handle = hitHandle;
1978 return hitObj;
1979}
1980
1981NvHitObject NvMakeMiss(
1982 uint MissShaderIndex,
1983 RayDesc Ray)
1984{
1985 uint index = g_NvidiaExt.IncrementCounter();
1986 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_MISS;
1987 g_NvidiaExt[index].src0u.x = MissShaderIndex;
1988 g_NvidiaExt[index].src0u.y = asuint(Ray.TMin);
1989 g_NvidiaExt[index].src0u.z = asuint(Ray.TMax);
1990 g_NvidiaExt[index].src1u.x = asuint(Ray.Origin.x);
1991 g_NvidiaExt[index].src1u.y = asuint(Ray.Origin.y);
1992 g_NvidiaExt[index].src1u.z = asuint(Ray.Origin.z);
1993 g_NvidiaExt[index].src2u.x = asuint(Ray.Direction.x);
1994 g_NvidiaExt[index].src2u.y = asuint(Ray.Direction.y);
1995 g_NvidiaExt[index].src2u.z = asuint(Ray.Direction.z);
1996 uint hitHandle = g_NvidiaExt.IncrementCounter();
1997
1998 NvHitObject hitObj;
1999 hitObj._handle = hitHandle;
2000 return hitObj;
2001}
2002
2003NvHitObject NvMakeNop()
2004{
2005 uint index = g_NvidiaExt.IncrementCounter();
2006 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_NOP;
2007 uint hitHandle = g_NvidiaExt.IncrementCounter();
2008
2009 NvHitObject hitObj;
2010 hitObj._handle = hitHandle;
2011 return hitObj;
2012}
2013
2014void NvReorderThread(uint CoherenceHint, uint NumCoherenceHintBits)
2015{
2016 uint index = g_NvidiaExt.IncrementCounter();
2017 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_REORDER_THREAD;
2018 g_NvidiaExt[index].src0u.x = 0;
2019 g_NvidiaExt[index].src0u.y = 0;
2020 g_NvidiaExt[index].src0u.z = CoherenceHint;
2021 g_NvidiaExt[index].src0u.w = NumCoherenceHintBits;
2022 g_NvidiaExt.IncrementCounter();
2023}
2024
2025void NvReorderThread(NvHitObject HitObj, uint CoherenceHint, uint NumCoherenceHintBits)
2026{
2027 uint index = g_NvidiaExt.IncrementCounter();
2028 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_REORDER_THREAD;
2029 g_NvidiaExt[index].src0u.x = 1;
2030 g_NvidiaExt[index].src0u.y = HitObj._handle;
2031 g_NvidiaExt[index].src0u.z = CoherenceHint;
2032 g_NvidiaExt[index].src0u.w = NumCoherenceHintBits;
2033 g_NvidiaExt.IncrementCounter();
2034}
2035
2036void NvReorderThread(NvHitObject HitObj)
2037{
2038 NvReorderThread(HitObj, 0, 0);
2039}
2040
2041template<typename T>
2042void NvInvokeHitObject(
2043 RaytracingAccelerationStructure AccelerationStructure,
2044 NvHitObject HitObj,
2045 inout T Payload)
2046{
2047 uint index = g_NvidiaExt.IncrementCounter();
2048 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_INVOKE;
2049 g_NvidiaExt[index].src0u.x = HitObj._handle;
2050 uint handle = g_NvidiaExt.IncrementCounter();
2051
2052 TraceRay(AccelerationStructure, 0, 0, 0, 0, handle, (RayDesc)0, Payload);
2053}
2054
2055// Macro-based version of the HitObject API. Use this when HLSL 2021 is not available.
2056// Enable by specifying #define NV_HITOBJECT_USE_MACRO_API before including this header.
2057#elif defined(NV_HITOBJECT_USE_MACRO_API)
2058
2059struct NvHitObject {
2060 uint _handle;
2061
2062 bool IsMiss()
2063 {
2064 uint index = g_NvidiaExt.IncrementCounter();
2065 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_MISS;
2066 g_NvidiaExt[index].src0u.x = _handle;
2067 uint ret = g_NvidiaExt.IncrementCounter();
2068 return ret != 0;
2069 }
2070
2071 bool IsHit()
2072 {
2073 uint index = g_NvidiaExt.IncrementCounter();
2074 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_HIT;
2075 g_NvidiaExt[index].src0u.x = _handle;
2076 uint ret = g_NvidiaExt.IncrementCounter();
2077 return ret != 0;
2078 }
2079
2080 bool IsNop()
2081 {
2082 uint index = g_NvidiaExt.IncrementCounter();
2083 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_IS_NOP;
2084 g_NvidiaExt[index].src0u.x = _handle;
2085 uint ret = g_NvidiaExt.IncrementCounter();
2086 return ret != 0;
2087 }
2088
2089 uint GetInstanceID()
2090 {
2091 uint index = g_NvidiaExt.IncrementCounter();
2092 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_INSTANCE_ID;
2093 g_NvidiaExt[index].src0u.x = _handle;
2094 return g_NvidiaExt.IncrementCounter();
2095 }
2096
2097 uint GetInstanceIndex()
2098 {
2099 uint index = g_NvidiaExt.IncrementCounter();
2100 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_INSTANCE_INDEX;
2101 g_NvidiaExt[index].src0u.x = _handle;
2102 return g_NvidiaExt.IncrementCounter();
2103 }
2104
2105 uint GetPrimitiveIndex()
2106 {
2107 uint index = g_NvidiaExt.IncrementCounter();
2108 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_PRIMITIVE_INDEX;
2109 g_NvidiaExt[index].src0u.x = _handle;
2110 return g_NvidiaExt.IncrementCounter();
2111 }
2112
2113 uint GetGeometryIndex()
2114 {
2115 uint index = g_NvidiaExt.IncrementCounter();
2116 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_GEOMETRY_INDEX;
2117 g_NvidiaExt[index].src0u.x = _handle;
2118 return g_NvidiaExt.IncrementCounter();
2119 }
2120
2121 uint GetHitKind()
2122 {
2123 uint index = g_NvidiaExt.IncrementCounter();
2124 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_HIT_KIND;
2125 g_NvidiaExt[index].src0u.x = _handle;
2126 return g_NvidiaExt.IncrementCounter();
2127 }
2128
2129 RayDesc GetRayDesc()
2130 {
2131 uint index = g_NvidiaExt.IncrementCounter();
2132 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_RAY_DESC;
2133 g_NvidiaExt[index].src0u.x = _handle;
2134
2135 uint tmin = g_NvidiaExt.IncrementCounter();
2136 uint tmax = g_NvidiaExt.IncrementCounter();
2137 uint rayOrgX = g_NvidiaExt.IncrementCounter();
2138 uint rayOrgY = g_NvidiaExt.IncrementCounter();
2139 uint rayOrgZ = g_NvidiaExt.IncrementCounter();
2140 uint rayDirX = g_NvidiaExt.IncrementCounter();
2141 uint rayDirY = g_NvidiaExt.IncrementCounter();
2142 uint rayDirZ = g_NvidiaExt.IncrementCounter();
2143
2144 RayDesc ray;
2145 ray.TMin = asfloat(tmin);
2146 ray.TMax = asfloat(tmax);
2147 ray.Origin.x = asfloat(rayOrgX);
2148 ray.Origin.y = asfloat(rayOrgY);
2149 ray.Origin.z = asfloat(rayOrgZ);
2150 ray.Direction.x = asfloat(rayDirX);
2151 ray.Direction.y = asfloat(rayDirY);
2152 ray.Direction.z = asfloat(rayDirZ);
2153
2154 return ray;
2155 }
2156
2157 uint GetShaderTableIndex()
2158 {
2159 uint index = g_NvidiaExt.IncrementCounter();
2160 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_SHADER_TABLE_INDEX;
2161 g_NvidiaExt[index].src0u.x = _handle;
2162 return g_NvidiaExt.IncrementCounter();
2163 }
2164
2165 uint LoadLocalRootTableConstant(uint RootConstantOffsetInBytes)
2166 {
2167 uint index = g_NvidiaExt.IncrementCounter();
2168 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_LOAD_LOCAL_ROOT_TABLE_CONSTANT;
2169 g_NvidiaExt[index].src0u.x = _handle;
2170 g_NvidiaExt[index].src0u.y = RootConstantOffsetInBytes;
2171 return g_NvidiaExt.IncrementCounter();
2172 }
2173};
2174
2175#define NvTraceRayHitObject(AccelerationStructure,RayFlags,InstanceInclusionMask,RayContributionToHitGroupIndex,MultiplierForGeometryContributionToHitGroupIndex,MissShaderIndex,Ray,Payload,ResultHitObj) \
2176do { \
2177 uint _rayFlags = RayFlags; \
2178 uint _instanceInclusionMask = InstanceInclusionMask; \
2179 uint _rayContributionToHitGroupIndex = RayContributionToHitGroupIndex; \
2180 uint _multiplierForGeometryContributionToHitGroupIndex = MultiplierForGeometryContributionToHitGroupIndex; \
2181 uint _missShaderIndex = MissShaderIndex; \
2182 RayDesc _ray = Ray; \
2183 uint _index = g_NvidiaExt.IncrementCounter(); \
2184 g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_TRACE_RAY; \
2185 g_NvidiaExt[_index].numOutputsForIncCounter = 2; \
2186 g_NvidiaExt[_index].src0u.x = _missShaderIndex; \
2187 uint _hitHandle = g_NvidiaExt.IncrementCounter(); \
2188 uint _traceHandle = g_NvidiaExt.IncrementCounter(); \
2189 TraceRay(AccelerationStructure, _rayFlags, _instanceInclusionMask, _rayContributionToHitGroupIndex, _multiplierForGeometryContributionToHitGroupIndex, _traceHandle, _ray, Payload); \
2190 ResultHitObj._handle = _hitHandle; \
2191} while(0)
2192
2193struct NvHitObjectMacroDummyPayloadType { int a; };
2194
2195#define NvMakeHit(AccelerationStructure,InstanceIndex,GeometryIndex,PrimitiveIndex,HitKind,RayContributionToHitGroupIndex,MultiplierForGeometryContributionToHitGroupIndex,Ray,Attributes,ResultHitObj) \
2196do { \
2197 uint _instanceIndex = InstanceIndex; \
2198 uint _geometryIndex = GeometryIndex; \
2199 uint _primitiveIndex = PrimitiveIndex; \
2200 uint _hitKind = HitKind; \
2201 uint _rayContributionToHitGroupIndex = RayContributionToHitGroupIndex; \
2202 uint _multiplierForGeometryContributionToHitGroupIndex = MultiplierForGeometryContributionToHitGroupIndex; \
2203 RayDesc _ray = Ray; \
2204 uint _index = g_NvidiaExt.IncrementCounter(); \
2205 g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_HIT; \
2206 g_NvidiaExt[_index].numOutputsForIncCounter = 2; \
2207 g_NvidiaExt[_index].src0u.x = _instanceIndex; \
2208 g_NvidiaExt[_index].src0u.y = _geometryIndex; \
2209 g_NvidiaExt[_index].src0u.z = _primitiveIndex; \
2210 g_NvidiaExt[_index].src0u.w = _hitKind; \
2211 g_NvidiaExt[_index].src1u.x = _rayContributionToHitGroupIndex; \
2212 g_NvidiaExt[_index].src1u.y = _multiplierForGeometryContributionToHitGroupIndex; \
2213 uint _hitHandle = g_NvidiaExt.IncrementCounter(); \
2214 uint _traceHandle = g_NvidiaExt.IncrementCounter(); \
2215 CallShader(_traceHandle, Attributes); \
2216 NvHitObjectMacroDummyPayloadType _payload; \
2217 TraceRay(AccelerationStructure, 0, 0, 0, 0, _traceHandle, _ray, _payload); \
2218 ResultHitObj._handle = _hitHandle; \
2219} while(0)
2220
2221#define NvMakeHitWithRecordIndex(HitGroupRecordIndex,AccelerationStructure,InstanceIndex,GeometryIndex,PrimitiveIndex,HitKind,Ray,Attributes,ResultHitObj) \
2222do { \
2223 uint _hitGroupRecordIndex = HitGroupRecordIndex; \
2224 uint _instanceIndex = InstanceIndex; \
2225 uint _geometryIndex = GeometryIndex; \
2226 uint _primitiveIndex = PrimitiveIndex; \
2227 uint _hitKind = HitKind; \
2228 RayDesc _ray = Ray; \
2229 uint _index = g_NvidiaExt.IncrementCounter(); \
2230 g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_HIT_WITH_RECORD_INDEX; \
2231 g_NvidiaExt[_index].numOutputsForIncCounter = 2; \
2232 g_NvidiaExt[_index].src0u.x = _instanceIndex; \
2233 g_NvidiaExt[_index].src0u.y = _geometryIndex; \
2234 g_NvidiaExt[_index].src0u.z = _primitiveIndex; \
2235 g_NvidiaExt[_index].src0u.w = _hitKind; \
2236 g_NvidiaExt[_index].src1u.x = _hitGroupRecordIndex; \
2237 uint _hitHandle = g_NvidiaExt.IncrementCounter(); \
2238 uint _traceHandle = g_NvidiaExt.IncrementCounter(); \
2239 CallShader(_traceHandle, Attributes); \
2240 NvHitObjectMacroDummyPayloadType _payload; \
2241 TraceRay(AccelerationStructure, 0, 0, 0, 0, _traceHandle, _ray, _payload); \
2242 ResultHitObj._handle = _hitHandle; \
2243} while(0)
2244
2245NvHitObject NvMakeMiss(
2246 uint MissShaderIndex,
2247 RayDesc Ray)
2248{
2249 uint index = g_NvidiaExt.IncrementCounter();
2250 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_MISS;
2251 g_NvidiaExt[index].src0u.x = MissShaderIndex;
2252 g_NvidiaExt[index].src0u.y = asuint(Ray.TMin);
2253 g_NvidiaExt[index].src0u.z = asuint(Ray.TMax);
2254 g_NvidiaExt[index].src1u.x = asuint(Ray.Origin.x);
2255 g_NvidiaExt[index].src1u.y = asuint(Ray.Origin.y);
2256 g_NvidiaExt[index].src1u.z = asuint(Ray.Origin.z);
2257 g_NvidiaExt[index].src2u.x = asuint(Ray.Direction.x);
2258 g_NvidiaExt[index].src2u.y = asuint(Ray.Direction.y);
2259 g_NvidiaExt[index].src2u.z = asuint(Ray.Direction.z);
2260 uint hitHandle = g_NvidiaExt.IncrementCounter();
2261
2262 NvHitObject hitObj;
2263 hitObj._handle = hitHandle;
2264 return hitObj;
2265}
2266
2267NvHitObject NvMakeNop()
2268{
2269 uint index = g_NvidiaExt.IncrementCounter();
2270 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_MAKE_NOP;
2271 uint hitHandle = g_NvidiaExt.IncrementCounter();
2272
2273 NvHitObject hitObj;
2274 hitObj._handle = hitHandle;
2275 return hitObj;
2276}
2277
2278#define NvGetAttributesFromHitObject(HitObj,ResultAttributes) \
2279do { \
2280 uint _index = g_NvidiaExt.IncrementCounter(); \
2281 g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_GET_ATTRIBUTES; \
2282 g_NvidiaExt[_index].src0u.x = HitObj._handle; \
2283 uint _callHandle = g_NvidiaExt.IncrementCounter(); \
2284 CallShader(_callHandle, ResultAttributes); \
2285} while(0)
2286
2287void NvReorderThread(uint CoherenceHint, uint NumCoherenceHintBits)
2288{
2289 uint index = g_NvidiaExt.IncrementCounter();
2290 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_REORDER_THREAD;
2291 g_NvidiaExt[index].src0u.x = 0;
2292 g_NvidiaExt[index].src0u.y = 0;
2293 g_NvidiaExt[index].src0u.z = CoherenceHint;
2294 g_NvidiaExt[index].src0u.w = NumCoherenceHintBits;
2295 g_NvidiaExt.IncrementCounter();
2296}
2297
2298void NvReorderThread(NvHitObject HitObj, uint CoherenceHint, uint NumCoherenceHintBits)
2299{
2300 uint index = g_NvidiaExt.IncrementCounter();
2301 g_NvidiaExt[index].opcode = NV_EXTN_OP_HIT_OBJECT_REORDER_THREAD;
2302 g_NvidiaExt[index].src0u.x = 1;
2303 g_NvidiaExt[index].src0u.y = HitObj._handle;
2304 g_NvidiaExt[index].src0u.z = CoherenceHint;
2305 g_NvidiaExt[index].src0u.w = NumCoherenceHintBits;
2306 g_NvidiaExt.IncrementCounter();
2307}
2308
2309void NvReorderThread(NvHitObject HitObj)
2310{
2311 NvReorderThread(HitObj, 0, 0);
2312}
2313
2314#define NvInvokeHitObject(AccelerationStructure,HitObj,Payload) \
2315do { \
2316 uint _index = g_NvidiaExt.IncrementCounter(); \
2317 g_NvidiaExt[_index].opcode = NV_EXTN_OP_HIT_OBJECT_INVOKE; \
2318 g_NvidiaExt[_index].src0u.x = HitObj._handle; \
2319 uint _handle = g_NvidiaExt.IncrementCounter(); \
2320 TraceRay(AccelerationStructure, 0, 0, 0, 0, _handle, (RayDesc)0, Payload); \
2321} while(0)
2322
2323#endif