NVAPI Reference Documentation 545
Loading...
Searching...
No Matches
nvHLSLExtnsInternal.h
1 /************************************************************************************************************************************\
2|* *|
3|* Copyright © 2012 NVIDIA Corporation. All rights reserved. *|
4|* *|
5|* NOTICE TO USER: *|
6|* *|
7|* This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws. *|
8|* *|
9|* This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA *|
10|* and are being provided solely under the terms and conditions of an NVIDIA software license agreement. *|
11|* Otherwise, you have no rights to use or access this software in any manner. *|
12|* *|
13|* If not covered by the applicable NVIDIA software license agreement: *|
14|* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE. *|
15|* IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. *|
16|* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, *|
17|* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. *|
18|* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, *|
19|* OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, *|
20|* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE. *|
21|* *|
22|* U.S. Government End Users. *|
23|* This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995), *|
24|* consisting of "commercial computer software" and "commercial computer software documentation" *|
25|* as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item. *|
26|* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), *|
27|* all U.S. Government End Users acquire the software with only those rights set forth herein. *|
28|* *|
29|* Any use of this software in individual and commercial software must include, *|
30|* in the user documentation and internal comments to the code, *|
31|* the above Disclaimer (as applicable) and U.S. Government End Users Notice. *|
32|* *|
33 \************************************************************************************************************************************/
34
36// internal functions
37// Functions in this file are not expected to be called by apps directly
38
39#include "nvShaderExtnEnums.h"
40
42{
43 uint opcode; // opcode
44 uint rid; // resource ID
45 uint sid; // sampler ID
46
47 uint4 dst1u; // destination operand 1 (for instructions that need extra destination operands)
48 uint4 src3u; // source operand 3
49 uint4 src4u; // source operand 4
50 uint4 src5u; // source operand 5
51
52 uint4 src0u; // uint source operand 0
53 uint4 src1u; // uint source operand 0
54 uint4 src2u; // uint source operand 0
55 uint4 dst0u; // uint destination operand
56
57 uint markUavRef; // the next store to UAV is fake and is used only to identify the uav slot
58 uint numOutputsForIncCounter; // Used for output to IncrementCounter
59 float padding1[27]; // struct size: 256 bytes
60};
61
62// RW structured buffer for Nvidia shader extensions
63
64// Application needs to define NV_SHADER_EXTN_SLOT as a unused slot, which should be
65// set using NvAPI_D3D11_SetNvShaderExtnSlot() call before creating the first shader that
66// uses nvidia shader extensions. E.g before including this file in shader define it as:
67// #define NV_SHADER_EXTN_SLOT u7
68
69// For SM5.1, application needs to define NV_SHADER_EXTN_REGISTER_SPACE as register space
70// E.g. before including this file in shader define it as:
71// #define NV_SHADER_EXTN_REGISTER_SPACE space2
72
73// Note that other operations to this UAV will be ignored so application
74// should bind a null resource
75
76#ifdef NV_SHADER_EXTN_REGISTER_SPACE
77RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT, NV_SHADER_EXTN_REGISTER_SPACE );
78#else
79RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT );
80#endif
81
82//----------------------------------------------------------------------------//
83// the exposed SHFL instructions accept a mask parameter in src2
84// To compute lane mask from width of segment:
85// minLaneID : currentLaneId & src2[12:8]
86// maxLaneID : minLaneId | (src2[4:0] & ~src2[12:8])
87// where [minLaneId, maxLaneId] defines the segment where currentLaneId belongs
88// we always set src2[4:0] to 11111 (0x1F), and set src2[12:8] as (32 - width)
89int __NvGetShflMaskFromWidth(uint width)
90{
91 return ((NV_WARP_SIZE - width) << 8) | 0x1F;
92}
93
94//----------------------------------------------------------------------------//
95
96void __NvReferenceUAVForOp(RWByteAddressBuffer uav)
97{
98 uint index = g_NvidiaExt.IncrementCounter();
99 g_NvidiaExt[index].markUavRef = 1;
100 uav.Store(index, 0);
101}
102
103void __NvReferenceUAVForOp(RWTexture1D<float2> uav)
104{
105 uint index = g_NvidiaExt.IncrementCounter();
106 g_NvidiaExt[index].markUavRef = 1;
107 uav[index] = float2(0,0);
108}
109
110void __NvReferenceUAVForOp(RWTexture2D<float2> uav)
111{
112 uint index = g_NvidiaExt.IncrementCounter();
113 g_NvidiaExt[index].markUavRef = 1;
114 uav[uint2(index,index)] = float2(0,0);
115}
116
117void __NvReferenceUAVForOp(RWTexture3D<float2> uav)
118{
119 uint index = g_NvidiaExt.IncrementCounter();
120 g_NvidiaExt[index].markUavRef = 1;
121 uav[uint3(index,index,index)] = float2(0,0);
122}
123
124void __NvReferenceUAVForOp(RWTexture1D<float4> uav)
125{
126 uint index = g_NvidiaExt.IncrementCounter();
127 g_NvidiaExt[index].markUavRef = 1;
128 uav[index] = float4(0,0,0,0);
129}
130
131void __NvReferenceUAVForOp(RWTexture2D<float4> uav)
132{
133 uint index = g_NvidiaExt.IncrementCounter();
134 g_NvidiaExt[index].markUavRef = 1;
135 uav[uint2(index,index)] = float4(0,0,0,0);
136}
137
138void __NvReferenceUAVForOp(RWTexture3D<float4> uav)
139{
140 uint index = g_NvidiaExt.IncrementCounter();
141 g_NvidiaExt[index].markUavRef = 1;
142 uav[uint3(index,index,index)] = float4(0,0,0,0);
143}
144
145void __NvReferenceUAVForOp(RWTexture1D<float> uav)
146{
147 uint index = g_NvidiaExt.IncrementCounter();
148 g_NvidiaExt[index].markUavRef = 1;
149 uav[index] = 0.0f;
150}
151
152void __NvReferenceUAVForOp(RWTexture2D<float> uav)
153{
154 uint index = g_NvidiaExt.IncrementCounter();
155 g_NvidiaExt[index].markUavRef = 1;
156 uav[uint2(index,index)] = 0.0f;
157}
158
159void __NvReferenceUAVForOp(RWTexture3D<float> uav)
160{
161 uint index = g_NvidiaExt.IncrementCounter();
162 g_NvidiaExt[index].markUavRef = 1;
163 uav[uint3(index,index,index)] = 0.0f;
164}
165
166
167void __NvReferenceUAVForOp(RWTexture1D<uint2> uav)
168{
169 uint index = g_NvidiaExt.IncrementCounter();
170 g_NvidiaExt[index].markUavRef = 1;
171 uav[index] = uint2(0,0);
172}
173
174void __NvReferenceUAVForOp(RWTexture2D<uint2> uav)
175{
176 uint index = g_NvidiaExt.IncrementCounter();
177 g_NvidiaExt[index].markUavRef = 1;
178 uav[uint2(index,index)] = uint2(0,0);
179}
180
181void __NvReferenceUAVForOp(RWTexture3D<uint2> uav)
182{
183 uint index = g_NvidiaExt.IncrementCounter();
184 g_NvidiaExt[index].markUavRef = 1;
185 uav[uint3(index,index,index)] = uint2(0,0);
186}
187
188void __NvReferenceUAVForOp(RWTexture1D<uint4> uav)
189{
190 uint index = g_NvidiaExt.IncrementCounter();
191 g_NvidiaExt[index].markUavRef = 1;
192 uav[index] = uint4(0,0,0,0);
193}
194
195void __NvReferenceUAVForOp(RWTexture2D<uint4> uav)
196{
197 uint index = g_NvidiaExt.IncrementCounter();
198 g_NvidiaExt[index].markUavRef = 1;
199 uav[uint2(index,index)] = uint4(0,0,0,0);
200}
201
202void __NvReferenceUAVForOp(RWTexture3D<uint4> uav)
203{
204 uint index = g_NvidiaExt.IncrementCounter();
205 g_NvidiaExt[index].markUavRef = 1;
206 uav[uint3(index,index,index)] = uint4(0,0,0,0);
207}
208
209void __NvReferenceUAVForOp(RWTexture1D<uint> uav)
210{
211 uint index = g_NvidiaExt.IncrementCounter();
212 g_NvidiaExt[index].markUavRef = 1;
213 uav[index] = 0;
214}
215
216void __NvReferenceUAVForOp(RWTexture2D<uint> uav)
217{
218 uint index = g_NvidiaExt.IncrementCounter();
219 g_NvidiaExt[index].markUavRef = 1;
220 uav[uint2(index,index)] = 0;
221}
222
223void __NvReferenceUAVForOp(RWTexture3D<uint> uav)
224{
225 uint index = g_NvidiaExt.IncrementCounter();
226 g_NvidiaExt[index].markUavRef = 1;
227 uav[uint3(index,index,index)] = 0;
228}
229
230void __NvReferenceUAVForOp(RWTexture1D<int2> uav)
231{
232 uint index = g_NvidiaExt.IncrementCounter();
233 g_NvidiaExt[index].markUavRef = 1;
234 uav[index] = int2(0,0);
235}
236
237void __NvReferenceUAVForOp(RWTexture2D<int2> uav)
238{
239 uint index = g_NvidiaExt.IncrementCounter();
240 g_NvidiaExt[index].markUavRef = 1;
241 uav[uint2(index,index)] = int2(0,0);
242}
243
244void __NvReferenceUAVForOp(RWTexture3D<int2> uav)
245{
246 uint index = g_NvidiaExt.IncrementCounter();
247 g_NvidiaExt[index].markUavRef = 1;
248 uav[uint3(index,index,index)] = int2(0,0);
249}
250
251void __NvReferenceUAVForOp(RWTexture1D<int4> uav)
252{
253 uint index = g_NvidiaExt.IncrementCounter();
254 g_NvidiaExt[index].markUavRef = 1;
255 uav[index] = int4(0,0,0,0);
256}
257
258void __NvReferenceUAVForOp(RWTexture2D<int4> uav)
259{
260 uint index = g_NvidiaExt.IncrementCounter();
261 g_NvidiaExt[index].markUavRef = 1;
262 uav[uint2(index,index)] = int4(0,0,0,0);
263}
264
265void __NvReferenceUAVForOp(RWTexture3D<int4> uav)
266{
267 uint index = g_NvidiaExt.IncrementCounter();
268 g_NvidiaExt[index].markUavRef = 1;
269 uav[uint3(index,index,index)] = int4(0,0,0,0);
270}
271
272void __NvReferenceUAVForOp(RWTexture1D<int> uav)
273{
274 uint index = g_NvidiaExt.IncrementCounter();
275 g_NvidiaExt[index].markUavRef = 1;
276 uav[index] = 0;
277}
278
279void __NvReferenceUAVForOp(RWTexture2D<int> uav)
280{
281 uint index = g_NvidiaExt.IncrementCounter();
282 g_NvidiaExt[index].markUavRef = 1;
283 uav[uint2(index,index)] = 0;
284}
285
286void __NvReferenceUAVForOp(RWTexture3D<int> uav)
287{
288 uint index = g_NvidiaExt.IncrementCounter();
289 g_NvidiaExt[index].markUavRef = 1;
290 uav[uint3(index,index,index)] = 0;
291}
292
293//----------------------------------------------------------------------------//
294// ATOMIC op sub-opcodes
295#define NV_EXTN_ATOM_AND 0
296#define NV_EXTN_ATOM_OR 1
297#define NV_EXTN_ATOM_XOR 2
298
299#define NV_EXTN_ATOM_ADD 3
300#define NV_EXTN_ATOM_MAX 6
301#define NV_EXTN_ATOM_MIN 7
302
303#define NV_EXTN_ATOM_SWAP 8
304#define NV_EXTN_ATOM_CAS 9
305
306//----------------------------------------------------------------------------//
307
308// performs Atomic operation on two consecutive fp16 values in the given UAV
309// the uint paramater 'fp16x2Val' is treated as two fp16 values
310// the passed sub-opcode 'op' should be an immediate constant
311// byteAddress must be multiple of 4
312// the returned value are the two fp16 values packed into a single uint
313uint __NvAtomicOpFP16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val, uint atomicOpType)
314{
315 __NvReferenceUAVForOp(uav);
316 uint index = g_NvidiaExt.IncrementCounter();
317 g_NvidiaExt[index].src0u.x = byteAddress;
318 g_NvidiaExt[index].src1u.x = fp16x2Val;
319 g_NvidiaExt[index].src2u.x = atomicOpType;
320 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
321
322 return g_NvidiaExt[index].dst0u.x;
323}
324
325//----------------------------------------------------------------------------//
326
327// performs Atomic operation on a R16G16_FLOAT UAV at the given address
328// the uint paramater 'fp16x2Val' is treated as two fp16 values
329// the passed sub-opcode 'op' should be an immediate constant
330// the returned value are the two fp16 values (.x and .y components) packed into a single uint
331// Warning: Behaviour of these set of functions is undefined if the UAV is not
332// of R16G16_FLOAT format (might result in app crash or TDR)
333
334uint __NvAtomicOpFP16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val, uint atomicOpType)
335{
336 __NvReferenceUAVForOp(uav);
337 uint index = g_NvidiaExt.IncrementCounter();
338 g_NvidiaExt[index].src0u.x = address;
339 g_NvidiaExt[index].src1u.x = fp16x2Val;
340 g_NvidiaExt[index].src2u.x = atomicOpType;
341 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
342
343 return g_NvidiaExt[index].dst0u.x;
344}
345
346uint __NvAtomicOpFP16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val, uint atomicOpType)
347{
348 __NvReferenceUAVForOp(uav);
349 uint index = g_NvidiaExt.IncrementCounter();
350 g_NvidiaExt[index].src0u.xy = address;
351 g_NvidiaExt[index].src1u.x = fp16x2Val;
352 g_NvidiaExt[index].src2u.x = atomicOpType;
353 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
354
355 return g_NvidiaExt[index].dst0u.x;
356}
357
358uint __NvAtomicOpFP16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val, uint atomicOpType)
359{
360 __NvReferenceUAVForOp(uav);
361 uint index = g_NvidiaExt.IncrementCounter();
362 g_NvidiaExt[index].src0u.xyz = address;
363 g_NvidiaExt[index].src1u.x = fp16x2Val;
364 g_NvidiaExt[index].src2u.x = atomicOpType;
365 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
366
367 return g_NvidiaExt[index].dst0u.x;
368}
369
370//----------------------------------------------------------------------------//
371
372// performs Atomic operation on a R16G16B16A16_FLOAT UAV at the given address
373// the uint2 paramater 'fp16x2Val' is treated as four fp16 values
374// i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz
375// the passed sub-opcode 'op' should be an immediate constant
376// the returned value are the four fp16 values (.xyzw components) packed into uint2
377// Warning: Behaviour of these set of functions is undefined if the UAV is not
378// of R16G16B16A16_FLOAT format (might result in app crash or TDR)
379
380uint2 __NvAtomicOpFP16x2(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val, uint atomicOpType)
381{
382 __NvReferenceUAVForOp(uav);
383
384 // break it down into two fp16x2 atomic ops
385 uint2 retVal;
386
387 // first op has x-coordinate = x * 2
388 uint index = g_NvidiaExt.IncrementCounter();
389 g_NvidiaExt[index].src0u.x = address * 2;
390 g_NvidiaExt[index].src1u.x = fp16x2Val.x;
391 g_NvidiaExt[index].src2u.x = atomicOpType;
392 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
393 retVal.x = g_NvidiaExt[index].dst0u.x;
394
395 // second op has x-coordinate = x * 2 + 1
396 index = g_NvidiaExt.IncrementCounter();
397 g_NvidiaExt[index].src0u.x = address * 2 + 1;
398 g_NvidiaExt[index].src1u.x = fp16x2Val.y;
399 g_NvidiaExt[index].src2u.x = atomicOpType;
400 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
401 retVal.y = g_NvidiaExt[index].dst0u.x;
402
403 return retVal;
404}
405
406uint2 __NvAtomicOpFP16x2(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val, uint atomicOpType)
407{
408 __NvReferenceUAVForOp(uav);
409
410 // break it down into two fp16x2 atomic ops
411 uint2 retVal;
412
413 // first op has x-coordinate = x * 2
414 uint2 addressTemp = uint2(address.x * 2, address.y);
415 uint index = g_NvidiaExt.IncrementCounter();
416 g_NvidiaExt[index].src0u.xy = addressTemp;
417 g_NvidiaExt[index].src1u.x = fp16x2Val.x;
418 g_NvidiaExt[index].src2u.x = atomicOpType;
419 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
420 retVal.x = g_NvidiaExt[index].dst0u.x;
421
422 // second op has x-coordinate = x * 2 + 1
423 addressTemp.x++;
424 index = g_NvidiaExt.IncrementCounter();
425 g_NvidiaExt[index].src0u.xy = addressTemp;
426 g_NvidiaExt[index].src1u.x = fp16x2Val.y;
427 g_NvidiaExt[index].src2u.x = atomicOpType;
428 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
429 retVal.y = g_NvidiaExt[index].dst0u.x;
430
431 return retVal;
432}
433
434uint2 __NvAtomicOpFP16x2(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val, uint atomicOpType)
435{
436 __NvReferenceUAVForOp(uav);
437
438 // break it down into two fp16x2 atomic ops
439 uint2 retVal;
440
441 // first op has x-coordinate = x * 2
442 uint3 addressTemp = uint3(address.x * 2, address.y, address.z);
443 uint index = g_NvidiaExt.IncrementCounter();
444 g_NvidiaExt[index].src0u.xyz = addressTemp;
445 g_NvidiaExt[index].src1u.x = fp16x2Val.x;
446 g_NvidiaExt[index].src2u.x = atomicOpType;
447 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
448 retVal.x = g_NvidiaExt[index].dst0u.x;
449
450 // second op has x-coordinate = x * 2 + 1
451 addressTemp.x++;
452 index = g_NvidiaExt.IncrementCounter();
453 g_NvidiaExt[index].src0u.xyz = addressTemp;
454 g_NvidiaExt[index].src1u.x = fp16x2Val.y;
455 g_NvidiaExt[index].src2u.x = atomicOpType;
456 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
457 retVal.y = g_NvidiaExt[index].dst0u.x;
458
459 return retVal;
460}
461
462uint __fp32x2Tofp16x2(float2 val)
463{
464 return (f32tof16(val.y)<<16) | f32tof16(val.x) ;
465}
466
467uint2 __fp32x4Tofp16x4(float4 val)
468{
469 return uint2( (f32tof16(val.y)<<16) | f32tof16(val.x), (f32tof16(val.w)<<16) | f32tof16(val.z) ) ;
470}
471
472//----------------------------------------------------------------------------//
473
474// FP32 Atomic functions
475// performs Atomic operation treating the uav as float (fp32) values
476// the passed sub-opcode 'op' should be an immediate constant
477// byteAddress must be multiple of 4
478float __NvAtomicAddFP32(RWByteAddressBuffer uav, uint byteAddress, float val)
479{
480 __NvReferenceUAVForOp(uav);
481 uint index = g_NvidiaExt.IncrementCounter();
482 g_NvidiaExt[index].src0u.x = byteAddress;
483 g_NvidiaExt[index].src1u.x = asuint(val); // passing as uint to make it more convinient for the driver to translate
484 g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
485 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
486
487 return asfloat(g_NvidiaExt[index].dst0u.x);
488}
489
490float __NvAtomicAddFP32(RWTexture1D<float> uav, uint address, float val)
491{
492 __NvReferenceUAVForOp(uav);
493 uint index = g_NvidiaExt.IncrementCounter();
494 g_NvidiaExt[index].src0u.x = address;
495 g_NvidiaExt[index].src1u.x = asuint(val);
496 g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
497 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
498
499 return asfloat(g_NvidiaExt[index].dst0u.x);
500}
501
502float __NvAtomicAddFP32(RWTexture2D<float> uav, uint2 address, float val)
503{
504 __NvReferenceUAVForOp(uav);
505 uint index = g_NvidiaExt.IncrementCounter();
506 g_NvidiaExt[index].src0u.xy = address;
507 g_NvidiaExt[index].src1u.x = asuint(val);
508 g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
509 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
510
511 return asfloat(g_NvidiaExt[index].dst0u.x);
512}
513
514float __NvAtomicAddFP32(RWTexture3D<float> uav, uint3 address, float val)
515{
516 __NvReferenceUAVForOp(uav);
517 uint index = g_NvidiaExt.IncrementCounter();
518 g_NvidiaExt[index].src0u.xyz = address;
519 g_NvidiaExt[index].src1u.x = asuint(val);
520 g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
521 g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
522
523 return asfloat(g_NvidiaExt[index].dst0u.x);
524}
525
526//----------------------------------------------------------------------------//
527
528// UINT64 Atmoic Functions
529// The functions below performs atomic operation on the given UAV treating the value as uint64
530// byteAddress must be multiple of 8
531// The returned value is the value present in memory location before the atomic operation
532// uint2 vector type is used to represent a single uint64 value with the x component containing the low 32 bits and y component the high 32 bits.
533
534uint2 __NvAtomicCompareExchangeUINT64(RWByteAddressBuffer uav, uint byteAddress, uint2 compareValue, uint2 value)
535{
536 __NvReferenceUAVForOp(uav);
537
538 uint index = g_NvidiaExt.IncrementCounter();
539 g_NvidiaExt[index].src0u.x = byteAddress;
540 g_NvidiaExt[index].src1u.xy = compareValue;
541 g_NvidiaExt[index].src1u.zw = value;
542 g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_CAS;
543 g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
544
545 return g_NvidiaExt[index].dst0u.xy;
546}
547
548uint2 __NvAtomicOpUINT64(RWByteAddressBuffer uav, uint byteAddress, uint2 value, uint atomicOpType)
549{
550 __NvReferenceUAVForOp(uav);
551
552 uint index = g_NvidiaExt.IncrementCounter();
553 g_NvidiaExt[index].src0u.x = byteAddress;
554 g_NvidiaExt[index].src1u.xy = value;
555 g_NvidiaExt[index].src2u.x = atomicOpType;
556 g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
557
558 return g_NvidiaExt[index].dst0u.xy;
559}
560
561uint2 __NvAtomicCompareExchangeUINT64(RWTexture1D<uint2> uav, uint address, uint2 compareValue, uint2 value)
562{
563 __NvReferenceUAVForOp(uav);
564
565 uint index = g_NvidiaExt.IncrementCounter();
566 g_NvidiaExt[index].src0u.x = address;
567 g_NvidiaExt[index].src1u.xy = compareValue;
568 g_NvidiaExt[index].src1u.zw = value;
569 g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_CAS;
570 g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
571
572 return g_NvidiaExt[index].dst0u.xy;
573}
574
575uint2 __NvAtomicOpUINT64(RWTexture1D<uint2> uav, uint address, uint2 value, uint atomicOpType)
576{
577 __NvReferenceUAVForOp(uav);
578
579 uint index = g_NvidiaExt.IncrementCounter();
580 g_NvidiaExt[index].src0u.x = address;
581 g_NvidiaExt[index].src1u.xy = value;
582 g_NvidiaExt[index].src2u.x = atomicOpType;
583 g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
584
585 return g_NvidiaExt[index].dst0u.xy;
586}
587
588uint2 __NvAtomicCompareExchangeUINT64(RWTexture2D<uint2> uav, uint2 address, uint2 compareValue, uint2 value)
589{
590 __NvReferenceUAVForOp(uav);
591
592 uint index = g_NvidiaExt.IncrementCounter();
593 g_NvidiaExt[index].src0u.xy = address;
594 g_NvidiaExt[index].src1u.xy = compareValue;
595 g_NvidiaExt[index].src1u.zw = value;
596 g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_CAS;
597 g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
598
599 return g_NvidiaExt[index].dst0u.xy;
600}
601
602uint2 __NvAtomicOpUINT64(RWTexture2D<uint2> uav, uint2 address, uint2 value, uint atomicOpType)
603{
604 __NvReferenceUAVForOp(uav);
605
606 uint index = g_NvidiaExt.IncrementCounter();
607 g_NvidiaExt[index].src0u.xy = address;
608 g_NvidiaExt[index].src1u.xy = value;
609 g_NvidiaExt[index].src2u.x = atomicOpType;
610 g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
611
612 return g_NvidiaExt[index].dst0u.xy;
613}
614
615uint2 __NvAtomicCompareExchangeUINT64(RWTexture3D<uint2> uav, uint3 address, uint2 compareValue, uint2 value)
616{
617 __NvReferenceUAVForOp(uav);
618
619 uint index = g_NvidiaExt.IncrementCounter();
620 g_NvidiaExt[index].src0u.xyz = address;
621 g_NvidiaExt[index].src1u.xy = compareValue;
622 g_NvidiaExt[index].src1u.zw = value;
623 g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_CAS;
624 g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
625
626 return g_NvidiaExt[index].dst0u.xy;
627}
628
629uint2 __NvAtomicOpUINT64(RWTexture3D<uint2> uav, uint3 address, uint2 value, uint atomicOpType)
630{
631 __NvReferenceUAVForOp(uav);
632
633 uint index = g_NvidiaExt.IncrementCounter();
634 g_NvidiaExt[index].src0u.xyz = address;
635 g_NvidiaExt[index].src1u.xy = value;
636 g_NvidiaExt[index].src2u.x = atomicOpType;
637 g_NvidiaExt[index].opcode = NV_EXTN_OP_UINT64_ATOMIC;
638
639 return g_NvidiaExt[index].dst0u.xy;
640}
641
642
643uint4 __NvFootprint(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint footprintmode, uint gran, int3 offset = int3(0, 0, 0))
644{
645 uint index = g_NvidiaExt.IncrementCounter();
646 g_NvidiaExt[index].src0u.x = texIndex;
647 g_NvidiaExt[index].src0u.y = smpIndex;
648 g_NvidiaExt[index].src1u.xyz = asuint(location);
649 g_NvidiaExt[index].src1u.w = gran;
650 g_NvidiaExt[index].src3u.x = texSpace;
651 g_NvidiaExt[index].src3u.y = smpSpace;
652 g_NvidiaExt[index].src3u.z = texType;
653 g_NvidiaExt[index].src3u.w = footprintmode;
654 g_NvidiaExt[index].src4u.xyz = asuint(offset);
655
656 g_NvidiaExt[index].opcode = NV_EXTN_OP_FOOTPRINT;
657 g_NvidiaExt[index].numOutputsForIncCounter = 4;
658
659 // result is returned as the return value of IncrementCounter on fake UAV slot
660 uint4 op;
661 op.x = g_NvidiaExt.IncrementCounter();
662 op.y = g_NvidiaExt.IncrementCounter();
663 op.z = g_NvidiaExt.IncrementCounter();
664 op.w = g_NvidiaExt.IncrementCounter();
665 return op;
666}
667
668uint4 __NvFootprintBias(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint footprintmode, uint gran, float bias, int3 offset = int3(0, 0, 0))
669{
670 uint index = g_NvidiaExt.IncrementCounter();
671 g_NvidiaExt[index].src0u.x = texIndex;
672 g_NvidiaExt[index].src0u.y = smpIndex;
673 g_NvidiaExt[index].src1u.xyz = asuint(location);
674 g_NvidiaExt[index].src1u.w = gran;
675 g_NvidiaExt[index].src2u.x = asuint(bias);
676 g_NvidiaExt[index].src3u.x = texSpace;
677 g_NvidiaExt[index].src3u.y = smpSpace;
678 g_NvidiaExt[index].src3u.z = texType;
679 g_NvidiaExt[index].src3u.w = footprintmode;
680 g_NvidiaExt[index].src4u.xyz = asuint(offset);
681
682 g_NvidiaExt[index].opcode = NV_EXTN_OP_FOOTPRINT_BIAS;
683 g_NvidiaExt[index].numOutputsForIncCounter = 4;
684
685 // result is returned as the return value of IncrementCounter on fake UAV slot
686 uint4 op;
687 op.x = g_NvidiaExt.IncrementCounter();
688 op.y = g_NvidiaExt.IncrementCounter();
689 op.z = g_NvidiaExt.IncrementCounter();
690 op.w = g_NvidiaExt.IncrementCounter();
691 return op;
692}
693
694uint4 __NvFootprintLevel(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint footprintmode, uint gran, float lodLevel, int3 offset = int3(0, 0, 0))
695{
696 uint index = g_NvidiaExt.IncrementCounter();
697 g_NvidiaExt[index].src0u.x = texIndex;
698 g_NvidiaExt[index].src0u.y = smpIndex;
699 g_NvidiaExt[index].src1u.xyz = asuint(location);
700 g_NvidiaExt[index].src1u.w = gran;
701 g_NvidiaExt[index].src2u.x = asuint(lodLevel);
702 g_NvidiaExt[index].src3u.x = texSpace;
703 g_NvidiaExt[index].src3u.y = smpSpace;
704 g_NvidiaExt[index].src3u.z = texType;
705 g_NvidiaExt[index].src3u.w = footprintmode;
706 g_NvidiaExt[index].src4u.xyz = asuint(offset);
707
708 g_NvidiaExt[index].opcode = NV_EXTN_OP_FOOTPRINT_LEVEL;
709 g_NvidiaExt[index].numOutputsForIncCounter = 4;
710
711 // result is returned as the return value of IncrementCounter on fake UAV slot
712 uint4 op;
713 op.x = g_NvidiaExt.IncrementCounter();
714 op.y = g_NvidiaExt.IncrementCounter();
715 op.z = g_NvidiaExt.IncrementCounter();
716 op.w = g_NvidiaExt.IncrementCounter();
717 return op;
718}
719
720uint4 __NvFootprintGrad(uint texSpace, uint texIndex, uint smpSpace, uint smpIndex, uint texType, float3 location, uint footprintmode, uint gran, float3 ddx, float3 ddy, int3 offset = int3(0, 0, 0))
721{
722 uint index = g_NvidiaExt.IncrementCounter();
723 g_NvidiaExt[index].src0u.x = texIndex;
724 g_NvidiaExt[index].src0u.y = smpIndex;
725 g_NvidiaExt[index].src1u.xyz = asuint(location);
726 g_NvidiaExt[index].src1u.w = gran;
727 g_NvidiaExt[index].src2u.xyz = asuint(ddx);
728 g_NvidiaExt[index].src5u.xyz = asuint(ddy);
729 g_NvidiaExt[index].src3u.x = texSpace;
730 g_NvidiaExt[index].src3u.y = smpSpace;
731 g_NvidiaExt[index].src3u.z = texType;
732 g_NvidiaExt[index].src3u.w = footprintmode;
733 g_NvidiaExt[index].src4u.xyz = asuint(offset);
734 g_NvidiaExt[index].opcode = NV_EXTN_OP_FOOTPRINT_GRAD;
735 g_NvidiaExt[index].numOutputsForIncCounter = 4;
736
737 // result is returned as the return value of IncrementCounter on fake UAV slot
738 uint4 op;
739 op.x = g_NvidiaExt.IncrementCounter();
740 op.y = g_NvidiaExt.IncrementCounter();
741 op.z = g_NvidiaExt.IncrementCounter();
742 op.w = g_NvidiaExt.IncrementCounter();
743 return op;
744}
745
746// returns value of special register - specify subopcode from any of NV_SPECIALOP_* specified in nvShaderExtnEnums.h - other opcodes undefined behavior
747uint __NvGetSpecial(uint subOpCode)
748{
749 uint index = g_NvidiaExt.IncrementCounter();
750 g_NvidiaExt[index].opcode = NV_EXTN_OP_GET_SPECIAL;
751 g_NvidiaExt[index].src0u.x = subOpCode;
752 return g_NvidiaExt.IncrementCounter();
753}
754
755// predicate is returned in laneValid indicating if srcLane is in range and val from specified lane is returned.
756int __NvShflGeneric(int val, uint srcLane, uint maskClampVal, out uint laneValid)
757{
758 uint index = g_NvidiaExt.IncrementCounter();
759 g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
760 g_NvidiaExt[index].src0u.y = srcLane; // source lane
761 g_NvidiaExt[index].src0u.z = maskClampVal;
762 g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_GENERIC;
763 g_NvidiaExt[index].numOutputsForIncCounter = 2;
764
765 laneValid = asuint(g_NvidiaExt.IncrementCounter());
766 return g_NvidiaExt.IncrementCounter();
767}
Definition nvHLSLExtnsInternal.h:42