#ifndef NON_POWER_OF_TWO #define NON_POWER_OF_TWO 0 #endif #ifndef CONVERT_TO_SRGB #define CONVERT_TO_SRGB 0 #endif #ifndef IMG_FORMAT #define IMG_FORMAT rgba8 #endif layout(IMG_FORMAT) uniform writeonly image2DArray OutMip0; layout(IMG_FORMAT) uniform writeonly image2DArray OutMip1; layout(IMG_FORMAT) uniform writeonly image2DArray OutMip2; layout(IMG_FORMAT) uniform writeonly image2DArray OutMip3; uniform sampler2DArray SrcMip; uniform CB { int SrcMipLevel; // Texture level of source mip int NumMipLevels; // Number of OutMips to write: [1, 4] int FirstArraySlice; int Dummy; vec2 TexelSize; // 1.0 / OutMip1.Dimensions }; // // The reason for separating channels is to reduce bank conflicts in the // local data memory controller. A large stride will cause more threads // to collide on the same memory bank. shared float gs_R[64]; shared float gs_G[64]; shared float gs_B[64]; shared float gs_A[64]; void StoreColor( uint Index, vec4 Color ) { gs_R[Index] = Color.r; gs_G[Index] = Color.g; gs_B[Index] = Color.b; gs_A[Index] = Color.a; } vec4 LoadColor( uint Index ) { return vec4( gs_R[Index], gs_G[Index], gs_B[Index], gs_A[Index]); } float LinearToSRGB(float x) { // This is exactly the sRGB curve //return x < 0.0031308 ? 12.92 * x : 1.055 * pow(abs(x), 1.0 / 2.4) - 0.055; // This is cheaper but nearly equivalent return x < 0.0031308 ? 12.92 * x : 1.13005 * sqrt(abs(x - 0.00228)) - 0.13448 * x + 0.005719; } vec4 PackColor(vec4 Linear) { #if CONVERT_TO_SRGB return vec4(LinearToSRGB(Linear.r), LinearToSRGB(Linear.g), LinearToSRGB(Linear.b), Linear.a); #else return Linear; #endif } void GroupMemoryBarrierWithGroupSync() { // OpenGL.org: groupMemoryBarrier() waits on the completion of all memory accesses // performed by an invocation of a compute shader relative to the same access performed // by other invocations in the same work group and then returns with no other effect. // groupMemoryBarrier() acts like memoryBarrier(), ordering memory writes for all kinds // of variables, but it only orders read/writes for the current work group. groupMemoryBarrier(); // OpenGL.org: memoryBarrierShared() waits on the completion of // all memory accesses resulting from the use of SHARED variables // and then returns with no other effect. memoryBarrierShared(); // Thread execution barrier barrier(); } layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in; void main() { uint LocalInd = gl_LocalInvocationIndex; uvec3 GlobalInd = gl_GlobalInvocationID; ivec3 SrcMipSize = textureSize(SrcMip, 0); // SrcMip is the view of the source mip level bool IsValidThread = GlobalInd.x < uint(SrcMipSize.x) && GlobalInd.y < uint(SrcMipSize.y); int ArraySlice = FirstArraySlice + int(GlobalInd.z); vec4 Src1 = vec4(0.0, 0.0, 0.0, 0.0); float fSrcMipLevel = 0.0; // SrcMip is the view of the source mip level if (IsValidThread) { // One bilinear sample is insufficient when scaling down by more than 2x. // You will slightly undersample in the case where the source dimension // is odd. This is why it's a really good idea to only generate mips on // power-of-two sized textures. Trying to handle the undersampling case // will force this shader to be slower and more complicated as it will // have to take more source texture samples. #if NON_POWER_OF_TWO == 0 vec2 UV = TexelSize * (vec2(GlobalInd.xy) + vec2(0.5, 0.5)); Src1 = textureLod(SrcMip, vec3(UV, ArraySlice), fSrcMipLevel); #elif NON_POWER_OF_TWO == 1 // > 2:1 in X dimension // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x // horizontally. vec2 UV1 = TexelSize * (vec2(GlobalInd.xy) + vec2(0.25, 0.5)); vec2 Off = TexelSize * vec2(0.5, 0.0); Src1 = 0.5 * (textureLod(SrcMip, vec3(UV1, ArraySlice), fSrcMipLevel) + textureLod(SrcMip, vec3(UV1 + Off, ArraySlice), fSrcMipLevel)); #elif NON_POWER_OF_TWO == 2 // > 2:1 in Y dimension // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x // vertically. vec2 UV1 = TexelSize * (vec2(GlobalInd.xy) + vec2(0.5, 0.25)); vec2 Off = TexelSize * vec2(0.0, 0.5); Src1 = 0.5 * (textureLod(SrcMip, vec3(UV1, ArraySlice), fSrcMipLevel) + textureLod(SrcMip, vec3(UV1 + Off, ArraySlice), fSrcMipLevel)); #elif NON_POWER_OF_TWO == 3 // > 2:1 in in both dimensions // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x // in both directions. vec2 UV1 = TexelSize * (vec2(GlobalInd.xy) + vec2(0.25, 0.25)); vec2 Off = TexelSize * 0.5; Src1 += textureLod(SrcMip, vec3(UV1, ArraySlice), fSrcMipLevel); Src1 += textureLod(SrcMip, vec3(UV1 + vec2(Off.x, 0.0), ArraySlice), fSrcMipLevel); Src1 += textureLod(SrcMip, vec3(UV1 + vec2(0.0, Off.y), ArraySlice), fSrcMipLevel); Src1 += textureLod(SrcMip, vec3(UV1 + vec2(Off.x, Off.y), ArraySlice), fSrcMipLevel); Src1 *= 0.25; #endif imageStore(OutMip0, ivec3(GlobalInd.xy, ArraySlice), PackColor(Src1)); } // A scalar (constant) branch can exit all threads coherently. if (NumMipLevels == 1) return; if (IsValidThread) { // Without lane swizzle operations, the only way to share data with other // threads is through LDS. StoreColor(LocalInd, Src1); } // This guarantees all LDS writes are complete and that all threads have // executed all instructions so far (and therefore have issued their LDS // write instructions.) GroupMemoryBarrierWithGroupSync(); if (IsValidThread) { // With low three bits for X and high three bits for Y, this bit mask // (binary: 001001) checks that X and Y are even. if ((LocalInd & 0x9u) == 0u) { vec4 Src2 = LoadColor(LocalInd + 0x01u); vec4 Src3 = LoadColor(LocalInd + 0x08u); vec4 Src4 = LoadColor(LocalInd + 0x09u); Src1 = 0.25 * (Src1 + Src2 + Src3 + Src4); imageStore(OutMip1, ivec3(GlobalInd.xy / 2u, ArraySlice), PackColor(Src1)); StoreColor(LocalInd, Src1); } } if (NumMipLevels == 2) return; GroupMemoryBarrierWithGroupSync(); if( IsValidThread ) { // This bit mask (binary: 011011) checks that X and Y are multiples of four. if ((LocalInd & 0x1Bu) == 0u) { vec4 Src2 = LoadColor(LocalInd + 0x02u); vec4 Src3 = LoadColor(LocalInd + 0x10u); vec4 Src4 = LoadColor(LocalInd + 0x12u); Src1 = 0.25 * (Src1 + Src2 + Src3 + Src4); imageStore(OutMip2, ivec3(GlobalInd.xy / 4u, ArraySlice), PackColor(Src1)); StoreColor(LocalInd, Src1); } } if (NumMipLevels == 3) return; GroupMemoryBarrierWithGroupSync(); if( IsValidThread ) { // This bit mask would be 111111 (X & Y multiples of 8), but only one // thread fits that criteria. if (LocalInd == 0u) { vec4 Src2 = LoadColor(LocalInd + 0x04u); vec4 Src3 = LoadColor(LocalInd + 0x20u); vec4 Src4 = LoadColor(LocalInd + 0x24u); Src1 = 0.25 * (Src1 + Src2 + Src3 + Src4); imageStore(OutMip3, ivec3(GlobalInd.xy / 8u, ArraySlice), PackColor(Src1)); } } }