Hi,
I'm trying to port a gaussian blur fragment shader to a gaussian blur compute shader.
According to GPU Perf Studio the original fragment shader is bandwidth bound on a hd7750 so such a port make sense.
However the new compute shader is 4 time slower than the fragment shader.
I use a local size of 8x8 with a shared memory of 48x32 vec4 (do I reduce occupancy by allocating too much LDS ?). The dispatch grid is 210x32.
According to gpu perf studio I'm ALU bound and reducing the loop count in the kernel reduce the execution time.
This is surprising as the computation are the same as in the original fragment shader.
Here are the fragment shaders and the compute shaders.
I tried to output 4 pixels in the compute shader but it didn't change anything.
Is there something I'm doing wrong ?
________________________
uniformsampler2Dtex;
uniformvec2pixel;
uniformfloatsigma=5.;
outvec4FragColor;
voidmain()
{
vec2uv=gl_FragCoord.xy*pixel;
floatX=uv.x;
floatY=uv.y;
floatg0,g1,g2;
g0=1.0/(sqrt(2.0*3.14)*sigma);
g1=exp(-0.5/(sigma*sigma));
g2=g1*g1;
vec4sum=texture(tex,vec2(X,Y))*g0;
g0*=g1;
g1*=g2;
for(inti=1;i<9;i++){
sum+=texture(tex,vec2(X-i*pixel.x,Y))*g0;
sum+=texture(tex,vec2(X+i*pixel.x,Y))*g0;
g0*=g1;
g1*=g2;
}
FragColor=sum;
}
________________________
// Debug Name:
/*------------------- Shader 141 -------------------*/
#version 430
//C:\Users\vljn_000\Documents\GitHub\stk-code\bld\bin\Release/../../../data/shaders/gaussian.comp
#define VSLayer
uniform layout(size1x16) restrict readonly image2D source;
uniform layout(size1x16) volatile restrict writeonly image2D dest;
uniform vec2 pixel;
uniform float sigma = 5.;
layout (local_size_x = 8, local_size_y = 8) in;
shared vec4 local_src[8 + 2 * 8][32];
void main()
{
int x = int(gl_LocalInvocationID.x), y = int(gl_LocalInvocationID.y);
for (int i = 0; i < 4; i++)
{
ivec2 uv = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y * 4 + i);
local_src[x][y + i * 8] = imageLoad(source, ivec2(uv) - ivec2(8, 0));
local_src[x + 8][y + i * 8] = imageLoad(source, ivec2(uv));
local_src[x + 16][y + i * 8] = imageLoad(source, ivec2(uv) + ivec2(8, 0));
}
barrier();
for (int i = 0; i < 4; i++)
{
float g0, g1, g2;
g0 = 1.0 / (sqrt(2.0 * 3.14) * sigma);
g1 = exp(-0.5 / (sigma * sigma));
g2 = g1 * g1;
vec4 sum = local_src[x + 8][y + i * 8] * g0;
g0 *= g1;
g1 *= g2;
for (int j = 1; j < 8; j++) {
sum += local_src[8 + x - j][y + i * 8] * g0;
sum += local_src[8 + x + j][y + i * 8] * g0;
g0 *= g1;
g1 *= g2;
}
ivec2 uv = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y * 4 + i);
imageStore(dest, ivec2(uv), sum);
}
}
________________________
Vincent