I'm trying to improve Henry Thasler's GLSL implementation of double-single arithmetic (from his GLSL Mandelbrot demo) to work reliably on NVIDIA graphics on Linux. I've recently learned that since OpenGL 4.0 (§4.7 The Precise Qualifier in the spec) or with GL_ARB_gpu_shader5
extension (spec) we can use precise
qualifier to make the calculations follow exact sequence of arithmetic operations specified in the GLSL source.
But the following attempt appears to not give any improvement:
#version 330
#extension GL_ARB_gpu_shader5 : require
vec2 ds_add(vec2 dsa, vec2 dsb)
{
precise float t1 = dsa.x + dsb.x;
precise float e = t1 - dsa.x;
precise float t2 = ((dsb.x - e) + (dsa.x - (t1 - e))) + dsa.y + dsb.y;
precise vec2 dsc;
dsc.x = t1 + t2;
dsc.y = t2 - (dsc.x - t1);
return dsc;
}
The result is the same as if there were no precise
added. I've checked that the algorithm itself is correct: it works as is (even without precise
) on Intel Core i7-4765T built-in graphics, and if I hide some variables to inhibit optimizations, then NVidia also gives the correct results. Here's how I inhibit the optimizations:
#version 330
#define hide(x) ((x)*one)
uniform float one=1;
vec2 ds_add(vec2 dsa, vec2 dsb)
{
float t1 = dsa.x + dsb.x;
float e = hide(t1) - dsa.x;
float t2 = ((dsb.x - e) + (dsa.x - (t1 - e))) + dsa.y + dsb.y;
vec2 dsc;
dsc.x = t1 + t2;
dsc.y = t2 - (hide(dsc.x) - t1);
return dsc;
}
So, apparently, I'm using the precise
qualifier incorrectly. But what exactly is wrong here?
For reference, I'm using NVidia GeForce GTX 750Ti with binary nvidia driver 390.116. Here's the full C++ test:
#include <cmath>
#include <vector>
#include <string>
#include <limits>
#include <iomanip>
#include <iostream>
// glad.h is generated by the following command:
// glad --out-path=. --generator=c --omit-khrplatform --api="gl=3.3" --profile=core --extensions=
#include "glad/glad.h"
#include <GL/freeglut.h>
#include <glm/glm.hpp>
using glm::vec4;
GLuint vao, vbo;
GLuint texFBO;
GLuint program;
GLuint fbo;
int width=1, height=2;
void printShaderOutput(int texW, int texH)
{
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, texFBO);
std::vector<vec4> data(texW*texH);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_FLOAT, data.data());
std::cout << "a,b,sum,relError(sum),note\n";
for(int i=0;i<width;++i)
{
const auto a=double(data[i+width*0].x)+double(data[i+width*0].y);
const auto b=double(data[i+width*0].z)+double(data[i+width*0].w);
const auto sum=double(data[i+width*1].x)+double(data[i+width*1].y);
const auto trueSum=a+b;
const auto sumErr=(sum-trueSum)/trueSum;
std::cout << std::setprecision(std::numeric_limits<double>::max_digits10)
<< a << ',' << b << ','
<< sum << ','
<< std::setprecision(3)
<< sumErr << ','
<< (std::abs(sumErr)>1e-14 ? "WARN" : "OK")
<< '\n';
}
std::cout.flush();
}
GLuint makeShader(GLenum type, std::string const& srcStr)
{
const auto shader=glCreateShader(type);
const GLint srcLen=srcStr.size();
const GLchar*const src=srcStr.c_str();
glShaderSource(shader, 1, &src, &srcLen);
glCompileShader(shader);
GLint status=-1;
glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
assert(glGetError()==GL_NO_ERROR);
assert(status);
return shader;
}
void loadShaders()
{
program=glCreateProgram();
const auto vertexShader=makeShader(GL_VERTEX_SHADER, 1+R"(
#version 330
in vec4 vertex;
void main() { gl_Position=vertex; }
)");
glAttachShader(program, vertexShader);
const auto fragmentShader=makeShader(GL_FRAGMENT_SHADER, 1+R"(
#version 330
#extension GL_ARB_gpu_shader5 : require
vec2 ds_add(vec2 dsa, vec2 dsb)
{
precise float t1 = dsa.x + dsb.x;
precise float e = t1 - dsa.x;
precise float t2 = ((dsb.x - e) + (dsa.x - (t1 - e))) + dsa.y + dsb.y;
precise vec2 dsc;
dsc.x = t1 + t2;
dsc.y = t2 - (dsc.x - t1);
return dsc;
}
uniform vec2 a, b;
out vec4 color;
void main()
{
if(gl_FragCoord.y<1) // first row
color=vec4(a,b);
else if(gl_FragCoord.y<2) // second row
color=vec4(ds_add(a,b),0,0);
}
)");
glAttachShader(program, fragmentShader);
glLinkProgram(program);
GLint status=0;
glGetProgramiv(program, GL_LINK_STATUS, &status);
assert(glGetError()==GL_NO_ERROR);
assert(status);
glDetachShader(program, fragmentShader);
glDeleteShader(fragmentShader);
glDetachShader(program, vertexShader);
glDeleteShader(vertexShader);
}
void setupBuffers()
{
glGenVertexArrays(1, &vao);
glBindVertexArray(vao);
glGenBuffers(1, &vbo);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
const GLfloat vertices[]=
{
-1, -1,
1, -1,
-1, 1,
1, 1,
};
glBufferData(GL_ARRAY_BUFFER, sizeof vertices, vertices, GL_STATIC_DRAW);
constexpr GLuint attribIndex=0;
constexpr int coordsPerVertex=2;
glVertexAttribPointer(attribIndex, coordsPerVertex, GL_FLOAT, false, 0, 0);
glEnableVertexAttribArray(attribIndex);
glBindVertexArray(0);
}
bool init()
{
if(!gladLoadGL())
{
std::cerr << "Failed to initialize GLAD\n";
return false;
}
if(!GLAD_GL_VERSION_3_3)
{
std::cerr << "OpenGL 3.3 not supported\n";
return false;
}
glGenTextures(1, &texFBO);
glGenFramebuffers(1,&fbo);
loadShaders();
setupBuffers();
glViewport(0,0,width,height);
glBindTexture(GL_TEXTURE_2D,texFBO);
glTexImage2D(GL_TEXTURE_2D,0,GL_RGBA32F,width,height,0,GL_RGBA,GL_UNSIGNED_BYTE,nullptr);
glBindTexture(GL_TEXTURE_2D,0);
glBindFramebuffer(GL_FRAMEBUFFER,fbo);
glFramebufferTexture2D(GL_FRAMEBUFFER,GL_COLOR_ATTACHMENT0,GL_TEXTURE_2D,texFBO,0);
const auto status=glCheckFramebufferStatus(GL_FRAMEBUFFER);
assert(status==GL_FRAMEBUFFER_COMPLETE);
glBindFramebuffer(GL_FRAMEBUFFER,0);
return true;
}
void display()
{
const static bool inited=init();
if(!inited) std::exit(1);
glBindFramebuffer(GL_FRAMEBUFFER,fbo);
glUseProgram(program);
#define SPLIT_DOUBLE_TO_FLOATS(x) GLfloat(x),GLfloat(x-GLfloat(x))
glUniform2f(glGetUniformLocation(program,"a"),SPLIT_DOUBLE_TO_FLOATS(3.1415926535897932));
glUniform2f(glGetUniformLocation(program,"b"),SPLIT_DOUBLE_TO_FLOATS(2.7182818284590452));
glUniform1f(glGetUniformLocation(program,"rtWidth"),width);
glBindVertexArray(vao);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glBindVertexArray(0);
printShaderOutput(width, height);
std::exit(0);
glFinish();
}
int main(int argc, char** argv)
{
glutInitContextVersion(3,3);
glutInitContextProfile(GLUT_CORE_PROFILE);
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGB);
glutInitWindowSize(width, height);
glutCreateWindow("Test");
glutDisplayFunc(display);
glutMainLoop();
}
I've been able to extract the NVfp5.0 assembly from the GLSL program binaries in the different cases:
hide
and without precise
:!!NVfp5.0
OPTION NV_internal;
OPTION NV_bindless_texture;
PARAM c[2] = { program.local[0..1] };
TEMP R0;
TEMP T;
TEMP RC, HC;
OUTPUT result_color0 = result.color;
SLT.F R0.x, fragment.position.y, {1, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
MOV.F result_color0.xy, c[0];
MOV.F result_color0.zw, c[1].xyxy;
ELSE;
SLT.F R0.x, fragment.position.y, {2, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
ADD.F R0.y, -c[0].x, c[0].x;
ADD.F R0.x, -c[1], c[1];
ADD.F R0.x, R0, R0.y;
ADD.F R0.x, R0, c[0].y;
ADD.F R0.y, R0.x, c[1];
ADD.F R0.x, c[0], c[1];
ADD.F result_color0.x, R0, R0.y;
ADD.F result_color0.y, R0, -R0;
MOV.F result_color0.zw, {0, 0, 0, 0}.x;
ENDIF;
ENDIF;
END
precise
(notice that nothing changes except .PREC
suffix in the "instructions"):!!NVfp5.0
OPTION NV_internal;
OPTION NV_bindless_texture;
PARAM c[2] = { program.local[0..1] };
TEMP R0;
TEMP T;
TEMP RC, HC;
OUTPUT result_color0 = result.color;
SLT.F R0.x, fragment.position.y, {1, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
MOV.F result_color0.xy, c[0];
MOV.F result_color0.zw, c[1].xyxy;
ELSE;
SLT.F R0.x, fragment.position.y, {2, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
ADD.F.PREC R0.y, -c[0].x, c[0].x;
ADD.F.PREC R0.x, -c[1], c[1];
ADD.F.PREC R0.x, R0, R0.y;
ADD.F.PREC R0.x, R0, c[0].y;
ADD.F.PREC R0.y, R0.x, c[1];
ADD.F.PREC R0.x, c[0], c[1];
ADD.F.PREC result_color0.x, R0, R0.y;
ADD.F.PREC result_color0.y, R0, -R0;
MOV.F result_color0.zw, {0, 0, 0, 0}.x;
ENDIF;
ENDIF;
END
hide
, which does work, and obviously has a different sequence of arithmetic operations:!!NVfp5.0
OPTION NV_internal;
OPTION NV_bindless_texture;
PARAM c[3] = { program.local[0..2] };
TEMP R0, R1;
TEMP T;
TEMP RC, HC;
OUTPUT result_color0 = result.color;
SLT.F R0.x, fragment.position.y, {1, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
MOV.F result_color0.xy, c[1];
MOV.F result_color0.zw, c[2].xyxy;
ELSE;
SLT.F R0.x, fragment.position.y, {2, 0, 0, 0};
TRUNC.U.CC HC.x, R0;
IF NE.x;
ADD.F R0.x, c[1], c[2];
MAD.F R0.y, R0.x, c[0].x, -c[1].x;
ADD.F R0.z, R0.x, -R0.y;
ADD.F R0.z, -R0, c[1].x;
ADD.F R0.y, -R0, c[2].x;
ADD.F R0.y, R0, R0.z;
ADD.F R0.y, R0, c[1];
ADD.F R0.y, R0, c[2];
ADD.F R1.x, R0, R0.y;
MAD.F R0.x, R1, c[0], -R0;
MOV.F R1.zw, {0, 0, 0, 0}.x;
ADD.F R1.y, R0, -R0.x;
MOV.F result_color0, R1;
ENDIF;
ENDIF;
END
I've never used precise myself, though you may benefit from learning OpenCL or CUDA here.
In any case, your GLSL version is 3.30, which is tied with OpenGL 3.3. The precise qualifier is avaiable through an extension, but I would always attempt to use a built-in feature of OpenGL if you can.
The extension may not be implemented in the same manner, I suggest you try using at least GLSL version 4.0, ideally the latest OpenGL / GLSL version.
Sometimes these old extensions can have regressions on newer GPUs if no one is using them.
GPU compilers tend to be more liberal with optimizations. You may benefit from seeing the output from the compiled shader, there may be some way to view the PTX assembly output from the Nvidia compiler with GLSL. With CUDA, you can definitely preview the assembly output to ensure the operations are not being re-ordered by the compiler.
The spec mentions MAD as being the main reason for the qualifier--it will force the compiler not to use the MAD instruction. Perhaps little testing was done with addition / subtraction with the precise qualifier.
If hide solves it for you, it's probably best to just call it a day, I doubt the precise qualifier has been thoroughly checked on the GLSL side. I highly recommend CUDA or OpenCL for this, you can use CL-GL interop if you want to display the texture quickly as well, which isn't terribly painful.
The precise qualifier ensures there is no re-ordering of operations, but doesn't mention optimizations which don't affect the ordering. It seems like AMD just turns off optimizations when using it. It's still possible that Nvidia still applies optimizations which are affecting your result which aren't related to the order of operations but rather to specific optimizations to addition being performed.
precise float t1 = dsa.x + dsb.x;
precise float e = t1 - dsa.x;
This will probably compute e
as simply dsb.x
. The compiler may potentially still be adding optimizations which don't affect the order of operations, as that's all that the spec guarantees. I can't think of anything other than the operations being re-ordered that would affect this result, but I'm no expert here.
Another thing to note is that based on my cursory reading of the spec, the result from the ds_add may need to be stored into a precise variable as well in order for the computation to be precise. The function may be inlined only on Nvidia (they have much better optimizations, at least historically), so I imagine that the compiler may perform the inlining and then if you store the result into a non-precise variable then all the existing precise qualifiers are ignored.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With