书上讲常量内存的那章有个用光线追踪画球的东西。
暂时没用常量内存实现了一下。
不得已自己写了个mvec3结构体
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
#include "myGL.h"
using namespace std;
const GLuint WIDTH = 1280;
const GLuint HEIGHT = 960;
__device__ const float INF = 123456.0f;
__device__ const float SINF = 123455.0f;
struct mvec3 {
float x, y, z;
__device__ mvec3(float _x = 0, float _y = 0, float _z = 0) : x(_x), y(_y), z(_z) {}
__device__ float length()const {
return sqrtf(x*x+y*y+z*z);
}
__device__ float square()const {
return x*x + y*y + z*z;
}
__device__ mvec3 operator + (const mvec3 &t) const {
return mvec3(x+t.x, y+t.y, z+t.z);
}
__device__ mvec3 operator - (const mvec3 &t) const {
return mvec3(x-t.x, y-t.y, z-t.z);
}
__device__ mvec3 operator * (float t) const {
return mvec3(x*t, y*t, z*t);
}
__device__ mvec3 operator / (float t) const {
return mvec3(x/t, y/t, z/t);
}
__device__ friend float dot(const mvec3 &a, const mvec3 &b) {
return a.x*b.x + a.y*b.y + a.z*b.z;
}
__device__ friend mvec3 cross(const mvec3 &a, const mvec3 &b) {
return mvec3(
a.y * b.z - a.z * b.y,
a.z * b.x - a.x * b.z,
a.x * b.y - a.y * b.x
);
}
__device__ friend mvec3 normalize(const mvec3 &a) {
return a / a.length();
}
};
struct Sphere {
mvec3 site;
float r;
unsigned char rgb[4];//only rgb!!!
void readData(ifstream &is) {
is >> site.x >> site.y >> site.z >> r;
int s;
for (int i = 0; i < 3; ++i) {
is >> s;
rgb[i] = (unsigned char)s;
}
rgb[3] = 0;
}
__device__ float calc(const mvec3 &ray) const {
float g = dot(ray, site);
float lh = (site - ray * g).square();
if (lh >= r * r) return INF;
return sqrtf(site.square() - lh) - sqrtf(r * r - lh);
}
__device__ float calcCos(float dis, const mvec3 &ray) const {
mvec3 rToC = normalize(site - ray * dis);//the radius point to center of sphere 's normalized vector
return dot(rToC, ray);
}
};
void HANDLE_ERROR(cudaError_t status);
void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height);
void ReadData(Sphere* &res, int &n) {
ifstream is("sphere.in");
is >> n;
cout << "Get " << n << " spheres." << endl;
res = new Sphere[n];
for (int i = 0; i < n; ++i)
res[i].readData(is);
}
int main() {
Sphere *sp;
int spnum;
ReadData(sp, spnum);
unsigned char *p = new unsigned char[WIDTH*HEIGHT * 4];
GLFWwindow *window = glfwStart(WIDTH, HEIGHT, "ray-tracing");
Shader shader;
shader.mkShader("shader.vert", NULL, "shader.frag");
GLuint vao = mkVAO();
drawPixels(p, sp, spnum, WIDTH, HEIGHT);
GLuint tex = mkTex(GL_RGBA, WIDTH, HEIGHT, p);
while (!glfwWindowShouldClose(window)) {
glfwPollEvents();
glClearColor(0,0,0,0);
glClear(GL_COLOR_BUFFER_BIT);
shader.Use();
glBindTexture(GL_TEXTURE_2D, tex);
glBindVertexArray(vao);
glDrawArrays(GL_TRIANGLES, 0, 6);
glfwSwapBuffers(window);
GLuint err = glGetError();
if (err)
cout << "Error: " << err << endl;
}
glDeleteTextures(1, &tex);
delete[]p;
delete[]sp;
glfwTerminate();
return 0;
}
void HANDLE_ERROR(cudaError_t status) {
if (status != cudaSuccess) {
fprintf(stderr, "Error~\n");
exit(0);
}
}
__global__ void kernel(unsigned char *res, Sphere *sp, int spnum, int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int offset = y * width + x;
if (offset < width * height) {
offset *= 4;
float fx = 1.0f * (width/2 - x) / height;
float fy = 1.0f * y / height - 0.5f;
mvec3 ray = normalize(mvec3(fx, fy, 1));
float miniLen = SINF;
int miniNum = -1;
for (int i = 0; i < spnum; ++i) {
float gg = sp[i].calc(ray);
if (gg < miniLen) {
miniLen = gg;
miniNum = i;
}
}
if (miniNum == -1)
res[offset] = res[offset + 1] = res[offset + 2] = 0;
else {
unsigned char *rgb = sp[miniNum].rgb;
float light = sp[miniNum].calcCos(miniLen, ray);
res[offset] = light * rgb[0];
res[offset + 1] = light * rgb[1];
res[offset + 2] = light * rgb[2];
}
}
else
res[offset] = res[offset + 1] = res[offset + 2] = 0;
res[offset + 3] = 0;
}
void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height) {
HANDLE_ERROR(cudaSetDevice(0));
cudaError_t status;
unsigned char *p = 0;
Sphere *pp = 0;
status = cudaMalloc((void**)&pp, spnum * sizeof(Sphere));
if (status != cudaSuccess) {
fprintf(stderr, "ERROR: Malloc for Sphere failed\n.");
goto Error;
}
status = cudaMalloc((void**)&p, width*height*4);
if (status != cudaSuccess) {
fprintf(stderr, "ERROR: Malloc for Sphere failed\n.");
goto Error;
}
status = cudaMemcpy(pp, sp, spnum * sizeof(Sphere), cudaMemcpyHostToDevice);
if (status != cudaSuccess) {
fprintf(stderr, "ERROR: Memcpy for Sphere failed\n.");
goto Error;
}
dim3 blockDim(32,32);
dim3 gridDim((width + 31) / 32, (height + 31) / 32);
kernel << > > (p, pp, spnum, width, height);
status = cudaGetLastError();
if (status != cudaSuccess) {
fprintf(stderr, "Build kernel failed.\n");
goto Error;
}
status = cudaDeviceSynchronize();
if (status != cudaSuccess) {
fprintf(stderr, "kernel run failed.\n");
goto Error;
}
status = cudaMemcpy(res, p, width*height*4, cudaMemcpyDeviceToHost);
if (status != cudaSuccess) {
fprintf(stderr, "Memcpy failed.\n");
goto Error;
}
Error:
cudaFree(p);
cudaFree(pp);
HANDLE_ERROR(cudaDeviceReset());
return ;
}
sphere.in文件
2
0 0 10 3
255 0 0
3 3 10 3
0 255 0
换成用常量内存存储Spheres,然后球数目增加到了200,渲染2560*1920的图
实测用常量内存还是全局内存速度都几乎一样都是1170ms
倒是如果在每一个线程束里用__shared__复制一遍Spheres数组能够将时间缩减到1080ms
差不多9%的性能提升
于是用__constant__存储球然后再加个球与光线的判定优化勉强达到了970ms
于是最后达不到书上所说的近50%的性能提升,这个日后再细究。。
以及:感觉我的代码内存泄漏有点严重Orz
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
#include "myGL.h"
using namespace std;
const GLuint WIDTH = 1280;
const GLuint HEIGHT = 960;
__device__ const float INF = 123456.0f;
__device__ const float SINF = 123455.0f;
struct mvec3 {
float x, y, z;
__device__ friend mvec3 Mvec3(float _x = 0, float _y = 0, float _z = 0) {
mvec3 res;
res.x = _x;
res.y = _y;
res.z = _z;
return res;
}
__device__ float length()const {
return sqrtf(x*x+y*y+z*z);
}
__device__ float square()const {
return x*x + y*y + z*z;
}
__device__ mvec3 operator + (const mvec3 &t) const {
return Mvec3(x+t.x, y+t.y, z+t.z);
}
__device__ mvec3 operator - (const mvec3 &t) const {
return Mvec3(x-t.x, y-t.y, z-t.z);
}
__device__ mvec3 operator * (const float &t) const {
return Mvec3(x*t, y*t, z*t);
}
__device__ mvec3 operator / (const float &t) const {
return Mvec3(x/t, y/t, z/t);
}
__device__ friend float dot(const mvec3 &a, const mvec3 &b) {
return a.x*b.x + a.y*b.y + a.z*b.z;
}
__device__ friend mvec3 cross(const mvec3 &a, const mvec3 &b) {
return Mvec3(
a.y * b.z - a.z * b.y,
a.z * b.x - a.x * b.z,
a.x * b.y - a.y * b.x
);
}
__device__ friend mvec3 normalize(const mvec3 &a) {
return a / a.length();
}
};
struct Sphere {
mvec3 site;
float r;
unsigned char rgb[4];//only rgb!!!
void readData(ifstream &is) {
is >> site.x >> site.y >> site.z >> r;
int s;
for (int i = 0; i < 3; ++i) {
is >> s;
rgb[i] = (unsigned char)s;
}
rgb[3] = 0;
}
__device__ float calc(const mvec3 &ray, const float &nowLen) const {
if (site.square() - r > nowLen)
return INF;
float g = dot(ray, site);
float lh = (site - ray * g).square();
if (lh >= r * r) return INF;
return sqrtf(site.square() - lh) - sqrtf(r * r - lh);
}
__device__ float calcCos(const float &dis, const mvec3 &ray) const {
mvec3 rToC = normalize(site - ray * dis);//the radius point to center of sphere 's normalized vector
return dot(rToC, ray);
}
};
void HANDLE_ERROR(cudaError_t status);
void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height);
void ReadData(Sphere* &res, int &n) {
ifstream is("sphere.in");
is >> n;
cout << "Get " << n << " spheres." << endl;
res = new Sphere[n];
for (int i = 0; i < n; ++i)
res[i].readData(is);
}
int main() {
Sphere *sp;
int spnum;
ReadData(sp, spnum);
unsigned char *p = new unsigned char[WIDTH*HEIGHT * 4 * 4];
GLFWwindow *window = glfwStart(WIDTH, HEIGHT, "ray-tracing");
Shader shader;
shader.mkShader("shader.vert", NULL, "shader.frag");
GLuint vao = mkVAO();
drawPixels(p, sp, spnum, WIDTH*2, HEIGHT*2);
GLuint tex = mkTex(GL_RGBA, WIDTH*2, HEIGHT*2, p);
while (!glfwWindowShouldClose(window)) {
glfwPollEvents();
glClearColor(0,0,0,0);
glClear(GL_COLOR_BUFFER_BIT);
shader.Use();
glBindTexture(GL_TEXTURE_2D, tex);
glBindVertexArray(vao);
glDrawArrays(GL_TRIANGLES, 0, 6);
glfwSwapBuffers(window);
GLuint err = glGetError();
if (err)
cout << "Error: " << err << endl;
}
glDeleteTextures(1, &tex);
delete[]p;
delete[]sp;
glfwTerminate();
return 0;
}
__constant__ Sphere pp[200];
void HANDLE_ERROR(cudaError_t status) {
if (status != cudaSuccess) {
fprintf(stderr, "Error~\n");
exit(0);
}
}
__global__ void kernel(unsigned char *res, int spnum, int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int offset = y * width + x;
if (offset < width * height) {
offset *= 4;
float fx = 1.0f * (width/2 - x) / height;
float fy = 1.0f * y / height - 0.5f;
mvec3 ray = normalize(Mvec3(fx, fy, 1));
float miniLen = SINF;
int miniNum = -1;
for (int i = 0; i < spnum; ++i) {
float gg = pp[i].calc(ray, miniLen);
if (gg < miniLen) {
miniLen = gg;
miniNum = i;
}
}
if (miniNum == -1)
res[offset] = res[offset + 1] = res[offset + 2] = 0;
else {
unsigned char *rgb = pp[miniNum].rgb;
float light = pp[miniNum].calcCos(miniLen, ray);
res[offset] = light * rgb[0];
res[offset + 1] = light * rgb[1];
res[offset + 2] = light * rgb[2];
}
}
else
res[offset] = res[offset + 1] = res[offset + 2] = 0;
res[offset + 3] = 0;
}
void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height) {
HANDLE_ERROR(cudaSetDevice(0));
cudaError_t status;
unsigned char *p = 0;
status = cudaMalloc((void**)&p, width*height*4);
if (status != cudaSuccess) {
fprintf(stderr, "ERROR: Malloc for pixels failed\n.");
goto Error;
}
status = cudaMemcpyToSymbol(pp, sp, spnum*sizeof(Sphere));
if (status != cudaSuccess) {
fprintf(stderr, "ERROR: MemcpyToSymbol failed.\n");
goto Error;
}
dim3 blockDim(32,32);
dim3 gridDim((width + 31) / 32, (height + 31) / 32);
kernel << > > (p, spnum, width, height);
status = cudaGetLastError();
if (status != cudaSuccess) {
fprintf(stderr, "Build kernel failed.\n");
goto Error;
}
status = cudaDeviceSynchronize();
if (status != cudaSuccess) {
fprintf(stderr, "kernel run failed.\n");
goto Error;
}
status = cudaMemcpy(res, p, width*height*4, cudaMemcpyDeviceToHost);
if (status != cudaSuccess) {
fprintf(stderr, "Memcpy failed.\n");
goto Error;
}
Error:
cudaFree(p);
cudaFree(pp);
HANDLE_ERROR(cudaDeviceReset());
return ;
}