光线追踪

书上讲常量内存的那章有个用光线追踪画球的东西。
暂时没用常量内存实现了一下。
不得已自己写了个mvec3结构体

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include 
#include 
#include 

#include "myGL.h"
using namespace std;
const GLuint WIDTH = 1280;
const GLuint HEIGHT = 960;
__device__ const float INF  = 123456.0f;
__device__ const float SINF = 123455.0f;
struct mvec3 {
    float x, y, z;
    __device__ mvec3(float _x = 0, float _y = 0, float _z = 0) : x(_x), y(_y), z(_z) {}
    __device__ float length()const {
        return sqrtf(x*x+y*y+z*z);
    }
    __device__ float square()const {
        return x*x + y*y + z*z;
    }
    __device__ mvec3 operator + (const mvec3 &t) const {
        return mvec3(x+t.x, y+t.y, z+t.z);
    }
    __device__ mvec3 operator - (const mvec3 &t) const {
        return mvec3(x-t.x, y-t.y, z-t.z);
    }
    __device__ mvec3 operator * (float t) const {
        return mvec3(x*t, y*t, z*t);
    }
    __device__ mvec3 operator / (float t) const {
        return mvec3(x/t, y/t, z/t);
    }
    __device__ friend float dot(const mvec3 &a, const mvec3 &b) {
        return a.x*b.x + a.y*b.y + a.z*b.z;
    }
    __device__ friend mvec3 cross(const mvec3 &a, const mvec3 &b) {
        return mvec3(
            a.y * b.z - a.z * b.y, 
            a.z * b.x - a.x * b.z,
            a.x * b.y - a.y * b.x
        );
    }
    __device__ friend mvec3 normalize(const mvec3 &a) {
        return a / a.length();
    }
};
struct Sphere {
    mvec3 site;
    float r;
    unsigned char rgb[4];//only rgb!!!
    void readData(ifstream &is) {
        is >> site.x >> site.y >> site.z >> r;
        int s;
        for (int i = 0; i < 3; ++i) {
            is >> s;
            rgb[i] = (unsigned char)s;
        }
        rgb[3] = 0;
    }
    __device__ float calc(const mvec3 &ray) const {
        float g = dot(ray, site);
        float lh = (site - ray * g).square();
        if (lh >= r * r) return INF;
        return sqrtf(site.square() - lh) - sqrtf(r * r - lh);
    }
    __device__ float calcCos(float dis, const mvec3 &ray) const {
        mvec3 rToC = normalize(site - ray * dis);//the radius point to  center of sphere 's normalized vector
        return dot(rToC, ray);
    }
};
void HANDLE_ERROR(cudaError_t status);
void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height);
void ReadData(Sphere* &res, int &n) {
    ifstream is("sphere.in");
    is >> n;
    cout << "Get " << n << " spheres." << endl;
    res = new Sphere[n];
    for (int i = 0; i < n; ++i)
        res[i].readData(is);
}
int main() {
    Sphere *sp;
    int spnum;
    ReadData(sp, spnum);

    unsigned char *p = new unsigned char[WIDTH*HEIGHT * 4];

    GLFWwindow *window = glfwStart(WIDTH, HEIGHT, "ray-tracing");
    Shader shader;
    shader.mkShader("shader.vert", NULL, "shader.frag");
    GLuint vao = mkVAO();

    drawPixels(p, sp, spnum, WIDTH, HEIGHT);
    GLuint tex = mkTex(GL_RGBA, WIDTH, HEIGHT, p);

    while (!glfwWindowShouldClose(window)) {
        glfwPollEvents();
        glClearColor(0,0,0,0);
        glClear(GL_COLOR_BUFFER_BIT);

        shader.Use();
        glBindTexture(GL_TEXTURE_2D, tex);
        glBindVertexArray(vao);
        glDrawArrays(GL_TRIANGLES, 0, 6);
        glfwSwapBuffers(window);
        GLuint err = glGetError();
        if (err)
            cout << "Error: " << err << endl;
    }
    glDeleteTextures(1, &tex);
    delete[]p;
    delete[]sp;
    glfwTerminate();
    return 0;
}

void HANDLE_ERROR(cudaError_t status) {
    if (status != cudaSuccess) {
        fprintf(stderr, "Error~\n");
        exit(0);
    }
}
__global__ void kernel(unsigned char *res, Sphere *sp, int spnum, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int offset = y * width + x;
    if (offset < width * height) {
        offset *= 4;
        float fx = 1.0f * (width/2 - x) / height;
        float fy = 1.0f * y / height - 0.5f;

        mvec3 ray = normalize(mvec3(fx, fy, 1));

        float miniLen = SINF;
        int miniNum = -1;
        for (int i = 0; i < spnum; ++i) {
            float gg = sp[i].calc(ray);
            if (gg < miniLen) {
                miniLen = gg;
                miniNum = i;
            }
        }
        if (miniNum == -1)
            res[offset] = res[offset + 1] = res[offset + 2] = 0;
        else {
            unsigned char *rgb = sp[miniNum].rgb;
            float light = sp[miniNum].calcCos(miniLen, ray);
            res[offset] = light * rgb[0];
            res[offset + 1] = light * rgb[1];
            res[offset + 2] = light * rgb[2];
        }
    }
    else
        res[offset] = res[offset + 1] = res[offset + 2] = 0;

    res[offset + 3] = 0;
}
void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height) {
    HANDLE_ERROR(cudaSetDevice(0));

    cudaError_t status;
    unsigned char *p = 0;
    Sphere *pp = 0;

    status = cudaMalloc((void**)&pp, spnum * sizeof(Sphere));
    if (status != cudaSuccess) {
        fprintf(stderr, "ERROR: Malloc for Sphere failed\n.");
        goto Error;
    }
    status = cudaMalloc((void**)&p, width*height*4);
    if (status != cudaSuccess) {
        fprintf(stderr, "ERROR: Malloc for Sphere failed\n.");
        goto Error;
    }
    status = cudaMemcpy(pp, sp, spnum * sizeof(Sphere), cudaMemcpyHostToDevice);
    if (status != cudaSuccess) {
        fprintf(stderr, "ERROR: Memcpy for Sphere failed\n.");
        goto Error;
    }

    dim3 blockDim(32,32);
    dim3 gridDim((width + 31) / 32, (height + 31) / 32);
    kernel << > > (p, pp, spnum, width, height);

    status = cudaGetLastError();
    if (status != cudaSuccess) {
        fprintf(stderr, "Build kernel failed.\n");
        goto Error;
    }
    status = cudaDeviceSynchronize();
    if (status != cudaSuccess) {
        fprintf(stderr, "kernel run failed.\n");
        goto Error;
    }
    status = cudaMemcpy(res, p, width*height*4, cudaMemcpyDeviceToHost);
    if (status != cudaSuccess) {
        fprintf(stderr, "Memcpy failed.\n");
        goto Error;
    }

Error:

    cudaFree(p);
    cudaFree(pp);
    HANDLE_ERROR(cudaDeviceReset());
    return ;
}

sphere.in文件

2

0 0 10 3
255 0 0

3 3 10 3
0 255 0

也就是站在(0,0,0)点像z轴正方向看了
效果图:
光线追踪_第1张图片

换成用常量内存存储Spheres,然后球数目增加到了200,渲染2560*1920的图
实测用常量内存还是全局内存速度都几乎一样都是1170ms
倒是如果在每一个线程束里用__shared__复制一遍Spheres数组能够将时间缩减到1080ms
差不多9%的性能提升
于是用__constant__存储球然后再加个球与光线的判定优化勉强达到了970ms
于是最后达不到书上所说的近50%的性能提升,这个日后再细究。。
以及:感觉我的代码内存泄漏有点严重Orz

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include 
#include 
#include 

#include "myGL.h"
using namespace std;
const GLuint WIDTH = 1280;
const GLuint HEIGHT = 960;
__device__ const float INF  = 123456.0f;
__device__ const float SINF = 123455.0f;
struct mvec3 {
    float x, y, z;
    __device__ friend mvec3 Mvec3(float _x = 0, float _y = 0, float _z = 0) {
        mvec3 res;
        res.x = _x;
        res.y = _y;
        res.z = _z;
        return res;
    }
    __device__ float length()const {
        return sqrtf(x*x+y*y+z*z);
    }
    __device__ float square()const {
        return x*x + y*y + z*z;
    }
    __device__ mvec3 operator + (const mvec3 &t) const {
        return Mvec3(x+t.x, y+t.y, z+t.z);
    }
    __device__ mvec3 operator - (const mvec3 &t) const {
        return Mvec3(x-t.x, y-t.y, z-t.z);
    }
    __device__ mvec3 operator * (const float &t) const {
        return Mvec3(x*t, y*t, z*t);
    }
    __device__ mvec3 operator / (const float &t) const {
        return Mvec3(x/t, y/t, z/t);
    }
    __device__ friend float dot(const mvec3 &a, const mvec3 &b) {
        return a.x*b.x + a.y*b.y + a.z*b.z;
    }
    __device__ friend mvec3 cross(const mvec3 &a, const mvec3 &b) {
        return Mvec3(
            a.y * b.z - a.z * b.y, 
            a.z * b.x - a.x * b.z,
            a.x * b.y - a.y * b.x
        );
    }
    __device__ friend mvec3 normalize(const mvec3 &a) {
        return a / a.length();
    }
};
struct Sphere {
    mvec3 site;
    float r;
    unsigned char rgb[4];//only rgb!!!
    void readData(ifstream &is) {
        is >> site.x >> site.y >> site.z >> r;
        int s;
        for (int i = 0; i < 3; ++i) {
            is >> s;
            rgb[i] = (unsigned char)s;
        }
        rgb[3] = 0;
    }
    __device__ float calc(const mvec3 &ray, const float &nowLen) const {
        if (site.square() - r > nowLen) 
            return INF;
        float g = dot(ray, site);
        float lh = (site - ray * g).square();
        if (lh >= r * r) return INF;
        return sqrtf(site.square() - lh) - sqrtf(r * r - lh);
    }
    __device__ float calcCos(const float &dis, const mvec3 &ray) const {
        mvec3 rToC = normalize(site - ray * dis);//the radius point to  center of sphere 's normalized vector
        return dot(rToC, ray);
    }
};
void HANDLE_ERROR(cudaError_t status);
void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height);
void ReadData(Sphere* &res, int &n) {
    ifstream is("sphere.in");
    is >> n;
    cout << "Get " << n << " spheres." << endl;
    res = new Sphere[n];
    for (int i = 0; i < n; ++i)
        res[i].readData(is);
}
int main() {
    Sphere *sp;
    int spnum;
    ReadData(sp, spnum);

    unsigned char *p = new unsigned char[WIDTH*HEIGHT * 4 * 4];

    GLFWwindow *window = glfwStart(WIDTH, HEIGHT, "ray-tracing");
    Shader shader;
    shader.mkShader("shader.vert", NULL, "shader.frag");
    GLuint vao = mkVAO();

    drawPixels(p, sp, spnum, WIDTH*2, HEIGHT*2);
    GLuint tex = mkTex(GL_RGBA, WIDTH*2, HEIGHT*2, p);

    while (!glfwWindowShouldClose(window)) {
        glfwPollEvents();
        glClearColor(0,0,0,0);
        glClear(GL_COLOR_BUFFER_BIT);

        shader.Use();
        glBindTexture(GL_TEXTURE_2D, tex);
        glBindVertexArray(vao);
        glDrawArrays(GL_TRIANGLES, 0, 6);
        glfwSwapBuffers(window);
        GLuint err = glGetError();
        if (err)
            cout << "Error: " << err << endl;
    }
    glDeleteTextures(1, &tex);
    delete[]p;
    delete[]sp;
    glfwTerminate();
    return 0;
}

__constant__ Sphere pp[200];
void HANDLE_ERROR(cudaError_t status) {
    if (status != cudaSuccess) {
        fprintf(stderr, "Error~\n");
        exit(0);
    }
}
__global__ void kernel(unsigned char *res, int spnum, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int offset = y * width + x;
    if (offset < width * height) {
        offset *= 4;
        float fx = 1.0f * (width/2 - x) / height;
        float fy = 1.0f * y / height - 0.5f;

        mvec3 ray = normalize(Mvec3(fx, fy, 1));

        float miniLen = SINF;
        int miniNum = -1;
        for (int i = 0; i < spnum; ++i) {
            float gg = pp[i].calc(ray, miniLen);
            if (gg < miniLen) {
                miniLen = gg;
                miniNum = i;
            }
        }

        if (miniNum == -1)
            res[offset] = res[offset + 1] = res[offset + 2] = 0;
        else {
            unsigned char *rgb = pp[miniNum].rgb;
            float light = pp[miniNum].calcCos(miniLen, ray);
            res[offset] = light * rgb[0];
            res[offset + 1] = light * rgb[1];
            res[offset + 2] = light * rgb[2];
        }
    }
    else
        res[offset] = res[offset + 1] = res[offset + 2] = 0;

    res[offset + 3] = 0;
}
void drawPixels(unsigned char *res, Sphere *sp, int spnum, int width, int height) {
    HANDLE_ERROR(cudaSetDevice(0));

    cudaError_t status;
    unsigned char *p = 0;

    status = cudaMalloc((void**)&p, width*height*4);
    if (status != cudaSuccess) {
        fprintf(stderr, "ERROR: Malloc for pixels failed\n.");
        goto Error;
    }
    status = cudaMemcpyToSymbol(pp, sp, spnum*sizeof(Sphere));
    if (status != cudaSuccess) {
        fprintf(stderr, "ERROR: MemcpyToSymbol failed.\n");
        goto Error;
    }

    dim3 blockDim(32,32);
    dim3 gridDim((width + 31) / 32, (height + 31) / 32);
    kernel << > > (p, spnum, width, height);

    status = cudaGetLastError();
    if (status != cudaSuccess) {
        fprintf(stderr, "Build kernel failed.\n");
        goto Error;
    }
    status = cudaDeviceSynchronize();
    if (status != cudaSuccess) {
        fprintf(stderr, "kernel run failed.\n");
        goto Error;
    }
    status = cudaMemcpy(res, p, width*height*4, cudaMemcpyDeviceToHost);
    if (status != cudaSuccess) {
        fprintf(stderr, "Memcpy failed.\n");
        goto Error;
    }

Error:

    cudaFree(p);
    cudaFree(pp);
    HANDLE_ERROR(cudaDeviceReset());
    return ;
}

效果图:
光线追踪_第2张图片

你可能感兴趣的:(cuda,openGL)