CUDA By Example(二)

这里是书上的一个例子,Julia集,实现并行的部分主要是计算每个像素点的值。

#ifndef __BITMAP_H__
#define __BITMAP_H__
#include <windows.h>
#include <GL/glut.h>

class Bitmap {
private:
	unsigned char *pixels;
	int  x, y;
	void  *dataBlock;  //回收时候使用的。。具体还不是很明白
	void (*bitmapExit)(void*);
	
	static Bitmap** get_bitmap_ptr( void ) {
        static Bitmap   *gBitmap;  //初始化才执行,以后gBitmap都存在,不会被回收,所以可以直接获取这个值
        return &gBitmap;
    }
	
	static void Draw( void ) {
		Bitmap*   bitmap = *(get_bitmap_ptr());
        glClearColor( 0.0, 0.0, 0.0, 1.0 );
        glClear( GL_COLOR_BUFFER_BIT );
        glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
        glFlush();
    }
	
	static void Key(unsigned char key, int x, int y) {
        switch (key) {
            case 27:  //ESC键
                Bitmap*  bitmap = *(get_bitmap_ptr());
                if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)
                    bitmap->bitmapExit( bitmap->dataBlock );
                exit(0);
        }
    }
	
public:
	Bitmap( int width, int height, void *d = NULL ) {
		pixels = new unsigned char[width * height * 4];
		x = width;
		y = height;
		dataBlock = d;
	}
	
	~Bitmap() {
		delete[] pixels;
	}
	
	unsigned char* get_ptr( void ) const   { return pixels; }
	long image_size( void ) const { return x * y * 4; }
	
	void display_and_exit( void(*e)(void*) = NULL ) {
		Bitmap**  bitmap = get_bitmap_ptr();
		*bitmap = this;  //让bitmap指向当前定义的bitmap
		bitmapExit = e;

		int c=1;
		char* dummy = "";
		glutInit( &c, &dummy );
		glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
		glutInitWindowSize( x, y );
		glutCreateWindow( "bitmap" );
		glutDisplayFunc(Draw);//这个Draw需要用静态的,否则会被认为是Draw是void (Bitmap::*)()而不是void (*)()指针
		glutKeyboardFunc(Key);
		glutMainLoop();
	}

};

#endif

 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "bitmap.h"

#define DIM 1000
 
struct cuComplex {
	float r;
	float i;
	__device__ cuComplex( float a, float b ) : r(a), i(b) {}
	__device__ float magnitude2( void ) { return r * r + i * i; }
	__device__ cuComplex operator*( const cuComplex& a ) {
		return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
 }
 __device__ cuComplex operator+( const cuComplex& a ) {
	 return cuComplex( r+a.r, i+a.i );
 }
};
 
__device__ int julia( int x, int y ){
	const float scale = 1.5;
	float jx = scale * (float)(DIM/2 - x)/(DIM/2);
	float jy = scale * (float)(DIM/2 - y)/(DIM/2);
	cuComplex c(-0.8, 0.156);
	cuComplex a(jx, jy);

	for (int i=0; i<200; i++){
		a = a * a + c;
		if (a.magnitude2() > 1000)
			return 0;
	}
 return 1;
}

__global__ void kernel( unsigned char * ptr ){
	int x = blockIdx.x;
	int y = blockIdx.y;
	int offset = x + y * gridDim.x;

	int juliaValue = julia( x, y );
	ptr[offset * 4 + 0] = 255 * juliaValue;
	ptr[offset * 4 + 1] = 0;
	ptr[offset * 4 + 2] = 0;
	ptr[offset * 4 + 3] = 255;
}
 
int main(){
	Bitmap bitmap( DIM, DIM );
	unsigned char * dev_bitmap;
 
	//在GPU上分配内存
	cudaMalloc((void**)&dev_bitmap, bitmap.image_size());

	//声明一个二维线程格
	dim3 grid(DIM, DIM);
	//将dim3变量传递给CUDA运行时
	kernel<<<grid, 1>>>(dev_bitmap);
 
	cudaMemcpy(bitmap.get_ptr(), dev_bitmap, bitmap.image_size(), cudaMemcpyDeviceToHost);
 
	bitmap.display_and_exit();
 
	return 0;
}

 整个程序是比较简单的,不熟悉的部分是配合opengl的使用。运行结果如下;


CUDA By Example(二)_第1张图片
 

 接下来的实例中,有没有__syncthreads()对结果又直接的影响。

#include "bitmap.h"
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define DIM 1024
#define PI 3.1415926535897932

__global__ void kernel(unsigned char * ptr){
	int x=threadIdx.x+blockIdx.x*blockDim.x;  //定位x方向
	int y=threadIdx.y+blockIdx.y*blockDim.y;  //定位y方向
	int offset=x+y*blockDim.x*gridDim.x;  //定位到线程的位置

	__shared__ float shared[16][16];  //用来计算像素点的RGB值
	const float period=128.0f;
	shared[threadIdx.x][threadIdx.y]=255*(sinf(x*2.0f*PI/period)+1.0f)*(sinf(y*2.0f*PI/period)+1.0f)/4.0f;
	__syncthreads();  //如果没有这一步,每个线程写入的shared步调不一致,没有完成shared的完全赋值就已经设置了像素的RGB
	ptr[offset*4+0]=0;
	ptr[offset*4+1]=shared[15-threadIdx.x][15-threadIdx.y];
	ptr[offset*4+2]=0;
	ptr[offset*4+3]=255;
}

int main(){
	Bitmap bitmap(DIM,DIM);
	unsigned char* dev_bitmap;
	cudaMalloc((void**)&dev_bitmap,bitmap.image_size());
	dim3 grids(DIM/16,DIM/16);
	dim3 blocks(16,16);
	//这里grid中的block是二维的,block中的thread也是二维的
	kernel<<<grids,blocks>>>(dev_bitmap);
	//将设备上的dev_bitmap拷贝到bitmap的普ptr所指的单元中
	cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost);
	bitmap.display_and_exit();
	cudaFree(dev_bitmap);
	return 0;
}

 运行结果:


CUDA By Example(二)_第2张图片
 

 如果没有同步函数,那么负责写入到shared的线程可能还没有完成写入操作。这种情况下,运行结果为:


CUDA By Example(二)_第3张图片
 

 

 

你可能感兴趣的:(example)