这里只写了方阵之间的乘法,但是本质上都是一样的。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
// Kernal:
__global__ void MatrixMultiply(int *a, int * b, int *c, int N) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
if (tx < N && ty < N) {
int sum = 0;
for (int k = 0; k < N; ++k) {
int adata = a[tx * N + k];
int bdata = b[k * N + ty];
sum += adata * bdata;
}
c[tx * N + ty] = sum;
}
}
cudaError_t matrixMultiplyWithCuda(int *a, int *b, int *c, size_t size);
int main()
{
std::ifstream in("data.txt");
int N;
in >> N;
if (in.fail()) {
printf("Something wrong\n");
}
else {
printf("Success read\n");
}
// host initial
int *a = new int[N * N];
int *b = new int[N * N];
int *c = new int[N * N];
// read
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j) in >> a[i * N + j];
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j) in >> b[i * N + j];
cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) std::cout << c[i * N + j]<<" ";
std::cout << std::endl;
}
cudaStatus = cudaThreadExit();
// host free
delete[] a;
delete[] b;
delete[] c;
return 0;
}
cudaError_t matrixMultiplyWithCuda(int *a, int *b, int *c, size_t N) {
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(int));
cudaStatus = cudaMalloc((void**)&dev_b, N * N * sizeof(int));
cudaStatus = cudaMalloc((void**)&dev_c, N * N * sizeof(int));
cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("Something wrong\n");
goto Error;
}
// kernal invocation
dim3 threadPerBlock(32, 32);
dim3 numBlocks(N / threadPerBlock.x + 1, N / threadPerBlock.y + 1);
MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N);
if (cudaStatus != cudaSuccess) {
printf( "Calculate wrong\n");
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);
Error:
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return cudaStatus;
}
(也改成了浮点数运算了)
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
// Kernal:
__global__ void MatrixMultiply(float *a, float * b, float *c, int N) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
if (tx < N && ty < N) {
float sum = 0;
for (int k = 0; k < N; ++k) {
float adata = a[tx * N + k];
float bdata = b[k * N + ty];
sum += adata * bdata;
}
c[tx * N + ty] = sum;
}
}
cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t size);
int main()
{
std::ifstream in("data.txt");
int N;
in >> N;
if (in.fail()) {
printf("Something wrong\n");
}
else {
printf("Success read\n");
}
// host initial
float *a = new float[N * N];
float *b = new float[N * N];
float *c = new float[N * N];
// read
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j) in >> a[i * N + j];
for (int i = 0; i < N; ++i)
for (int j = 0; j < N; ++j) in >> b[i * N + j];
cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);
std::ofstream out("output.txt");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) out << c[i * N + j]<<" ";
out << std::endl;
}
cudaStatus = cudaThreadExit();
// host free
delete[] a;
delete[] b;
delete[] c;
return 0;
}
cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t N) {
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(int));
cudaStatus = cudaMalloc((void**)&dev_b, N * N * sizeof(int));
cudaStatus = cudaMalloc((void**)&dev_c, N * N * sizeof(int));
cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("Something wrong\n");
goto Error;
}
// kernal invocation
dim3 threadPerBlock(32, 32);
dim3 numBlocks(N / threadPerBlock.x + 1, N / threadPerBlock.y + 1);
MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N);
if (cudaStatus != cudaSuccess) {
printf( "Calculate wrong\n");
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);
Error:
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return cudaStatus;
}