源码:
#include
#include
#include
#include
#include
#include
#include
#define BILLION 1000000000L;
void print_vector(float* tau, int n){
for(int i=0; i lwork_orgqr) ? lwork_geqrf : lwork_orgqr;
// device memory for workspace
cudaStat = cudaMalloc((void **)&d_work, sizeof(float) * lwork);
// QR factorization for d_A
clock_gettime(CLOCK_REALTIME, &start); // start timer
cusolver_status = cusolverDnSgeqrf(cusolverH, m, n, d_A, lda,
d_tau, d_work, lwork, devInfo);
cudaStat = cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &stop); // stop timer
accum = (stop.tv_sec - start.tv_sec) + // elapsed time
(stop.tv_nsec - start.tv_nsec) / (double)BILLION;
printf(" Sgeqrf time : %lf sec .\n", accum); // print elapsed time
cudaStat = cudaMemcpy(&info_gpu, devInfo, sizeof(int),
cudaMemcpyDeviceToHost); // copy devInfo -> info_gpu
// check geqrf error code
printf("\n after geqrf : info_gpu = %d\n", info_gpu);
///
printf("\nA =\n");print_matrix(A, m, n, lda);
cudaStat = cudaMemcpy(A, d_A, sizeof(float) * lda * n,
cudaMemcpyDeviceToHost);
printf("\nV+R-I =\n");print_matrix(A, m, n, lda);
float* tau = nullptr;
tau = (float*)malloc(n*sizeof(float));
cudaStat = cudaMemcpy(tau, d_tau, n*sizeof(float), cudaMemcpyDeviceToHost);
printf("\ntau = ");print_vector(tau, n);
tau_matrix(A, m, n, lda);
free(tau);
///
// apply orgqr function to compute the orthogonal matrix Q
// using elementary reflectors vectors stored in d_A and
// elementary reflectors scalars stored in d_tau ,
cusolver_status = cusolverDnSorgqr(cusolverH, m, n, n, d_A,
lda, d_tau, d_work, lwork, devInfo);
cudaStat = cudaDeviceSynchronize();
cudaStat = cudaMemcpy(&info_gpu, devInfo, sizeof(int),
cudaMemcpyDeviceToHost); // copy devInfo -> info_gpu
// check orgqr error code
printf("\n after orgqr : info_gpu = %d\n", info_gpu);
cudaStat = cudaMemcpy(Q, d_A, sizeof(float) * lda * n,
cudaMemcpyDeviceToHost); // copy d_A ->Q
memset(R, 0, sizeof(double) * n * n); // nxn matrix of zeros
for (int j = 0; j < n; j++)
{
R[j + n * j] = 1.0f; // ones on the diagonal
}
cudaStat = cudaMemcpy(d_R, R, sizeof(float) * n * n,
cudaMemcpyHostToDevice); // copy R-> d_R
// compute R = -Q**T*Q + I
cublas_status = cublasSgemm_v2(cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
n, n, m, &h_minus_one, d_A, lda, d_A, lda, &h_one, d_R, n);
float dR_nrm2 = 0.0; // norm value
// compute the norm of R = -Q**T*Q + I
cublas_status = cublasSnrm2_v2(cublasH, n * n, d_R, 1, &dR_nrm2);
printf("||I - Q^T*Q|| = %E\n", dR_nrm2); // print the norm
// free memory
cudaFree(d_A);
cudaFree(d_tau);
cudaFree(devInfo);
cudaFree(d_work);
cudaFree(d_R);
cublasDestroy(cublasH);
cusolverDnDestroy(cusolverH);
cudaDeviceReset();
return 0;
}
// Sqeqrf time : 0.434779 sec .
// after geqrf : info_gpu = 0
// after orgqr : info_gpu = 0
//|I - Q**T*Q| = 2.515004E -04
//
//
Makefile:
TARGETS = qr_cusolver_sgeqrf
all: $(TARGETS)
LD_FLAGS = -L/usr/local/cuda/lib64 \
-lcudart -lcudadevrt \
-lcusolver -lcublas \
-lcublasLt -lpthread
%: %.cpp
g++ -o $@ $< -I/usr/local/cuda/include $(LD_FLAGS) -fopenmp -I./cblas_source -L./cblas_source/CBLAS/lib -lcblas_LINUX -L/usr/local/lib -lblas -lgfortran
.PHONY:clean
clean:
-rm -f $(TARGETS)
可知 tau的计算公式如代码中所示。