cuBLAS Level-3 Function
Level-3 进行的是矩阵和矩阵之间的操作。
具体请参考nvidia官方文档:
https://docs.nvidia.com/cuda/cublas/index.html
- cublas
gemm()
cublasStatus_t cublasSgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc)
cublasStatus_t cublasDgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const double *alpha,
const double *A, int lda,
const double *B, int ldb,
const double *beta,
double *C, int ldc)
cublasStatus_t cublasCgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const cuComplex *alpha,
const cuComplex *A, int lda,
const cuComplex *B, int ldb,
const cuComplex *beta,
cuComplex *C, int ldc)
cublasStatus_t cublasZgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const cuDoubleComplex *alpha,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb,
const cuDoubleComplex *beta,
cuDoubleComplex *C, int ldc)
cublasStatus_t cublasHgemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const __half *alpha,
const __half *A, int lda,
const __half *B, int ldb,
const __half *beta,
__half *C, int ldc)
该函数进行,其中
- cublas
gemm3m()
cublasStatus_t cublasCgemm3m(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const cuComplex *alpha,
const cuComplex *A, int lda,
const cuComplex *B, int ldb,
const cuComplex *beta,
cuComplex *C, int ldc)
cublasStatus_t cublasZgemm3m(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const cuDoubleComplex *alpha,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb,
const cuDoubleComplex *beta,
cuDoubleComplex *C, int ldc)
该函数进行复数矩阵与矩阵相乘,使用Gauss complexity reduction算法,效率可以提高25%。
- cublas
gemmBatched()
cublasStatus_t cublasHgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m, int n, int k,
const __half *alpha,
const __half *Aarray[], int lda,
const __half *Barray[], int ldb,
const __half *beta,
__half *Carray[], int ldc,
int batchCount)
cublasStatus_t cublasSgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *Aarray[], int lda,
const float *Barray[], int ldb,
const float *beta,
float *Carray[], int ldc,
int batchCount)
cublasStatus_t cublasDgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m, int n, int k,
const double *alpha,
const double *Aarray[], int lda,
const double *Barray[], int ldb,
const double *beta,
double *Carray[], int ldc,
int batchCount)
cublasStatus_t cublasCgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m, int n, int k,
const cuComplex *alpha,
const cuComplex *Aarray[], int lda,
const cuComplex *Barray[], int ldb,
const cuComplex *beta,
cuComplex *Carray[], int ldc,
int batchCount)
cublasStatus_t cublasZgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m, int n, int k,
const cuDoubleComplex *alpha,
const cuDoubleComplex *Aarray[], int lda,
const cuDoubleComplex *Barray[], int ldb,
const cuDoubleComplex *beta,
cuDoubleComplex *Carray[], int ldc,
int batchCount)
该函数进行一个batch的矩阵和矩阵相乘。batch中每一个元素(矩阵)都应该有相同的形状。
- cublas
symm()
cublasStatus_t cublasSsymm(cublasHandle_t handle,
cublasSideMode_t side, cublasFillMode_t uplo,
int m, int n,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc)
cublasStatus_t cublasDsymm(cublasHandle_t handle,
cublasSideMode_t side, cublasFillMode_t uplo,
int m, int n,
const double *alpha,
const double *A, int lda,
const double *B, int ldb,
const double *beta,
double *C, int ldc)
cublasStatus_t cublasCsymm(cublasHandle_t handle,
cublasSideMode_t side, cublasFillMode_t uplo,
int m, int n,
const cuComplex *alpha,
const cuComplex *A, int lda,
const cuComplex *B, int ldb,
const cuComplex *beta,
cuComplex *C, int ldc)
cublasStatus_t cublasZsymm(cublasHandle_t handle,
cublasSideMode_t side, cublasFillMode_t uplo,
int m, int n,
const cuDoubleComplex *alpha,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb,
const cuDoubleComplex *beta,
cuDoubleComplex *C, int ldc)
进行对称矩阵相乘。