cublasgemvStridedBatched()
cublasStatus_t cublasSgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const float *alpha,
const float *A, int lda,
long long int strideA,
const float *x, int incx,
long long int stridex,
const float *beta,
float *y, int incy,
long long int stridey,
int batchCount)
cublasStatus_t cublasDgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const double *alpha,
const double *A, int lda,
long long int strideA,
const double *x, int incx,
long long int stridex,
const double *beta,
double *yarray[], int incy,
long long int stridey,
int batchCount)
cublasStatus_t cublasCgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const cuComplex *alpha,
const cuComplex *A, int lda,
long long int strideA,
const cuComplex *x, int incx,
long long int stridex,
const cuComplex *beta,
cuComplex *y, int incy,
long long int stridey,
int batchCount)
cublasStatus_t cublasZgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const cuDoubleComplex *alpha,
const cuDoubleComplex *A, int lda,
long long int strideA,
const cuDoubleComplex *x, int incx,
long long int stridex,
const cuDoubleComplex *beta,
cuDoubleComplex *y, int incy,
long long int stridey,
int batchCount)
cublasStatus_t cublasHSHgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const float *alpha,
const __half *A, int lda,
long long int strideA,
const __half *x, int incx,
long long int stridex,
const float *beta,
__half *y, int incy,
long long int stridey,
int batchCount)
cublasStatus_t cublasHSSgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const float *alpha,
const __half *A, int lda,
long long int strideA,
const __half *x, int incx,
long long int stridex,
const float *beta,
float *y, int incy,
long long int stridey,
int batchCount)
cublasStatus_t cublasTSTgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const float *alpha,
const __nv_bfloat16 *A, int lda,
long long int strideA,
const __nv_bfloat16 *x, int incx,
long long int stridex,
const float *beta,
__nv_bfloat16 *y, int incy,
long long int stridey,
int batchCount)
cublasStatus_t cublasTSSgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const float *alpha,
const __nv_bfloat16 *A, int lda,
long long int strideA,
const __nv_bfloat16 *x, int incx,
long long int stridex,
const float *beta,
float *y, int incy,
long long int stridey,
int batchCount)
此函数执行一批矩阵和向量的矩阵向量乘法。 batch 被认为是“统一的”,即所有实例对于各自的 A 矩阵、x 和 向量。 批处理的每个实例的输入矩阵 A 和向量 x 以及输出向量 y 位于与它们在前一个实例中的位置的元素数量上的固定偏移处。 用户将指向第一个实例的 A 矩阵、x 和 y 向量的指针连同元素数量的偏移量传递给函数 - strideA、stridex 和 stridey 确定输入矩阵和向量的位置,以及未来实例中的输出向量 .
y + i ∗ s t i d e y = α o p ( A + i ∗ s t r i d e A ) ( x + i ∗ s t r i d e x ) + β ( y + i ∗ s t r i d e y ) , f o r i ∈ [ 0 , b a t c h C o u n t − 1 ] y + i*stidey = \alpha op(A+i*strideA)(x + i*stridex) + \beta (y+i*stridey), for i \in [0, batchCount -1] y+i∗stidey=αop(A+i∗strideA)(x+i∗stridex)+β(y+i∗stridey),fori∈[0,batchCount−1]
其中 α \alpha α和 β \beta β是标量, A 是指向矩阵 A[i] 的指针数组,以列优先格式存储,维度为 m x n ,x 和 y 是指向向量的指针数组。 此外,对于 matrixA[i] ,
o p ( A [ i ] ) = { A [ i ] 如 果 t r a n s a = = C U B L A S _ O P _ N , A [ i ] T 如 果 t r a n s a = = C U B L A S _ O P _ T , A [ i ] H 如 果 t r a n s a = = C U B L A S _ O P _ C op(A[i])= \begin{cases} A[i]\ \ \ \ 如果 transa == CUBLAS\_OP\_N,\\ A[i]^T \ \ 如果 transa == CUBLAS\_OP\_T,\\ A[i]^H \ \ 如果 transa == CUBLAS\_OP\_C \end{cases} op(A[i])=⎩⎪⎨⎪⎧A[i] 如果transa==CUBLAS_OP_N,A[i]T 如果transa==CUBLAS_OP_T,A[i]H 如果transa==CUBLAS_OP_C
注意:y[i] 向量不能重叠,也就是说,各个 gemv 操作必须是可独立计算的; 否则,会出现未定义的行为。
对于某些规模的问题,在不同的 CUDA 流中多次调用 cublas
可能比使用此 API 更有利。
注意:在下表中,我们使用 A[i]、x[i]、y[i] 作为 A 矩阵的符号,以及批处理的第 i 个实例中的 x 和 y 向量,隐含地假设它们分别在数字上偏移 元素 strideA、stridex、stridey 远离 A[i-1]、x[i-1]、y[i-1]。 偏移量的单位是元素数,不能为零。
Param. | Memory | In/out | Meaning |
---|---|---|---|
handle | input | handle to the cuBLAS library context. | |
trans | input | Operation op(A[i]) that is non- or (conj.) transpose. | |
m | input | Number of rows of matrix A[i]. | |
n | input | number of columns of matrix A. | |
alpha | host or device | input | scalar used for multiplication. |
A | device | input | pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x n with lda>=max(1,m). |
lda | input | Leading dimension of two-dimensional array used to store each matrix A[i]. | |
strideA | input | Value of type long long int that gives the offset in number of elements between A[i] and A[i+1] | |
x | device | input | pointer to the x vector corresponding to the first instance of the batch, with each dimension n if trans==CUBLAS_OP_N and m otherwise. |
incx | input | stride between consecutive elements of x. | |
stridex | input | Value of type long long int that gives the offset in number of elements between x[i] and x[i+1] | |
beta | host or device | input | scalar used for multiplication. If beta == 0, y does not have to be a valid input. |
y | device | in/out | pointer to the y vector corresponding to the first instance of the batch, with each dimension m if trans==CUBLAS_OP_N and n otherwise. Vectors y[i] should not overlap; otherwise, undefined behavior is expected. |
incy | input | Stride of each one-dimensional array y[i]. | |
stridey | input | Value of type long long int that gives the offset in number of elements between y[i] and y[i+1] | |
batchCount | input | Number of pointers contained in Aarray, xarray and yarray. |
该函数可能返回的错误值及其含义如下表所示:
ErrorValue | Meaning |
---|---|
CUBLAS_STATUS_SUCCESS | 操作成功完成 |
CUBLAS_STATUS_NOT_INITIALIZED | 库未初始化 |
CUBLAS_STATUS_INVALID_VALUE | 参数 m,n,batchCount<0 . |
CUBLAS_STATUS_EXECUTION_FAILED | 该功能无法在 GPU 上启动 |