check_cuda_error(cublasGemmEx(param_.cublas_handle,
CUBLAS_OP_N, CUBLAS_OP_N,
n, m, k,
&alpha,
param_.attr_kernel_Q, AType_, n,
param_.from_tensor, BType_, k,
&beta,
query_buf_, CType_, n,
computeType_,
static_cast(cublasAlgo_[0])));
check_cuda_error(cublasGemmEx(param_.cublas_handle,
CUBLAS_OP_N, CUBLAS_OP_N,
n, m, k,
&alpha,
param_.attr_kernel_K, AType_, n,
param_.to_tensor, BType_, k,
&beta,
key_buf_, CType_, n,
computeType_,
static_cast(cublasAlgo_[0])));
check_cuda_error(cublasGemmEx(param_.cublas_handle,
CUBLAS_OP_N, CUBLAS_OP_N,
n, m, k,
&alpha,
param_.attr_kernel_V, AType_, n,
param_.to_tensor, BType_, k,
&beta,
value_buf_, CType_, n,
computeType_,
static_cast(cublasAlgo_[0])));
cublasGemmEx
is a function from the NVIDIA cuBLAS library that performs a generalized matrix multiplication operation (GEMM) on two matrices A and B, and accumulates the result into a third matrix C.
The “Ex” suffix in the function name indicates that this is an extended version of the basic cublasGemm
function, which allows for more advanced features such as data type casting, tensor operations, and tensor cores support.
The function signature for cublasGemmEx
is as follows:
cublasStatus_t cublasGemmEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void *alpha,
const void *A,
cudaDataType_t Atype,
int lda,
const void *B,
cudaDataType_t Btype,
int ldb,
const void *beta,
void *C,
cudaDataType_t Ctype,
int ldc,
cudaDataType_t computeType,
cublasGemmAlgo_t algo);
Here’s a brief overview of the input parameters:
handle
: A handle to the cuBLAS library context.transa
and transb
: Transpose operation to be performed on matrices A and B respectively before the GEMM operation.m
, n
, and k
: The dimensions of matrices A, B, and C, respectively.alpha
: Scalar used to scale the product of matrices A and B.A
, B
, and C
: Pointers to the device memory storing the matrices A, B, and C.Atype
, Btype
, and Ctype
: Data types of matrices A, B, and C, respectively.lda
, ldb
, and ldc
: The leading dimensions of matrices A, B, and C, respectively.beta
: Scalar used to scale matrix C before accumulation.computeType
: The data type used for intermediate computations.algo
: The algorithm used for the GEMM operation.The function returns a cublasStatus_t
value indicating whether the operation was successful or if an error occurred.
void multiHeadAttr_nofuse_kernelLauncher(
cudaStream_t stream,
cublasHandle_t handle,
DataType_* Q,
const DataType_* bias_Q,
DataType_* K,
const DataType_* bias_K,
DataType_* V,
const DataType_* bias_V,
const DataType_* attr_mask,
DataType_* dst,
const int batch_size,
const int seq_len,
const int head_num,
const int size_per_head,
const DataType_ scaler);
template void OpenMultiHeadAttention::multiHeadAttr_nofuse_kernelLauncher(
cudaStream_t stream,
cublasHandle_t handle,
float* Q,
const float* bias_Q,
float* K,
const float* bias_K,
float* V,
const float* bias_V,
const float* attr_mask,
float* dst,
const int batch_size,
const int seq_len,
const int head_num,
const int size_per_head,
const float scaler);
template void OpenMultiHeadAttention::multiHeadAttr_nofuse_kernelLauncher(
cudaStream_t stream,
cublasHandle_t handle,
__half* Q,
const __half* bias_Q,
__half* K,
const __half* bias_K,
__half* V,
const __half* bias_V,
const __half* attr_mask,
__half* dst,
const int batch_size,
const int seq_len,
const int head_num,
const int size_per_head,
const __half scaler);
}//namespace cuda
template
void OpenMultiHeadAttention::multiHeadAttr_nofuse_kernelLauncher(
cudaStream_t stream,
cublasHandle_t cublas_handle,
DataType_* Q,
const DataType_* bias_Q,
DataType_* K,
const DataType_* bias_K,
DataType_* V,
const DataType_* bias_V,
const DataType_* attr_mask,
DataType_* dst,
const int batch_size,
const int seq_len,
const int head_num,
const int size_per_head,
const DataType_ scaler)
{
int m = batch_size * seq_len;
int k = head_num * size_per_head;
dim3 grid;
dim3 block;
if(OpType_ == OperationType::FP32)
{
const int word_per_block = 1;
assert(k <= 1024);
assert(m / word_per_block * 3 <= 65536);
dim3 grid(m / word_per_block * 3);
dim3 block(k);
add_QKV_bias<<>>(Q, bias_Q, K, bias_K, V, bias_V, q_buf_, k_buf_, v_buf_,
batch_size, seq_len, head_num, size_per_head, word_per_block);
}
else
{
const int word_per_block = 1;
grid.x = batch_size * seq_len / word_per_block;
block.x = head_num * size_per_head * word_per_block / 2;
add_QKV_bias<<>>(Q, bias_Q, K, bias_K, V, bias_V, q_buf_, k_buf_,
v_buf_, batch_size, seq_len, head_num, size_per_head / 2, word_per_block);
}
DataType_ alpha = (DataType_)1.0f, beta = (DataType_)0.0f;
check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle,
CUBLAS_OP_T, CUBLAS_OP_N,
seq_len, seq_len, size_per_head,
&alpha,
k_buf_, AType_, size_per_head, seq_len * size_per_head,
q_buf_, BType_, size_per_head, seq_len * size_per_head,
&beta,
qk_buf_, CType_, seq_len, seq_len * seq_len,
batch_size * head_num,
computeType_,
static_cast(cublasAlgo_[1])));
if(seq_len <= 32)
block.x = 32;
else if(seq_len > 32 && seq_len <= 64)
block.x = 64;
else if(seq_len > 64 && seq_len <= 128)
block.x = 128;
else if(seq_len > 128 && seq_len <= 256)
block.x = 256;
else if(seq_len > 256 && seq_len <= 512)
block.x = 512;
else
block.x = 1024;
if(batch_size * head_num <= 120)
{
grid.x = batch_size * head_num * seq_len;
softmax_kernel_v2<<>>(qk_buf_, attr_mask, batch_size, head_num, seq_len, scaler);
}
else
{
grid.x = batch_size * head_num;
softmax_kernel<<>>(qk_buf_, attr_mask, batch_size, head_num, seq_len, scaler);
}
check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle,
CUBLAS_OP_N, CUBLAS_OP_N,
size_per_head, seq_len, seq_len,
&alpha,
v_buf_, AType_, size_per_head, seq_len * size_per_head,
qk_buf_, BType_, seq_len, seq_len * seq_len,
&beta,
transpose_dst_, CType_, size_per_head, seq_len * size_per_head,
batch_size * head_num,
computeType_,
static_cast(cublasAlgo_[2])));
/* for half2 only */
if(OpType_ == OperationType::HALF)
{
const int seq_per_block = 4;
grid.x = batch_size * head_num * seq_len / seq_per_block;
block.x = seq_per_block * size_per_head / 2;
assert(grid.x * seq_per_block == batch_size * head_num * seq_len);
transpose<<>>(transpose_dst_, dst,
batch_size, seq_len, head_num, size_per_head / 2);
}
else
{
const int seq_per_block = 1;
grid.x = batch_size * head_num * seq_len / seq_per_block;
block.x = seq_per_block * size_per_head;
transpose<<>>(transpose_dst_, dst,
batch_size, seq_len, head_num, size_per_head);
}
}
cublasGemmTridedBatchedEx
是cuBLAS库中的一个函数,用于执行跨步分批矩阵乘法。它获取多组输入矩阵,并对每组并行执行相同的矩阵乘法运算,将结果存储回内存。函数名称中的“Ex”表示这是基本“cublasGemmTriedBatched”函数的扩展版本,其中包括用于指定数据类型、比例因子和其他设置的附加选项。
在深度学习的背景下,“cublasGemmTridedBatchedEx”通常用于在Transformer神经网络中执行多头自注意操作。此操作涉及将查询、键和值矩阵集相乘以产生注意力分数,注意力分数用于对值进行加权并计算最终输出。有效地执行这些矩阵乘法对于在大型Transformer模型中实现高性能至关重要。
cuda inline
https://github1s.com/NVIDIA/FasterTransformer/blob/v1.0/fastertransformer/bert_encoder_transformer.h#L208-L281 中调用了add_bias_input_layernorm_kernelLauncher方法。
一个帮助理解multiHeadAttr_nofuse_kernelLauncher的例子
#include
#include
#include
// using OperationType = int;
enum class OperationType{FP32, HALF};
templateclass First{};
template< OperationType N,templateclass XXX >
class Second;
template< OperationType N,templateclass XXX >
class Second{
public:
XXX b;
Second(){
std::cout<<"NNN";
}
};
// template class MultiHeadAttention_>
// class BertEncoderTransformerTraits
template< templateclass XXX >
class Second{
public:
XXX b;
Second(){
std::cout<<"SSSSS";
}
};
// template< templateclass XXX >
// class Second{
// public:
// XXX b;
// Second(){
// std::cout<<"HALF";
// }
// };
int main()
{
printf("Hello World\n");
// Second *second = new Second();
Second *second = new Second();
return 0;
}