从上一篇一道简单面试题引出的优化方法讨论 (Ⅰ)中,我们已经了解到了这个问题使用SIMD
和SMT
进行优化的实现方法,我会在第二篇中继续探讨使用SIMT
优化的实现方法。
我们再来回顾下问题
在一个内存文件中找出所有以 Windows换行符(
\r\n
)结尾的行首指针,并保存在数组中,结果不要求有序
对于SIMT
的概念通常对应于GPU上的开发,我们选用目前最为流行的异构计算的库CUDA
,Thrust
和OpenAcc
来示例。
将朴素算法移植到CUDA上
static const size_t BLOCK_SIZE = 128;
__global__ void foo(const char *array, size_t *tokens, int *token_index)
{
__shared__ char s_array[BLOCK_SIZE + 1];
size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIdx.x == BLOCK_SIZE - 1)
{
s_array[threadIdx.x] = array[offset];
s_array[threadIdx.x + 1] = array[offset + 1];
}
else
{
s_array[threadIdx.x] = array[offset];
}
__syncthreads();
if (s_array[threadIdx.x] == '\r' && s_array[threadIdx.x + 1] == '\n')
{
int index = atomicAdd(token_index, 1);
tokens[index] = offset + 2;
}
}
void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
const char *d_buffer;
size_t *d_tokens;
int *d_token_index;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void **)&d_buffer, buffer_size + 1);
cudaMalloc((void **)&d_tokens, token_size * sizeof(size_t));
cudaMalloc((void **)&d_token_index, sizeof(int));
cudaMemcpy((void *)d_buffer, buffer, buffer_size, cudaMemcpyHostToDevice);
cudaMemset(d_token_index, 0, sizeof(int));
size_t blocks = buffer_size / BLOCK_SIZE;
cudaEventRecord(start);
foo<<>>(d_buffer, d_tokens, d_token_index);
cudaEventRecord(stop);
cudaMemcpy(tokens, d_tokens, token_size * sizeof(size_t), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Kernel took %.4f milliseconds\n", milliseconds);
cudaFree((void *)d_buffer);
cudaFree((void *)d_tokens);
cudaFree((void *)d_token_index);
}
将方法二移植到CUDA上
static const size_t BLOCK_SIZE = 128;
__global__ void foo(const char *array, size_t *tokens, int *token_index)
{
__shared__ char s_array[BLOCK_SIZE * 2 + 1];
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
size_t s_offset = threadIdx.x * 2;
*reinterpret_cast<short *>(&s_array[s_offset]) = *reinterpret_cast<const short *>(&array[offset]);
if (threadIdx.x == BLOCK_SIZE - 1)
s_array[BLOCK_SIZE * 2] = array[offset + 2];
__syncthreads();
if (threadIdx.x == 0)
{
if (s_array[0] == '\r' && s_array[1] == '\n')
{
int index = atomicAdd(token_index, 1);
tokens[index] = offset + 2;
}
if (s_array[BLOCK_SIZE * 2 - 1] == '\r' && s_array[BLOCK_SIZE * 2] == '\n')
{
int index = atomicAdd(token_index, 1);
tokens[index] = offset + 2 * BLOCK_SIZE + 1;
}
}
else
{
if (s_array[s_offset] == '\r')
{
if (s_array[s_offset + 1] == '\n')
{
int index = atomicAdd(token_index, 1);
tokens[index] = offset + 2;
}
}
else if (s_array[s_offset] == '\n')
{
if (s_array[s_offset - 1] == '\r')
{
int index = atomicAdd(token_index, 1);
tokens[index] = offset + 1;
}
}
}
}
void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
const char *d_buffer;
size_t *d_tokens;
int *d_token_index;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void **)&d_buffer, buffer_size + 1);
cudaMalloc((void **)&d_tokens, token_size * sizeof(size_t));
cudaMalloc((void **)&d_token_index, sizeof(int));
cudaMemcpy((void *)d_buffer, buffer, buffer_size, cudaMemcpyHostToDevice);
cudaMemset(d_token_index, 0, sizeof(int));
int blocks = buffer_size / 2 / BLOCK_SIZE;
cudaEventRecord(start);
foo<<>>(d_buffer, d_tokens, d_token_index);
cudaEventRecord(stop);
cudaMemcpy(tokens, d_tokens, token_size * sizeof(size_t), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Kernel took %.3f milliseconds\n", milliseconds);
cudaFree((void *)d_buffer);
cudaFree((void *)d_tokens);
cudaFree((void *)d_token_index);
}
对方法六进一步优化,增加单个线程的计算量,减少线程数目
static const size_t BLOCK_SIZE = 128;
static const size_t SEGMENT_SIZE = sizeof(int);
__global__ void foo(const char *array, size_t *tokens, int *token_index)
{
__shared__ char s_array[SEGMENT_SIZE * BLOCK_SIZE + 1];
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x) * SEGMENT_SIZE;
size_t s_offset = threadIdx.x * SEGMENT_SIZE;
*reinterpret_cast<int *>(&s_array[s_offset]) = *reinterpret_cast<const int *>(&array[offset]);
if (threadIdx.x == BLOCK_SIZE - 1)
s_array[s_offset + SEGMENT_SIZE] = array[offset + SEGMENT_SIZE];
__syncthreads();
for (size_t i = 0; i < SEGMENT_SIZE; i++)
{
if (s_array[s_offset + i] == '\r' && s_array[s_offset + i + 1] == '\n')
{
int index = atomicAdd(token_index, 1);
tokens[index] = offset + i + 2;
}
}
}
void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
const char *d_buffer;
size_t *d_tokens;
int *d_token_index;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void **)&d_buffer, buffer_size + 1);
cudaMalloc((void **)&d_tokens, token_size * sizeof(size_t));
cudaMalloc((void **)&d_token_index, sizeof(int));
cudaMemcpy((void *)d_buffer, buffer, buffer_size, cudaMemcpyHostToDevice);
cudaMemset(d_token_index, 0, sizeof(int));
size_t blocks = buffer_size / BLOCK_SIZE / SEGMENT_SIZE;
cudaEventRecord(start);
foo<<>>(d_buffer, d_tokens, d_token_index);
cudaEventRecord(stop);
cudaMemcpy(tokens, d_tokens, token_size * sizeof(size_t), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Kernel took %.4f milliseconds\n", milliseconds);
cudaFree((void *)d_buffer);
cudaFree((void *)d_tokens);
cudaFree((void *)d_token_index);
}
使用CUDA
自带的并行算法库Thrust
来实现
static thrust::host_vector h_input;
struct transform
{
template Tuple>
__host__ __device__ size_t operator()(Tuple t)
{
if (thrust::get<1>(t) == '\r' && thrust::get<2>(t) == '\n')
{
return thrust::get<0>(t) + 2;
}
else
{
return (size_t)-1;
}
}
};
struct predicate
{
__host__ __device__ bool operator()(size_t token)
{
return token != (size_t)-1;
}
};
void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
thrust::device_vector d_tokens(token_size);
thrust::device_vector d_input = h_input;
auto zip_it = thrust::make_zip_iterator(
thrust::make_tuple(
thrust::counting_iterator (0),
d_input.begin(),
d_input.begin() + 1));
auto transform_it = thrust::make_transform_iterator(zip_it, transform());
thrust::copy_if(transform_it, transform_it + buffer_size, d_tokens.begin(), predicate());
thrust::copy(d_tokens.begin(), d_tokens.end(), tokens);
}
使用OpenAcc
编程接口移植朴素算法
void tokenize(const char *restrict buffer, size_t buffer_size, size_t *restrict tokens, size_t token_size)
{
size_t token_index = 0;
#pragma acc parallel loop copyin(buffer [0:buffer_size]) copyin(token_index) copyout(tokens [0:token_size])
for (size_t i = 0; i < buffer_size - 1; ++i)
{
if (buffer[i] == '\r' && buffer[i + 1] == '\n')
{
size_t index;
#pragma acc atomic capture
{
index = token_index;
++token_index;
}
tokens[index] = i + 2;
}
}
}
使用OpenAcc
编程接口移植方法二
void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
size_t token_index = 0;
if (buffer_size > 1 && buffer[0] == '\r' && buffer[1] == '\n')
tokens[token_index++] = 2;
#pragma acc parallel loop copyin(buffer [0:buffer_size]) copyin(token_index) copyout(tokens [token_index:token_size])
for (size_t i = 2; i < buffer_size; i += 2)
{
if (buffer[i] == '\r')
{
if (buffer[i + 1] == '\n')
{
size_t index;
#pragma acc atomic capture
{
index = token_index;
++token_index;
}
tokens[index] = i + 2;
}
}
else if (buffer[i] == '\n')
{
if (buffer[i - 1] == '\r')
{
size_t index;
#pragma acc atomic capture
{
index = token_index;
++token_index;
}
tokens[index] = i + 1;
}
}
}
if (buffer_size > 1 && (buffer_size & 0x01) == 0x01)
{
if (buffer[buffer_size - 2] == '\r' && buffer[buffer_size - 1] == '\n')
tokens[token_index++] = buffer_size;
}
}
对于以上六种方法,我们看一下实际运行情况
CUDA
的程序使用了特殊的语法,需要使用其自带的nvcc
编译器,OpenAcc
的程序使用了pgc++
编译器
数据:构造1G长的随机文件,其中
\r
和\n
出现的概率分别是1/256,总行数16432
方法 | 用时(ms) | Kernel用时(ms) |
---|---|---|
方法六 | 302 | 35 |
方法七 | 288 | 23 |
方法八 | 281 | 13 |
方法九 | 344 | - |
方法十 | 218 | - |
方法十一 | 200 | - |
注意到tokenize
方法的用时并不低,其中主要的耗时来源于CPU内存与GPU内存之间数据的拷贝上,因为我们的例子实在太过简单,所以内存拷贝消耗的时间已经高于计算本身的时间。
为此,我在CUDA
程序中特地单独测量了kernel的耗时。
测试机器 CPU:E5-2690 v3 @ 2.60GHz
,6核6线程,GPU:NVIDIA Tesla K80
编译器 gcc 7.2
,nvcc 9.1
和pgc++ 17.10
编译选项 -O2
关于这个问题的讨论就要暂时告一段落了,这两篇文章使用了十二种不同的方法来解决一道看似简单的问题,希望以此抛砖引玉,对大家在面试和工作中有所帮助。