一道简单面试题引出的优化方法讨论 (Ⅱ)

从上一篇一道简单面试题引出的优化方法讨论 (Ⅰ)中,我们已经了解到了这个问题使用SIMDSMT进行优化的实现方法,我会在第二篇中继续探讨使用SIMT优化的实现方法。

我们再来回顾下问题

在一个内存文件中找出所有以 Windows换行符(\r\n)结尾的行首指针,并保存在数组中,结果不要求有序

对于SIMT的概念通常对应于GPU上的开发,我们选用目前最为流行的异构计算的库CUDAThrustOpenAcc来示例。

方法六 SIMT

将朴素算法移植到CUDA上

static const size_t BLOCK_SIZE = 128;

__global__ void foo(const char *array, size_t *tokens, int *token_index)
{
    __shared__ char s_array[BLOCK_SIZE + 1];

    size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
    if (threadIdx.x == BLOCK_SIZE - 1)
    {
        s_array[threadIdx.x] = array[offset];
        s_array[threadIdx.x + 1] = array[offset + 1];
    }
    else
    {
        s_array[threadIdx.x] = array[offset];
    }
    __syncthreads();

    if (s_array[threadIdx.x] == '\r' && s_array[threadIdx.x + 1] == '\n')
    {
        int index = atomicAdd(token_index, 1);
        tokens[index] = offset + 2;
    }
}

void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
    const char *d_buffer;
    size_t *d_tokens;
    int *d_token_index;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMalloc((void **)&d_buffer, buffer_size + 1);
    cudaMalloc((void **)&d_tokens, token_size * sizeof(size_t));
    cudaMalloc((void **)&d_token_index, sizeof(int));
    cudaMemcpy((void *)d_buffer, buffer, buffer_size, cudaMemcpyHostToDevice);
    cudaMemset(d_token_index, 0, sizeof(int));

    size_t blocks = buffer_size / BLOCK_SIZE;

    cudaEventRecord(start);
    foo<<>>(d_buffer, d_tokens, d_token_index);
    cudaEventRecord(stop);

    cudaMemcpy(tokens, d_tokens, token_size * sizeof(size_t), cudaMemcpyDeviceToHost);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Kernel took %.4f milliseconds\n", milliseconds);

    cudaFree((void *)d_buffer);
    cudaFree((void *)d_tokens);
    cudaFree((void *)d_token_index);
}

方法七 SIMT

将方法二移植到CUDA上

static const size_t BLOCK_SIZE = 128;

__global__ void foo(const char *array, size_t *tokens, int *token_index)
{
    __shared__ char s_array[BLOCK_SIZE * 2 + 1];

    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
    size_t s_offset = threadIdx.x * 2;
    *reinterpret_cast<short *>(&s_array[s_offset]) = *reinterpret_cast<const short *>(&array[offset]);
    if (threadIdx.x == BLOCK_SIZE - 1)
        s_array[BLOCK_SIZE * 2] = array[offset + 2];
    __syncthreads();

    if (threadIdx.x == 0)
    {
        if (s_array[0] == '\r' && s_array[1] == '\n')
        {
            int index = atomicAdd(token_index, 1);
            tokens[index] = offset + 2;
        }
        if (s_array[BLOCK_SIZE * 2 - 1] == '\r' && s_array[BLOCK_SIZE * 2] == '\n')
        {
            int index = atomicAdd(token_index, 1);
            tokens[index] = offset + 2 * BLOCK_SIZE + 1;
        }
    }
    else
    {
        if (s_array[s_offset] == '\r')
        {
            if (s_array[s_offset + 1] == '\n')
            {
                int index = atomicAdd(token_index, 1);
                tokens[index] = offset + 2;
            }
        }
        else if (s_array[s_offset] == '\n')
        {
            if (s_array[s_offset - 1] == '\r')
            {
                int index = atomicAdd(token_index, 1);
                tokens[index] = offset + 1;
            }
        }
    }
}

void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
    const char *d_buffer;
    size_t *d_tokens;
    int *d_token_index;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMalloc((void **)&d_buffer, buffer_size + 1);
    cudaMalloc((void **)&d_tokens, token_size * sizeof(size_t));
    cudaMalloc((void **)&d_token_index, sizeof(int));
    cudaMemcpy((void *)d_buffer, buffer, buffer_size, cudaMemcpyHostToDevice);
    cudaMemset(d_token_index, 0, sizeof(int));

    int blocks = buffer_size / 2 / BLOCK_SIZE;

    cudaEventRecord(start);
    foo<<>>(d_buffer, d_tokens, d_token_index);
    cudaEventRecord(stop);

    cudaMemcpy(tokens, d_tokens, token_size * sizeof(size_t), cudaMemcpyDeviceToHost);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel took %.3f milliseconds\n", milliseconds);

    cudaFree((void *)d_buffer);
    cudaFree((void *)d_tokens);
    cudaFree((void *)d_token_index);
}

方法八 SIMT

对方法六进一步优化,增加单个线程的计算量,减少线程数目

static const size_t BLOCK_SIZE = 128;
static const size_t SEGMENT_SIZE = sizeof(int);

__global__ void foo(const char *array, size_t *tokens, int *token_index)
{
    __shared__ char s_array[SEGMENT_SIZE * BLOCK_SIZE + 1];

    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x) * SEGMENT_SIZE;
    size_t s_offset = threadIdx.x * SEGMENT_SIZE;
    *reinterpret_cast<int *>(&s_array[s_offset]) = *reinterpret_cast<const int *>(&array[offset]);
    if (threadIdx.x == BLOCK_SIZE - 1)
        s_array[s_offset + SEGMENT_SIZE] = array[offset + SEGMENT_SIZE];
    __syncthreads();

    for (size_t i = 0; i < SEGMENT_SIZE; i++)
    {
        if (s_array[s_offset + i] == '\r' && s_array[s_offset + i + 1] == '\n')
        {
            int index = atomicAdd(token_index, 1);
            tokens[index] = offset + i + 2;
        }
    }
}

void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
    const char *d_buffer;
    size_t *d_tokens;
    int *d_token_index;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMalloc((void **)&d_buffer, buffer_size + 1);
    cudaMalloc((void **)&d_tokens, token_size * sizeof(size_t));
    cudaMalloc((void **)&d_token_index, sizeof(int));
    cudaMemcpy((void *)d_buffer, buffer, buffer_size, cudaMemcpyHostToDevice);
    cudaMemset(d_token_index, 0, sizeof(int));

    size_t blocks = buffer_size / BLOCK_SIZE / SEGMENT_SIZE;

    cudaEventRecord(start);
    foo<<>>(d_buffer, d_tokens, d_token_index);
    cudaEventRecord(stop);

    cudaMemcpy(tokens, d_tokens, token_size * sizeof(size_t), cudaMemcpyDeviceToHost);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel took %.4f milliseconds\n", milliseconds);

    cudaFree((void *)d_buffer);
    cudaFree((void *)d_tokens);
    cudaFree((void *)d_token_index);
}

方法九 SIMT

使用CUDA自带的并行算法库Thrust来实现

static thrust::host_vector h_input;

struct transform
{
    template Tuple>
    __host__ __device__ size_t operator()(Tuple t)
    {
        if (thrust::get<1>(t) == '\r' && thrust::get<2>(t) == '\n')
        {
            return thrust::get<0>(t) + 2;
        }
        else
        {
            return (size_t)-1;
        }
    }
};

struct predicate
{
    __host__ __device__ bool operator()(size_t token)
    {
        return token != (size_t)-1;
    }
};

void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
    thrust::device_vector d_tokens(token_size);
    thrust::device_vector d_input = h_input;

    auto zip_it = thrust::make_zip_iterator(
        thrust::make_tuple(
            thrust::counting_iterator(0),
            d_input.begin(),
            d_input.begin() + 1));
    auto transform_it = thrust::make_transform_iterator(zip_it, transform());

    thrust::copy_if(transform_it, transform_it + buffer_size, d_tokens.begin(), predicate());
    thrust::copy(d_tokens.begin(), d_tokens.end(), tokens);
}

方法十 SIMT

使用OpenAcc编程接口移植朴素算法

void tokenize(const char *restrict buffer, size_t buffer_size, size_t *restrict tokens, size_t token_size)
{
    size_t token_index = 0;
#pragma acc parallel loop copyin(buffer [0:buffer_size]) copyin(token_index) copyout(tokens [0:token_size])
    for (size_t i = 0; i < buffer_size - 1; ++i)
    {
        if (buffer[i] == '\r' && buffer[i + 1] == '\n')
        {
            size_t index;
#pragma acc atomic capture
            {
                index = token_index;
                ++token_index;
            }
            tokens[index] = i + 2;
        }
    }
}

方法十一 SIMT

使用OpenAcc编程接口移植方法二

void tokenize(const char *buffer, size_t buffer_size, size_t *tokens, size_t token_size)
{
    size_t token_index = 0;

    if (buffer_size > 1 && buffer[0] == '\r' && buffer[1] == '\n')
        tokens[token_index++] = 2;

#pragma acc parallel loop copyin(buffer [0:buffer_size]) copyin(token_index) copyout(tokens [token_index:token_size])
    for (size_t i = 2; i < buffer_size; i += 2)
    {
        if (buffer[i] == '\r')
        {
            if (buffer[i + 1] == '\n')
            {
                size_t index;
#pragma acc atomic capture
                {
                    index = token_index;
                    ++token_index;
                }
                tokens[index] = i + 2;
            }
        }
        else if (buffer[i] == '\n')
        {
            if (buffer[i - 1] == '\r')
            {
                size_t index;
#pragma acc atomic capture
                {
                    index = token_index;
                    ++token_index;
                }
                tokens[index] = i + 1;
            }
        }
    }

    if (buffer_size > 1 && (buffer_size & 0x01) == 0x01)
    {
        if (buffer[buffer_size - 2] == '\r' && buffer[buffer_size - 1] == '\n')
            tokens[token_index++] = buffer_size;
    }
}

测试结论

对于以上六种方法,我们看一下实际运行情况
CUDA的程序使用了特殊的语法,需要使用其自带的nvcc编译器,OpenAcc的程序使用了pgc++编译器

数据:构造1G长的随机文件,其中\r\n出现的概率分别是1/256,总行数16432

方法 用时(ms) Kernel用时(ms)
方法六 302 35
方法七 288 23
方法八 281 13
方法九 344 -
方法十 218 -
方法十一 200 -

注意到tokenize方法的用时并不低,其中主要的耗时来源于CPU内存与GPU内存之间数据的拷贝上,因为我们的例子实在太过简单,所以内存拷贝消耗的时间已经高于计算本身的时间。
为此,我在CUDA程序中特地单独测量了kernel的耗时。

测试机器 CPU:E5-2690 v3 @ 2.60GHz,6核6线程,GPU:NVIDIA Tesla K80
编译器 gcc 7.2nvcc 9.1pgc++ 17.10 编译选项 -O2

关于这个问题的讨论就要暂时告一段落了,这两篇文章使用了十二种不同的方法来解决一道看似简单的问题,希望以此抛砖引玉,对大家在面试和工作中有所帮助。

你可能感兴趣的:(优化,面试)