字符串查找是信息安全、信息过滤领域的重要操作,尤其是对大文本的实时处理。这篇作为实例,使用GPU OpenCL进行精确模式串查找。
1.加速方法
(1)将少量常量数据,如模式串长度、文本长度等,保存在线程的private memory中。
(2)将模式串保存在GPU的local memory中,加速线程对模式串的访问。
(3)将待查找的文本保存在global memory中,使用尽可能多线程访问global memory,减小线程平均访存时间。
(4)每个work-group中的线程操作文本中一段,多个work-group并行处理大文本。
2.同步
(1)work-group内,使用CLK_LOCAL_MEM_FENCE、CLK_GLOBAL_MEM_FENCE
(2)全局使用对__global int 的原子操作,来保证每个线程将结果写到全局内存的正确位置。设备支持的操作可以通过查询设备的扩展获得,如下图,可知核函数支持原子操作、printf操作:
3.代码实例,大文本精确模式串搜索
3.1 核函数(string_search_kernel.cl):
1 int compare(__global const uchar* text, __local const uchar* pattern, uint length){ 2 for(uint l=0; l<length; ++l){ 3 if (text[l] != pattern[l]) 4 return 0; 5 } 6 return 1; 7 } 8 9 __kernel void 10 StringSearch ( 11 __global uchar* text, //Input Text 12 const uint textLength, //Length of the text 13 __global const uchar* pattern, //Pattern string 14 const uint patternLength, //Pattern length 15 const uint maxSearchLength, //Maximum search positions for each work-group 16 __global int* resultCount, //Result counts (global) 17 __global int* resultBuffer, //Save the match result 18 __local uchar* localPattern) //local buffer for the search pattern 19 { 20 21 int localIdx = get_local_id(0); 22 int localSize = get_local_size(0); 23 int groupIdx = get_group_id(0); 24 25 uint lastSearchIdx = textLength - patternLength + 1; 26 uint beginSearchIdx = groupIdx * maxSearchLength; 27 uint endSearchIdx = beginSearchIdx + maxSearchLength; 28 if(beginSearchIdx > lastSearchIdx) 29 return; 30 if(endSearchIdx > lastSearchIdx) 31 endSearchIdx = lastSearchIdx; 32 33 for(int idx = localIdx; idx < patternLength; idx+=localSize) 34 localPattern[idx] = pattern[idx]; 35 barrier(CLK_LOCAL_MEM_FENCE); 36 37 for(uint stringPos=beginSearchIdx+localIdx; stringPos<endSearchIdx; stringPos+=localSize){ 38 if (compare(text+stringPos, localPattern, patternLength) == 1){ 39 int count = atomic_inc(resultCount); 40 resultBuffer[count] = stringPos; 41 //printf("%d ",stringPos); 42 } 43 barrier(CLK_LOCAL_MEM_FENCE); 44 } 45 }
3.2.tool.h 、tool.cpp
见:http://www.cnblogs.com/xudong-bupt/p/3582780.html
3.3 StringSearch.cpp
1 #include <CL/cl.h> 2 #include "tool.h" 3 #include <string.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <iostream> 7 #include <string> 8 #include <fstream> 9 using namespace std; 10 11 12 int main(int argc, char* argv[]) 13 { 14 cl_int status; 15 /**Step 1: Getting platforms and choose an available one(first).*/ 16 cl_platform_id platform; 17 getPlatform(platform); 18 19 /**Step 2:Query the platform and choose the first GPU device if has one.*/ 20 cl_device_id *devices=getCl_device_id(platform); 21 22 /**Step 3: Create context.*/ 23 cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); 24 25 /**Step 4: Creating command queue associate with the context.*/ 26 cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); 27 28 /**Step 5: Create program object */ 29 const char *filename = "string_search_kernel.cl"; 30 string sourceStr; 31 status = convertToString(filename, sourceStr); 32 const char *source = sourceStr.c_str(); 33 size_t sourceSize[] = {strlen(source)}; 34 cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL); 35 36 /**Step 6: Build program. */ 37 status=clBuildProgram(program, 1,devices,NULL,NULL,NULL); 38 39 40 /**Step 7: Initial input,output for the host and create memory objects for the kernel*/ 41 string textStr; //StringSearch_Input.txt 42 convertToString("StringSearch_Input.txt", textStr); 43 const char * text = textStr.c_str(); 44 int textlen=strlen(text); 45 46 char * pattern="info"; 47 int patternlen=strlen(pattern); 48 int maxSearchLength=256*64; 49 int * resultCount=new int[1]; 50 *resultCount=0; 51 int * result=new int[textlen]; 52 memset(result,0,sizeof(int)*textlen); 53 54 cl_mem textBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(char)*textlen,(void *)text, NULL); //global memory 55 cl_mem patternBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR ,sizeof(char)*patternlen, (void *)pattern, NULL); 56 cl_mem resultCountBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR ,sizeof(int), (void *)resultCount, NULL); 57 cl_mem resultBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR ,sizeof(int)*textlen, (void *)result, NULL); 58 59 /**Step 8: Create kernel object */ 60 cl_kernel kernel = clCreateKernel(program,"StringSearch", NULL); 61 62 /**Step 9: Sets Kernel arguments.*/ 63 status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&textBuffer); //global 64 status = clSetKernelArg(kernel, 1, sizeof(int), &textlen); //private 65 status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&patternBuffer); //global 66 status = clSetKernelArg(kernel, 3, sizeof(int), &patternlen); //private 67 status = clSetKernelArg(kernel, 4, sizeof(int), &maxSearchLength); //private 68 status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&resultCountBuffer); //global 69 status = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&resultBuffer); //global 70 status = clSetKernelArg(kernel, 7, sizeof(char)*patternlen, NULL); //local 71 72 /**Step 10: Running the kernel.*/ 73 cl_event enentPoint; 74 int globalWorkItem=textlen/64; 75 76 if(textlen%64 != 0) 77 globalWorkItem++; 78 size_t groupNUm[1]={globalWorkItem}; 79 size_t localNUm[1]={256}; 80 81 status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, groupNUm, localNUm, 0, NULL, &enentPoint); 82 83 clWaitForEvents(1,&enentPoint); ///wait 84 clReleaseEvent(enentPoint); 85 int count=0; 86 status = clEnqueueReadBuffer(commandQueue, resultCountBuffer, CL_TRUE, 0, sizeof(int), &count, 0, NULL, NULL); 87 cout<<"\nNumber of matches:"<<count<<endl; 88 89 /**Step 12: Clean the resources.*/ 90 status = clReleaseKernel(kernel);//*Release kernel. 91 status = clReleaseProgram(program); //Release the program object. 92 status = clReleaseMemObject(resultBuffer);//Release mem object. 93 status = clReleaseMemObject(textBuffer);//Release mem object. 94 status = clReleaseMemObject(resultCountBuffer);//Release mem object. 95 status = clReleaseMemObject(patternBuffer);//Release mem object. 96 status = clReleaseCommandQueue(commandQueue);//Release Command queue. 97 status = clReleaseContext(context);//Release context. 98 99 free(devices); 100 free(result); 101 free(resultCount); 102 103 getchar(); 104 return 0; 105 }