前面介绍了几个简单的CUDA程序,这里再举两个更具体的应用。为简单记,用python写。
图像滤波是用一个带参数滤波器(也可以称为核,也可以称为模板)对图像逐像素点处理,通常是对像素邻域进行加权和。
这里以能够提取边缘的索贝尔算子作为例子。
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import cv2
import math
import numpy as np
"""
通用的图像滤波核函数
"""
mod = SourceModule("""
__global__ void applyFilter(const unsigned char *input, unsigned char *output, const unsigned int width, const unsigned int height, const float *kernel, const unsigned int kernelWidth) {
const unsigned int col = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned int row = threadIdx.y + blockIdx.y * blockDim.y;
if(row < height && col < width) {
const int half = kernelWidth / 2;
float blur = 0.0;
for(int i = -half; i <= half; i++) {
for(int j = -half; j <= half; j++) {
const unsigned int y = max(0, min(height - 1, row + i));
const unsigned int x = max(0, min(width - 1, col + j));
const float w = kernel[(j + half) + (i + half) * kernelWidth];
blur += w * input[x + y * width];
}
}
output[col + row * width] = static_cast(blur);
}
}
""" )
applyFilter = mod.get_function("applyFilter")
#滤波器的大小设为3x3
kernel_width = 3
#索贝尔算子
kernel_matrix = np.array([[-1,0,1],[-2,0,2],[-1,0,1]]).astype(np.float32)
#示例图像读取,灰度图
img = cv2.imread('example.jpg',cv2.IMREAD_GRAYSCALE)
height, width = img.shape
out = np.empty_like(img)
#一个像素对应一个线程
# block_dim没什么讲究,这里设为32
block_dim = 32
#要让线程数多于像素数,才能保证每个像素都被处理
#对于多出来的那部分线程,可以在核函数中用条件语句让它啥都不用干
grid_dim_x = (width+block_dim-1)//block_dim
grid_dim_y = (height+block_dim-1)//block_dim
applyFilter(
drv.In(img),
drv.InOut(out),
np.uint32(width),
np.uint32(height),
drv.In(kernel_matrix),
np.uint32(kernel_width),
block=(block_dim, block_dim, 1),
grid=(grid_dim_x, grid_dim_y)
)
# out就是滤波后的图像
K-means是机器学习中经典的聚类算法,注意别跟KNN搞混
假设所有样本点可分为K类,随机选取K个点分别作为这K类的中心点,①计算所有点到K个中心点的距离,把每个点分配给距离它最近的中心。②更新聚类的中心为现有点的平均值。不断迭代直至收敛。
假设有N个点,则第①步的计算次数为N*K,第②步的计算次数为N。故第①步是优化的主要目标。
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import cv2
import math
import numpy as np
mod = SourceModule("""
__device__ void loadVector(float *target, float* source)
{
target[0] = source[0];
target[1] = source[1];
}
//计算样本点所属的类别
__global__ void find_cluster(float *data, int *cluster_ids,
float *cluster, int n_clusters, int n_points) {
int index = blockIdx.x * blockDim.x + threadIdx.x ;
__shared__ float center[2];
float min_distance = 1e8;
int cluster_id = 0;
if(index < n_points){
for(int k=0;k)
find_cluster = mod.get_function("find_cluster")
# 类别数 K
n_clusters = 256
# 样本数 N
n_points = 25600
# 生成二维平面上的N个样本
data = (np.random.randn(n_points, 2)+np.random.randint(0,n_clusters,(n_points,1))).astype(np.float32)
# 初始化样本的类别
data_ids = np.zeros(n_points).astype(np.int32)
# 随机抽取聚类中心
clusters = np.take(data, np.random.randint(0, n_points, n_clusters), 0)
# 一个样本点对应一个线程
block_dim = 512
grid_dim = (n_points+block_dim-1)//block_dim
# 最大迭代次数为60
# 值得一提的是,K-means肯定会收敛,这里限制迭代次数是出于时间的考虑
for i in range(60):
find_cluster(drv.In(data),
drv.InOut(data_ids),
drv.In(clusters),
np.int32(n_clusters),
np.int32(n_points),
block=(block_dim,1,1),
grid=(grid_dim,1))
old_clusters = clusters.copy()
# 更新聚类中心
for j in range(n_clusters):
data_j = np.compress(np.equal(data_ids, j), data, 0)
if data_j.shape[0]>0:
clusters[j]=np.mean(data_j,0)
ave_center_move = np.mean(np.abs(old_clusters-clusters))
print(ave_center_move)
# 判断是否满足收敛条件
if ave_center_move<1e-6:
break