哦豁灬

手写 CPU 卷积核加速神经网络计算(2)——OpenCL 简单实现卷积、池化、激活、全连接、批归一化（python 实现）

这里使用 pyopencl 这个支持在 python 环境下跑 opencl 的库，用 opencl 简单实现了卷积、池化、激活、全连接、批归一化这五个算子，没有做其他的算法和内存上的优化。

1 Conv2d

%%cl_kernel -o "-cl-fast-relaxed-math"

__kernel void Conv2D(__global const float *ift, 
                     __global float *weight, __global float *bias,
                     __global int *output_channel, __global int *output_height, __global int *output_width,
                     __global int *input_channel, __global int *input_height, __global int *input_width,
                     __global int *feature_height, __global int *feature_width,
                     __global float *oft) {
    int Co = *output_channel, Ho = *output_height, Wo = *output_width;
    int Ci = *input_channel, Hi = *input_height, Wi = *input_width;
    int Hf = *feature_height, Wf = *feature_width;
    int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
    int So = Wo*Ho, Sf = Wf*Hf, Si = Wi*Hi;
    int Vf = Sf*Ci;
    int i = posc*(So) + (posh*Wo+posw);
    
    oft[i] = bias[posc];
    for(int l = 0; l < Hf; l++) {
        for(int m = 0; m < Wf; m++) {
            for(int n = 0; n < Ci; n++) {
                oft[i] += ift[(n*Si)+((posh+l)*Wi)+(posw+m)]*weight[(posc*Vf)+(n*Sf)+(l*Wf)+(m)];
            }
        }
    }
}

2 BatchNorm2D

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void BatchNorm2D(__global const float *ift, 
        __global float *mean, __global float *std,
        __global float *eps, __global float *weight, __global float *bias,
        __global int *channel, __global int *height, __global int *width,
        __global float *oft) {
    int c = *channel, h = *height, w = *width;
    int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
    int i = posc*(w*h) + (posh*w+posw);
    float e = *eps;
    float res1 = ift[i] - mean[posc];
    float res2 = sqrt(std[posc] + e);
    float res3 = res1 / res2;
    oft[i] = res3 * weight[posc] + bias[posc];
}

3 Linear

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void Linear(__global const float *ift, 
                     __global float *weight, __global float *bias,
                     __global int *output_channel, __global int *input_channel,
                     __global float *oft) {
    int Co = *output_channel, Ci = *input_channel;
    int posCo = get_global_id(0);
    
    oft[posCo] = bias[posCo];
    for(int k = 0; k < Ci; k++) {
        oft[posCo] += ift[k]*weight[posCo*Ci+k];
    }
}

4 MaxPool2D

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void MaxPool2D(__global const float *ift, 
                        __global int *size, __global int *stride,
                        __global int *channel, 
                        __global int *input_height, __global int *input_width, 
                        __global int *output_height, __global int *output_width, 
                        __global float *oft) {
    int sz = *size, sd = *stride;
    int C = *channel, Hi = *input_height, Wi = *input_width, Ho = *output_height, Wo = *output_width;
    int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
    
    int So = Ho*Wo, Si = Hi*Wi;
    int i = (posc*(So))+(posh*Wo)+(posw);
    int startX = posw*sd, startY = posh*sd;
    
    oft[i] = ift[(posc*(Si))+(startY*Wi)+startX];
    for(int y = 0; y < sz; y++) {
        for(int x = 0; x < sz; x++) {
            oft[i] = max(oft[i], ift[(posc*(Si))+((startY+y)*Wi)+(startX+x)]);
        }
    }
}

5 ReLU】

这个分了两个来写，对应一维和三维的：

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void ReluD1(__global const float *ift, __global float *oft) {
    int i = get_global_id(0);
    oft[i] = max((float)0, ift[i]);
}

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void ReluD3(__global const float *ift, __global float *oft,
                  __global int *channel, __global int *height, __global int *width) {
    int c = *channel;
    int h = *height;
    int w = *width;
    int posc = get_global_id(0);
    int posh = get_global_id(1);
    int posw = get_global_id(2);
    int i = posc*(w*h) + (posh*w+posw);
    oft[i] = max((float)0, ift[i]);
}

6 封装 opencl 代码，这里可以优化一下数据

def conv2d(input_numpy, kernel_weight_numpy, kernel_bias_numpy, padding = 0):
    # 卷积前预处理，在CPU端做padding和开辟内存空间
    input_numpy = input_numpy
    weight_cpu = kernel_weight_numpy
    bias_cpu = kernel_bias_numpy

    Ci, Hi, Wi = input_numpy.shape
    input_cpu = np.zeros((Ci, Hi+2*padding, Wi+2*padding)).astype(np.float32)
    if padding > 0:
        input_cpu[:, padding:-padding, padding:-padding] = input_numpy
    else:
        input_cpu = input_numpy

    Ci, Hi, Wi = input_cpu.shape
    Co, Ci, Hf, Wf = weight_cpu.shape
    Ho, Wo = Hi - Hf + 1, Wi - Wf + 1
    output_cpu = np.zeros((Co,Ho,Wo)).astype(np.float32)
    # 将数据从host搬运到device
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)

    kernel_weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight_cpu)
    kernel_bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias_cpu)

    output_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Co))
    output_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ho))
    output_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wo))
    input_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ci))
    input_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hi))
    input_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wi))
    feature_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hf))
    feature_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wf))

    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    # device执行
    Conv2D(queue, output_cpu.shape, None, 
           input_gpu, 
           kernel_weight_gpu, kernel_bias_gpu,
           output_channel_gpu, output_height_gpu, output_width_gpu,
           input_channel_gpu, input_height_gpu, input_width_gpu,
           feature_height_gpu, feature_width_gpu,
           output_gpu)
    # 将结果从device读回到host
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    # 结果返回
    return output_cpu

def batchnorm2d(input_numpy, eps, weight, bias):
    # cpu端的预处理
    mean = input_numpy.mean(axis=(1,2))
    std = input_numpy.std(axis=(1,2))**2
    output_cpu = np.empty_like(input_numpy)
    # 将数据从host搬运到device
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_numpy)

    mean_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = mean)
    std_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = std)

    eps_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(eps))
    weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight)
    bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias)

    channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[0]))
    height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[1]))
    width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[2]))

    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    # device端执行
    BatchNorm2D(queue, input_numpy.shape, None, input_gpu, 
                mean_gpu, std_gpu, 
                eps_gpu, weight_gpu, bias_gpu, 
                channel_gpu, height_gpu, width_gpu, 
                output_gpu)
    # 数据写回
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    # 返回数据
    return output_cpu

def relu(input_numpy):
    output_cpu = np.empty_like(input_numpy)
    
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_numpy)
    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    if len(input_numpy.shape) == 3:
        channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[0]))
        height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[1]))
        width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[2]))
        
        ReluD3(queue, input_numpy.shape, None, input_gpu, output_gpu, channel_gpu, height_gpu, width_gpu)
    else:
        ReluD1(queue, input_numpy.shape, None, input_gpu, output_gpu)
        
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    return output_cpu

def maxpool2d(input_numpy,size=2,stride=2):
    # cpu端做一下padding和内存开辟
    C, Hi, Wi = input_numpy.shape
    Ho, Wo = int(np.floor(Hi/stride)),int(np.floor(Wi/stride))
    row_remainder,col_remainder = Hi%stride, Wi%stride
    Ho += int(row_remainder!=0)
    Wo += int(col_remainder!=0)
    input_cpu = np.zeros((C, Hi+size-row_remainder, Wi+size-col_remainder)).astype(np.float32)
    input_cpu[:, :Hi, :Wi] = input_numpy

    C, Hi, Wi = input_cpu.shape

    output_cpu = np.zeros((C,Ho,Wo)).astype(np.float32)
    # 从host搬运数据到device
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)

    size_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(size))
    stride_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(stride))

    channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(C))
    input_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hi))
    input_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wi))
    output_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ho))
    output_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wo))

    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    # device执行
    MaxPool2D(queue, output_cpu.shape, None, 
               input_gpu, 
               size_gpu, stride_gpu,
               channel_gpu,
               input_height_gpu, input_width_gpu,
               output_height_gpu, output_width_gpu,
               output_gpu)
    # 数据写回
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    # 返回结果
    return output_cpu

def linear(input_numpy, weight_numpy, bias_numpy):
    # cpu端简单处理
    input_cpu = input_numpy
    weight_cpu = weight_numpy
    bias_cpu = bias_numpy

    Co, Ci = weight_cpu.shape

    output_cpu = np.zeros((Co,)).astype(np.float32)
    # 将数据从host搬运到device
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)

    weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight_cpu)
    bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias_cpu)

    output_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Co))
    input_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ci))

    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    # device执行
    Linear(queue, output_cpu.shape, None, 
               input_gpu, 
               weight_gpu, bias_gpu,
               output_channel_gpu, input_channel_gpu,
               output_gpu)
    # 数据写回
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    # 返回数据
    return output_cpu

7 加载 pytorch 的模型获取参数

class LeNet(nn.Module):
    def __init__(self, num_classes):
        super(LeNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1 ),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fclayer = nn.Sequential(
            nn.Linear(16*5*5, 120),
            nn.ReLU(),
            nn.Linear(120,84),
            nn.ReLU(),
            nn.Linear(84,num_classes)
        ) 
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x=x.view(-1, 16*5*5)
        x=self.fclayer(x)
        return x

model = LeNet(10)
model.load_state_dict(torch.load('model.pth'))

8 opencl 的 lenet 推理代码

def opencl_LeNet(model, input_numpy):
    layer1_Conv2d = model.layer1[0]
    layer1_BatchNorm2d = model.layer1[1]
    layer1_ReLU = model.layer1[2]
    layer1_MaxPool2d = model.layer1[3]

    layer2_Conv2d = model.layer2[0]
    layer2_BatchNorm2d = model.layer2[1]
    layer2_ReLU = model.layer2[2]
    layer2_MaxPool2d = model.layer2[3]

    fclayer_Linear_1 = model.fclayer[0]
    fclayer_ReLU_1 = model.fclayer[1]
    fclayer_Linear_2 = model.fclayer[2]
    fclayer_ReLU_2 = model.fclayer[3]
    fclayer_Linear_3 = model.fclayer[4]

    
    
    layer1_Conv2d_numpy_out = conv2d(input_numpy, layer1_Conv2d.weight.detach().numpy(), layer1_Conv2d.bias.detach().numpy(), padding = 2)
    layer1_BatchNorm2d_numpy_out = batchnorm2d(layer1_Conv2d_numpy_out, layer1_BatchNorm2d.eps, layer1_BatchNorm2d.weight.detach().numpy(), layer1_BatchNorm2d.bias.detach().numpy())
    layer1_ReLU_numpy_out = relu(layer1_BatchNorm2d_numpy_out)
    layer1_MaxPool2d_numpy_out = maxpool2d(layer1_ReLU_numpy_out, 2)

    layer2_Conv2d_numpy_out = conv2d(layer1_MaxPool2d_numpy_out, layer2_Conv2d.weight.detach().numpy(), layer2_Conv2d.bias.detach().numpy(), padding = 0)
    layer2_BatchNorm2d_numpy_out = batchnorm2d(layer2_Conv2d_numpy_out, layer2_BatchNorm2d.eps, layer2_BatchNorm2d.weight.detach().numpy(), layer2_BatchNorm2d.bias.detach().numpy())
    layer2_ReLU_numpy_out = relu(layer2_BatchNorm2d_numpy_out)
    layer2_MaxPool2d_numpy_out = maxpool2d(layer2_ReLU_numpy_out, 2)

    layer2_MaxPool2d_numpy_out = layer2_MaxPool2d_numpy_out.reshape(-1, 16*5*5)

    fclayer_Linear_1_numpy_out = linear(layer2_MaxPool2d_numpy_out, fclayer_Linear_1.weight.detach().numpy(), fclayer_Linear_1.bias.detach().numpy())
    fclayer_ReLU_1_numpy_out = relu(fclayer_Linear_1_numpy_out)
    fclayer_Linear_2_numpy_out = linear(fclayer_ReLU_1_numpy_out, fclayer_Linear_2.weight.detach().numpy(), fclayer_Linear_2.bias.detach().numpy())
    fclayer_ReLU_2_numpy_out = relu(fclayer_Linear_2_numpy_out)
    fclayer_Linear_3_numpy_out = linear(fclayer_ReLU_2_numpy_out, fclayer_Linear_3.weight.detach().numpy(), fclayer_Linear_3.bias.detach().numpy())

    return fclayer_Linear_3_numpy_out

input = torch.randn(1,1,28,28)
input_numpy = input.numpy()[0,:,:,:]

# opencl测试
time1 = time.time()
out = opencl_LeNet(model,input_numpy)
time2 = time.time()
print('{} ms'.format(1000*(time2-time1)))
print(out)

# pytorch的cpu测试
time1 = time.time()
out = model(input)
time2 = time.time()
print('{} ms'.format(1000*(time2-time1)))
print(out.detach().numpy())

把 batch 这一维度去掉了，opencl 好像只能并行展开 1 维、2 维、3 维，考虑到 nchw 里面 chw 维度较大，展开性价比较高，n 这个分开一个一个的推理又不是不行，就直接把 n 去掉了。
每一层输入输出都是 cpu 的，每一层的处理其实挺复杂的，很多需要有一定的预处理，这个预处理在 opencl 做比较复杂，所以每一层都是在 cpu 做预处理，把数据写到 device，然后执行 device，再把数据从 device 写回到 host，然后再返回。从这里也可以看出来，这个数据在 host 跟 device 之间频繁交换数据，瓶颈也基本上是在这个地方，所有的纯 device 执行的时间，大概是 6ms，比pytorch 的 cpu 慢一倍左右。所以这里有个可以优化的点，就是在 cpu 端把预处理一次性全做了，然后把数据一次性交付到 device，这样能有比较显著的速度提升。
每一个算子都是简单的按照之前的 for 循环展开了一下，内部还有很多可以优化的细节

iphone se 一代不完美越狱 14.6 视频壁纸教程(踩坑笔记) YANG_301 ios iphone
iphonese一代不完美越狱14.6加视频壁纸教程-踩坑笔记越狱流程1.爱思助手制作启动u盘坑点:2.越狱好后视频壁纸软件1.源2.软件安装越狱流程1.爱思助手制作启动u盘https://www.i4.cn/news_detail_42302.html此网址为具体流程,但要注意!!!坑点:下图中最后一排quickmode应被勾选(勾选后是×(´ཀ`」∠))进入options后不禁要勾选allow
Docker+Portainer 离线安装 qq_30024063 docker 容器运维
1.Docker安装步骤一：官网下载docker安装包步骤二：解压安装包;tar-zxvfdocker-24.0.6.tgz步骤三：将解压之后的docker文件移到/usr/bin目录下;cpdocker/*/usr/bin/步骤四：将docker注册成系统服务;vim/etc/systemd/system/docker.service然后在文件中添加以下内容，退出并保存（:wq!）[Unit]D
【LlamaIndex核心组件指南 | 模型篇】一文通晓 LlamaIndex 模型层：LLM、Embedding 及多模态应用全景解析
Langchain系列文章目录01-玩转LangChain：从模型调用到Prompt模板与输出解析的完整指南02-玩转LangChainMemory模块：四种记忆类型详解及应用场景全覆盖03-全面掌握LangChain：从核心链条构建到动态任务分配的实战指南04-玩转LangChain：从文档加载到高效问答系统构建的全程实战05-玩转LangChain：深度评估问答系统的三种高效方法（示例生成、手
Python_计算两个省市之间的直线距离_2506 夏天里的肥宅水 PYTHON python spring 开发语言
更新代码上一版链接importpandasaspdimporttimeimportpickleimportosimportsysfromgeopy.geocodersimportNominatimfromgeopy.distanceimportgeodesicfromtqdmimporttqdm#ConfigurationINPUT_FILE=r"距离.xlsx"#输入文件路径OUTPUT_FIL
python中的*args 和 **kwargs Hi_kenyon python python
简单来说，它们允许一个函数接收不定数量的参数。这在我们预先不知道会传递多少个参数给函数时非常有用。*args(任意数量的位置参数)*args用于在一个函数中接收任意数量的位置参数(positionalarguments)。当你在函数定义中使用*args时，Python会将所有传入的多余的位置参数收集到一个元组(tuple)中。这个名字args只是一个约定俗成的惯例(arguments的缩写)，你也
iPhone越狱基本流程王景程 github iphone xcode macos
目录一、什么是越狱（Jailbreak）？二、越狱前的准备工作三、越狱方式总览（按iOS版本划分）越狱类型：主流越狱工具一览：四、以Checkra1n为例讲解越狱流程（适合iPhoneX及更早）✅支持设备（iOS12–14）：步骤：五、越狱后的操作（以Cydia为例）⚠️六、越狱风险与注意事项总结流程图：一、iPhone16+iOS26：是否可以越狱？当前情况（截至2025年中）：二、为何新设备（
《AI颠覆编码：GPT-4在编译器层面的奇幻漂流》的深度技术解析踢足球的，程序猿人工智能 python c语言
一、传统编译器的黄昏：LLVM面临的AI降维打击1.1经典优化器的性能天花板//LLVM循环优化Pass传统实现（LoopUnroll.cpp）voidLoopUnrollPass::runOnLoop(Loop*L){unsignedTripCount=SE->getSmallConstantTripCount(L);if(!TripCount||TripCount>UnrollThreshol
c语言实现2的n次方 network爬虫算法 c语言
#include#includeintmain(){intn;scanf("%d",&n);doublea=pow(2,n);printf("%lf\n",a);}
用 Python 开发文字冒险游戏：从零开始的教程晓天天天向上 python microsoft 开发语言
文字冒险游戏(Text-basedAdventureGame)是一种经典的游戏类型，玩家通过输入文字指令与游戏世界互动。这种游戏不依赖复杂的图形界面，非常适合初学者学习编程逻辑和用户交互。在本篇博客中，我们将用Python开发一个简单的文字冒险游戏，体验游戏开发的乐趣。1.游戏设计思路游戏背景玩家醒来发现自己身处一个神秘的地下城，需要探索房间、收集物品、战胜敌人并找到出口。核心机制房间导航：玩家可
从零开始理解零样本学习：AI人工智能必学技术 AI天才研究院 Agentic AI 实战 AI人工智能与大数据 AI大模型企业级应用开发实战 ai
从零开始理解零样本学习：AI人工智能必学技术关键词：零样本学习、人工智能、机器学习、知识迁移、语义嵌入摘要：本文旨在全面深入地介绍零样本学习这一在人工智能领域具有重要意义的技术。首先阐述零样本学习的背景和基本概念，通过详细的解释和直观的示意图让读者建立起对零样本学习的初步认识。接着深入剖析其核心算法原理，结合Python代码进行详细说明，同时引入相关数学模型和公式并举例阐释。通过项目实战部分，带领
稳定币独角兽：Circle InnoLink_1024 区块链稳定币区块链
Circle公司背景分析CircleInternetFinancial（以下简称Circle）是一家成立于2013年的美国金融科技公司，总部位于波士顿，由JeremyAllaire和SeanNeville联合创立。公司最初专注于点对点加密货币支付和交易，后转型为全球领先的稳定币发行机构，其核心产品是与美元1:1挂钩的USDCoin（USDC），目前为全球第二大稳定币，仅次于Tether的USDT。
Cline中配置MCP Alexon Xu MCP
1、自动安装MCP默认AI生成的配置会报错：spawnnpxENOENTspawnnpxENOENT，然后排查了npx安装都是OK的，需要使用cmd运行npx，配置如下：{"mcpServers":{"sequentialthinking":{"autoApprove":[],"disabled":false,"timeout":60,"command":"cmd.exe","args":["/c
js递归性能优化啃火龙果的兔子开发DEMO javascript 开发语言 ecmascript
JavaScript递归性能优化递归是编程中强大的技术，但在JavaScript中如果不注意优化可能会导致性能问题甚至栈溢出。以下是几种优化递归性能的方法：1.尾调用优化(TailCallOptimization,TCO)ES6引入了尾调用优化，但只在严格模式下有效：'usestrict';//普通递归functionfactorial(n){if(n===1)return1;returnn*fa
卷积神经网络（Convolutional Neural Network, CNN）不想秃头的程序神经网络语音识别人工智能深度学习网络卷积神经网络
卷积神经网络（ConvolutionalNeuralNetwork,CNN）是一种专门用于处理图像、视频等网格数据的深度学习模型。它通过卷积层自动提取数据的特征，并利用空间共享权重和池化层减少参数量和计算复杂度，成为计算机视觉领域的核心技术。以下是CNN的详细介绍：一、核心思想CNN的核心目标是从图像中自动学习层次化特征，并通过空间共享权重和平移不变性减少参数量和计算成本。其关键组件包括：卷积层（
ResNet（Residual Network）不想秃头的程序神经网络语音识别人工智能深度学习网络残差网络神经网络
ResNet（ResidualNetwork）是深度学习中一种经典的卷积神经网络（CNN）架构，由微软研究院的KaimingHe等人在2015年提出。它通过引入残差连接（SkipConnection）解决了深度神经网络中的梯度消失问题，使得网络可以训练极深的模型（如上百层），并在图像分类、目标检测、语义分割等任务中取得了突破性成果。以下是ResNet的详细介绍：一、核心思想ResNet的核心创新是
平台再升级！接入DeepSeek AI，三大能力一键生成橙武科技低代码 AI deepseek 人工智能
在数字化项目落地过程中，很多企业都会面临相同的问题：数据库建模要写SQL表结构；业务流程需要画LogicFlow流程图；前端页面还要写AMISJSON配置。从想法到实现，中间至少要经历产品经理、架构师、后端、前端多轮沟通。每个环节都耗时，改起来还要推翻重来。demo地址：https://admin.cwcode.top✨我们的平台，现在直接整合了DeepSeekAI大模型只要输入一句需求，就能：✅
P25：LSTM实现糖尿病探索与预测 ?Agony lstm 人工智能 rnn
本文为365天深度学习训练营中的学习记录博客原作者：K同学啊一、相关技术1.LSTM基本概念LSTM（长短期记忆网络）是RNN（循环神经网络）的一种变体，它通过引入特殊的结构来解决传统RNN中的梯度消失和梯度爆炸问题，特别适合处理序列数据。结构组成：遗忘门：决定丢弃哪些信息，通过sigmoid函数输出0-1之间的值，表示保留或遗忘的程度。输入门：决定更新哪些信息，同样通过sigmoid函数控制更新
Python训练营打卡——DAY16（2025.5.5） cosine2025 Python训练营打卡 python 开发语言机器学习
目录一、NumPy数组基础笔记1.理解数组的维度(Dimensions)2.NumPy数组与深度学习Tensor的关系3.一维数组(1DArray)4.二维数组(2DArray)5.数组的创建5.1数组的简单创建5.2数组的随机化创建5.3数组的遍历5.4数组的运算6.数组的索引6.1一维数组索引6.2二维数组索引6.3三维数组索引二、SHAP值的深入理解三、总结1.NumPy数组基础总结2.SH
AI助力基因遗传疾病检测：现状与未来 t0_54program 大数据与人工智能人工智能个人开发
在现代医学领域，与基因紊乱相关疾病的早期检测至关重要。像肺癌，早期诊断的患者5年生存率可达57%，而四期癌症患者生存率仅3%。阿尔茨海默病的早期检测，能让患者改变生活方式、参与临床试验并提前治疗脑部退化症状，有效延长生命。尽管基因检测对评估晚发性阿尔茨海默病的可能性有帮助，对早发性阿尔茨海默病也有指示作用，但其检测技术仍有待完善。目前，仅基于生物学研究的疾病检测技术多样，虽对特定病例精确，但通常需
Python的一点基础教程------文件读写卡提西亚 python 开发语言
最近在看大佬写的Python教程自学,但是感觉有点头痛,因为大佬讲了一些底层的结构和原理,但是又没那么详细,然后作为一个初学者自学的情况下,看的很费劲.看完就有感而发,想写一篇更基础的教程,教会大家怎么去用它,尽量少的去讲原理.但是当然,你也需要有一定的编程语言基础,了解基本的语法和函数等功能.正所谓师傅领进门,修行在个人,有时候我们学了一个东西,如果觉得很有趣,自然就会去了解关于它的更多信息,但
1.2 Python 的特点与优势 Utopia Reverie python python 开发语言
1.语法简洁易读Python以简洁的语法著称，代码可读性强，减少了不必要的符号和冗余代码。例如，使用缩进来表示代码块，而非传统的大括号。这使得代码更易于理解和维护，尤其适合初学者。示例：python运行【#计算斐波那契数列的前10项n=10a,b=0,1for_inrange(n);print(a,end='')a,b=b,a+b#输出:0112358132134】2.开源与社区支持Python是
AI写作实战：从零开始撰写项目提案 SuperMale-zxq AI编程写作投资专栏 AI写作 java 人工智能 AI编程 python
AI写作实战：从零开始撰写项目提案为什么大多数项目提案一出生就已经死亡？还记得上周看到一封邮件吗？一位读者小李发了他精心准备的项目提案，希望有人给些建议。打开附件的那一刻，我叹了口气——这又是一份"自嗨式提案"：密密麻麻的文字堆砌、技术术语泛滥、价值主张模糊不清。我发现数千份项目提案中，有超过80%在开头几分钟就失去了读者的注意力。更残酷的是，决策者通常只会花60秒浏览你的提案，如果没有在这短暂时
动手学Python：从零开始构建一个“文字冒险游戏” network爬虫 python python 开发语言
动手学Python：从零开始构建一个“文字冒险游戏”大家好，我是你的技术向导。今天，我们不聊高深的框架，也不谈复杂的算法，我们来做一点“复古”又极具趣味性的事情——用Python亲手打造一个属于自己的文字冒险游戏（TextAdventureGame）。你是否还记得那些在早期计算机上，通过一行行文字描述和简单指令来探索未知世界的日子？这种游戏的魅力在于它能激发我们最原始的想象力。而对于我们程序员来说
【重构推荐系统】国产大模型驱动的电商个性化推荐完整实战：架构设计、推理优化与在线部署闭环观熵国产大模型部署实战全流程指南重构人工智能 Agent 智能体落地方案
个人简介作者简介：全栈研发，具备端到端系统落地能力，专注大模型的压缩部署、多模态理解与Agent架构设计。热爱“结构”与“秩序”，相信复杂系统背后总有简洁可控的可能。我叫观熵。不是在控熵，就是在观测熵的流动个人主页：观熵个人邮箱：[email protected]座右铭：愿科技之光，不止照亮智能，也照亮人心！专栏导航观熵系列专栏导航：AI前沿探索：从大模型进化、多模态交互、AIGC内容生成，到
python 脚本遍历目录，并把目录下的非utf-8文件改成utf8 还债大湿兄 python 开发语言数据库
从网上下载的qt项目我本地编译里面经常包含中文，提示编译不过，实际上以前经常手动转，发觉还是用脚本不，毕竟这次下的有点大，我只改.h.cpp#pythonD:\python\filetoUtf.pyE:\EasyCanvas-master\EasyCanvas-masterimportosimportcodecsimportargparseimportsysdefconvert_to_utf8_b
树莓派中 Python+opencv打开摄像头 68lizi 光电设计 python
树莓派中Python+opencv打开摄像头注意不要使用cap=cv2.VideoCapture(0,cv2.CAP_DSHOW)，我在树莓派使用这个的时候会报错，在windows不会报错，具体原因不清楚cap=cv2.VideoCapture(0)#使用cap=cv2.VideoCapture(0,cv2.CAP_DSHOW)会报错whileTrue:status,img=cap.read()i
代码随想录算法训练营第52天 | 101.孤岛的总面积、102.沉没孤岛、103.水流问题、104.建造最大岛屿 Amor_Fati_Yu 算法 java 数据结构
101.孤岛的总面积importjava.util.*;publicclassMain{privatestaticintcount=0;privatestaticfinalint[][]dir={{0,1},{1,0},{-1,0},{0,-1}};//四个方向privatestaticvoidbfs(int[][]grid,intx,inty){Queueque=newLinkedList=gr
python实现读取文件的指定某行内容 Fitz1318 Python3学习 python
python实现读取文件的指定某行内容最近有一个需求就是读取一个文件中的指定某行的内容，现将方法记录如下importlinecache#这里填写你自己的文件位置和行号text=linecache.getline("../TestFile/test_C1.json",2)print(text)
Git使用基本指南 LEIX_lll git
一、Git基础配置首先需要配置用户信息，让Git知道你是谁：gitconfig--globaluser.name"你的名字"gitconfig--globaluser.email"你的邮箱@example.com"如果需要查看配置信息，可以使用：gitconfig--list二、仓库操作1.创建新仓库gitinit该命令会在当前目录下创建一个新的Git仓库。2.克隆已有仓库gitclone[远程仓
川翔云电脑全新上线：三维行业高效云端算力新选择渲染101专业云渲染电脑 houdini maya blender 3d 云计算
一、核心定位与优势云端虚拟工作站服务依托云端高性能CPU/GPU集群，提供远程桌面服务，支持普通设备运行专业软件。按需付费模式：无需采购高端硬件，大幅降低成本投入。生态协同优势：与渲染101同属母公司，可在云电脑中完成创作后一键提交至渲染101平台进行分布式渲染。二、硬件配置与性能参数CPU机型（侧重计算能力）GPU机型（图形渲染/AI训练）性能亮点支持最高8卡并联，显存叠加提升复杂场景处理能力。
矩阵求逆（JAVA）初等行变换 qiuwanchi 矩阵求逆（JAVA）
package gaodai.matrix; import gaodai.determinant.DeterminantCalculation; import java.util.ArrayList; import java.util.List; import java.util.Scanner; /** * 矩阵求逆(初等行变换) * @author 邱万迟 *
JDK timer antlove java jdk schedule code timer
1.java.util.Timer.schedule(TimerTask task, long delay)：多长时间（毫秒）后执行任务 2.java.util.Timer.schedule(TimerTask task, Date time)：设定某个时间执行任务 3.java.util.Timer.schedule(TimerTask task, long delay,longperiod
JVM调优总结 -Xms -Xmx -Xmn -Xss coder_xpf jvm 应用服务器
堆大小设置JVM 中最大堆大小有三方面限制：相关操作系统的数据模型（32-bt还是64-bit）限制；系统的可用虚拟内存限制；系统的可用物理内存限制。32位系统下，一般限制在1.5G~2G；64为操作系统对内存无限制。我在Windows Server 2003 系统，3.5G物理内存，JDK5.0下测试，最大可设置为1478m。典型设置： java -Xmx
JDBC连接数据库 Array_06 jdbc
package Util; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; public class JDBCUtil { //完
Unsupported major.minor version 51.0（jdk版本错误） oloz java
java.lang.UnsupportedClassVersionError: cn/support/cache/CacheType : Unsupported major.minor version 51.0 (unable to load class cn.support.cache.CacheType) at org.apache.catalina.loader.WebappClassL
用多个线程处理1个List集合 362217990 多线程 thread list 集合
昨天发了一个提问，启动5个线程将一个List中的内容，然后将5个线程的内容拼接起来，由于时间比较急迫，自己就写了一个Demo，希望对菜鸟有参考意义。。 import java.util.ArrayList; import java.util.List; import java.util.concurrent.CountDownLatch; public c
JSP简单访问数据库香水浓 sql mysql jsp
学习使用javaBean，代码很烂，仅为留个脚印 public class DBHelper { private String driverName; private String url; private String user; private String password; private Connection connection; privat
Flex4中使用组件添加柱状图、饼状图等图表 AdyZhang Flex
1.添加一个最简单的柱状图 ? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 <?xml version= "1.0"&n
Android 5.0 - ProgressBar 进度条无法展示到按钮的前面 aijuans android
在低于SDK < 21 的版本中，ProgressBar 可以展示到按钮前面，并且为之在按钮的中间，但是切换到android 5.0后进度条ProgressBar 展示顺序变化了，按钮再前面，ProgressBar 在后面了我的xml配置文件如下： [html] view plain copy <RelativeLa
查询汇总的sql baalwolf sql
select list.listname, list.createtime,listcount from dream_list as list , (select listid,count(listid) as listcount from dream_list_user group by listid order by count(
Linux du命令和df命令区别 BigBird2012 linux
1，两者区别 du，disk usage,是通过搜索文件来计算每个文件的大小然后累加，du能看到的文件只是一些当前存在的，没有被删除的。他计算的大小就是当前他认为存在的所有文件大小的累加和。
AngularJS中的$apply，用还是不用？ bijian1013 JavaScript AngularJS $apply
在AngularJS开发中，何时应该调用$scope.$apply()，何时不应该调用。下面我们透彻地解释这个问题。但是首先，让我们把$apply转换成一种简化的形式。 scope.$apply就像一个懒惰的工人。它需要按照命
[Zookeeper学习笔记十]Zookeeper源代码分析之ClientCnxn数据序列化和反序列化 bit1129 zookeeper
ClientCnxn是Zookeeper客户端和Zookeeper服务器端进行通信和事件通知处理的主要类，它内部包含两个类，1. SendThread 2. EventThread， SendThread负责客户端和服务器端的数据通信，也包括事件信息的传输，EventThread主要在客户端回调注册的Watchers进行通知处理 ClientCnxn构造方法 &
【Java命令一】jmap bit1129 Java命令
jmap命令的用法： [hadoop@hadoop sbin]$ jmap Usage: jmap [option] <pid> (to connect to running process) jmap [option] <executable <core> (to connect to a
Apache 服务器安全防护及实战 ronin47
此文转自IBM. Apache 服务简介 Web 服务器也称为 WWW 服务器或 HTTP 服务器 (HTTP Server)，它是 Internet 上最常见也是使用最频繁的服务器之一，Web 服务器能够为用户提供网页浏览、论坛访问等等服务。由于用户在通过 Web 浏览器访问信息资源的过程中，无须再关心一些技术性的细节，而且界面非常友好，因而 Web 在 Internet 上一推出就得到
unity 3d实例化位置出现布置？ brotherlamp unity教程 unity unity资料 unity视频 unity自学
问：unity 3d实例化位置出现布置？答：实例化的同时就可以指定被实例化的物体的位置,即 position Instantiate (original : Object, position : Vector3, rotation : Quaternion) : Object 这样你不需要再用Transform.Position了, 如果你省略了第二个参数(
《重构，改善现有代码的设计》第八章 Duplicate Observed Data bylijinnan java 重构
import java.awt.Color; import java.awt.Container; import java.awt.FlowLayout; import java.awt.Label; import java.awt.TextField; import java.awt.event.FocusAdapter; import java.awt.event.FocusE
struts2更改struts.xml配置目录 chiangfai struts.xml
struts2默认是读取classes目录下的配置文件，要更改配置文件目录，比如放在WEB-INF下，路径应该写成../struts.xml(非/WEB-INF/struts.xml) web.xml文件修改如下： <filter> <filter-name>struts2</filter-name> <filter-class&g
redis做缓存时的一点优化 chenchao051 redis hadoop pipeline
最近集群上有个job，其中需要短时间内频繁访问缓存，大概7亿多次。我这边的缓存是使用redis来做的，问题就来了。首先，redis中存的是普通kv，没有考虑使用hash等解结构，那么以为着这个job需要访问7亿多次redis，导致效率低，且出现很多redi
mysql导出数据不输出标题行 daizj mysql 数据导出去掉第一行去掉标题
当想使用数据库中的某些数据，想将其导入到文件中，而想去掉第一行的标题是可以加上-N参数如通过下面命令导出数据： mysql -uuserName -ppasswd -hhost -Pport -Ddatabase -e " select * from tableName" > exportResult.txt 结果为： studentid
phpexcel导出excel表简单入门示例 dcj3sjt126com PHP Excel phpexcel
先下载PHPEXCEL类文件，放在class目录下面，然后新建一个index.php文件，内容如下 <?php error_reporting(E_ALL); ini_set('display_errors', TRUE); ini_set('display_startup_errors', TRUE); if (PHP_SAPI == 'cli') die('
爱情格言 dcj3sjt126com 格言
1) I love you not because of who you are, but because of who I am when I am with you. 　　我爱你，不是因为你是一个怎样的人，而是因为我喜欢与你在一起时的感觉。 　　2) No man or woman is worth your tears, and the one who is, won‘t
转 Activity 详解——Activity文档翻译 e200702084 android UI sqlite 配置管理网络应用
activity 展现在用户面前的经常是全屏窗口，你也可以将 activity 作为浮动窗口来使用（使用设置了 windowIsFloating 的主题），或者嵌入到其他的 activity （使用 ActivityGroup ）中。当用户离开 activity 时你可以在 onPause() 进行相应的操作。更重要的是，用户做的任何改变都应该在该点上提交 ( 经常提交到 ContentPro
win7安装MongoDB服务 geeksun mongodb
1. 下载MongoDB的windows版本：mongodb-win32-x86_64-2008plus-ssl-3.0.4.zip，Linux版本也在这里下载，下载地址： http://www.mongodb.org/downloads 2. 解压MongoDB在D:\server\mongodb, 在D:\server\mongodb下创建d
Javascript魔法方法:__defineGetter__,__defineSetter__ hongtoushizi js
转载自： http://www.blackglory.me/javascript-magic-method-definegetter-definesetter/ 在javascript的类中,可以用defineGetter和defineSetter_控制成员变量的Get和Set行为例如,在一个图书类中,我们自动为Book加上书名符号: function Book(name){
错误的日期格式可能导致走nginx proxy cache时不能进行304响应 jinnianshilongnian cache
昨天在整合某些系统的nginx配置时，出现了当使用nginx cache时无法返回304响应的情况，出问题的响应头： Content-Type:text/html; charset=gb2312 Date:Mon, 05 Jan 2015 01:58:05 GMT Expires:Mon , 05 Jan 15 02:03:00 GMT Last-Modified:Mon, 05
数据源架构模式之行数据入口 home198979 PHP 架构行数据入口
注：看不懂的请勿踩，此文章非针对java，java爱好者可直接略过。一、概念行数据入口（Row Data Gateway）：充当数据源中单条记录入口的对象，每行一个实例。二、简单实现行数据入口为了方便理解，还是先简单实现： <?php /** * 行数据入口类 */ class OrderGateway { /*定义元数
Linux各个目录的作用及内容 pda158 linux 脚本
1）根目录“/” 　　根目录位于目录结构的最顶层，用斜线（/）表示，类似于 Windows 操作系统的“C:\“，包含Fedora操作系统中所有的目录和文件。　　2）/bin 　　/bin 　　目录又称为二进制目录，包含了那些供系统管理员和普通用户使用的重要 linux命令的二进制映像。该目录存放的内容包括各种可执行文件，还有某些可执行文件的符号连接。常用的命令有：cp、d
ubuntu12.04上编译openjdk7 ol_beta HotSpot jvm jdk OpenJDK
获取源码从openjdk代码仓库获取(比较慢) 安装mercurial Mercurial是一个版本管理工具。 sudo apt-get install mercurial 将以下内容添加到$HOME/.hgrc文件中，如果没有则自己创建一个： [extensions] forest=/home/lichengwu/hgforest-crew/forest.py fe
将数据库字段转换成设计文档所需的字段 vipbooks 设计模式工作正则表达式
哈哈，出差这么久终于回来了，回家的感觉真好！ PowerDesigner的物理数据库一出来，设计文档中要改的字段就多得不计其数，如果要把PowerDesigner中的字段一个个Copy到设计文档中，那将会是一件非常痛苦的事情。

手写 CPU 卷积核加速神经网络计算(2)——OpenCL 简单实现卷积、池化、激活、全连接、批归一化（python 实现）

1 Conv2d

2 BatchNorm2D

3 Linear

4 MaxPool2D

5 ReLU】

6 封装 opencl 代码，这里可以优化一下数据

7 加载 pytorch 的模型获取参数

8 opencl 的 lenet 推理代码

你可能感兴趣的:(深度学习,ai,compiler,学习笔记,神经网络,python,深度学习)