用numpy实现pytorch式的深度学习框架similartorch

https://github.com/kaszperro/slick-dnnhttps://github.com/kaszperro/slick-dnn

项目的git地址:https://github.com/leeguandong/SimilarWork

按照torch的风格写的基于numpy的深度学习框架,其实无论是tensorflow还是pytorch,框架的原理都是相近的,只不过说静态图和动态图的设计上有差异,但是反向传播这些都是相似的。我参考pytorch在similartorch中添加了autograd,nn,utils和tensor这四个主要的部分,其中autograd主要是自动微分,实现反向传播的,但是不像静态图,要先在graph上检索operation(session),再结合优化器计算反向传播的梯度,nn中主要是functional和modules,modules中会定义关于Module的容器,它允许以不同的方式来建立model,比如sequential的方式,在functional中是关于modules中class的函数形式,当然定义class时一定要实现forward和backward方法,两者皆在自动微分时自动计算,tensor是所有进入框架的数据的载体,它可以和numpy直接切换,默认是不计算梯度的,utils时主要是定义data的迭代器,迭代器包括dataset和dataloader两种。

Tensor的定义,tensor是数据的载体,是深度学习中核心的数据结构,核心是backward属性,tensor的定义本身还是比较复杂的,属性相对较多。

import numpy as np
from typing import Type

from .nn import Add, Subtract, Multiply, Divide, Power, Positive, Negative, MatMul, SwapAxes
from .autograd import Autograd


class Tensor(object):
    def __init__(self, data: np.array, requires_grad=False):
        self.data = data
        self.requires_grad = requires_grad
        self.grad = None

        if requires_grad:
            self.grad = np.zeros_like(self.data, dtype=np.float32)

        self.backward_function = None
        self.backward_tensor = []
        self.shape = self.data.shape

    def backward(self, grad=np.array([1])):
        if self.requires_grad:
            self.grad = grad + self.grad
            sum_ax = tuple(range(len(self.grad.shape) - len(self.data.shape)))
            self.grad = np.sum(self.grad, sum_ax)

        if self.backward_function is not None:
            accumulated = self.backward_function(grad)
            if len(self.backward_tensor) == 1:
                accumulated = accumulated,
            for bv, ac in zip(self.backward_tensor, accumulated):
                bv.backward(ac)

    @classmethod
    def _op(cls, Op: Type[Autograd], *input_vars):
        f = Op()
        return f(*input_vars)

    def __str__(self):
        return "\n" + self.data.__str__()

    def __add__(self, other):
        from .nn import Add
        return self._op(Add, self, other)

    def __radd__(self, other):
        return self._op(Add, other, self)

    def __sub__(self, other):
        return self._op(Subtract, self, other)

    def __rsub__(self, other):
        return self._op(Subtract, other, self)

    def __matmul__(self, other):
        return self._op(MatMul, self, other)

    def __rmatmul__(self, other):
        return self._op(MatMul, other, self)

    def __mul__(self, other):
        return self._op(Multiply, self, other)

    def __rmul__(self, other):
        return self._op(Multiply, other, self)

    def __copy__(self):
        """复制当前Tensor的grad,data,requires_grad,如果当前的Tensor没有梯度,则梯度为None
        :return:
        """
        copy = Tensor(np.copy(self.data), requires_grad=self.requires_grad)
        try:
            copy.grad[:] = self.grad[:]
        except:
            pass
        return copy

    def copy(self):
        return self.__copy__()

    def numpy(self):
        return self.data.copy()

    def __len__(self):
        return len(self.data)

    @property
    def size(self):
        return self.data.size

    @property
    def ndim(self):
        return self.data.ndim

    @property
    def shape(self):
        return self.data.shape

    @property
    def T(self):
        pass

    def swapaxes(self, axis1, axis2):
        return SwapAxes(axis1, axis2)(self)

nn模块,在similartorch中主要由两部分组成,第一部分是modules模块,第二部分是functional,就是对modules里面的类做了函数上的封装。在modules中包括activation,容器sequential,conv,flatten,img2col,init,linear,loss,pooling以及基类Modules,similartorch.ones等基础函数和add,matmul这种基础算子。

mathematical:定义了常用的一些计算的函数,比如add,mul这些常用的,继承自Autograd,其实可以再做了operation的类继承autograd的,实现了forward和backward方法。其实numpy中也有这些方法,但是重新用框架定义之后,再写上backward方法之后,可以再构造模型时使用重新包装了backward的框架定义的方法,在反向传播时,可以链接上math方法的导数,这样写的话就比较类似tf1时对算子层的深度解耦,你可以自己拼接出想要的函数,也可以在nn中把想要的函数定义好,直接写backward方法,这样的话算子的粒度更粗了,并不是特备灵活。

import numpy as np
from similartorch.autograd import Autograd


class Add(Autograd):
    def forward(self, ctx, x, y):
        return x + y

    def backward(self, ctx, grad):
        return grad, grad


class Subtract(Autograd):
    def forward(self, ctx, x, y):
        return x - y

    def backward(self, ctx, grad):
        return grad, -grad


class MatMul(Autograd):
    def forward(self, ctx, x, y):
        ctx.save_for_back(x, y)
        return x @ y

    def backward(self, ctx, grad: np.array):
        t1, t2 = ctx.data_for_back

        grad1 = grad @ np.swapaxes(t2, -1, -2)
        grad2 = np.swapaxes(t1, -1, -2) @ grad

        return grad1, grad2


class Multiply(Autograd):
    def forward(self, ctx, x, y):
        ctx.save_for_back(x, y)
        return x * y

    def backward(self, ctx, grad: np.array):
        t1, t2 = ctx.data_for_back
        return grad * t2, grad * t1


class Assign(Autograd):
    def forward(self, ctx, x):
        return x

    def backward(self, ctx, grad):
        return None


class Divide(Autograd):
    def forward(self, ctx, x, y):
        ctx.save_for_back(x, y)
        return x / y

    def backward(self, ctx, grad):
        t1, t2 = ctx.data_for_back
        grad1 = grad / t2
        grad2 = -grad1 * (t1 / t2)
        return grad1, grad2


class Negative(Autograd):
    def forward(self, ctx, x):
        return -x

    def backward(self, ctx, grad):
        return -grad


class Positive(Autograd):
    def forward(self, ctx, x):
        return np.positive(x)

    def backward(self, ctx, grad):
        return np.positive(grad)


class Power(Autograd):
    def forward(self, ctx, x, y):
        ctx.save_for_back(x, y)
        return x ** y

    def backward(self, ctx, grad):
        t1, t2 = ctx.data_for_back
        grad1 = grad * t2 * (t1 ** np.where(t2, (t2 - 1), 1))
        grad2 = grad * (t1 ** t2) * np.log(np.where(t1, t1, 1))
        return grad1, grad2


# --------------------------------------------------------------------------------
class Exp(Autograd):
    def forward(self, ctx, x):
        ctx.save_for_back(x)
        return np.exp(x)

    def backward(self, ctx, grad):
        t1, _ = ctx.data_for_back
        return grad * np.exp(t1)


class Log(Autograd):
    def forward(self, ctx, x):
        return np.log(x)

    def backward(self, ctx, grad):
        t1, _ = ctx.data_for_back
        return grad / t1

activation:类和函数这块的定义和上面的mathematical有所不同,mathematical提供的一些方法大都还是tensor的方法,在框架中所有的数据都是Tensor,因此可以直接利用的属性方法,activation和module中和functional中的方法是对应的,在pytorch中functional中的函数提供给类做forward的方法,但是在similartorch中主要还是实现了类中的forward方法,functional本身只是类方法的实例化。

import numpy as np
from similartorch.autograd import Autograd


class ReLU(Autograd):
    def forward(self, ctx, x):
        ctx.save_for_back(x)
        return np.clip(x, a_min=0, a_max=None)

    def backward(self, ctx, grad):
        t, = ctx.data_for_back
        return np.where(t < 0, 0, grad)


class Sigmoid(Autograd):
    def forward(self, ctx, x):
        sig = 1 / (1 + np.exp(-x))
        ctx.save_for_back(sig)
        return sig

    def backward(self, ctx, grad):
        sig, = ctx.data_for_back
        return sig * (1 - sig) * grad


class Softmax(Autograd):
    def forward(self, ctx, x):
        softm = np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)
        ctx.save_for_back(softm)
        return softm

    def backward(self, ctx, grad):
        softm, = ctx.data_for_back
        return grad * softm * (1 - softm)


class Softplus(Autograd):
    def forward(self, ctx, x):
        ctx.save_for_back(1 + np.exp(-x))
        return np.log(1 + np.exp(-x))

    def backward(self, ctx, grad):
        softp, = ctx.data_for_back
        return grad / softp


class Softsign(Autograd):
    def forward(self, ctx, x):
        ctx.save_for_back(1 + np.abs(x))
        return x / (1 + np.abs(x))

    def backward(self, ctx, grad):
        softs, = ctx.data_for_back
        return grad / softs


class ArcTan(Autograd):
    def forward(self, ctx, x):
        ctx.save_for_back(x)
        return np.arctan(x)

    def backward(self, ctx, grad):
        t, = ctx.data_for_back
        return grad / (t * t + 1)


class Tanh(Autograd):
    def forward(self, ctx, x):
        tanh = np.tanh(x)
        ctx.save_for_back(tanh)
        return tanh

    def backward(self, ctx, grad):
        tanh, = ctx.data_for_back
        return (1 - tanh * tanh) * grad

loss模块

import numpy as np
from similartorch.autograd import Autograd


class MSELoss(Autograd):
    def forward(self, ctx, target, input):
        if target.shape != input.shape:
            raise ValueError("wrong shape")

        ctx.save_for_back(target, input)
        return ((target - input) ** 2).mean()

    def backward(self, ctx, grad):
        target, input = ctx.data_for_back
        batch = target.shape[0]
        grad1 = grad * 2 * (target - input) / batch
        grad2 = grad * 2 * (input - target) / batch
        return grad1, grad2


class CrossEntropyLoss(Autograd):
    def forward(self, ctx, target, input):
        ctx.save_for_back(target, input)
        input = np.clip(input, 1e-15, 1 - 1e-15)
        return -target * np.log(input) - (1 - target) * np.log(1 - input)

    def backward(self, ctx, grad):
        target, input = ctx.data_for_back
        batch = target.shape[0]

        input = np.clip(input, 1e-15, 1 - 1e-15)
        grad1 = grad * (np.log(1 - input) - np.log(input)) / batch
        grad2 = grad * (- target / input + (1 - target) / (1 - input)) / batch
        return grad1, grad2

pooling模块,pooling这块的backward很简单,maxpool的话,就用mask记住max最大的位置,average的话就全部赋均值即可。

import numpy as np

from abc import ABC
from similartorch.autograd import Autograd, Context
from .img2col import Img2Col


class BasePool(Autograd, ABC):
    def __init__(self, kernel_size, stride=1):
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)

        self.kernel_size = kernel_size
        self.stride = stride

    @staticmethod
    def _fill_col(to_fill, new_shape):
        repeats = new_shape[-2]
        ret = np.repeat(to_fill, repeats, -2)
        ret = np.reshape(ret, new_shape)
        return ret


class MaxPool2d(BasePool):
    def forward(self, ctx: Context, input):
        img_w = input.shape[-1]
        img_h = input.shape[-2]
        channels = input.shape[-3]

        new_w = (img_w - self.kernel_size[0]) // self.stride[0] + 1
        new_h = (img_h - self.kernel_size[1]) // self.stride[1] + 1

        img_out = Img2Col.img2col_forward(self.kernel_size, self.stride, False, input)
        maxed = np.max(img_out, -2)

        ctx.save_for_back(img_out, input.shape, maxed.shape)
        return np.reshape(maxed, (-1, channels, new_h, new_w))

    def backward(self, ctx: Context, grad: np.array = None):
        """切成一小块算max,计算完毕之后再把shape转回去
        """
        reshaped_image, back_shape, maxed_shape = ctx.data_for_back

        grad = np.reshape(grad, maxed_shape)
        mask = (reshaped_image == np.max(reshaped_image, -2, keepdims=True))
        new_grad = self._fill_col(grad, reshaped_image.shape)

        new_grad = np.where(mask, new_grad, 0)
        return Img2Col.img2col_backward(self.kernel_size, self.stride, back_shape, new_grad)


class AvgPool2d(BasePool):
    def forward(self, ctx: Context, input):
        img_w = input.shape[-1]
        img_h = input.shape[-2]
        channels = input.shape[-3]

        new_w = (img_w - self.kernel_size[0]) // self.stride[0] + 1
        new_h = (img_h - self.kernel_size[1]) // self.stride[1] + 1

        img_out = Img2Col.img2col_forward(self.kernel_size, self.stride, False, input)
        averaged = np.average(img_out, -2)
        ctx.save_for_back(img_out, input.shape, averaged.shape)
        return np.reshape(averaged, (-1, channels, new_h, new_w))

    def backward(self, ctx, grad):
        reshaped_image, back_shape, averaged_shape = ctx.data_for_back

        grad = np.reshape(grad, averaged_shape)
        new_grad = self._fill_col(grad, reshaped_image.shape) / (self.kernel_size[0] * self.kernel_size[1])

        return Img2Col.img2col_backward(self.kernel_size, self.stride, back_shape, new_grad)

conv层以及其继承的基类module,module没有backward方法,继承自module的linear,sequential,conv都没有backward方法,实际上这些属于高阶的算子,反向传播往往可以由低阶算子拼出来,因此对于这些操作没有给出其backward的形式。

import math
import numpy as np
import similartorch
from similartorch import Tensor
from .img2col import Img2Col
from .module import Module
from . import init


class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=0, add_bias=True):
        super(Conv2d, self).__init__()
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.padding = padding
        self.stride = stride
        self.add_bias = add_bias

        self.weight = similartorch.rands([0, 0.05, (self.out_channels, self.in_channels,
                                                    self.kernel_size[0], self.kernel_size[1])], requires_grad=True)
        if add_bias:
            self.bias = similartorch.zeros(out_channels, np.float32, requires_grad=True)
            self.register_parameter(("weight", self.weight), ("bias", self.bias))
        else:
            self.register_parameter(("weight", self.weight))

        self.img2col = Img2Col(self.kernel_size, self.stride)

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input: Tensor) -> Tensor:
        img2col = self.img2col(input)
        output = self.weight.reshape(self.weight.shape[0], -1) @ img2col

        img_w = input.shape[-1]
        img_h = input.shape[-2]
        new_w = (img_w - self.kernel_size[0]) // self.stride[0] + 1
        new_h = (img_h - self.kernel_size[1]) // self.stride[1] + 1

        batch_input = len(input.shape) == 4
        if batch_input:
            output_shape = (input.shape[0], self.out_channels, new_h, new_w)
        else:
            output_shape = (self.out_channels, new_h, new_w)

        if self.add_bias:
            output = (output.swapaxes(-1, 2) + self.bias).swapaxes(-1, -2)

        return output.reshape(*output_shape)


import numpy as np

from abc import ABC, abstractmethod
from collections import OrderedDict

from similartorch.tensor import Tensor


class Module(ABC):
    def __init__(self):
        self._parameters = OrderedDict([])

    def register_parameter(self, *var_iterable):
        for var_name, var in var_iterable:
            self._parameters.update({var_name: var})

    def parameters(self) -> list:
        return list(self._parameters.values())

    def get_state_dict(self) -> OrderedDict:
        return self._parameters

    def load_state_dict(self, state_dict: OrderedDict):
        for k, val in state_dict.items():
            self._parameters[k].data = np.array(val)

    @abstractmethod
    def forward(self, *input) -> Tensor:
        raise NotImplementedError

    def __call__(self, *input) -> Tensor:
        return self.forward(*input)

autograd自动微分模块

from abc import ABC, abstractmethod

from similartorch.tensor import Tensor


class Context:
    def __init__(self):
        self.data_for_back = None

    def save_for_back(self, *data):
        self.data_for_back = tuple(data)


class Autograd(ABC):
    def apply(self, *tensor_list):
        ctx = Context()

        forward_tensor = self.forward(ctx, *map(lambda v: v.data, tensor_list))

        output_tensor = Tensor(forward_tensor, requires_grad=False)
        output_tensor.backward_function = lambda x: self.backward(ctx, x)
        output_tensor.backward_tensor = list(tensor_list)
        return output_tensor

    @abstractmethod
    def forward(self, ctx, *tensor_list):
        raise NotImplementedError

    @abstractmethod
    def backward(self, ctx, grad):
        raise NotImplementedError

    def __call__(self, *tensor_list):
        return self.apply(*tensor_list)

优化器模块

from abc import ABC, abstractmethod


class Optimizer(ABC):
    def __init__(self, param_list: list):
        self.param_list = param_list
        self.state = {}

    def zero_grad(self):
        for param in self.param_list:
            param.grad.fill(0)

    @abstractmethod
    def step(self):
        raise NotImplementedError


import numpy as np
from .optimizer import Optimizer


class Adam(Optimizer):
    def __init__(self, param_list: list, learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
        super(Adam, self).__init__(param_list)

        self.lr = learning_rate

        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = epsilon

    @staticmethod
    def initialize_state(state, param):
        state["step"] = 0
        state["m"] = np.zeros(param.grad.shape)
        state["v"] = np.zeros(param.grad.shape)

    def step(self):
        for param in self.param_list:
            if param.grad is None:
                continue

            if param not in self.state:
                self.state[param] = {}

            state = self.state[param]

            if len(state) == 0:
                self.initialize_state(state, param)

            state["step"] += 1
            state["m"] = self.beta1 * state["m"] + (1 - self.beta1) * param.grad
            state["v"] = self.beta2 * state["v"] + (1 - self.beta2) * param.grad

            m_hat = state["m"] / (1 - self.beta1 ** state["step"])
            v_hat = state["v"] / (1 - self.beta2 ** state["step"])
            param.data -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)


import numpy as np
from .optimizer import Optimizer


class SGD(Optimizer):
    def __init__(self, param_list: list, learning_rate=0.01, momentum=0., decay=0.):
        super(SGD, self).__init__(param_list)

        self.lr = learning_rate
        self.decay = decay
        self.momentum = momentum

    @staticmethod
    def initialize_state(state, param):
        state["v"] = np.zeros_like(param.grad)

    def step(self):
        for param in self.param_list:
            if param.grad is None:
                continue
            if param not in self.state:
                self.state[param] = {}

            state = self.state[param]

            if len(state) == 0:
                self.initialize_state(state, param)

            state["v"] = self.momentum * state["v"] - self.lr * param.grad
            param.data += state["v"]

        self.lr = self.lr / (1 + self.decay)

utils模块主要写了数据加载的一些函数,dataset和dataloader模块,dataloader主要就是实现了__iter__和__next__迭代器,一些具体的数据的数据加载类包括MNIST等。

整体看下来,做一个类torch的深度学习小框架,纯numpy实现,自动微分模块autograd,定义了核心的forward和backward,是个抽象基类,主要是定义接口的,backward通过backward_function来实现,这层其实是通过算子的backward方法去实现的,算子大多继承自Autograd,均实现了forward和backward方法,其次是modules模块,在nn中,这是构建模型所需的函数和类,optim是优化器,utils中是数据加载的方法。

你可能感兴趣的:(深度机器学习基础,pytorch,深度学习,python)