批量归一化（batch-normalization）

2022-09-07 本文已影响0人小黄不头秃

(一)批量归一化BN

（1）问题的提出

随着我们的学习，我们发现我们的网络在一点一点的变深。那就是我们之前会提到的一个问题。有可能会造成梯度消失，或者梯度爆炸。另外的一个问题就是，靠近输出的层有更好的梯度下降，然而靠近输入的层梯度往往无法很快的下降。

损失出现在最后，后面的层训练较快
数据在最底部

底层数据训练的很慢
底层一变化，所有层都得跟着变化，就像是蝴蝶效应
最后的那些层需要重新学习多次
导致收敛变慢

有什么办法能做到，学习底部层参数的时候避免变化顶部层？

（2）解决方案

会出现这样子的原因就是因为输入和输出的差别太大了，数据在网络中变化非常剧烈。前面我们在讨论数值稳定性的时候，提出了其中一部分的优化方法是，让权重初始化服从某种分布，即xavier权重初始化。那么现在的另一种解决方法是批量归一化。

我们先算出一个batch里面所有数据的均值和方差（后面加ε是为了防止除零错误）。然后利用下面的公式对输出输入进行约束。(可以通过代码进行理解)

$\begin{aligned} \hat{\boldsymbol{\mu}}_\mathcal{B} &= \frac{1}{|\mathcal{B}|} \sum_{\mathbf{x} \in \mathcal{B}} \mathbf{x},\\\\\ \hat{\boldsymbol{\sigma}}_\mathcal{B}^2 &= \frac{1}{|\mathcal{B}|} \sum_{\mathbf{x} \in \mathcal{B}} (\mathbf{x} - \hat{\boldsymbol{\mu}}_{\mathcal{B}})^2 + \epsilon.\end{aligned}$

$\mathrm{BN}(\mathbf{x}) = \boldsymbol{\gamma} \odot \frac{\mathbf{x} - \hat{\boldsymbol{\mu}}_\mathcal{B}}{\hat{\boldsymbol{\sigma}}_\mathcal{B}} + \boldsymbol{\beta}.$

我们可以看出里面有两个新的参数 $γ$ 和 $β$ ，这两个参数是在训练过程中学习所得的。
一般会将其放在全连接层和卷积层的输出上，在激活函数之前。或者在全连接层，卷积层输入之前。

对于全连接层来说，作用在特征维上。(-1,sample)，特征维就是一列
对于卷积层来说，其作用在通道维上。(-1,channel,h,w),通道维就是RGB的三个通道。

（3）批量归一化到底是做了些什么事情呢？

最初的论文是想，用起来减少网络层与层之间的内部协变量转移。
后续的论文指出，你其实就是对数据加了噪音，对其进行偏移和缩放。并且它指出没有必要和Dropout混用。

（二）代码实现

（1）从零开始

import torch
from torch import nn
from torchvision import transforms
import torchvision
from torch.utils import data
from d2l import torch as d2l
import numpy as np
import matplotlib.pyplot as plt

# 参数列表：(x, γ, β, 全局均值, 全局方差, ε, 用于更新全局均值和方差)
def batch_norm(x,gamma,beta,moving_mean,moving_var,eps,momentum):
    # 通过is_grad_enabled来判断当前模式是训练模式还是预测模式
    if not torch.is_grad_enabled():
        # 如果是在预测模式下，直接使用传入的移动平均所得的均值和方差
        x_hat = (x-moving_mean)/torch.sqrt(moving_var + eps)
    else:
        # 判断是否是否是全连接层(2维)或者卷积层(4维)
        assert len(x.shape) in (2, 4)
        if len(x.shape) == 2:
            # 全连接层每一列的均值和方差
            mean = x.mean(dim=0)
            var = ((x-mean)**2).mean(dim=0)
        else:
            # 卷积层每一个通道的均值和方差
            mean = x.mean(dim=(0,2,3),keepdim=True)
            var = ((x - mean)**2).mean(dim=(0,2,3),keepdim=True)
        x_hat = (x-mean)/torch.sqrt(var+eps)

        # 更新全局均值和方差，使用加权平均
        moving_mean = momentum*moving_mean + (1-momentum)*mean
        moving_var = momentum*moving_var + (1-momentum)*var
    y = gamma*x_hat + beta # 进行缩放和移位
    return y, moving_mean.data, moving_var.data

# 创建一个BN层
class BatchNorm(nn.Module):
    # 构造函数的两个初始化参数，（特征维/通道维，判断是全连接还是卷积）
    def __init__(self,num_features,num_dims) -> None:
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features,1,1)
        # 将γ初始化为1，β为0，均值为零，方差为1
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)
    
    def forward(self,x):
        if self.moving_mean.device != x.device:
            self.moving_mean = self.moving_mean.to(x.device)
            self.moving_var = self.moving_var.to(x.device)
        y, self.moving_mean,self.moving_var = batch_norm(x,self.gamma,self.beta,self.moving_mean,self.moving_var,eps=1e-5,momentum=0.9)
        return y

下面的代码不要慌，均来自前面讲过的关于LetNet的代码。

# 将BN作用在LetNet上
net  = torch.nn.Sequential(
    nn.Conv2d(in_channels=1,out_channels=6,kernel_size=5), 
    BatchNorm(6,4),
    nn.Sigmoid(),
    nn.MaxPool2d(kernel_size=2,stride=2), 
    nn.Conv2d(in_channels=6,out_channels=16,kernel_size=5), 
    BatchNorm(16,4),
    nn.Sigmoid(),
    nn.MaxPool2d(kernel_size=2,stride=2), 
    nn.Flatten(),
    nn.Linear(in_features=16*4*4,out_features=120), 
    BatchNorm(120,2),
    nn.Sigmoid(),
    nn.Linear(120,84),
    BatchNorm(84,2),
    nn.Sigmoid(),
    nn.Linear(in_features=84,out_features=10)
)

print(net)

x = torch.randn(4*28*28,dtype=torch.float32).reshape(4,1,28,28)

for layer in net:
    x = layer(x)
    print(layer.__class__.__name__,"output shape = ", x.shape)

# 现在使用mnist数据集测试一下结果
def load_data_fashion_mnist(batch_size, resize=None):
    """下载或者加载Fashion-MNIST数据集"""
    trans = transforms.ToTensor()
    mnist_train = torchvision.datasets.FashionMNIST(
        root="../data/",
        train=True,
        transform=trans,
        download=False # 要是没下载过就选择true
    )
    mnist_test = torchvision.datasets.FashionMNIST(
        root="../data/",
        train=False,
        transform=trans,
        download=False # 要是没下载过就选择true
    )
    return (data.DataLoader(mnist_train,batch_size=batch_size,shuffle=True,num_workers=0),
            data.DataLoader(mnist_test,batch_size=batch_size,shuffle=True,num_workers=0))

# 超参数设置
batch_size = 128
learning_rate = 0.3
epochs = 20

# 加载数据
train_iter, test_iter = load_data_fashion_mnist(batch_size)

# 简化版评估模型准确率
def evalue_acc(net,data_iter,device=None):
    if isinstance(net, nn.Module):
        net.eval()
        if not device:
            device = next(iter(net.parameters())).device
    # 正确的预测数量，总预测的数量
    acc_list = np.array([])
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                # Bert微调所需
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y_hat = net(X)
            y = y.to(device)
            if len(y_hat.shape) > 1 and y_hat.shape[1] >1:
                y_hat = y_hat.argmax(axis=1)
            cmp = y_hat.type(y.dtype) == y
            acc = torch.tensor(cmp).sum().item()/len(y)
            acc_list = np.append(acc_list,acc)
    return acc_list.mean()

# print(evalue_acc(net,train_iter,device=None))

%matplotlib inline

# 简化版的训练函数
def train(net,train_iter,test_iter,epochs,lr,device):
    """使用GPU训练模型"""
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weights)

    print("Network traning on",device)
    net.to(device)

    # 优化器和损失函数
    optimizer = torch.optim.SGD(net.parameters(),lr)
    loss = nn.CrossEntropyLoss()

    #画图用的变量
    loss_history = np.array([])
    train_acc = np.array([])
    test_acc = np.array([])


    for epoch in range(epochs):
        l_epoch = np.array([])
        for x, y in train_iter:
            if isinstance(net,torch.nn.Module):
                net.train() # 开启训练模式
            x = x.to(device)
            y = y.to(device)
            l = loss(net(x),y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_epoch = np.append(l_epoch,l.detach().mean().to("cpu"))
            # break
        print(f"epoch:{epoch}, train_loss:{l_epoch.mean()}")
        loss_history = np.append(loss_history,l_epoch.mean())
        train_acc = np.append(train_acc,evalue_acc(net,train_iter))
        test_acc = np.append(test_acc,evalue_acc(net,test_iter))

        # 画图
        plt.ion()
        plt.clf()  #清除上一幅图像
        plt.plot(np.arange(epoch+1),loss_history,'b',label="train_loss")
        plt.plot(np.arange(epoch+1),train_acc,':g',label="train_acc")
        plt.plot(np.arange(epoch+1),test_acc,':m',label="test_acc")
        plt.xlabel("epoch")
        plt.grid(True)
        plt.legend()
        plt.pause(0.01)  # 暂停0.01秒
        plt.ioff()
    print(f"train_acc:{train_acc[-1]},test_acc:{test_acc[-1]}")

train(net,train_iter,test_iter,epochs,lr=learning_rate,device=d2l.try_gpu())

train_acc:0.91149, test_acc:0.88014

（2）简单实现

net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5), nn.BatchNorm2d(6), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), nn.BatchNorm2d(16), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
    nn.Linear(256, 120), nn.BatchNorm1d(120), nn.Sigmoid(),
    nn.Linear(120, 84), nn.BatchNorm1d(84), nn.Sigmoid(),
    nn.Linear(84, 10))

d2l.train_ch6(net, train_iter, test_iter, epochs, learning_rate, d2l.try_gpu())

注意两条虚线颜色和上图反过来了

train acc 0.923, test acc 0.881