反向传播求导

2019-07-30  本文已影响0人  0xFFFFFG

(教材)
dz^{[2]} = a^{[2]}-y
dW^{[2]} = dz^{[2]}a^{[1]T}
db^{[2]} = dz^{[2]}
dz^{[1]} = W^{[2]T}dz^{[2]} * g^{[1]\prime }(z^{[1]})
dW^{[1]} = dz^{[1]}x^{T}
db^ {[1]} = dz^{[1]}
其中*表示逐元素乘积
向量化后:
dZ^{[2]}=A^{[2]}-Y
dW^{[2]}={\frac{1}{m}}dZ^{[2]}{A^{[1]}}^{T}
db^{[2]} = {\frac{1}{m}}np.sum(dZ^{[2]},axis=1,keepdims=True)
\underbrace{dZ^{[1]}}_{(n^{[1]}, m)} = \underbrace{W^{[2]T}dZ^{[2]}}_{(n^{[1]}, m)}*\underbrace{g[1]^{'}(Z^{[1]})}_{(n^{[1]}, m)}
dW^{[1]} = {\frac{1}{m}}dZ^{[1]}x^{T}
db^{[1]} = {\frac{1}{m}}np.sum(dZ^{[1]},axis=1,keepdims=True)

(自己推导)
ReLU:
forward: Y = Y(X) = X \rightarrow \left[ X^{(i,j)}\gt0?X^{(i,j)}:0 \right] \\ backward: dX = {dX}(dY,X) =\left[ {X}^{(i,j)} \gt 0 ? {dY}^{ (i,j) } : 0 \right]

class Relu(AbstractLayer):
    def __init__(self):
        self.mask = None

    def forward(self, X):
        self.mask = (X <= 0)
        out = X.copy()
        out[self.mask] = 0
        return out

    def backward(self, dY):
        dX = dY
        dX[self.mask] = 0
        return dX

Sigmoid:
forward: Y = Y(X) = \left[\frac{1}{1+e^{-X^{(i,j)}}}\right] \\ backward: {dX} = dX(dY,Y) =dY*(1-Y)*Y

class Sigmoid(AbstractLayer):
    def __init__(self):
        self.Y = None

    def forward(self, X):
        self.Y = 1. / (1. + np.exp(-X))
        return self.Y

    def backward(self, dY):
        return dY * (1 - self.Y) * self.Y

Affine:
init: W \in R ^{h \times n},b \in R ^{h \times 1} \\ forward: Y \in R^{h \times m}=Y(W,b,X\in R^{n \times m})=W \cdot {X} + b \\ backward: \left\{\begin{array}{lcl} dW=dW(dY,X)=dY\cdot{X^T}\\ db=db(dY)=\sum_{axis=1}dY\\ dX=dX(W,dY)=W^T\cdot{dY}\\ \end{array} \right.

class Affine(AbstractLayer):
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.Y = None
        self.X = None
        self.m = None

    def forward(self, X):
        self.X = X
        self.m = X.shape[1]
        self.Y = np.dot(self.W, X) + self.b
        return self.Y

    def backward(self, dY):
        dW = np.dot(dY, self.X.T)
        db = np.sum(dY, axis=1, keepdims=True)
        dX = np.dot(self.W.T, dY)
        return dW, db, dX

LogCost:
forward: L = L(\hat{Y},Y) = -\frac{1}{m}\sum_{i,j}(Y^{(i,j)}*\log{\hat{Y}^{(i,j)}} + (1-Y^{(i,j)})*\log{(1-\hat{Y}^{(i,j)})}) \\ backward: d\hat{Y}=d\hat{Y}(\hat{Y},Y)=-\frac{1}{m}(\frac{Y}{\hat{Y}}-\frac{1-Y}{1-\hat{Y}})

class LogCost(CostLayer):

    def forward(self, Y_hat, Y):
        m = Y.shape[1]
        return -(1. / m) * np.sum(Y * np.log(Y_hat) + (1. - Y) * np.log(1. - Y_hat))

    def backward(self, Y_hat, Y):
        m = Y.shape[1]
        return -(1./m)*(Y / Y_hat - (1. - Y) / (1. - Y_hat))
上一篇下一篇

猜你喜欢

热点阅读